Skip to content

Commit 7e7a907

Browse files
Merging slight inferring improvements into stable (#10156)
1 parent 89ce2f9 commit 7e7a907

File tree

4 files changed

+31
-24
lines changed

4 files changed

+31
-24
lines changed

ydb/core/external_sources/object_storage/inference/arrow_inferencinator.cpp

+7-7
Original file line numberDiff line numberDiff line change
@@ -184,14 +184,14 @@ std::variant<ArrowFields, TString> InferCsvTypes(std::shared_ptr<arrow::io::Rand
184184
.Value(&reader);
185185

186186
if (!readerStatus.ok()) {
187-
return TString{TStringBuilder{} << "couldn't open csv/tsv file, check format and compression params: " << readerStatus.ToString()};
187+
return TString{TStringBuilder{} << "couldn't open csv/tsv file, check format and compression parameters: " << readerStatus.ToString()};
188188
}
189189

190190
std::shared_ptr<arrow::Table> table;
191191
auto tableRes = reader->Read().Value(&table);
192192

193193
if (!tableRes.ok()) {
194-
return TStringBuilder{} << "couldn't parse csv/tsv file, check format and compression params: " << tableRes.ToString();
194+
return TStringBuilder{} << "couldn't parse csv/tsv file, check format and compression parameters: " << tableRes.ToString();
195195
}
196196

197197
return table->fields();
@@ -202,19 +202,19 @@ std::variant<ArrowFields, TString> InferParquetTypes(std::shared_ptr<arrow::io::
202202
builder.properties(parquet::ArrowReaderProperties(false));
203203
auto openStatus = builder.Open(std::move(file));
204204
if (!openStatus.ok()) {
205-
return TStringBuilder{} << "couldn't open parquet file, check format params: " << openStatus.ToString();
205+
return TStringBuilder{} << "couldn't open parquet file, check format parameters: " << openStatus.ToString();
206206
}
207207

208208
std::unique_ptr<parquet::arrow::FileReader> reader;
209209
auto readerStatus = builder.Build(&reader);
210210
if (!readerStatus.ok()) {
211-
return TStringBuilder{} << "couldn't read parquet file, check format params: " << readerStatus.ToString();
211+
return TStringBuilder{} << "couldn't read parquet file, check format parameters: " << readerStatus.ToString();
212212
}
213213

214214
std::shared_ptr<arrow::Schema> schema;
215215
auto schemaRes = reader->GetSchema(&schema);
216216
if (!schemaRes.ok()) {
217-
return TStringBuilder{} << "couldn't parse parquet file, check format params: " << schemaRes.ToString();
217+
return TStringBuilder{} << "couldn't parse parquet file, check format parameters: " << schemaRes.ToString();
218218
}
219219

220220
return schema->fields();
@@ -235,14 +235,14 @@ std::variant<ArrowFields, TString> InferJsonTypes(std::shared_ptr<arrow::io::Ran
235235
).Value(&reader);
236236

237237
if (!readerStatus.ok()) {
238-
return TString{TStringBuilder{} << "couldn't open json file, check format and compression params: " << readerStatus.ToString()};
238+
return TString{TStringBuilder{} << "couldn't open json file, check format and compression parameters: " << readerStatus.ToString()};
239239
}
240240

241241
std::shared_ptr<arrow::Table> table;
242242
auto tableRes = reader->Read().Value(&table);
243243

244244
if (!tableRes.ok()) {
245-
return TString{TStringBuilder{} << "couldn't parse json file, check format and compression params: " << tableRes.ToString()};
245+
return TString{TStringBuilder{} << "couldn't parse json file, check format and compression parameters: " << tableRes.ToString()};
246246
}
247247

248248
return table->fields();

ydb/core/external_sources/object_storage/inference/infer_config.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ std::shared_ptr<FormatConfig> MakeFormatConfig(const THashMap<TString, TString>&
6060
EFileFormat format;
6161
if (auto formatPtr = params.FindPtr("format"); formatPtr) {
6262
format = ConvertFileFormat(*formatPtr);
63+
} else {
64+
throw yexception() << "format unspecified, use format parameter with type inferring";
6365
}
6466

6567
if (auto delimiter = params.FindPtr("csvdelimiter"); delimiter) {

ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp

+21-16
Original file line numberDiff line numberDiff line change
@@ -126,11 +126,11 @@
126126
LOG_TRACE_S(GetActorContext(), NKikimrServices::KQP_COMPUTE, "TS3ReadCoroImpl: " << SelfActorId << ", CA: " << ComputeActorId << ", TxId: " << TxId \
127127
<< " [" << Path << "]. RETRY{ Offset: " << RetryStuff->Offset << ", Delay: " << RetryStuff->NextRetryDelay << ", RequestId: " << RetryStuff->RequestId << "}. " << stream)
128128
129-
#define THROW_ARROW_NOT_OK(status) \
129+
#define THROW_ARROW_NOT_OK(code, status) \
130130
do \
131131
{ \
132132
if (::arrow::Status _s = (status); !_s.ok()) \
133-
throw yexception() << _s.ToString(); \
133+
ythrow TCodeLineException(code) << _s.ToString(); \
134134
} while (false)
135135
136136
namespace NYql::NDq {
@@ -575,7 +575,7 @@ class TS3ReadCoroImpl : public TActorCoroImpl {
575575
void HandleEvent(TEvS3Provider::TEvReadResult2::THandle& event) {
576576
577577
if (event.Get()->Failure) {
578-
throw yexception() << event.Get()->Issues.ToOneLineString();
578+
ythrow TCodeLineException(NYql::NDqProto::StatusIds::INTERNAL_ERROR) << event.Get()->Issues.ToOneLineString();
579579
}
580580
auto readyRange = event.Get()->ReadRange;
581581
LOG_CORO_D("Download FINISHED [" << readyRange.Offset << "-" << readyRange.Length << "], cookie: " << event.Cookie);
@@ -773,9 +773,7 @@ class TS3ReadCoroImpl : public TActorCoroImpl {
773773
if (StopIfConsumedEnough(numRows)) {
774774
isCancelled = true;
775775
}
776-
if (!status.ok()) {
777-
throw yexception() << status.ToString();
778-
}
776+
ThrowParquetNotOk(status);
779777
SourceContext->UpdateProgress(downloadedBytes, decodedBytes, table->num_rows());
780778
if (RawInflightSize) {
781779
RawInflightSize->Sub(downloadedBytes);
@@ -862,9 +860,7 @@ class TS3ReadCoroImpl : public TActorCoroImpl {
862860
if (StopIfConsumedEnough(numRows)) {
863861
isCancelled = true;
864862
}
865-
if (!status.ok()) {
866-
throw yexception() << status.ToString();
867-
}
863+
ThrowParquetNotOk(status);
868864
SourceContext->UpdateProgress(downloadedBytes, decodedBytes, table->num_rows());
869865
if (isCancelled) {
870866
LOG_CORO_D("RunCoroBlockArrowParserOverFile - STOPPED ON SATURATION");
@@ -1177,6 +1173,11 @@ class TS3ReadCoroImpl : public TActorCoroImpl {
11771173
// Stop any activity instantly
11781174
RetryStuff->Cancel();
11791175
return;
1176+
} catch (const TCodeLineException& err) {
1177+
LOG_CORO_E(err.what());
1178+
Issues.AddIssue(err.GetRawMessage());
1179+
FatalCode = static_cast<NYql::NDqProto::StatusIds::StatusCode>(err.Code);
1180+
RetryStuff->Cancel();
11801181
} catch (const std::exception& err) {
11811182
Issues.AddIssue(TIssue(err.what()));
11821183
FatalCode = NYql::NDqProto::StatusIds::INTERNAL_ERROR;
@@ -2029,11 +2030,11 @@ NDB::DataTypePtr MetaToClickHouse(const TType* type, NSerialization::TSerializat
20292030
return std::make_shared<const NDB::DataTypeDecimal<NDB::Decimal128>>(precision, scale);
20302031
}
20312032
default:
2032-
throw yexception() << "Unsupported data slot in MetaToClickHouse: " << slot;
2033+
ythrow TCodeLineException(NYql::NDqProto::StatusIds::INTERNAL_ERROR) << "Unsupported data slot in MetaToClickHouse: " << slot;
20332034
}
20342035
}
20352036
default:
2036-
throw yexception() << "Unsupported type kind in MetaToClickHouse: " << type->GetKindAsStr();
2037+
ythrow TCodeLineException(NYql::NDqProto::StatusIds::INTERNAL_ERROR) << "Unsupported type kind in MetaToClickHouse: " << type->GetKindAsStr();
20372038
}
20382039
return nullptr;
20392040
}
@@ -2104,17 +2105,18 @@ std::pair<NYql::NDq::IDqComputeActorAsyncInput*, IActor*> CreateS3ReadActor(
21042105
if (hasDirectories) {
21052106
auto pathPatternValue = settings.find("pathpattern");
21062107
if (pathPatternValue == settings.cend()) {
2107-
ythrow yexception() << "'pathpattern' must be configured for directory listing";
2108+
ythrow TCodeLineException(NYql::NDqProto::StatusIds::INTERNAL_ERROR)
2109+
<< "'pathpattern' must be configured for directory listing";
21082110
}
21092111
pathPattern = pathPatternValue->second;
21102112

21112113
auto pathPatternVariantValue = settings.find("pathpatternvariant");
21122114
if (pathPatternVariantValue == settings.cend()) {
2113-
ythrow yexception()
2115+
ythrow TCodeLineException(NYql::NDqProto::StatusIds::INTERNAL_ERROR)
21142116
<< "'pathpatternvariant' must be configured for directory listing";
21152117
}
21162118
if (!TryFromString(pathPatternVariantValue->second, pathPatternVariant)) {
2117-
ythrow yexception()
2119+
ythrow TCodeLineException(NYql::NDqProto::StatusIds::INTERNAL_ERROR)
21182120
<< "Unknown 'pathpatternvariant': " << pathPatternVariantValue->second;
21192121
}
21202122
}
@@ -2197,13 +2199,16 @@ std::pair<NYql::NDq::IDqComputeActorAsyncInput*, IActor*> CreateS3ReadActor(
21972199
std::shared_ptr<arrow::DataType> dataType;
21982200

21992201
YQL_ENSURE(ConvertArrowType(memberType, dataType, true), "Unsupported arrow type");
2200-
THROW_ARROW_NOT_OK(builder.AddField(std::make_shared<arrow::Field>(std::string(memberName), dataType, memberType->IsOptional())));
2202+
THROW_ARROW_NOT_OK(
2203+
NYql::NDqProto::StatusIds::INTERNAL_ERROR,
2204+
builder.AddField(std::make_shared<arrow::Field>(std::string(memberName), dataType, memberType->IsOptional()))
2205+
);
22012206
readSpec->ColumnReorder.push_back(i);
22022207
readSpec->RowSpec.emplace(memberName, memberType);
22032208
}
22042209

22052210
auto res = builder.Finish();
2206-
THROW_ARROW_NOT_OK(res.status());
2211+
THROW_ARROW_NOT_OK(NYql::NDqProto::StatusIds::INTERNAL_ERROR, res.status());
22072212
readSpec->ArrowSchema = std::move(res).ValueOrDie();
22082213
} else {
22092214
readSpec->CHColumns.resize(structType->GetMembersCount());

ydb/tests/fq/s3/test_s3_0.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ def test_inference_file_error(self, kikimr, s3, client, unique_prefix):
339339

340340
query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id
341341
client.wait_query_status(query_id, fq.QueryMeta.FAILED)
342-
assert "couldn\\'t open csv/tsv file, check format and compression params:" in str(
342+
assert "couldn\\'t open csv/tsv file, check format and compression parameters:" in str(
343343
client.describe_query(query_id).result
344344
)
345345

0 commit comments

Comments
 (0)