Skip to content

Commit d5bcd8a

Browse files
authored
Merge ce819a2 into d4c13e3
2 parents d4c13e3 + ce819a2 commit d5bcd8a

File tree

13 files changed

+186
-14
lines changed

13 files changed

+186
-14
lines changed

ydb/core/external_sources/object_storage.cpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ struct TObjectStorageExternalSource : public IExternalSource {
6767
}
6868
}
6969

70-
if (auto issues = Validate(schema, objectStorage, PathsLimit)) {
70+
if (auto issues = Validate(schema, objectStorage, PathsLimit, general.location())) {
7171
ythrow TExternalSourceException() << issues.ToString();
7272
}
7373

@@ -136,9 +136,15 @@ struct TObjectStorageExternalSource : public IExternalSource {
136136
}
137137

138138
template<typename TScheme, typename TObjectStorage>
139-
static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit) {
139+
static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit, const TString& location) {
140140
NYql::TIssues issues;
141-
issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting()));
141+
if (TString errorString; !NYql::NS3::ValidateWildcards(location, errorString)) {
142+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains invalid wildcard: " << errorString));
143+
}
144+
if (objectStorage.partitioned_by_size() && NYql::NS3::HasWildcards(location)) {
145+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains wildcards"));
146+
}
147+
issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting(), location));
142148
issues.AddIssues(ValidateRawFormat(objectStorage.format(), schema, objectStorage.partitioned_by()));
143149
if (objectStorage.projection_size() || objectStorage.partitioned_by_size()) {
144150
try {
@@ -160,11 +166,17 @@ struct TObjectStorageExternalSource : public IExternalSource {
160166
return issues;
161167
}
162168

163-
static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map<TString, TString>& formatSetting) {
169+
static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map<TString, TString>& formatSetting, const TString& location) {
164170
NYql::TIssues issues;
165171
issues.AddIssues(ValidateDateFormatSetting(formatSetting));
166172
for (const auto& [key, value]: formatSetting) {
167173
if (key == "file_pattern"sv) {
174+
if (TString errorString; !NYql::NS3::ValidateWildcards(value, errorString)) {
175+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "File pattern '" << value << "' contains invalid wildcard: " << errorString));
176+
}
177+
if (value && !location.EndsWith("/")) {
178+
issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, "Path pattern cannot be used with file_pattern"));
179+
}
168180
continue;
169181
}
170182

@@ -631,8 +643,8 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector<TRegEx
631643
return MakeIntrusive<TObjectStorageExternalSource>(hostnamePatterns, actorSystem, pathsLimit, std::move(credentialsFactory), enableInfer, allowLocalFiles);
632644
}
633645

634-
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit) {
635-
return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit);
646+
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location) {
647+
return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit, location);
636648
}
637649

638650
NYql::TIssues ValidateDateFormatSetting(const google::protobuf::Map<TString, TString>& formatSetting, bool matchAllSettings) {

ydb/core/external_sources/object_storage.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector<TRegEx
1616
bool enableInfer,
1717
bool allowLocalFiles);
1818

19-
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit);
19+
NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location);
2020

2121
NYql::TIssues ValidateDateFormatSetting(const google::protobuf::Map<TString, TString>& formatSetting, bool matchAllSettings = false);
2222

ydb/core/external_sources/object_storage_ut.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,32 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) {
2929
general.mutable_attributes()->insert({"projection.h", "b"});
3030
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Partition by must always be specified");
3131
}
32+
33+
Y_UNIT_TEST(WildcardsValidation) {
34+
auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false, false);
35+
NKikimrExternalSources::TSchema schema;
36+
37+
{ // location
38+
NKikimrExternalSources::TGeneral general;
39+
general.set_location("}");
40+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '}' contains invalid wildcard:");
41+
}
42+
43+
{ // file pattern
44+
NKikimrExternalSources::TGeneral general;
45+
general.mutable_attributes()->insert({"file_pattern", "}"});
46+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "File pattern '}' contains invalid wildcard:");
47+
general.set_location("/test_file");
48+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Path pattern cannot be used with file_pattern");
49+
}
50+
51+
{ // partitioned by
52+
NKikimrExternalSources::TGeneral general;
53+
general.set_location("*");
54+
general.mutable_attributes()->insert({"partitioned_by", "[year]"});
55+
UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '*' contains wildcards");
56+
}
57+
}
3258
}
3359

3460
} // NKikimr

ydb/core/fq/libs/control_plane_storage/request_validators.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ NYql::TIssues ValidateBinding(const T& ev, size_t maxSize, const TSet<FederatedQ
121121
case FederatedQuery::BindingSetting::kObjectStorage:
122122
const FederatedQuery::ObjectStorageBinding objectStorage = setting.object_storage();
123123
for (const auto& subset: objectStorage.subset()) {
124-
issues.AddIssues(NKikimr::NExternalSource::Validate(subset.schema(), subset, pathsLimit));
124+
issues.AddIssues(NKikimr::NExternalSource::Validate(subset.schema(), subset, pathsLimit, subset.path_pattern()));
125125
}
126126
break;
127127
}

ydb/core/kqp/gateway/utils/scheme_helpers.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ void FillCreateExternalTableColumnDesc(NKikimrSchemeOp::TExternalTableDescriptio
8888
columnDesc.SetNotNull(columnIt->second.NotNull);
8989
}
9090
NKikimrExternalSources::TGeneral general;
91+
general.set_location(settings.Location);
9192
auto& attributes = *general.mutable_attributes();
9293
for (const auto& [key, value]: settings.SourceTypeParameters) {
9394
attributes.insert({key, value});

ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) {
3535
const TString externalDataSourceName = "/Root/external_data_source";
3636
const TString externalTableName = "/Root/test_binding_resolve";
3737
const TString bucket = "test_bucket1";
38-
const TString object = TStringBuilder() << "test_" << GetSymbolsString(' ', '~', "{}") << "_object";
38+
const TString object = TStringBuilder() << "test_" << GetSymbolsString(' ', '~', "*?{}") << "_object";
3939

4040
CreateBucketWithObject(bucket, object, TEST_CONTENT);
4141

@@ -2143,7 +2143,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) {
21432143

21442144
Y_UNIT_TEST(TestReadEmptyFileWithCsvFormat) {
21452145
const TString externalDataSourceName = "/Root/external_data_source";
2146-
const TString bucket = "test_bucket1";
2146+
const TString bucket = "test_bucket12";
21472147

21482148
CreateBucketWithObject(bucket, "test_object", "");
21492149

@@ -2181,6 +2181,60 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) {
21812181
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
21822182
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Completed, readyOp.Status().GetIssues().ToString());
21832183
}
2184+
2185+
Y_UNIT_TEST(TestWildcardValidation) {
2186+
const TString bucket = "test_bucket13";
2187+
2188+
CreateBucket(bucket);
2189+
2190+
auto kikimr = NTestUtils::MakeKikimrRunner();
2191+
2192+
auto tc = kikimr->GetTableClient();
2193+
auto session = tc.CreateSession().GetValueSync().GetSession();
2194+
const TString query = fmt::format(R"(
2195+
CREATE EXTERNAL DATA SOURCE `/Root/external_data_source` WITH (
2196+
SOURCE_TYPE="ObjectStorage",
2197+
LOCATION="{location}",
2198+
AUTH_METHOD="NONE"
2199+
);)",
2200+
"location"_a = GetBucketLocation(bucket)
2201+
);
2202+
auto result = session.ExecuteSchemeQuery(query).GetValueSync();
2203+
UNIT_ASSERT_C(result.GetStatus() == NYdb::EStatus::SUCCESS, result.GetIssues().ToString());
2204+
2205+
auto db = kikimr->GetQueryClient();
2206+
2207+
{ // path validation
2208+
const TString sql = R"(
2209+
SELECT * FROM `/Root/external_data_source`.`/}` WITH (
2210+
SCHEMA = (data String),
2211+
FORMAT = "csv_with_names"
2212+
))";
2213+
2214+
auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync();
2215+
UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString());
2216+
2217+
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
2218+
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString());
2219+
UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "Path '/}' contains invalid wildcard:");
2220+
}
2221+
2222+
{ // file pattern validation
2223+
const TString sql = R"(
2224+
SELECT * FROM `/Root/external_data_source`.`/` WITH (
2225+
SCHEMA = (data String),
2226+
FORMAT = "csv_with_names",
2227+
FILE_PATTERN = "}"
2228+
))";
2229+
2230+
auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync();
2231+
UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString());
2232+
2233+
NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver());
2234+
UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString());
2235+
UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "File pattern '}' contains invalid wildcard:");
2236+
}
2237+
}
21842238
}
21852239

21862240
} // namespace NKikimr::NKqp

ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,28 @@ Y_UNIT_TEST_SUITE(KqpFederatedSchemeTest) {
215215
};
216216
TestInvalidDropForExternalTableWithAuth(queryClientExecutor, "generic_query");
217217
}
218+
219+
Y_UNIT_TEST(ExternalTableDdlLocationValidation) {
220+
auto kikimr = NTestUtils::MakeKikimrRunner();
221+
auto db = kikimr->GetTableClient();
222+
auto session = db.CreateSession().GetValueSync().GetSession();
223+
auto query = TStringBuilder() << R"(
224+
CREATE EXTERNAL DATA SOURCE `/Root/ExternalDataSource` WITH (
225+
SOURCE_TYPE="ObjectStorage",
226+
LOCATION="my-bucket",
227+
AUTH_METHOD="NONE"
228+
);
229+
CREATE EXTERNAL TABLE `/Root/ExternalTable` (
230+
Key Uint64,
231+
Value String
232+
) WITH (
233+
DATA_SOURCE="/Root/ExternalDataSource",
234+
LOCATION="}"
235+
);)";
236+
auto result = session.ExecuteSchemeQuery(query).GetValueSync();
237+
UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), EStatus::SCHEME_ERROR);
238+
UNIT_ASSERT_STRING_CONTAINS(result.GetIssues().ToString(), "Location '}' contains invalid wildcard:");
239+
}
218240
}
219241

220242
} // namespace NKikimr::NKqp

ydb/core/protos/external_sources.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ message TSchema {
1111

1212
message TGeneral {
1313
map<string, string> attributes = 1 [(Ydb.size).le = 100];
14+
optional string location = 2;
1415
}
1516

1617
message TObjectStorage {

ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ std::pair<TPathFilter, TEarlyStopChecker> MakeFilterRegexp(const TString& regex,
4747
} else {
4848
re = std::make_shared<RE2>(re2::StringPiece(regex), RE2::Options());
4949
}
50+
Y_ENSURE(re->ok());
5051

5152
const size_t numGroups = re->NumberOfCapturingGroups();
5253
YQL_CLOG(DEBUG, ProviderS3)

ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,22 +47,23 @@ TString RegexFromWildcards(const std::string_view& pattern) {
4747
const auto& escaped = EscapeRegex(pattern);
4848
TStringBuilder result;
4949
bool slash = false;
50-
bool group = false;
50+
size_t groupCount = 0;
5151

5252
for (const char& c : escaped) {
5353
switch (c) {
5454
case '{':
5555
result << "(?:";
56-
group = true;
56+
groupCount++;
5757
slash = false;
5858
break;
5959
case '}':
60+
Y_ENSURE(groupCount, "Unexpected group end");
6061
result << ')';
61-
group = false;
62+
groupCount--;
6263
slash = false;
6364
break;
6465
case ',':
65-
if (group)
66+
if (groupCount)
6667
result << '|';
6768
else
6869
result << "\\,";
@@ -89,7 +90,29 @@ TString RegexFromWildcards(const std::string_view& pattern) {
8990
break;
9091
}
9192
}
93+
Y_ENSURE(!groupCount, "Found " << groupCount << " unterminated groups");
94+
Y_ENSURE(!slash, "Expected symbol after slash");
9295
return result;
9396
}
9497

98+
bool ValidateWildcards(const std::string_view& pattern, TString& errorString) {
99+
size_t groupCount = 0;
100+
for (size_t i = 0; i < pattern.size(); ++i) {
101+
if (pattern[i] == '{') {
102+
groupCount++;
103+
} else if (pattern[i] == '}') {
104+
if (!groupCount) {
105+
errorString = TStringBuilder() << "found unexpected group end at position " << i;
106+
return false;
107+
}
108+
groupCount--;
109+
}
110+
}
111+
if (groupCount) {
112+
errorString = TStringBuilder() << "found " << groupCount << " unterminated groups";
113+
return false;
114+
}
115+
return true;
116+
}
117+
95118
}

ydb/library/yql/providers/s3/object_listers/yql_s3_path.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,6 @@ TString EscapeRegex(const TString& str);
1919
TString EscapeRegex(const std::string_view& str);
2020

2121
TString RegexFromWildcards(const std::string_view& pattern);
22+
bool ValidateWildcards(const std::string_view& pattern, TString& errorString);
2223

2324
}

ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,28 @@ Y_UNIT_TEST_SUITE(TPathTests) {
2828
UNIT_ASSERT_VALUES_EQUAL(NormalizePath("/a/b/c/"), "a/b/c/");
2929
UNIT_ASSERT_VALUES_EQUAL(NormalizePath("///a/b/c///"), "a/b/c/");
3030
}
31+
32+
void TestRegexFromWildcardsSuccess(const TString& wildcards, const TString& expectedRegex) {
33+
TString errorString;
34+
UNIT_ASSERT_C(ValidateWildcards(wildcards, errorString), errorString);
35+
UNIT_ASSERT_VALUES_EQUAL(RegexFromWildcards(wildcards), expectedRegex);
36+
}
37+
38+
void TestRegexFromWildcardsFail(const TString& wildcards, const TString& expectedException, const TString& expectedError) {
39+
TString errorString;
40+
UNIT_ASSERT(!ValidateWildcards(wildcards, errorString));
41+
UNIT_ASSERT_STRING_CONTAINS(errorString, expectedError);
42+
UNIT_ASSERT_EXCEPTION_CONTAINS(RegexFromWildcards(wildcards), yexception, expectedException);
43+
}
44+
45+
Y_UNIT_TEST(TestRegexFromWildcards) {
46+
TestRegexFromWildcardsSuccess("first,test\\_{alt1,alt2}_text", "first\\,test\\\\_(?:alt1|alt2)_text");
47+
TestRegexFromWildcardsSuccess("hello.*world?str", "hello\\..*world.str");
48+
TestRegexFromWildcardsSuccess("many_{{alt1,alt2},{alt3,{alt4}},alt5,{}}_alts", "many_(?:(?:alt1|alt2)|(?:alt3|(?:alt4))|alt5|(?:))_alts");
49+
50+
TestRegexFromWildcardsFail("hello{}}world", "Unexpected group end", "found unexpected group end at position 7");
51+
TestRegexFromWildcardsFail("hello{{{}world", "Found 2 unterminated groups", "found 2 unterminated groups");
52+
}
3153
}
3254

3355
}

ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,10 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase {
732732
if (!FindFilePattern(settings, ctx, filePattern)) {
733733
return false;
734734
}
735+
if (TString errorString; !NS3::ValidateWildcards(filePattern, errorString)) {
736+
ctx.AddError(TIssue(ctx.GetPosition(read.Pos()), TStringBuilder() << "File pattern '" << filePattern << "' contains invalid wildcard: " << errorString));
737+
return false;
738+
}
735739
const TString effectiveFilePattern = filePattern ? filePattern : "*";
736740

737741
TVector<TString> paths;
@@ -763,6 +767,11 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase {
763767
}
764768

765769
for (const auto& path : paths) {
770+
if (TString errorString; !NS3::ValidateWildcards(path, errorString)) {
771+
ctx.AddError(TIssue(ctx.GetPosition(read.Pos()), TStringBuilder() << "Path '" << path << "' contains invalid wildcard: " << errorString));
772+
return false;
773+
}
774+
766775
// each path in CONCAT() can generate multiple list requests for explicit partitioning
767776
TVector<TListRequest> reqs;
768777

0 commit comments

Comments
 (0)