Skip to content

Commit 42b4fda

Browse files
authored
new converters have been added (#5147)
1 parent a3bfa1d commit 42b4fda

File tree

2 files changed

+515
-172
lines changed

2 files changed

+515
-172
lines changed

ydb/library/yql/providers/s3/actors/yql_arrow_column_converters.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,37 @@ std::shared_ptr<arrow::Array> ArrowTypeAsYqlTimestamp(const std::shared_ptr<arro
182182
return builder.Build(true).make_array();
183183
}
184184

185+
template <bool isOptional, typename TArrowType>
186+
std::shared_ptr<arrow::Array> ArrowTypeAsYqlString(const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value, ui64 multiplier, const TString& format = {}) {
187+
::NYql::NUdf::TStringArrayBuilder<arrow::BinaryType, isOptional> builder(NKikimr::NMiniKQL::TTypeInfoHelper(), targetType, *arrow::system_memory_pool(), value->length());
188+
::NYql::NUdf::TFixedSizeBlockReader<TArrowType, isOptional> reader;
189+
for (i64 i = 0; i < value->length(); ++i) {
190+
const NUdf::TBlockItem item = reader.GetItem(*value->data(), i);
191+
if constexpr (isOptional) {
192+
if (!item) {
193+
builder.Add(item);
194+
continue;
195+
}
196+
} else if (!item) {
197+
throw parquet::ParquetException(TStringBuilder() << "null value for timestamp could not be represented in non-optional type");
198+
}
199+
200+
const TArrowType baseValue = item.As<TArrowType>();
201+
if (baseValue < 0 && baseValue > static_cast<i64>(::NYql::NUdf::MAX_TIMESTAMP)) {
202+
throw parquet::ParquetException(TStringBuilder() << "timestamp in parquet is out of range [0, " << ::NYql::NUdf::MAX_TIMESTAMP << "]: " << baseValue);
203+
}
204+
205+
if (static_cast<ui64>(baseValue) > ::NYql::NUdf::MAX_TIMESTAMP / multiplier) {
206+
throw parquet::ParquetException(TStringBuilder() << "timestamp in parquet is out of range [0, " << ::NYql::NUdf::MAX_TIMESTAMP << "] after transformation: " << baseValue);
207+
}
208+
209+
const ui64 v = baseValue * multiplier;
210+
TString result = format ? TInstant::FromValue(v).FormatGmTime(format.c_str()) : TInstant::FromValue(v).ToString();
211+
builder.Add(NUdf::TBlockItem(NUdf::TStringRef(result.c_str(), result.Size())));
212+
}
213+
return builder.Build(true).make_array();
214+
}
215+
185216
template <bool isOptional>
186217
std::shared_ptr<arrow::Array> ArrowStringAsYqlTimestamp(const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value, const NDB::FormatSettings& formatSettings) {
187218
::NYql::NUdf::TFixedSizeArrayBuilder<ui64, isOptional> builder(NKikimr::NMiniKQL::TTypeInfoHelper(), targetType, *arrow::system_memory_pool(), value->length());
@@ -375,6 +406,30 @@ TColumnConverter ArrowTimestampAsYqlTimestamp(const std::shared_ptr<arrow::DataT
375406
};
376407
}
377408

409+
TColumnConverter ArrowTimestampAsYqlString(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::TimeUnit::type timeUnit) {
410+
return [targetType, isOptional, multiplier=GetMultiplierForTimestamp(timeUnit)](const std::shared_ptr<arrow::Array>& value) {
411+
return isOptional
412+
? ArrowTypeAsYqlString<true, i64>(targetType, value, multiplier)
413+
: ArrowTypeAsYqlString<false, i64>(targetType, value, multiplier);
414+
};
415+
}
416+
417+
TColumnConverter ArrowDate64AsYqlString(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit dateUnit) {
418+
return [targetType, isOptional, multiplier=GetMultiplierForDatetime(dateUnit)](const std::shared_ptr<arrow::Array>& value) {
419+
return isOptional
420+
? ArrowTypeAsYqlString<true, i64>(targetType, value, multiplier, "%Y-%m-%d")
421+
: ArrowTypeAsYqlString<false, i64>(targetType, value, multiplier, "%Y-%m-%d");
422+
};
423+
}
424+
425+
TColumnConverter ArrowDate32AsYqlString(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit dateUnit) {
426+
return [targetType, isOptional, multiplier=GetMultiplierForTimestamp(dateUnit)](const std::shared_ptr<arrow::Array>& value) {
427+
return isOptional
428+
? ArrowTypeAsYqlString<true, i32>(targetType, value, multiplier, "%Y-%m-%d")
429+
: ArrowTypeAsYqlString<false, i32>(targetType, value, multiplier, "%Y-%m-%d");
430+
};
431+
}
432+
378433
TColumnConverter ArrowDate32AsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit unit) {
379434
if (unit == arrow::DateUnit::MILLI) {
380435
throw parquet::ParquetException(TStringBuilder() << "millisecond accuracy does not fit into the date");
@@ -459,6 +514,9 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
459514
return ArrowDate32AsYqlDatetime(targetType, isOptional, dateType.unit());
460515
case NUdf::EDataSlot::Timestamp:
461516
return ArrowDate32AsYqlTimestamp(targetType, isOptional, dateType.unit());
517+
case NUdf::EDataSlot::String:
518+
case NUdf::EDataSlot::Utf8:
519+
return ArrowDate32AsYqlString(targetType, isOptional, dateType.unit());
462520
default:
463521
return {};
464522
}
@@ -471,6 +529,9 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
471529
return ArrowDate64AsYqlDatetime(targetType, isOptional, dateType.unit());
472530
case NUdf::EDataSlot::Timestamp:
473531
return ArrowDate64AsYqlTimestamp(targetType, isOptional, dateType.unit());
532+
case NUdf::EDataSlot::String:
533+
case NUdf::EDataSlot::Utf8:
534+
return ArrowDate64AsYqlString(targetType, isOptional, dateType.unit());
474535
default:
475536
return {};
476537
}
@@ -482,10 +543,14 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
482543
return ArrowTimestampAsYqlDatetime(targetType, isOptional, timestampType.unit());
483544
case NUdf::EDataSlot::Timestamp:
484545
return ArrowTimestampAsYqlTimestamp(targetType, isOptional, timestampType.unit());
546+
case NUdf::EDataSlot::String:
547+
case NUdf::EDataSlot::Utf8:
548+
return ArrowTimestampAsYqlString(targetType, isOptional, timestampType.unit());
485549
default:
486550
return {};
487551
}
488552
}
553+
case arrow::Type::STRING:
489554
case arrow::Type::BINARY: {
490555
switch (slotItem) {
491556
case NUdf::EDataSlot::Datetime:

0 commit comments

Comments
 (0)