@@ -182,6 +182,37 @@ std::shared_ptr<arrow::Array> ArrowTypeAsYqlTimestamp(const std::shared_ptr<arro
182
182
return builder.Build (true ).make_array ();
183
183
}
184
184
185
+ template <bool isOptional, typename TArrowType>
186
+ std::shared_ptr<arrow::Array> ArrowTypeAsYqlString (const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value, ui64 multiplier, const TString& format = {}) {
187
+ ::NYql::NUdf::TStringArrayBuilder<arrow::BinaryType, isOptional> builder (NKikimr::NMiniKQL::TTypeInfoHelper (), targetType, *arrow::system_memory_pool (), value->length ());
188
+ ::NYql::NUdf::TFixedSizeBlockReader<TArrowType, isOptional> reader;
189
+ for (i64 i = 0 ; i < value->length (); ++i) {
190
+ const NUdf::TBlockItem item = reader.GetItem (*value->data (), i);
191
+ if constexpr (isOptional) {
192
+ if (!item) {
193
+ builder.Add (item);
194
+ continue ;
195
+ }
196
+ } else if (!item) {
197
+ throw parquet::ParquetException (TStringBuilder () << " null value for timestamp could not be represented in non-optional type" );
198
+ }
199
+
200
+ const TArrowType baseValue = item.As <TArrowType>();
201
+ if (baseValue < 0 && baseValue > static_cast <i64>(::NYql::NUdf::MAX_TIMESTAMP)) {
202
+ throw parquet::ParquetException (TStringBuilder () << " timestamp in parquet is out of range [0, " << ::NYql::NUdf::MAX_TIMESTAMP << " ]: " << baseValue);
203
+ }
204
+
205
+ if (static_cast <ui64>(baseValue) > ::NYql::NUdf::MAX_TIMESTAMP / multiplier) {
206
+ throw parquet::ParquetException (TStringBuilder () << " timestamp in parquet is out of range [0, " << ::NYql::NUdf::MAX_TIMESTAMP << " ] after transformation: " << baseValue);
207
+ }
208
+
209
+ const ui64 v = baseValue * multiplier;
210
+ TString result = format ? TInstant::FromValue (v).FormatGmTime (format.c_str ()) : TInstant::FromValue (v).ToString ();
211
+ builder.Add (NUdf::TBlockItem (NUdf::TStringRef (result.c_str (), result.Size ())));
212
+ }
213
+ return builder.Build (true ).make_array ();
214
+ }
215
+
185
216
template <bool isOptional>
186
217
std::shared_ptr<arrow::Array> ArrowStringAsYqlTimestamp (const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value, const NDB::FormatSettings& formatSettings) {
187
218
::NYql::NUdf::TFixedSizeArrayBuilder<ui64, isOptional> builder (NKikimr::NMiniKQL::TTypeInfoHelper (), targetType, *arrow::system_memory_pool (), value->length ());
@@ -375,6 +406,30 @@ TColumnConverter ArrowTimestampAsYqlTimestamp(const std::shared_ptr<arrow::DataT
375
406
};
376
407
}
377
408
409
+ TColumnConverter ArrowTimestampAsYqlString (const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::TimeUnit::type timeUnit) {
410
+ return [targetType, isOptional, multiplier=GetMultiplierForTimestamp (timeUnit)](const std::shared_ptr<arrow::Array>& value) {
411
+ return isOptional
412
+ ? ArrowTypeAsYqlString<true , i64>(targetType, value, multiplier)
413
+ : ArrowTypeAsYqlString<false , i64>(targetType, value, multiplier);
414
+ };
415
+ }
416
+
417
+ TColumnConverter ArrowDate64AsYqlString (const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit dateUnit) {
418
+ return [targetType, isOptional, multiplier=GetMultiplierForDatetime (dateUnit)](const std::shared_ptr<arrow::Array>& value) {
419
+ return isOptional
420
+ ? ArrowTypeAsYqlString<true , i64>(targetType, value, multiplier, " %Y-%m-%d" )
421
+ : ArrowTypeAsYqlString<false , i64>(targetType, value, multiplier, " %Y-%m-%d" );
422
+ };
423
+ }
424
+
425
+ TColumnConverter ArrowDate32AsYqlString (const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit dateUnit) {
426
+ return [targetType, isOptional, multiplier=GetMultiplierForTimestamp (dateUnit)](const std::shared_ptr<arrow::Array>& value) {
427
+ return isOptional
428
+ ? ArrowTypeAsYqlString<true , i32>(targetType, value, multiplier, " %Y-%m-%d" )
429
+ : ArrowTypeAsYqlString<false , i32>(targetType, value, multiplier, " %Y-%m-%d" );
430
+ };
431
+ }
432
+
378
433
TColumnConverter ArrowDate32AsYqlDate (const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit unit) {
379
434
if (unit == arrow::DateUnit::MILLI) {
380
435
throw parquet::ParquetException (TStringBuilder () << " millisecond accuracy does not fit into the date" );
@@ -459,6 +514,9 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
459
514
return ArrowDate32AsYqlDatetime (targetType, isOptional, dateType.unit ());
460
515
case NUdf::EDataSlot::Timestamp:
461
516
return ArrowDate32AsYqlTimestamp (targetType, isOptional, dateType.unit ());
517
+ case NUdf::EDataSlot::String:
518
+ case NUdf::EDataSlot::Utf8:
519
+ return ArrowDate32AsYqlString (targetType, isOptional, dateType.unit ());
462
520
default :
463
521
return {};
464
522
}
@@ -471,6 +529,9 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
471
529
return ArrowDate64AsYqlDatetime (targetType, isOptional, dateType.unit ());
472
530
case NUdf::EDataSlot::Timestamp:
473
531
return ArrowDate64AsYqlTimestamp (targetType, isOptional, dateType.unit ());
532
+ case NUdf::EDataSlot::String:
533
+ case NUdf::EDataSlot::Utf8:
534
+ return ArrowDate64AsYqlString (targetType, isOptional, dateType.unit ());
474
535
default :
475
536
return {};
476
537
}
@@ -482,10 +543,14 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
482
543
return ArrowTimestampAsYqlDatetime (targetType, isOptional, timestampType.unit ());
483
544
case NUdf::EDataSlot::Timestamp:
484
545
return ArrowTimestampAsYqlTimestamp (targetType, isOptional, timestampType.unit ());
546
+ case NUdf::EDataSlot::String:
547
+ case NUdf::EDataSlot::Utf8:
548
+ return ArrowTimestampAsYqlString (targetType, isOptional, timestampType.unit ());
485
549
default :
486
550
return {};
487
551
}
488
552
}
553
+ case arrow::Type::STRING:
489
554
case arrow::Type::BINARY: {
490
555
switch (slotItem) {
491
556
case NUdf::EDataSlot::Datetime:
0 commit comments