@@ -64,12 +64,16 @@ public sealed class Arguments : TransformInputBase
64
64
public const string LoaderSignature = "CharToken" ;
65
65
public const string UserName = "Character Tokenizer Transform" ;
66
66
67
+ // Keep track of the model that was saved with ver:0x00010001
68
+ private readonly bool _isSeparatorStartEnd ;
69
+
67
70
private static VersionInfo GetVersionInfo ( )
68
71
{
69
72
return new VersionInfo (
70
73
modelSignature : "CHARTOKN" ,
71
- verWrittenCur : 0x00010001 , // Initial
72
- verReadableCur : 0x00010001 ,
74
+ //verWrittenCur: 0x00010001, // Initial
75
+ verWrittenCur : 0x00010002 , // Updated to use UnitSeparator <US> character instead of using <ETX><STX> for vector inputs.
76
+ verReadableCur : 0x00010002 ,
73
77
verWeCanReadBack : 0x00010001 ,
74
78
loaderSignature : LoaderSignature ) ;
75
79
}
@@ -84,6 +88,7 @@ private static VersionInfo GetVersionInfo()
84
88
private volatile string _keyValuesStr ;
85
89
private volatile int [ ] _keyValuesBoundaries ;
86
90
91
+ private const ushort UnitSeparator = 0x1f ;
87
92
private const ushort TextStartMarker = 0x02 ;
88
93
private const ushort TextEndMarker = 0x03 ;
89
94
private const int TextMarkersCount = 2 ;
@@ -120,6 +125,8 @@ private CharTokenizeTransform(IHost host, ModelLoadContext ctx, IDataView input)
120
125
// byte: _useMarkerChars value.
121
126
_useMarkerChars = ctx . Reader . ReadBoolByte ( ) ;
122
127
128
+ _isSeparatorStartEnd = ctx . Header . ModelVerReadable < 0x00010002 || ctx . Reader . ReadBoolByte ( ) ;
129
+
123
130
_type = GetOutputColumnType ( ) ;
124
131
SetMetadata ( ) ;
125
132
}
@@ -145,6 +152,7 @@ public override void Save(ModelSaveContext ctx)
145
152
// byte: _useMarkerChars value.
146
153
SaveBase ( ctx ) ;
147
154
ctx . Writer . WriteBoolByte ( _useMarkerChars ) ;
155
+ ctx . Writer . WriteBoolByte ( _isSeparatorStartEnd ) ;
148
156
}
149
157
150
158
protected override ColumnType GetColumnTypeCore ( int iinfo )
@@ -399,8 +407,8 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(IRow input, int iinfo)
399
407
400
408
var getSrc = GetSrcGetter < VBuffer < DvText > > ( input , iinfo ) ;
401
409
var src = default ( VBuffer < DvText > ) ;
402
- return
403
- ( ref VBuffer < ushort > dst ) =>
410
+
411
+ ValueGetter < VBuffer < ushort > > getterWithStartEndSep = ( ref VBuffer < ushort > dst ) =>
404
412
{
405
413
getSrc ( ref src ) ;
406
414
@@ -438,6 +446,67 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(IRow input, int iinfo)
438
446
439
447
dst = new VBuffer < ushort > ( len , values , dst . Indices ) ;
440
448
} ;
449
+
450
+ ValueGetter < VBuffer < ushort > > getterWithUnitSep = ( ref VBuffer < ushort > dst ) =>
451
+ {
452
+ getSrc ( ref src ) ;
453
+
454
+ int len = 0 ;
455
+
456
+ for ( int i = 0 ; i < src . Count ; i ++ )
457
+ {
458
+ if ( src . Values [ i ] . HasChars )
459
+ {
460
+ len += src . Values [ i ] . Length ;
461
+
462
+ if ( i > 0 )
463
+ len += 1 ; // add UnitSeparator character to len that will be added
464
+ }
465
+ }
466
+
467
+ if ( _useMarkerChars )
468
+ len += TextMarkersCount ;
469
+
470
+ var values = dst . Values ;
471
+ if ( len > 0 )
472
+ {
473
+ if ( Utils . Size ( values ) < len )
474
+ values = new ushort [ len ] ;
475
+
476
+ int index = 0 ;
477
+
478
+ // VBuffer<DvText> can be a result of either concatenating text columns together
479
+ // or application of word tokenizer before char tokenizer in TextTransform.
480
+ //
481
+ // Considering VBuffer<DvText> as a single text stream.
482
+ // Therefore, prepend and append start and end markers only once i.e. at the start and at end of vector.
483
+ // Insert UnitSeparator after every piece of text in the vector.
484
+ if ( _useMarkerChars )
485
+ values [ index ++ ] = TextStartMarker ;
486
+
487
+ for ( int i = 0 ; i < src . Count ; i ++ )
488
+ {
489
+ if ( ! src . Values [ i ] . HasChars )
490
+ continue ;
491
+
492
+ if ( i > 0 )
493
+ values [ index ++ ] = UnitSeparator ;
494
+
495
+ for ( int ich = 0 ; ich < src . Values [ i ] . Length ; ich ++ )
496
+ {
497
+ values [ index ++ ] = src . Values [ i ] [ ich ] ;
498
+ }
499
+ }
500
+
501
+ if ( _useMarkerChars )
502
+ values [ index ++ ] = TextEndMarker ;
503
+
504
+ Contracts . Assert ( index == len ) ;
505
+ }
506
+
507
+ dst = new VBuffer < ushort > ( len , values , dst . Indices ) ;
508
+ } ;
509
+ return _isSeparatorStartEnd ? getterWithStartEndSep : getterWithUnitSep ;
441
510
}
442
511
}
443
512
}
0 commit comments