-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Fixed the TextTransform bug where chargrams where being computed differently when using with/without word tokenizer. #548
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
868a72f
f482eea
64390ee
2201433
d1dd161
f458cdb
90cdf2a
3ed6e87
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,12 +64,16 @@ public sealed class Arguments : TransformInputBase | |
public const string LoaderSignature = "CharToken"; | ||
public const string UserName = "Character Tokenizer Transform"; | ||
|
||
// Keep track of the model that was saved with ver:0x00010001 | ||
private readonly bool _isSeparatorStartEnd; | ||
|
||
private static VersionInfo GetVersionInfo() | ||
{ | ||
return new VersionInfo( | ||
modelSignature: "CHARTOKN", | ||
verWrittenCur: 0x00010001, // Initial | ||
verReadableCur: 0x00010001, | ||
//verWrittenCur: 0x00010001, // Initial | ||
verWrittenCur: 0x00010002, // Updated to use UnitSeparator <US> character instead of using <ETX><STX> for vector inputs. | ||
verReadableCur: 0x00010002, | ||
verWeCanReadBack: 0x00010001, | ||
loaderSignature: LoaderSignature); | ||
} | ||
|
@@ -84,6 +88,7 @@ private static VersionInfo GetVersionInfo() | |
private volatile string _keyValuesStr; | ||
private volatile int[] _keyValuesBoundaries; | ||
|
||
private const ushort UnitSeparator = 0x1f; | ||
private const ushort TextStartMarker = 0x02; | ||
private const ushort TextEndMarker = 0x03; | ||
private const int TextMarkersCount = 2; | ||
|
@@ -102,6 +107,7 @@ public CharTokenizeTransform(IHostEnvironment env, Arguments args, IDataView inp | |
|
||
_type = GetOutputColumnType(); | ||
SetMetadata(); | ||
_isSeparatorStartEnd = false; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI there is no need for this. The default value of bool is false. #Resolved |
||
} | ||
|
||
private static ColumnType GetOutputColumnType() | ||
|
@@ -120,6 +126,8 @@ private CharTokenizeTransform(IHost host, ModelLoadContext ctx, IDataView input) | |
// byte: _useMarkerChars value. | ||
_useMarkerChars = ctx.Reader.ReadBoolByte(); | ||
|
||
_isSeparatorStartEnd = ctx.Header.ModelVerReadable < 0x00010002 || ctx.Reader.ReadBoolByte(); | ||
|
||
_type = GetOutputColumnType(); | ||
SetMetadata(); | ||
} | ||
|
@@ -145,6 +153,7 @@ public override void Save(ModelSaveContext ctx) | |
// byte: _useMarkerChars value. | ||
SaveBase(ctx); | ||
ctx.Writer.WriteBoolByte(_useMarkerChars); | ||
ctx.Writer.WriteBoolByte(_isSeparatorStartEnd); | ||
} | ||
|
||
protected override ColumnType GetColumnTypeCore(int iinfo) | ||
|
@@ -399,8 +408,8 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(IRow input, int iinfo) | |
|
||
var getSrc = GetSrcGetter<VBuffer<DvText>>(input, iinfo); | ||
var src = default(VBuffer<DvText>); | ||
return | ||
(ref VBuffer<ushort> dst) => | ||
|
||
ValueGetter<VBuffer<ushort>> getterWithStartEndSep = (ref VBuffer<ushort> dst) => | ||
{ | ||
getSrc(ref src); | ||
|
||
|
@@ -438,6 +447,67 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(IRow input, int iinfo) | |
|
||
dst = new VBuffer<ushort>(len, values, dst.Indices); | ||
}; | ||
|
||
ValueGetter < VBuffer<ushort> > getterWithUnitSep = (ref VBuffer<ushort> dst) => | ||
{ | ||
getSrc(ref src); | ||
|
||
int len = 0; | ||
|
||
for (int i = 0; i < src.Count; i++) | ||
{ | ||
if (src.Values[i].HasChars) | ||
{ | ||
len += src.Values[i].Length; | ||
|
||
if (i > 0) | ||
len += 1; // add UnitSeparator character to len that will be added | ||
} | ||
} | ||
|
||
if (_useMarkerChars) | ||
len += TextMarkersCount; | ||
|
||
var values = dst.Values; | ||
if (len > 0) | ||
{ | ||
if (Utils.Size(values) < len) | ||
values = new ushort[len]; | ||
|
||
int index = 0; | ||
|
||
// VBuffer<DvText> can be a result of either concatenating text columns together | ||
// or application of word tokenizer before char tokenizer in TextTransform. | ||
// | ||
// Considering VBuffer<DvText> as a single text stream. | ||
// Therefore, prepend and append start and end markers only once i.e. at the start and at end of vector. | ||
// Insert UnitSeparator after every piece of text in the vector. | ||
if (_useMarkerChars) | ||
values[index++] = TextStartMarker; | ||
|
||
for (int i = 0; i < src.Count; i++) | ||
{ | ||
if (!src.Values[i].HasChars) | ||
continue; | ||
|
||
if (i > 0) | ||
values[index++] = UnitSeparator; | ||
|
||
for (int ich = 0; ich < src.Values[i].Length; ich++) | ||
{ | ||
values[index++] = src.Values[i][ich]; | ||
} | ||
} | ||
|
||
if (_useMarkerChars) | ||
values[index++] = TextEndMarker; | ||
|
||
Contracts.Assert(index == len); | ||
} | ||
|
||
dst = new VBuffer<ushort>(len, values, dst.Indices); | ||
}; | ||
return _isSeparatorStartEnd ? getterWithStartEndSep : getterWithUnitSep; | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -262,6 +262,30 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV | |
view = new ConcatTransform(h, new ConcatTransform.Arguments() { Column = xfCols }, view); | ||
} | ||
|
||
if (tparams.NeedsNormalizeTransform) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @justinormont, I moved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know if there are averse effects of running If you make me a build, I'll run a manual regression test. Unfortunately we don't currently have running nightly regression tests to tell us if the text datasets decreased in their core metrics. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't like this change. It's far from clear to me that if someone asks to remove stopwords that they meant for the content of stopwords to be retained in the chargrams. At the very least this ought to be a configurable option. In reply to: 204000786 [](ancestors = 204000786) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @TomFinley, Can you elaborate more on this? Your point is not clear to me. Don't you see it good to have In reply to: 204201054 [](ancestors = 204201054,204000786) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry thought you were saying something else for some reason got confused. #Resolved |
||
{ | ||
var xfCols = new TextNormalizerCol[textCols.Length]; | ||
string[] dstCols = new string[textCols.Length]; | ||
for (int i = 0; i < textCols.Length; i++) | ||
{ | ||
dstCols[i] = GenerateColumnName(view.Schema, textCols[i], "TextNormalizer"); | ||
tempCols.Add(dstCols[i]); | ||
xfCols[i] = new TextNormalizerCol() { Source = textCols[i], Name = dstCols[i] }; | ||
} | ||
|
||
view = new TextNormalizerTransform(h, | ||
new TextNormalizerArgs() | ||
{ | ||
Column = xfCols, | ||
KeepDiacritics = tparams.KeepDiacritics, | ||
KeepNumbers = tparams.KeepNumbers, | ||
KeepPunctuations = tparams.KeepPunctuations, | ||
TextCase = tparams.TextCase | ||
}, view); | ||
|
||
textCols = dstCols; | ||
} | ||
|
||
if (tparams.NeedsWordTokenizationTransform) | ||
{ | ||
var xfCols = new DelimitedTokenizeTransform.Column[textCols.Length]; | ||
|
@@ -281,34 +305,6 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV | |
view = new DelimitedTokenizeTransform(h, new DelimitedTokenizeTransform.Arguments() { Column = xfCols }, view); | ||
} | ||
|
||
if (tparams.NeedsNormalizeTransform) | ||
{ | ||
string[] srcCols = wordTokCols == null ? textCols : wordTokCols; | ||
var xfCols = new TextNormalizerCol[srcCols.Length]; | ||
string[] dstCols = new string[srcCols.Length]; | ||
for (int i = 0; i < srcCols.Length; i++) | ||
{ | ||
dstCols[i] = GenerateColumnName(view.Schema, srcCols[i], "TextNormalizer"); | ||
tempCols.Add(dstCols[i]); | ||
xfCols[i] = new TextNormalizerCol() { Source = srcCols[i], Name = dstCols[i] }; | ||
} | ||
|
||
view = new TextNormalizerTransform(h, | ||
new TextNormalizerArgs() | ||
{ | ||
Column = xfCols, | ||
KeepDiacritics = tparams.KeepDiacritics, | ||
KeepNumbers = tparams.KeepNumbers, | ||
KeepPunctuations = tparams.KeepPunctuations, | ||
TextCase = tparams.TextCase | ||
}, view); | ||
|
||
if (wordTokCols != null) | ||
wordTokCols = dstCols; | ||
else | ||
textCols = dstCols; | ||
} | ||
|
||
if (tparams.NeedsRemoveStopwordsTransform) | ||
{ | ||
Contracts.Assert(wordTokCols != null, "StopWords transform requires that word tokenization has been applied to the input text."); | ||
|
@@ -360,7 +356,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV | |
if (tparams.CharExtractorFactory != null) | ||
{ | ||
{ | ||
var srcCols = wordTokCols ?? textCols; | ||
var srcCols = tparams.NeedsRemoveStopwordsTransform ? wordTokCols : textCols; | ||
charTokCols = new string[srcCols.Length]; | ||
var xfCols = new CharTokenizeTransform.Column[srcCols.Length]; | ||
for (int i = 0; i < srcCols.Length; i++) | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See other examples. You will see that we keep commented out the version strings from old versions, to track them and so we know why we bumped the version each time. This is important: we have live code in our deserializers that is meant to handle those version bumps, and for that reason we like to have documentation on what exactly changed, so that when we have tests on this or that version in our deserializers, we know why they changed. #Resolved