Skip to content

Commit ef32b4a

Browse files
committed
Merge branch 'master' of https://github.com/dotnet/machinelearning into dvtext
2 parents c0c0963 + 2f4e50d commit ef32b4a

File tree

8 files changed

+82
-205
lines changed

8 files changed

+82
-205
lines changed

src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,23 @@ public sealed partial class TextLoader : IDataLoader
4040
/// </example>
4141
public sealed class Column
4242
{
43+
public Column() { }
44+
45+
public Column(string name, DataKind? type, int index)
46+
: this(name, type, new[] { new Range(index) }) { }
47+
48+
public Column(string name, DataKind? type, Range[] source, KeyRange keyRange = null)
49+
{
50+
Contracts.CheckValue(name, nameof(name));
51+
Contracts.CheckValue(source, nameof(source));
52+
Contracts.CheckValueOrNull(keyRange);
53+
54+
Name = name;
55+
Type = type;
56+
Source = source;
57+
KeyRange = keyRange;
58+
}
59+
4360
[Argument(ArgumentType.AtMostOnce, HelpText = "Name of the column")]
4461
public string Name;
4562

@@ -179,6 +196,20 @@ public bool IsValid()
179196

180197
public sealed class Range
181198
{
199+
public Range() { }
200+
201+
public Range(int index)
202+
: this(index, index) { }
203+
204+
public Range(int min, int max)
205+
{
206+
Contracts.CheckParam(min >= 0, nameof(min), "min must be non-negative.");
207+
Contracts.CheckParam(max >= min, nameof(max), "max must be greater than or equal to min.");
208+
209+
Min = min;
210+
Max = max;
211+
}
212+
182213
[Argument(ArgumentType.Required, HelpText = "First index in the range")]
183214
public int Min;
184215

src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -491,13 +491,13 @@ private TextLoader.Column GetColumn(string name, ColumnType type, int? start)
491491
{
492492
var key = type.ItemType.AsKey;
493493
if (!key.Contiguous)
494-
keyRange = new KeyRange() { Min = key.Min, Contiguous = false };
494+
keyRange = new KeyRange(key.Min, contiguous: false);
495495
else if (key.Count == 0)
496-
keyRange = new KeyRange() { Min = key.Min };
496+
keyRange = new KeyRange(key.Min);
497497
else
498498
{
499499
Contracts.Assert(key.Count >= 1);
500-
keyRange = new KeyRange() { Min = key.Min, Max = key.Min + (ulong)(key.Count - 1) };
500+
keyRange = new KeyRange(key.Min, key.Min + (ulong)(key.Count - 1));
501501
}
502502
kind = key.RawKind;
503503
}

src/Microsoft.ML.Data/Transforms/TermTransform.cs

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -352,15 +352,7 @@ private static TermMap CreateFileTermMap(IHostEnvironment env, IChannel ch, Argu
352352
new TextLoader.Arguments()
353353
{
354354
Separator = "tab",
355-
Column = new[]
356-
{
357-
new TextLoader.Column()
358-
{
359-
Name ="Term",
360-
Type = DataKind.TX,
361-
Source = new[] { new TextLoader.Range() { Min = 0 } }
362-
}
363-
}
355+
Column = new[] { new TextLoader.Column("Term", DataKind.TX, 0) }
364356
},
365357
fileSource);
366358
src = "Term";

src/Microsoft.ML.Data/Utilities/TypeParsingUtils.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,15 @@ public static KeyType ConstructKeyType(DataKind? type, KeyRange range)
8585
/// </summary>
8686
public sealed class KeyRange
8787
{
88+
public KeyRange() { }
89+
90+
public KeyRange(ulong min, ulong? max = null, bool contiguous = true)
91+
{
92+
Min = min;
93+
Max = max;
94+
Contiguous = contiguous;
95+
}
96+
8897
[Argument(ArgumentType.AtMostOnce, HelpText = "First index in the range")]
8998
public ulong Min;
9099

test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs

Lines changed: 16 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -43,19 +43,9 @@ private IDataView GetBreastCancerDataView()
4343
{
4444
Column = new[]
4545
{
46-
new TextLoader.Column()
47-
{
48-
Name = "Label",
49-
Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} },
50-
Type = Runtime.Data.DataKind.R4
51-
},
52-
53-
new TextLoader.Column()
54-
{
55-
Name = "Features",
56-
Source = new [] { new TextLoader.Range() { Min = 1, Max = 9} },
57-
Type = Runtime.Data.DataKind.R4
58-
}
46+
new TextLoader.Column("Label", DataKind.R4, 0),
47+
new TextLoader.Column("Features", DataKind.R4,
48+
new [] { new TextLoader.Range(1, 9) })
5949
}
6050
},
6151

@@ -74,31 +64,10 @@ private IDataView GetBreastCancerDataviewWithTextColumns()
7464
HasHeader = true,
7565
Column = new[]
7666
{
77-
new TextLoader.Column()
78-
{
79-
Name = "Label",
80-
Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} }
81-
},
82-
83-
new TextLoader.Column()
84-
{
85-
Name = "F1",
86-
Source = new [] { new TextLoader.Range() { Min = 1, Max = 1} },
87-
Type = Runtime.Data.DataKind.Text
88-
},
89-
90-
new TextLoader.Column()
91-
{
92-
Name = "F2",
93-
Source = new [] { new TextLoader.Range() { Min = 2, Max = 2} },
94-
Type = Runtime.Data.DataKind.I4
95-
},
96-
97-
new TextLoader.Column()
98-
{
99-
Name = "Rest",
100-
Source = new [] { new TextLoader.Range() { Min = 3, Max = 9} }
101-
}
67+
new TextLoader.Column("Label", type: null, 0),
68+
new TextLoader.Column("F1", DataKind.Text, 1),
69+
new TextLoader.Column("F2", DataKind.I4, 2),
70+
new TextLoader.Column("Rest", type: null, new [] { new TextLoader.Range(3, 9) })
10271
}
10372
},
10473

@@ -998,19 +967,8 @@ public void EntryPointPipelineEnsembleText()
998967
HasHeader = true,
999968
Column = new[]
1000969
{
1001-
new TextLoader.Column()
1002-
{
1003-
Name = "Label",
1004-
Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} },
1005-
Type = Runtime.Data.DataKind.TX
1006-
},
1007-
1008-
new TextLoader.Column()
1009-
{
1010-
Name = "Text",
1011-
Source = new [] { new TextLoader.Range() { Min = 3, Max = 3} },
1012-
Type = Runtime.Data.DataKind.TX
1013-
}
970+
new TextLoader.Column("Label", DataKind.TX, 0),
971+
new TextLoader.Column("Text", DataKind.TX, 3)
1014972
}
1015973
},
1016974

@@ -1222,19 +1180,8 @@ public void EntryPointMulticlassPipelineEnsemble()
12221180
{
12231181
Column = new[]
12241182
{
1225-
new TextLoader.Column()
1226-
{
1227-
Name = "Label",
1228-
Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} },
1229-
Type = Runtime.Data.DataKind.R4
1230-
},
1231-
1232-
new TextLoader.Column()
1233-
{
1234-
Name = "Features",
1235-
Source = new [] { new TextLoader.Range() { Min = 1, Max = 4} },
1236-
Type = Runtime.Data.DataKind.R4
1237-
}
1183+
new TextLoader.Column("Label", DataKind.R4, 0),
1184+
new TextLoader.Column("Features", DataKind.R4, new [] { new TextLoader.Range(1, 4) })
12381185
}
12391186
},
12401187

@@ -3474,18 +3421,8 @@ public void EntryPointLinearPredictorSummary()
34743421
HasHeader = true,
34753422
Column = new[]
34763423
{
3477-
new TextLoader.Column()
3478-
{
3479-
Name = "Label",
3480-
Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} },
3481-
},
3482-
3483-
new TextLoader.Column()
3484-
{
3485-
Name = "Features",
3486-
Source = new [] { new TextLoader.Range() { Min = 1, Max = 9} },
3487-
Type = Runtime.Data.DataKind.Num
3488-
}
3424+
new TextLoader.Column("Label", type: null, 0),
3425+
new TextLoader.Column("Features", DataKind.Num, new [] { new TextLoader.Range(1, 9) })
34893426
}
34903427
},
34913428

@@ -3561,12 +3498,7 @@ public void EntryPointPcaPredictorSummary()
35613498
HasHeader = false,
35623499
Column = new[]
35633500
{
3564-
new TextLoader.Column()
3565-
{
3566-
Name = "Features",
3567-
Source = new [] { new TextLoader.Range() { Min = 1, Max = 784} },
3568-
Type = Runtime.Data.DataKind.R4
3569-
}
3501+
new TextLoader.Column("Features", DataKind.R4, new [] { new TextLoader.Range(1, 784) })
35703502
}
35713503
},
35723504

@@ -3774,12 +3706,8 @@ public void EntryPointWordEmbeddings()
37743706
SeparatorChars = new []{' '},
37753707
Column = new[]
37763708
{
3777-
new TextLoader.Column()
3778-
{
3779-
Name = "Text",
3780-
Source = new [] { new TextLoader.Range() { Min = 0, VariableEnd=true, ForceVector=true} },
3781-
Type = DataKind.Text
3782-
}
3709+
new TextLoader.Column("Text", DataKind.Text,
3710+
new [] { new TextLoader.Range() { Min = 0, VariableEnd=true, ForceVector=true} })
37833711
}
37843712
},
37853713
InputFile = inputFile,

test/Microsoft.ML.Tests/Scenarios/Api/SimpleTrainAndPredict.cs

Lines changed: 11 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -82,44 +82,18 @@ private static TextTransform.Arguments MakeSentimentTextTransformArgs(bool norma
8282

8383
private static TextLoader.Arguments MakeIrisTextLoaderArgs()
8484
{
85-
8685
return new TextLoader.Arguments()
8786
{
8887
Separator = "comma",
8988
HasHeader = true,
9089
Column = new[]
91-
{
92-
new TextLoader.Column()
93-
{
94-
Name = "SepalLength",
95-
Source = new [] { new TextLoader.Range() { Min=0, Max=0} },
96-
Type = DataKind.R4
97-
},
98-
new TextLoader.Column()
99-
{
100-
Name = "SepalWidth",
101-
Source = new [] { new TextLoader.Range() { Min=1, Max=1} },
102-
Type = DataKind.R4
103-
},
104-
new TextLoader.Column()
105-
{
106-
Name = "PetalLength",
107-
Source = new [] { new TextLoader.Range() { Min=2, Max=2} },
108-
Type = DataKind.R4
109-
},
110-
new TextLoader.Column()
111-
{
112-
Name = "PetalWidth",
113-
Source = new [] { new TextLoader.Range() { Min=3, Max=3} },
114-
Type = DataKind.R4
115-
},
116-
new TextLoader.Column()
117-
{
118-
Name = "Label",
119-
Source = new [] { new TextLoader.Range() { Min=4, Max=4} },
120-
Type = DataKind.Text
121-
}
122-
}
90+
{
91+
new TextLoader.Column("SepalLength", DataKind.R4, 0),
92+
new TextLoader.Column("SepalWidth", DataKind.R4, 1),
93+
new TextLoader.Column("PetalLength", DataKind.R4, 2),
94+
new TextLoader.Column("PetalWidth",DataKind.R4, 3),
95+
new TextLoader.Column("Label", DataKind.Text, 4)
96+
}
12397
};
12498
}
12599
private static TextLoader.Arguments MakeSentimentTextLoaderArgs()
@@ -129,21 +103,10 @@ private static TextLoader.Arguments MakeSentimentTextLoaderArgs()
129103
Separator = "tab",
130104
HasHeader = true,
131105
Column = new[]
132-
{
133-
new TextLoader.Column()
134-
{
135-
Name = "Label",
136-
Source = new [] { new TextLoader.Range() { Min=0, Max=0} },
137-
Type = DataKind.BL
138-
},
139-
140-
new TextLoader.Column()
141-
{
142-
Name = "SentimentText",
143-
Source = new [] { new TextLoader.Range() { Min=1, Max=1} },
144-
Type = DataKind.Text
145-
}
146-
}
106+
{
107+
new TextLoader.Column("Label", DataKind.BL, 0),
108+
new TextLoader.Column("SentimentText", DataKind.Text, 1)
109+
}
147110
};
148111
}
149112
}

test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/IrisPlantClassificationTests.cs

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -29,37 +29,13 @@ public void TrainAndPredictIrisModelUsingDirectInstantiationTest()
2929
new TextLoader.Arguments()
3030
{
3131
HasHeader = false,
32-
Column = new[] {
33-
new TextLoader.Column()
34-
{
35-
Name = "Label",
36-
Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} },
37-
Type = DataKind.R4
38-
},
39-
new TextLoader.Column()
40-
{
41-
Name = "SepalLength",
42-
Source = new [] { new TextLoader.Range() { Min = 1, Max = 1} },
43-
Type = DataKind.R4
44-
},
45-
new TextLoader.Column()
46-
{
47-
Name = "SepalWidth",
48-
Source = new [] { new TextLoader.Range() { Min = 2, Max = 2} },
49-
Type = DataKind.R4
50-
},
51-
new TextLoader.Column()
52-
{
53-
Name = "PetalLength",
54-
Source = new [] { new TextLoader.Range() { Min = 3, Max = 3} },
55-
Type = DataKind.R4
56-
},
57-
new TextLoader.Column()
58-
{
59-
Name = "PetalWidth",
60-
Source = new [] { new TextLoader.Range() { Min = 4, Max = 4} },
61-
Type = DataKind.R4
62-
}
32+
Column = new[]
33+
{
34+
new TextLoader.Column("Label", DataKind.R4, 0),
35+
new TextLoader.Column("SepalLength", DataKind.R4, 1),
36+
new TextLoader.Column("SepalWidth", DataKind.R4, 2),
37+
new TextLoader.Column("PetalLength", DataKind.R4, 3),
38+
new TextLoader.Column("PetalWidth", DataKind.R4, 4)
6339
}
6440
}, new MultiFileSource(dataPath));
6541

0 commit comments

Comments
 (0)