Skip to content

Commit a8f16d4

Browse files
committed
Address comments (recovery old logics and add tests)
1 parent 6e2ecd4 commit a8f16d4

File tree

6 files changed

+192
-14
lines changed

6 files changed

+192
-14
lines changed

src/Microsoft.ML.Data/Dirty/ChooseColumnsByIndexTransform.cs

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,27 @@ public sealed class Arguments
3232

3333
private sealed class Bindings
3434
{
35-
// A collection of source column indexes after removing those we want to drop. Specifically, j=_sources[i] means
36-
// that the i-th output column in the output schema is the j-th column in the input schema.
35+
/// <summary>
36+
/// A collection of source column indexes after removing those we want to drop. Specifically, j=_sources[i] means
37+
/// that the i-th output column in the output schema is the j-th column in the input schema.
38+
/// </summary>
3739
private readonly int[] _sources;
3840

39-
// Input schema of this transform. It's useful when determining column dependencies and other
40-
// relations between input and output schemas.
41+
/// <summary>
42+
/// Input schema of this transform. It's useful when determining column dependencies and other
43+
/// relations between input and output schemas.
44+
/// </summary>
4145
private readonly Schema _sourceSchema;
4246

43-
// Variable used for saving in backward-compatible format. It's not needed actually.
47+
/// <summary>
48+
/// Some column indexes in the input schema. <see cref="_sources"/> is computed from <see cref="_selectedColumnIndexes"/>
49+
/// and <see cref="_drop"/>.
50+
/// </summary>
51+
private readonly int[] _selectedColumnIndexes;
52+
53+
/// <summary>
54+
/// True, if this transform drops selected columns indexed by <see cref="_selectedColumnIndexes"/>.
55+
/// </summary>
4456
private readonly bool _drop;
4557

4658
// This transform's output schema.
@@ -52,20 +64,34 @@ internal Bindings(Arguments args, Schema sourceSchema)
5264
Contracts.AssertValue(sourceSchema);
5365

5466
_sourceSchema = sourceSchema;
67+
68+
// Store user-specified arguments as the major state of this transform. Only the major states will
69+
// be saved and all other attributes can be reconstructed from them.
5570
_drop = args.Drop;
71+
_selectedColumnIndexes = args.Index;
72+
73+
// Compute actually used attributes in runtime from those major states.
74+
ComputeSources(_drop, _selectedColumnIndexes, _sourceSchema, out _sources);
75+
76+
// All necessary fields in this class are set, so we can compute output schema now.
77+
OutputSchema = ComputeOutputSchema();
78+
}
5679

57-
if (args.Drop)
80+
/// <summary>
81+
/// Common method of computing <see cref="_sources"/> from necessary parameters. This function is used in constructors.
82+
/// </summary>
83+
private static void ComputeSources(bool drop, int[] selectedColumnIndexes, Schema sourceSchema, out int[] sources)
84+
{
85+
// Compute the mapping, <see cref="_sources"/>, from output column index to input column index.
86+
if (drop)
5887
// Drop columns indexed by args.Index
59-
_sources = Enumerable.Range(0, _sourceSchema.ColumnCount).Except(args.Index).ToArray();
88+
sources = Enumerable.Range(0, sourceSchema.ColumnCount).Except(selectedColumnIndexes).ToArray();
6089
else
6190
// Keep columns indexed by args.Index
62-
_sources = args.Index.ToArray();
91+
sources = selectedColumnIndexes;
6392

6493
// Make sure the output of this transform is meaningful.
65-
Contracts.Check(_sources.Length > 0, "Choose columns by index has no output column.");
66-
67-
// All necessary fields in this class are set, so we can compute output schema now.
68-
OutputSchema = ComputeOutputSchema();
94+
Contracts.Check(sources.Length > 0, "Choose columns by index has no output column.");
6995
}
7096

7197
/// <summary>
@@ -97,11 +123,16 @@ internal Bindings(ModelLoadContext ctx, Schema sourceSchema)
97123
Contracts.AssertValue(ctx);
98124
Contracts.AssertValue(sourceSchema);
99125

126+
_sourceSchema = sourceSchema;
127+
100128
// *** Binary format ***
101129
// bool (as byte): operation mode
102130
// int[]: selected source column indices
103131
_drop = ctx.Reader.ReadBoolByte();
104-
_sources = ctx.Reader.ReadIntArray();
132+
_selectedColumnIndexes = ctx.Reader.ReadIntArray();
133+
134+
// Compute actually used attributes in runtime from those major states.
135+
ComputeSources(_drop, _selectedColumnIndexes, _sourceSchema, out _sources);
105136

106137
_sourceSchema = sourceSchema;
107138
OutputSchema = ComputeOutputSchema();
@@ -115,7 +146,7 @@ internal void Save(ModelSaveContext ctx)
115146
// bool (as byte): operation mode
116147
// int[]: selected source column indices
117148
ctx.Writer.WriteBoolByte(_drop);
118-
ctx.Writer.WriteIntArray(_sources);
149+
ctx.Writer.WriteIntArray(_selectedColumnIndexes);
119150
}
120151

121152
internal bool[] GetActive(Func<int, bool> predicate)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#@ TextLoader{
2+
#@ header+
3+
#@ sep=tab
4+
#@ col=Name:TX:0
5+
#@ col=Label:R4:1
6+
#@ }
7+
Name Label
8+
25 0
9+
38 0
10+
28 1
11+
44 1
12+
18 0
13+
34 0
14+
29 0
15+
63 1
16+
24 0
17+
55 0
18+
65 1
19+
36 0
20+
26 0
21+
58 0
22+
48 1
23+
43 1
24+
20 0
25+
43 0
26+
37 0
27+
40 1
28+
Wrote 20 rows of length 2
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#@ TextLoader{
2+
#@ header+
3+
#@ sep=tab
4+
#@ col=Name:TX:0
5+
#@ col=Label:R4:1
6+
#@ }
7+
Name Label
8+
25 0
9+
38 0
10+
28 1
11+
44 1
12+
18 0
13+
34 0
14+
29 0
15+
63 1
16+
24 0
17+
55 0
18+
65 1
19+
36 0
20+
26 0
21+
58 0
22+
48 1
23+
43 1
24+
20 0
25+
43 0
26+
37 0
27+
40 1
28+
Wrote 20 rows of length 2
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#@ TextLoader{
2+
#@ header+
3+
#@ sep=tab
4+
#@ col=Cat:TX:0-7
5+
#@ col=Num:R4:8-13
6+
#@ }
7+
Workclass education marital-status occupation relationship ethnicity sex native-country-region age fnlwgt education-num capital-gain capital-loss hours-per-week
8+
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States 25 226802 7 0 0 40
9+
Private HS-grad Married-civ-spouse Farming-fishing Husband White Male United-States 38 89814 9 0 0 50
10+
Local-gov Assoc-acdm Married-civ-spouse Protective-serv Husband White Male United-States 28 336951 12 0 0 40
11+
Private Some-college Married-civ-spouse Machine-op-inspct Husband Black Male United-States 44 160323 10 7688 0 40
12+
? Some-college Never-married ? Own-child White Female United-States 18 103497 10 0 0 30
13+
Private 10th Never-married Other-service Not-in-family White Male United-States 34 198693 6 0 0 30
14+
? HS-grad Never-married ? Unmarried Black Male United-States 29 227026 9 0 0 40
15+
Self-emp-not-inc Prof-school Married-civ-spouse Prof-specialty Husband White Male United-States 63 104626 15 3103 0 32
16+
Private Some-college Never-married Other-service Unmarried White Female United-States 24 369667 10 0 0 40
17+
Private 7th-8th Married-civ-spouse Craft-repair Husband White Male United-States 55 104996 4 0 0 10
18+
Private HS-grad Married-civ-spouse Machine-op-inspct Husband White Male United-States 65 184454 9 6418 0 40
19+
Federal-gov Bachelors Married-civ-spouse Adm-clerical Husband White Male United-States 36 212465 13 0 0 40
20+
Private HS-grad Never-married Adm-clerical Not-in-family White Female United-States 26 82091 9 0 0 39
21+
? HS-grad Married-civ-spouse ? Husband White Male United-States 58 299831 9 0 0 35
22+
Private HS-grad Married-civ-spouse Machine-op-inspct Husband White Male United-States 48 279724 9 3103 0 48
23+
Private Masters Married-civ-spouse Exec-managerial Husband White Male United-States 43 346189 14 0 0 50
24+
State-gov Some-college Never-married Other-service Own-child White Male United-States 20 444554 10 0 0 25
25+
Private HS-grad Married-civ-spouse Adm-clerical Wife White Female United-States 43 128354 9 0 0 30
26+
Private HS-grad Widowed Machine-op-inspct Unmarried White Female United-States 37 60548 9 0 0 20
27+
Private Doctorate Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander Male ? 40 85019 16 0 0 45
28+
Wrote 20 rows of length 14
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#@ TextLoader{
2+
#@ header+
3+
#@ sep=tab
4+
#@ col=Cat:TX:0-7
5+
#@ col=Num:R4:8-13
6+
#@ }
7+
Workclass education marital-status occupation relationship ethnicity sex native-country-region age fnlwgt education-num capital-gain capital-loss hours-per-week
8+
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States 25 226802 7 0 0 40
9+
Private HS-grad Married-civ-spouse Farming-fishing Husband White Male United-States 38 89814 9 0 0 50
10+
Local-gov Assoc-acdm Married-civ-spouse Protective-serv Husband White Male United-States 28 336951 12 0 0 40
11+
Private Some-college Married-civ-spouse Machine-op-inspct Husband Black Male United-States 44 160323 10 7688 0 40
12+
? Some-college Never-married ? Own-child White Female United-States 18 103497 10 0 0 30
13+
Private 10th Never-married Other-service Not-in-family White Male United-States 34 198693 6 0 0 30
14+
? HS-grad Never-married ? Unmarried Black Male United-States 29 227026 9 0 0 40
15+
Self-emp-not-inc Prof-school Married-civ-spouse Prof-specialty Husband White Male United-States 63 104626 15 3103 0 32
16+
Private Some-college Never-married Other-service Unmarried White Female United-States 24 369667 10 0 0 40
17+
Private 7th-8th Married-civ-spouse Craft-repair Husband White Male United-States 55 104996 4 0 0 10
18+
Private HS-grad Married-civ-spouse Machine-op-inspct Husband White Male United-States 65 184454 9 6418 0 40
19+
Federal-gov Bachelors Married-civ-spouse Adm-clerical Husband White Male United-States 36 212465 13 0 0 40
20+
Private HS-grad Never-married Adm-clerical Not-in-family White Female United-States 26 82091 9 0 0 39
21+
? HS-grad Married-civ-spouse ? Husband White Male United-States 58 299831 9 0 0 35
22+
Private HS-grad Married-civ-spouse Machine-op-inspct Husband White Male United-States 48 279724 9 3103 0 48
23+
Private Masters Married-civ-spouse Exec-managerial Husband White Male United-States 43 346189 14 0 0 50
24+
State-gov Some-college Never-married Other-service Own-child White Male United-States 20 444554 10 0 0 25
25+
Private HS-grad Married-civ-spouse Adm-clerical Wife White Female United-States 43 128354 9 0 0 30
26+
Private HS-grad Widowed Machine-op-inspct Unmarried White Female United-States 37 60548 9 0 0 20
27+
Private Doctorate Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander Male ? 40 85019 16 0 0 45
28+
Wrote 20 rows of length 14

test/Microsoft.ML.TestFramework/TestCommandBase.cs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2093,5 +2093,40 @@ public void Datatypes()
20932093
TestCore("savedata", intermediateData.Path, "loader=binary", "saver=text", textOutputPath.Arg("dout"));
20942094
Done();
20952095
}
2096+
2097+
[TestCategory("DataPipeSerialization")]
2098+
[Fact()]
2099+
public void SavePipeChooseColumnsByIndex()
2100+
{
2101+
string dataPath = GetDataPath("adult.tiny.with-schema.txt");
2102+
const string loaderArgs = "loader=text{header+ col=Label:0 col=Cat:TX:1-8 col=Num:9-14 col=Name:TX:9}";
2103+
2104+
OutputPath modelPath = ModelPath();
2105+
string extraArgs = "xf=ChooseColumnsByIndex{ind=3 ind=0}";
2106+
TestCore("showdata", dataPath, loaderArgs, extraArgs);
2107+
2108+
_step++;
2109+
2110+
TestCore("showdata", dataPath, string.Format("in={{{0}}}", modelPath.Path), "");
2111+
Done();
2112+
}
2113+
2114+
[TestCategory("DataPipeSerialization")]
2115+
[Fact()]
2116+
public void SavePipeChooseColumnsByIndexDrop()
2117+
{
2118+
string dataPath = GetDataPath("adult.tiny.with-schema.txt");
2119+
const string loaderArgs = "loader=text{header+ col=Label:0 col=Cat:TX:1-8 col=Num:9-14 col=Name:TX:9}";
2120+
2121+
OutputPath modelPath = ModelPath();
2122+
2123+
string extraArgs = "xf=ChooseColumnsByIndex{ind=3 ind=0 drop+}";
2124+
TestCore("showdata", dataPath, loaderArgs, extraArgs);
2125+
2126+
_step++;
2127+
2128+
TestCore("showdata", dataPath, string.Format("in={{{0}}}", modelPath.Path), "");
2129+
Done();
2130+
}
20962131
}
20972132
}

0 commit comments

Comments
 (0)