@@ -7,44 +7,47 @@ namespace Microsoft.ML.Auto
7
7
internal static class ColumnInferenceApi
8
8
{
9
9
public static ColumnInferenceResult InferColumns ( MLContext context , string path , string label ,
10
- bool hasHeader , string separator , bool ? isQuoted , bool ? isSparse )
10
+ bool hasHeader , char ? separatorChar , bool ? allowQuotedStrings , bool ? supportSparse , bool trimWhitespace )
11
11
{
12
12
var sample = TextFileSample . CreateFromFullFile ( path ) ;
13
- Func < TextLoader , IDataView > createDataView = ( textLoader ) =>
13
+ var splitInference = InferSplit ( sample , separatorChar , allowQuotedStrings , supportSparse ) ;
14
+ var typeInference = InferColumnTypes ( context , sample , splitInference ) ;
15
+ var typedLoaderArgs = new TextLoader . Arguments
14
16
{
15
- return textLoader . Read ( path ) ;
17
+ Column = ColumnTypeInference . GenerateLoaderColumns ( typeInference . Columns ) ,
18
+ Separator = splitInference . Separator ,
19
+ AllowSparse = splitInference . AllowSparse ,
20
+ AllowQuoting = splitInference . AllowQuote ,
21
+ HasHeader = hasHeader ,
22
+ TrimWhitespace = trimWhitespace
16
23
} ;
17
- return InferColumns ( context , sample , createDataView , label , hasHeader , separator , isQuoted , isSparse ) ;
18
- }
24
+ var textLoader = context . Data . CreateTextReader ( typedLoaderArgs ) ;
25
+ var dataView = textLoader . Read ( path ) ;
19
26
20
- public static ColumnInferenceResult InferColumns ( MLContext context , IMultiStreamSource multiStreamSource ,
21
- string label , bool hasHeader , string separator , bool ? isQuoted , bool ? isSparse )
22
- {
23
- // heuristic: use first stream in multi-stream source to infer column types & split
24
- var stream = multiStreamSource . Open ( 0 ) ;
25
- var sample = TextFileSample . CreateFromFullStream ( stream ) ;
27
+ var purposeInferenceResult = PurposeInference . InferPurposes ( context , dataView , label ) ;
26
28
27
- Func < TextLoader , IDataView > createDataView = ( textLoader ) =>
28
- {
29
- return textLoader . Read ( multiStreamSource ) ;
30
- } ;
29
+ // infer column grouping and generate column names
30
+ var groupingResult = ColumnGroupingInference . InferGroupingAndNames ( context , hasHeader ,
31
+ typeInference . Columns , purposeInferenceResult ) ;
31
32
32
- return InferColumns ( context , sample , createDataView , label , hasHeader , separator , isQuoted , isSparse ) ;
33
+ // build result objects & return
34
+ var inferredColumns = groupingResult . Select ( c => ( c . GenerateTextLoaderColumn ( ) , c . Purpose ) ) . ToArray ( ) ;
35
+ return new ColumnInferenceResult ( inferredColumns , splitInference . AllowQuote , splitInference . AllowSparse , splitInference . Separator , hasHeader , trimWhitespace ) ;
33
36
}
34
37
35
- private static TextFileContents . ColumnSplitResult InferSplit ( TextFileSample sample , string separator , bool ? isQuoted , bool ? isSparse )
38
+ private static TextFileContents . ColumnSplitResult InferSplit ( TextFileSample sample , char ? separatorChar , bool ? allowQuotedStrings , bool ? supportSparse )
36
39
{
37
- var separatorCandidates = separator == null ? TextFileContents . DefaultSeparators : new string [ ] { separator } ;
40
+ var separatorCandidates = separatorChar == null ? TextFileContents . DefaultSeparators : new char [ ] { separatorChar . Value } ;
38
41
var splitInference = TextFileContents . TrySplitColumns ( sample , separatorCandidates ) ;
39
42
40
43
// respect passed-in overrides
41
- if ( isQuoted != null )
44
+ if ( allowQuotedStrings != null )
42
45
{
43
- splitInference . AllowQuote = isQuoted . Value ;
46
+ splitInference . AllowQuote = allowQuotedStrings . Value ;
44
47
}
45
- if ( isSparse != null )
48
+ if ( supportSparse != null )
46
49
{
47
- splitInference . AllowSparse = isSparse . Value ;
50
+ splitInference . AllowSparse = supportSparse . Value ;
48
51
}
49
52
50
53
if ( ! splitInference . IsSuccess )
@@ -75,33 +78,5 @@ private static ColumnTypeInference.InferenceResult InferColumnTypes(MLContext co
75
78
76
79
return typeInferenceResult ;
77
80
}
78
-
79
- private static ColumnInferenceResult InferColumns ( MLContext context ,
80
- TextFileSample sample , Func < TextLoader , IDataView > createDataView , string label ,
81
- bool hasHeader , string separator , bool ? isQuoted , bool ? isSparse )
82
- {
83
- var splitInference = InferSplit ( sample , separator , isQuoted , isSparse ) ;
84
- var typeInference = InferColumnTypes ( context , sample , splitInference ) ;
85
- var typedLoaderArgs = new TextLoader . Arguments
86
- {
87
- Column = ColumnTypeInference . GenerateLoaderColumns ( typeInference . Columns ) ,
88
- Separator = splitInference . Separator ,
89
- AllowSparse = splitInference . AllowSparse ,
90
- AllowQuoting = splitInference . AllowQuote ,
91
- HasHeader = hasHeader
92
- } ;
93
- var textLoader = context . Data . CreateTextReader ( typedLoaderArgs ) ;
94
- var dataView = createDataView ( textLoader ) ;
95
-
96
- var purposeInferenceResult = PurposeInference . InferPurposes ( context , dataView , label ) ;
97
-
98
- // infer column grouping and generate column names
99
- var groupingResult = ColumnGroupingInference . InferGroupingAndNames ( context , hasHeader ,
100
- typeInference . Columns , purposeInferenceResult ) ;
101
-
102
- // build result objects & return
103
- var inferredColumns = groupingResult . Select ( c => ( c . GenerateTextLoaderColumn ( ) , c . Purpose ) ) . ToArray ( ) ;
104
- return new ColumnInferenceResult ( inferredColumns , splitInference . AllowQuote , splitInference . AllowSparse , splitInference . Separator , hasHeader ) ;
105
- }
106
81
}
107
82
}
0 commit comments