Skip to content

Commit ae63aea

Browse files
committed
categorical update
1 parent 3be0ed5 commit ae63aea

File tree

6 files changed

+114
-65
lines changed

6 files changed

+114
-65
lines changed

src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public static class ConversionsExtensionsCatalog
2424
/// </summary>
2525
/// <param name="catalog">The conversion transform's catalog.</param>
2626
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
27-
/// This column's data type will be a vector of <see cref="System.UInt32"/>, or a scalar <see cref="System.UInt32"/> based on whether the input column data types
27+
/// This column's data type will be a vector of keys, or a scalar keys based on whether the input column data types
2828
/// are vectors or scalars.</param>
2929
/// <param name="inputColumnName">Name of the column whose data will be hashed.
3030
/// If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.

src/Microsoft.ML.Data/Transforms/Hashing.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1113,7 +1113,7 @@ public override void Process()
11131113
/// | -- | -- |
11141114
/// | Does this estimator need to look at the data to train its parameters? | Yes, if the mapping of the hashes to the values is required. |
11151115
/// | Input column data type | Vector or scalars of numeric, boolean, [text](xref:Microsoft.ML.Data.TextDataViewType), [DateTime](xref: System.DateTime) and [key](xref:Microsoft.ML.Data.KeyDataViewType) data types.|
1116-
/// | Output column data type | Vector or scalar [System.Int32](xref:System.Int32).|
1116+
/// | Output column data type | Vector or scalar [key](xref:Microsoft.ML.Data.KeyDataViewType)|
11171117
///
11181118
/// ]]></format>
11191119
/// </remarks>

src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ namespace Microsoft.ML.Transforms
2121
/// | | |
2222
/// | -- | -- |
2323
/// | Does this estimator need to look at the data to train its parameters? | Yes |
24-
/// | Input column data type | Scalar numeric, boolean, [text](xref:Microsoft.ML.Data.TextDataViewType), [System.DateTime](xref:System.DateTime) or [key](xref:Microsoft.ML.Data.KeyDataViewType) data types.|
25-
/// | Output column data type | [key](xref:Microsoft.ML.Data.KeyDataViewType)|
24+
/// | Input column data type | Scalar or vector of numeric, boolean, [text](xref:Microsoft.ML.Data.TextDataViewType), [System.DateTime](xref:System.DateTime) and [key](xref:Microsoft.ML.Data.KeyDataViewType) data types.|
25+
/// | Output column data type | Scalar or vector of [key](xref:Microsoft.ML.Data.KeyDataViewType)|
2626
///
2727
/// The ValueToKeyMappingEstimator builds up term vocabularies(dictionaries) mapping the input values to the keys on the dictionary.
2828
/// If multiple columns are used, each column builds/uses exactly one vocabulary.

src/Microsoft.ML.Transforms/CategoricalCatalog.cs

+37-22
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,22 @@ namespace Microsoft.ML
1515
public static class CategoricalCatalog
1616
{
1717
/// <summary>
18-
/// Create a <see cref="OneHotEncodingEstimator"/>, which converts the text input column specified by <paramref name="inputColumnName"/> into a column of one-hot encoded vectors named <paramref name="outputColumnName"/>.
18+
/// Create a <see cref="OneHotEncodingEstimator"/>, which converts the input column specified by <paramref name="inputColumnName"/>
19+
/// into a column of one-hot encoded vectors named <paramref name="outputColumnName"/>.
1920
/// </summary>
20-
/// <param name="catalog">The transform catalog</param>
21+
/// <param name="catalog">The transform catalog.</param>
2122
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
22-
/// This column's data type will be a vector of floats for <paramref name="outputKind"/> Bag, Indicator, and Binary. For <paramref name="outputKind"/> Key, the data type will be a key in the case of a singleton input column or a vector of keys in the case of a vector input column.</param>
23-
/// <param name="inputColumnName">Name of column to convert to one-hot vectors. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
24-
/// This column's data type can be numeric, text, boolean, <see cref="System.DateTime"/> or <see cref="System.DateTimeOffset"/></param>
23+
/// This column's data type will be a vector of <see cref="System.Single"/> if <paramref name="outputKind"/> is
24+
/// <see cref="OneHotEncodingEstimator.OutputKind.Bag"/>, <see cref="OneHotEncodingEstimator.OutputKind.Indicator"/>, and <see cref="OneHotEncodingEstimator.OutputKind.Binary"/>.
25+
/// If <paramref name="outputKind"/> is <see cref="OneHotEncodingEstimator.OutputKind.Key"/>, this column's data type will be a key in the case of a scalar input column
26+
/// or a vector of keys in the case of a vector input column.</param>
27+
/// <param name="inputColumnName">Name of column to convert to one-hot vectors. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/>
28+
/// will be used as source. This column's data type can be scalar or vector of numeric, text, boolean, <see cref="System.DateTime"/> or <see cref="System.DateTimeOffset"/>,</param>
2529
/// <param name="outputKind">Output kind: Bag (multi-set vector), Indicator (indicator vector), Key (index), or Binary encoded indicator vector.</param>
2630
/// <param name="maximumNumberOfKeys">Maximum number of terms to keep per column when auto-training.</param>
27-
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
28-
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
31+
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/>
32+
/// choosen they will be in the order encountered. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>,
33+
/// items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
2934
/// <param name="keyData">Specifies an ordering for the encoding. If specified, this should be a single column data view,
3035
/// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting.</param>
3136
/// <example>
@@ -45,16 +50,19 @@ public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Cate
4550
new[] { new OneHotEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality) }, keyData);
4651

4752
/// <summary>
48-
/// Create a <see cref="OneHotEncodingEstimator"/>, which converts the input text column specified by <see cref="InputOutputColumnPair.InputColumnName"/> into a column of one-hot encoded vectors named <see cref="InputOutputColumnPair.OutputColumnName"/>.
53+
/// Create a <see cref="OneHotEncodingEstimator"/>, which converts one or more input text columns specified in <paramref name="columns"/>
54+
/// into as many columns of one-hot encoded vectors.
4955
/// </summary>
50-
/// <param name="catalog">The transform catalog</param>
51-
/// <param name="columns">The pairs of input and output columns. The data type of the input column can be numeric, text, boolean, <see cref="System.DateTime"/> or <see cref="System.DateTimeOffset"/>.
52-
/// The data type of the output column will be a vector of floats for <paramref name="outputKind"/> Bag, Indicator, and Binary.
53-
/// For <paramref name="outputKind"/> Key, the data type of the output column will be a key in the case of a singleton input column or a vector of keys in the case of a vector input column.</param>
56+
/// <param name="catalog">The transform catalog.</param>
57+
/// <param name="columns">The pairs of input and output columns. The output columns' data type will be a vector of <see cref="System.Single"/> if <paramref name="outputKind"/> is
58+
/// <see cref="OneHotEncodingEstimator.OutputKind.Bag"/>, <see cref="OneHotEncodingEstimator.OutputKind.Indicator"/>, and <see cref="OneHotEncodingEstimator.OutputKind.Binary"/>.
59+
/// If <paramref name="outputKind"/> is <see cref="OneHotEncodingEstimator.OutputKind.Key"/>, the output columns' data type will be a key in the case of scalar input column
60+
/// or a vector of keys in the case of a vector input column.</param>
5461
/// <param name="outputKind">Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector.</param>
5562
/// <param name="maximumNumberOfKeys">Maximum number of terms to keep per column when auto-training.</param>
56-
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
57-
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
63+
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/>
64+
/// choosen they will be in the order encountered. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>,
65+
/// items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
5866
/// <param name="keyData">Specifies an ordering for the encoding. If specified, this should be a single column data view,
5967
/// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting.</param>
6068
/// <example>
@@ -100,19 +108,24 @@ internal static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Ca
100108
=> new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columns, keyData);
101109

102110
/// <summary>
103-
/// Create a <see cref="OneHotHashEncodingEstimator"/>, which converts a text column specified by <paramref name="inputColumnName"/> into a hash-based one-hot encoded vector column named <paramref name="outputColumnName"/>.
111+
/// Create a <see cref="OneHotHashEncodingEstimator"/>, which converts a text column specified by <paramref name="inputColumnName"/>
112+
/// into a hash-based one-hot encoded vector column named <paramref name="outputColumnName"/>.
104113
/// </summary>
105114
/// <param name="catalog">The transform catalog</param>
106115
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
107-
/// This column's data type will be a vector of floats for <paramref name="outputKind"/> Bag, Indicator, and Binary. For <paramref name="outputKind"/> Key, the data type will be a key in the case of a singleton input column or a vector of keys in the case of a vector input column.</param>
116+
/// This column's data type will be a vector of <see cref="System.Single"/> if <paramref name="outputKind"/> is
117+
/// <see cref="OneHotEncodingEstimator.OutputKind.Bag"/>, <see cref="OneHotEncodingEstimator.OutputKind.Indicator"/>, and <see cref="OneHotEncodingEstimator.OutputKind.Binary"/>.
118+
/// If <paramref name="outputKind"/> is <see cref="OneHotEncodingEstimator.OutputKind.Key"/>, this column's data type will be a key in the case of a scalar input column
119+
/// or a vector of keys in the case of a vector input column.
108120
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
109-
/// This column's data type can be numeric, text, boolean, <see cref="System.DateTime"/> or <see cref="System.DateTimeOffset"/>.</param>
121+
/// This column's data type can be scalar or vector of numeric, text, boolean, <see cref="System.DateTime"/> or <see cref="System.DateTimeOffset"/>.</param>
110122
/// <param name="outputKind">The conversion mode.</param>
111123
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
112124
/// <param name="seed">Hashing seed.</param>
113125
/// <param name="useOrderedHashing">Whether the position of each term should be included in the hash.</param>
114126
/// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
115-
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
127+
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing,
128+
/// as such, can map many initial values to one.</param>
116129
/// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
117130
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
118131
/// <example>
@@ -133,12 +146,14 @@ public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCata
133146
new[] { new OneHotHashEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts) });
134147

135148
/// <summary>
136-
/// Create a <see cref="OneHotHashEncodingEstimator"/>, which converts the input text column specified by <see cref="InputOutputColumnPair.InputColumnName"/> into a column of hash-based one-hot encoded vectors named <see cref="InputOutputColumnPair.OutputColumnName"/>
149+
/// Create a <see cref="OneHotHashEncodingEstimator"/>, which converts one or more input text columns specified by <paramref name="columns"/>
150+
/// into as many columns of hash-based one-hot encoded vectors.
137151
/// </summary>
138152
/// <param name="catalog">The transform catalog</param>
139-
/// <param name="columns">The pairs of input and output columns. The data type of the input column can be numeric, text, boolean, <see cref="System.DateTime"/> or <see cref="System.DateTimeOffset"/>.
140-
/// The data type of the output column will be a vector of floats for <paramref name="outputKind"/> Bag, Indicator, and Binary.
141-
/// For <paramref name="outputKind"/> Key, the data type of the output column will be a key in the case of a singleton input column or a vector of keys in the case of a vector input column.</param>
153+
/// <param name="columns">The pairs of input and output columns. The output columns' data type will be a vector of <see cref="System.Single"/> if <paramref name="outputKind"/> is
154+
/// <see cref="OneHotEncodingEstimator.OutputKind.Bag"/>, <see cref="OneHotEncodingEstimator.OutputKind.Indicator"/>, and <see cref="OneHotEncodingEstimator.OutputKind.Binary"/>.
155+
/// If <paramref name="outputKind"/> is <see cref="OneHotEncodingEstimator.OutputKind.Key"/>, the output columns' data type will be a key in the case of scalar input column
156+
/// or a vector of keys in the case of a vector input column.</param>
142157
/// <param name="outputKind">The conversion mode.</param>
143158
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
144159
/// <param name="seed">Hashing seed.</param>

0 commit comments

Comments
 (0)