Skip to content

Commit 7aa5a45

Browse files
authored
Towards #3204 - Documentation for MLContext.Transforms.Categorical (#3388)
* Docs for Categorical catalog * Fixing EntryPoints test * PR comments * PR comments - output data types * categorical update * Final review comments
1 parent 7ca768a commit 7aa5a45

File tree

8 files changed

+179
-59
lines changed

8 files changed

+179
-59
lines changed

src/Microsoft.ML.Data/Transforms/ColumnCopying.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema)
8686
}
8787

8888
/// <summary>
89-
/// <see cref="ITransformer"/> resulting from fitting an <see cref="ColumnCopyingEstimator"/>.
89+
/// <see cref="ITransformer"/> resulting from fitting a <see cref="ColumnCopyingEstimator"/>.
9090
/// </summary>
9191
public sealed class ColumnCopyingTransformer : OneToOneTransformerBase
9292
{

src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public static class ConversionsExtensionsCatalog
2424
/// </summary>
2525
/// <param name="catalog">The conversion transform's catalog.</param>
2626
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
27-
/// This column's data type will be a vector of <see cref="System.UInt32"/>, or a scalar <see cref="System.UInt32"/> based on whether the input column data types
27+
/// This column's data type will be a vector of keys, or a scalar keys based on whether the input column data types
2828
/// are vectors or scalars.</param>
2929
/// <param name="inputColumnName">Name of the column whose data will be hashed.
3030
/// If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.

src/Microsoft.ML.Data/Transforms/Hashing.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1113,7 +1113,7 @@ public override void Process()
11131113
/// | -- | -- |
11141114
/// | Does this estimator need to look at the data to train its parameters? | Yes, if the mapping of the hashes to the values is required. |
11151115
/// | Input column data type | Vector or scalars of numeric, boolean, [text](xref:Microsoft.ML.Data.TextDataViewType), [DateTime](xref: System.DateTime) and [key](xref:Microsoft.ML.Data.KeyDataViewType) data types.|
1116-
/// | Output column data type | Vector or scalar [System.Int32](xref:System.Int32).|
1116+
/// | Output column data type | Vector or scalar [key](xref:Microsoft.ML.Data.KeyDataViewType)|
11171117
///
11181118
/// ]]></format>
11191119
/// </remarks>

src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ namespace Microsoft.ML.Transforms
2121
/// | | |
2222
/// | -- | -- |
2323
/// | Does this estimator need to look at the data to train its parameters? | Yes |
24-
/// | Input column data type | Scalar numeric, boolean, [text](xref:Microsoft.ML.Data.TextDataViewType), [System.DateTime](xref:System.DateTime) or [key](xref:Microsoft.ML.Data.KeyDataViewType) data types.|
25-
/// | Output column data type | [key](xref:Microsoft.ML.Data.KeyDataViewType)|
24+
/// | Input column data type | Scalar or vector of numeric, boolean, [text](xref:Microsoft.ML.Data.TextDataViewType), [System.DateTime](xref:System.DateTime) and [key](xref:Microsoft.ML.Data.KeyDataViewType) data types.|
25+
/// | Output column data type | Scalar or vector of [key](xref:Microsoft.ML.Data.KeyDataViewType)|
2626
///
2727
/// The ValueToKeyMappingEstimator builds up term vocabularies(dictionaries) mapping the input values to the keys on the dictionary.
2828
/// If multiple columns are used, each column builds/uses exactly one vocabulary.

src/Microsoft.ML.Transforms/CategoricalCatalog.cs

+46-19
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,22 @@ namespace Microsoft.ML
1616
public static class CategoricalCatalog
1717
{
1818
/// <summary>
19-
/// Convert text columns into one-hot encoded vectors.
19+
/// Create a <see cref="OneHotEncodingEstimator"/>, which converts the input column specified by <paramref name="inputColumnName"/>
20+
/// into a column of one-hot encoded vectors named <paramref name="outputColumnName"/>.
2021
/// </summary>
21-
/// <param name="catalog">The transform catalog</param>
22-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
23-
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
24-
/// <param name="outputKind">Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector.</param>
22+
/// <param name="catalog">The transform catalog.</param>
23+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
24+
/// This column's data type will be a vector of <see cref="System.Single"/> if <paramref name="outputKind"/> is
25+
/// <see cref="OneHotEncodingEstimator.OutputKind.Bag"/>, <see cref="OneHotEncodingEstimator.OutputKind.Indicator"/>, and <see cref="OneHotEncodingEstimator.OutputKind.Binary"/>.
26+
/// If <paramref name="outputKind"/> is <see cref="OneHotEncodingEstimator.OutputKind.Key"/>, this column's data type will be a key in the case of a scalar input column
27+
/// or a vector of keys in the case of a vector input column.</param>
28+
/// <param name="inputColumnName">Name of column to convert to one-hot vectors. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/>
29+
/// will be used as source. This column's data type can be scalar or vector of numeric, text, boolean, <see cref="System.DateTime"/> or <see cref="System.DateTimeOffset"/>,</param>
30+
/// <param name="outputKind">Output kind: Bag (multi-set vector), Indicator (indicator vector), Key (index), or Binary encoded indicator vector.</param>
2531
/// <param name="maximumNumberOfKeys">Maximum number of terms to keep per column when auto-training.</param>
26-
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
27-
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
32+
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/>
33+
/// choosen they will be in the order encountered. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>,
34+
/// items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
2835
/// <param name="keyData">Specifies an ordering for the encoding. If specified, this should be a single column data view,
2936
/// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting.</param>
3037
/// <example>
@@ -44,14 +51,21 @@ public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Cate
4451
new[] { new OneHotEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality) }, keyData);
4552

4653
/// <summary>
47-
/// Convert text columns into one-hot encoded vectors.
54+
/// Create a <see cref="OneHotEncodingEstimator"/>, which converts one or more input text columns specified in <paramref name="columns"/>
55+
/// into as many columns of one-hot encoded vectors.
4856
/// </summary>
49-
/// <param name="catalog">The transform catalog</param>
50-
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
57+
/// <remarks>If multiple columns are passed to the estimator, all of the columns will be processed in a single pass over the data.
58+
/// Therefore, it is more efficient to specify one estimator with many columns than it is to specify many estimators each with a single column.</remarks>
59+
/// <param name="catalog">The transform catalog.</param>
60+
/// <param name="columns">The pairs of input and output columns. The output columns' data type will be a vector of <see cref="System.Single"/> if <paramref name="outputKind"/> is
61+
/// <see cref="OneHotEncodingEstimator.OutputKind.Bag"/>, <see cref="OneHotEncodingEstimator.OutputKind.Indicator"/>, and <see cref="OneHotEncodingEstimator.OutputKind.Binary"/>.
62+
/// If <paramref name="outputKind"/> is <see cref="OneHotEncodingEstimator.OutputKind.Key"/>, the output columns' data type will be a key in the case of scalar input column
63+
/// or a vector of keys in the case of a vector input column.</param>
5164
/// <param name="outputKind">Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector.</param>
5265
/// <param name="maximumNumberOfKeys">Maximum number of terms to keep per column when auto-training.</param>
53-
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
54-
/// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
66+
/// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/>
67+
/// choosen they will be in the order encountered. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>,
68+
/// items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
5569
/// <param name="keyData">Specifies an ordering for the encoding. If specified, this should be a single column data view,
5670
/// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting.</param>
5771
/// <example>
@@ -97,17 +111,24 @@ internal static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Ca
97111
=> new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columns, keyData);
98112

99113
/// <summary>
100-
/// Convert a text column into hash-based one-hot encoded vector.
114+
/// Create a <see cref="OneHotHashEncodingEstimator"/>, which converts a text column specified by <paramref name="inputColumnName"/>
115+
/// into a hash-based one-hot encoded vector column named <paramref name="outputColumnName"/>.
101116
/// </summary>
102117
/// <param name="catalog">The transform catalog</param>
103-
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
104-
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
118+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
119+
/// This column's data type will be a vector of <see cref="System.Single"/> if <paramref name="outputKind"/> is
120+
/// <see cref="OneHotEncodingEstimator.OutputKind.Bag"/>, <see cref="OneHotEncodingEstimator.OutputKind.Indicator"/>, and <see cref="OneHotEncodingEstimator.OutputKind.Binary"/>.
121+
/// If <paramref name="outputKind"/> is <see cref="OneHotEncodingEstimator.OutputKind.Key"/>, this column's data type will be a key in the case of a scalar input column
122+
/// or a vector of keys in the case of a vector input column.
123+
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.
124+
/// This column's data type can be scalar or vector of numeric, text, boolean, <see cref="System.DateTime"/> or <see cref="System.DateTimeOffset"/>.</param>
105125
/// <param name="outputKind">The conversion mode.</param>
106126
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
107127
/// <param name="seed">Hashing seed.</param>
108128
/// <param name="useOrderedHashing">Whether the position of each term should be included in the hash.</param>
109129
/// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
110-
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
130+
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing,
131+
/// as such, can map many initial values to one.</param>
111132
/// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
112133
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
113134
/// <example>
@@ -128,16 +149,22 @@ public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCata
128149
new[] { new OneHotHashEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts) });
129150

130151
/// <summary>
131-
/// Convert text columns into hash-based one-hot encoded vector columns.
152+
/// Create a <see cref="OneHotHashEncodingEstimator"/>, which converts one or more input text columns specified by <paramref name="columns"/>
153+
/// into as many columns of hash-based one-hot encoded vectors.
132154
/// </summary>
155+
/// <remarks>If multiple columns are passed to the estimator, all of the columns will be processed in a single pass over the data.
156+
/// Therefore, it is more efficient to specify one estimator with many columns than it is to specify many estimators each with a single column.</remarks>
133157
/// <param name="catalog">The transform catalog</param>
134-
/// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
158+
/// <param name="columns">The pairs of input and output columns. The output columns' data type will be a vector of <see cref="System.Single"/> if <paramref name="outputKind"/> is
159+
/// <see cref="OneHotEncodingEstimator.OutputKind.Bag"/>, <see cref="OneHotEncodingEstimator.OutputKind.Indicator"/>, and <see cref="OneHotEncodingEstimator.OutputKind.Binary"/>.
160+
/// If <paramref name="outputKind"/> is <see cref="OneHotEncodingEstimator.OutputKind.Key"/>, the output columns' data type will be a key in the case of scalar input column
161+
/// or a vector of keys in the case of a vector input column.</param>
135162
/// <param name="outputKind">The conversion mode.</param>
136163
/// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
137164
/// <param name="seed">Hashing seed.</param>
138165
/// <param name="useOrderedHashing">Whether the position of each term should be included in the hash.</param>
139166
/// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
140-
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
167+
/// Text representation of original values are stored in the slot names of the metadata for the new column. Hashing, as such, can map many initial values to one.
141168
/// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
142169
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
143170
/// <example>

0 commit comments

Comments
 (0)