|
3 | 3 | // See the LICENSE file in the project root for more information.
|
4 | 4 |
|
5 | 5 | using System;
|
| 6 | +using System.Collections; |
6 | 7 | using System.Collections.Generic;
|
| 8 | +using System.Collections.Specialized; |
7 | 9 | using System.Globalization;
|
8 | 10 | using System.IO;
|
9 | 11 | using System.Linq;
|
@@ -49,23 +51,35 @@ internal static IDictionary<string, string> GenerateSampleData(string inputFile,
|
49 | 51 |
|
50 | 52 | internal static IDictionary<string, string> GenerateSampleData(IDataView dataView, ColumnInferenceResults columnInference)
|
51 | 53 | {
|
52 |
| - var featureColumns = dataView.Schema.AsEnumerable().Where(col => col.Name != columnInference.ColumnInformation.LabelColumnName && !columnInference.ColumnInformation.IgnoredColumnNames.Contains(col.Name)); |
| 54 | + var featureColumns = dataView.Schema.ToList().FindAll( |
| 55 | + col => col.Name != columnInference.ColumnInformation.LabelColumnName && |
| 56 | + !columnInference.ColumnInformation.IgnoredColumnNames.Contains(col.Name)); |
53 | 57 | var rowCursor = dataView.GetRowCursor(featureColumns);
|
54 | 58 |
|
55 |
| - var sampleData = featureColumns.Select(column => new { key = Utils.Normalize(column.Name), val = "null" }).ToDictionary(x => x.key, x => x.val); |
| 59 | + OrderedDictionary sampleData = new OrderedDictionary(); |
| 60 | + // Get normalized and unique column names. If there are duplicate column names, the |
| 61 | + // differentiator suffix '_col_x' will be added to each column name, where 'x' is |
| 62 | + // the load order for a given column. |
| 63 | + List<string> normalizedColumnNames= GenerateColumnNames(featureColumns.Select(column => column.Name).ToList()); |
| 64 | + foreach (string columnName in normalizedColumnNames) |
| 65 | + sampleData[columnName] = null; |
56 | 66 | if (rowCursor.MoveNext())
|
57 | 67 | {
|
58 | 68 | var getGetGetterMethod = typeof(Utils).GetMethod(nameof(Utils.GetValueFromColumn), BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic);
|
59 | 69 |
|
60 |
| - foreach (var column in featureColumns) |
| 70 | + // Access each feature column name through its index in featureColumns |
| 71 | + // as there may exist duplicate column names. In this case, sampleData |
| 72 | + // column names may have the differentiator suffix of '_col_x' added, |
| 73 | + // which requires access to each column name in through its index. |
| 74 | + for(int i = 0; i < featureColumns.Count(); i++) |
61 | 75 | {
|
62 |
| - var getGeneraicGetGetterMethod = getGetGetterMethod.MakeGenericMethod(column.Type.RawType); |
63 |
| - string val = getGeneraicGetGetterMethod.Invoke(null, new object[] { rowCursor, column }) as string; |
64 |
| - sampleData[Utils.Normalize(column.Name)] = val; |
| 76 | + var getGenericGetGetterMethod = getGetGetterMethod.MakeGenericMethod(featureColumns[i].Type.RawType); |
| 77 | + string val = getGenericGetGetterMethod.Invoke(null, new object[] { rowCursor, featureColumns[i] }) as string; |
| 78 | + sampleData[i] = val; |
65 | 79 | }
|
66 | 80 | }
|
67 | 81 |
|
68 |
| - return sampleData; |
| 82 | + return sampleData.Cast<DictionaryEntry>().ToDictionary(k => (string)k.Key, v => (string)v.Value); |
69 | 83 | }
|
70 | 84 |
|
71 | 85 | internal static string GetValueFromColumn<T>(DataViewRowCursor rowCursor, DataViewSchema.Column column)
|
@@ -247,8 +261,7 @@ internal static int CreateSolutionFile(string solutionFile, string outputPath)
|
247 | 261 | internal static IList<string> GenerateClassLabels(ColumnInferenceResults columnInferenceResults, IDictionary<string, CodeGeneratorSettings.ColumnMapping> columnMapping = default)
|
248 | 262 | {
|
249 | 263 | IList<string> result = new List<string>();
|
250 |
| - List<string> normalizedColumnNames = new List<string>(); |
251 |
| - bool duplicateColumnNamesExist = false; |
| 264 | + List<string> columnNames = new List<string>(); |
252 | 265 | foreach (var column in columnInferenceResults.TextLoaderOptions.Columns)
|
253 | 266 | {
|
254 | 267 | StringBuilder sb = new StringBuilder();
|
@@ -284,28 +297,47 @@ internal static IList<string> GenerateClassLabels(ColumnInferenceResults columnI
|
284 | 297 | result.Add($"[ColumnName(\"{columnName}\"), LoadColumn({column.Source[0].Min})]");
|
285 | 298 | }
|
286 | 299 | sb.Append(" ");
|
287 |
| - string normalizedColumnName = Utils.Normalize(column.Name); |
288 |
| - // Put placeholder for normalized and unique version of column name |
289 |
| - if (!duplicateColumnNamesExist && normalizedColumnNames.Contains(normalizedColumnName)) |
290 |
| - duplicateColumnNamesExist = true; |
291 |
| - normalizedColumnNames.Add(normalizedColumnName); |
| 300 | + columnNames.Add(column.Name); |
292 | 301 | result.Add(sb.ToString());
|
293 | 302 | result.Add("\r\n");
|
294 | 303 | }
|
| 304 | + // Get normalized and unique column names. If there are duplicate column names, the |
| 305 | + // differentiator suffix '_col_x' will be added to each column name, where 'x' is |
| 306 | + // the load order for a given column. |
| 307 | + List<string> normalizedColumnNames = GenerateColumnNames(columnNames); |
295 | 308 | for (int i = 1; i < result.Count; i+=3)
|
296 | 309 | {
|
297 | 310 | // Get normalized column name for correctly typed class property name
|
298 |
| - // If duplicate column names exist, the only way to ensure all generated column names are unique is to add |
299 |
| - // a differentiator depending on the column load order from dataset. |
300 |
| - if (duplicateColumnNamesExist) |
301 |
| - result[i] += normalizedColumnNames[i/3] + $"_col_{i/3}"; |
302 |
| - else |
303 |
| - result[i] += normalizedColumnNames[i/3]; |
| 311 | + result[i] += normalizedColumnNames[i/3]; |
304 | 312 | result[i] += "{get; set;}";
|
305 | 313 | }
|
306 | 314 | return result;
|
307 | 315 | }
|
308 | 316 |
|
| 317 | + /// <summary> |
| 318 | + /// Take a list of column names that may not be normalized to fit property name standards |
| 319 | + /// and contain duplicate column names. Return unique and normalized column names. |
| 320 | + /// </summary> |
| 321 | + /// <param name="columnNames">Column names to normalize.</param> |
| 322 | + /// <returns>A list of strings that contain normalized and unique column names.</returns> |
| 323 | + internal static List<string> GenerateColumnNames(List<string> columnNames) |
| 324 | + { |
| 325 | + for (int i = 0; i < columnNames.Count; i++) |
| 326 | + columnNames[i] = Utils.Normalize(columnNames[i]); |
| 327 | + // Check if there are any duplicates in columnNames by obtaining its set |
| 328 | + // and seeing whether or not they are the same size. |
| 329 | + HashSet<String> columnNamesSet = new HashSet<String>(columnNames); |
| 330 | + // If there are duplicates, add the differentiator suffix '_col_x' |
| 331 | + // to each normalized column name, where 'x' is the load |
| 332 | + // order for a given column from dataset. |
| 333 | + if (columnNamesSet.Count != columnNames.Count) |
| 334 | + { |
| 335 | + for (int i = 0; i < columnNames.Count; i++) |
| 336 | + columnNames[i] += String.Concat("_col_", i); |
| 337 | + } |
| 338 | + return columnNames; |
| 339 | + } |
| 340 | + |
309 | 341 | internal static string GetSymbolOfDataKind(DataKind dataKind)
|
310 | 342 | {
|
311 | 343 | switch (dataKind)
|
|
0 commit comments