diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs new file mode 100644 index 0000000000..16dc0278b9 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/ConcatTransform.cs @@ -0,0 +1,68 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. + using Microsoft.ML.Runtime.Data; + using Microsoft.ML.Runtime.Api; + using System; + using System.Linq; + using System.Collections.Generic; + using Microsoft.ML.Transforms; + +namespace Microsoft.ML.Samples.Dynamic +{ + public partial class TransformSamples + { + class SampleInfertDataWithFeatures + { + public VBuffer Features { get; set; } + } + + public static void ConcatTransform() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var ml = new MLContext(); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); + var trainData = ml.CreateStreamingDataView(data); + + // Preview of the data. + // + // Age Case Education induced parity pooled.stratum row_num ... + // 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ... + // 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ... + // 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ... + // 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ... + // 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ... + + // A pipeline for concatenating the age, parity and induced columns together in the Features column. + string outputColumnName = "Features"; + var pipeline = new ColumnConcatenatingEstimator(ml, outputColumnName, new[] { "Age", "Parity", "Induced"}); + + // The transformed data. + var transformedData = pipeline.Fit(trainData).Transform(trainData); + + // Getting the data of the newly created column as an IEnumerable of SampleInfertDataWithFeatures. + var featuresColumn = transformedData.AsEnumerable(ml, reuseRowObject: false); + + Console.WriteLine($"{outputColumnName} column obtained post-transformation."); + foreach (var featureRow in featuresColumn) + { + foreach (var value in featureRow.Features.Values) + Console.Write($"{value} "); + Console.WriteLine(""); + } + + // Features column obtained post-transformation. + // + // 26 6 1 + // 42 1 1 + // 39 6 2 + // 34 4 2 + // 35 3 1 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs new file mode 100644 index 0000000000..4755a8a994 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs @@ -0,0 +1,116 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. + using Microsoft.ML.Data; + using Microsoft.ML.Runtime.Api; + using Microsoft.ML.Runtime.Data; + using Microsoft.ML.Transforms.Categorical; + using Microsoft.ML.Transforms.Text; + using System; + using System.Collections.Generic; + using System.Linq; + +namespace Microsoft.ML.Samples.Dynamic +{ + public partial class TransformSamples + { + public static void KeyToValue_Term() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var ml = new MLContext(); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetTopicsData(); + var trainData = ml.CreateStreamingDataView(data); + + // Preview of one of the columns of the the topics data. + // The Review column contains the keys associated with a particular body of text. + // + // Review + // "animals birds cats dogs fish horse" + // "horse birds house fish duck cats" + // "car truck driver bus pickup" + // "car truck driver bus pickup horse" + + // A pipeline to convert the terms of the 'Review' column in + // making use of default settings. + string defaultColumnName = "DefaultKeys"; + // REVIEW create through the catalog extension + var default_pipeline = new WordTokenizingEstimator(ml, "Review") + .Append(new ValueToKeyMappingEstimator(ml, "Review", defaultColumnName)); + + // Another pipeline, that customizes the advanced settings of the TermEstimator. + // We can change the maxNumTerm to limit how many keys will get generated out of the set of words, + // and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered) + // to value/alphabetically. + string customizedColumnName = "CustomizedKeys"; + var customized_pipeline = new WordTokenizingEstimator(ml, "Review") + .Append(new ValueToKeyMappingEstimator(ml, "Review", customizedColumnName, maxNumTerms: 10, sort: TermTransform.SortOrder.Value)); + + // The transformed data. + var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); + var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); + + // Small helper to print the text inside the columns, in the console. + Action>> printHelper = (columnName, column) => + { + Console.WriteLine($"{columnName} column obtained post-transformation."); + foreach (var row in column) + { + foreach (var value in row.Values) + Console.Write($"{value} "); + Console.WriteLine(""); + } + + Console.WriteLine("==================================================="); + }; + + // Preview of the DefaultKeys column obtained after processing the input. + var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName); + printHelper(defaultColumnName, defaultColumn); + + // DefaultKeys column obtained post-transformation. + // + // 1 2 3 4 5 6 + // 6 2 7 5 8 3 + // 9 10 11 12 13 3 + // 9 10 11 12 13 6 + + // Previewing the CustomizedKeys column obtained after processing the input. + var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName); + printHelper(customizedColumnName, customizedColumn); + + // CustomizedKeys column obtained post-transformation. + // + // 1 2 4 5 7 8 + // 8 2 9 7 6 4 + // 3 10 0 0 0 4 + // 3 10 0 0 0 8 + + // Retrieve the original values, by appending the KeyToValue etimator to the existing pipelines + // to convert the keys back to the strings. + var pipeline = default_pipeline.Append(new KeyToValueEstimator(ml, defaultColumnName)); + transformedData_default = pipeline.Fit(trainData).Transform(trainData); + + // Preview of the DefaultColumnName column obtained. + var originalColumnBack = transformedData_default.GetColumn>>(ml, defaultColumnName); + + foreach (var row in originalColumnBack) + { + foreach (var value in row.Values) + Console.Write($"{value} "); + Console.WriteLine(""); + } + + // DefaultKeys column obtained post-transformation. + // + // animals birds cats dogs fish horse + // horse birds house fish duck cats + // car truck driver bus pickup cats + // car truck driver bus pickup horse + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs new file mode 100644 index 0000000000..67a1bd859f --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/MinMaxNormalizer.cs @@ -0,0 +1,91 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. + using Microsoft.ML.Runtime.Api; + using Microsoft.ML.Data; + using Microsoft.ML.Transforms.Normalizers; + using System; + using System.Collections.Generic; + +namespace Microsoft.ML.Samples.Dynamic +{ + public partial class TransformSamples + { + public static void MinMaxNormalizer() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var ml = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); + var trainData = ml.CreateStreamingDataView(data); + + // Preview of the data. + // + // Age Case Education Induced Parity PooledStratum RowNum ... + // 26 1 0-5yrs 1 6 3 1 ... + // 42 1 0-5yrs 1 1 1 2 ... + // 39 1 0-5yrs 2 6 4 3 ... + // 34 1 0-5yrs 2 4 2 4 ... + // 35 1 6-11yrs 1 3 32 5 ... + + // A pipeline for normalizing the Induced column. + var pipeline = ml.Transforms.Normalize("Induced"); + // The transformed (normalized according to Normalizer.NormalizerMode.MinMax) data. + var transformedData = pipeline.Fit(trainData).Transform(trainData); + // Getting the data of the newly created column, so we can preview it. + var normalizedColumn = transformedData.GetColumn(ml, "Induced"); + + // A small printing utility. + Action> printHelper = (colName, column) => + { + Console.WriteLine($"{colName} column obtained post-transformation."); + foreach (var row in column) + Console.WriteLine($"{row} "); + }; + + printHelper("Induced", normalizedColumn); + + // Induced column obtained post-transformation. + // + // 0.5 + // 0.5 + // 1 + // 1 + // 0.5 + + // Composing a different pipeline if we wanted to normalize more than one column at a time. + // Using log scale as the normalization mode. + var multiColPipeline = ml.Transforms.Normalize(NormalizingEstimator.NormalizerMode.LogMeanVariance, new[] { ("Induced", "LogInduced"), ("Spontaneous", "LogSpontaneous") }); + // The transformed data. + var multiColtransformedData = multiColPipeline.Fit(trainData).Transform(trainData); + + // Getting the newly created columns. + var normalizedInduced = multiColtransformedData.GetColumn(ml, "LogInduced"); + var normalizedSpont = multiColtransformedData.GetColumn(ml, "LogSpontaneous"); + + printHelper("LogInduced", normalizedInduced); + + // LogInduced column obtained post-transformation. + // + // 0.2071445 + // 0.2071445 + // 0.889631 + // 0.889631 + // 0.2071445 + + printHelper("LogSpontaneous", normalizedSpont); + + // LogSpontaneous column obtained post-transformation. + // + // 0.8413026 + // 0 + // 0 + // 0 + // 0.1586974 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs new file mode 100644 index 0000000000..a77bb703e0 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs @@ -0,0 +1,88 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. + using Microsoft.ML.Runtime.Data; + using Microsoft.ML.Runtime.Api; + using Microsoft.ML.Data; + using Microsoft.ML.Transforms.Text; + using System; + using System.Collections.Generic; + +namespace Microsoft.ML.Samples.Dynamic +{ + public partial class TransformSamples + { + public static void TextTransform() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var ml = new MLContext(); + + // Get a small dataset as an IEnumerable and convert to IDataView. + IEnumerable data = SamplesUtils.DatasetUtils.GetSentimentData(); + var trainData = ml.CreateStreamingDataView(data); + + // Preview of the data. + // + // Sentiment SentimentText + // true Best game I've ever played. + // false ==RUDE== Dude, 2. + // true Until the next game, this is the best Xbox game! + + // A pipeline for featurization of the "SentimentText" column, and placing the output in a new column named "DefaultTextFeatures" + // The pipeline uses the default settings to featurize. + string defaultColumnName = "DefaultTextFeatures"; + var default_pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", defaultColumnName); + + // Another pipeline, that customizes the advanced settings of the FeaturizeText transformer. + string customizedColumnName = "CustomizedTextFeatures"; + var customized_pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", customizedColumnName, s => + { + s.KeepPunctuations = false; + s.KeepNumbers = false; + s.OutputTokens = true; + s.TextLanguage = TextFeaturizingEstimator.Language.English; // supports English, French, German, Dutch, Italian, Spanish, Japanese + }); + + // The transformed data for both pipelines. + var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); + var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); + + // Small helper to print the text inside the columns, in the console. + Action>> printHelper = (columnName, column) => + { + Console.WriteLine($"{columnName} column obtained post-transformation."); + foreach (var featureRow in column) + { + foreach (var value in featureRow.Values) + Console.Write($"{value} "); + Console.WriteLine(""); + } + + Console.WriteLine("==================================================="); + }; + + // Preview of the DefaultTextFeatures column obtained after processing the input. + var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName); + printHelper(defaultColumnName, defaultColumn); + + // DefaultTextFeatures column obtained post-transformation. + // + // 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0.2357023 0.2357023 0.2357023 0.2357023 0.4714046 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.2357023 0.5773503 0.5773503 0.5773503 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.1924501 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.246183 0.246183 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.1230915 0 0 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.3692745 0.246183 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.246183 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.1230915 0.2886751 0 0 0 0 0 0 0 0.2886751 0.5773503 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 0.2886751 + + // Preview of the CustomizedTextFeatures column obtained after processing the input. + var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName); + printHelper(customizedColumnName, customizedColumn); + + // CustomizedTextFeatures column obtained post-transformation. + // + // 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0.25 0.25 0.25 0.25 0.5 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.7071068 0.7071068 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 + // 0 0.125 0.125 0.125 0.125 0.25 0.25 0.25 0.125 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.125 0.125 0.125 0.125 0.125 0.125 0.375 0.25 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.25 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.2672612 0.5345225 0 0 0 0 0 0.2672612 0.5345225 0.2672612 0.2672612 0.2672612 0.2672612 } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index 263060abeb..4c2f9385b9 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -13,6 +13,7 @@ + false diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index b8eed4095d..c2c9ef37a6 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. + using Microsoft.ML.Samples.Dynamic; namespace Microsoft.ML.Samples @@ -9,7 +10,7 @@ internal static class Program { static void Main(string[] args) { - TrainerSamples.SDCA_BinaryClassification(); + TransformSamples.KeyToValue_Term(); } } } diff --git a/docs/samples/Microsoft.ML.Samples/Static/FastTree.cs b/docs/samples/Microsoft.ML.Samples/Static/FastTree.cs new file mode 100644 index 0000000000..639fec5990 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Static/FastTree.cs @@ -0,0 +1,72 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. + using Microsoft.ML.Runtime.Data; + using Microsoft.ML.Trainers.FastTree; + using Microsoft.ML.StaticPipe; + using System; + using System.Linq; + +// NOTE: WHEN ADDING TO THE FILE, ALWAYS APPEND TO THE END OF IT. +// If you change the existinc content, check that the files referencing it in the XML documentation are still correct, as they reference +// line by line. +namespace Microsoft.ML.Samples.Static +{ + public partial class TrainersSamples + { + public static void FastTreeRegression() + { + // Downloading a regression dataset from github.com/dotnet/machinelearning + // this will create a housing.txt file in the filsystem this code will run + // you can open the file to see the data. + string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); + + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Creating a data reader, based on the format of the data + var reader = TextLoader.CreateReader(mlContext, c => ( + label: c.LoadFloat(0), + features: c.LoadFloat(1, 6) + ), + separator: '\t', hasHeader: true); + + // Read the data, and leave 10% out, so we can use them for testing + var data = reader.Read(dataFile); + + // The predictor that gets produced out of training + FastTreeRegressionPredictor pred = null; + + // Create the estimator + var learningPipeline = reader.MakeNewEstimator() + .Append(r => (r.label, score: mlContext.Regression.Trainers.FastTree( + r.label, + r.features, + numTrees: 100, // try: (int) 20-2000 + numLeaves: 20, // try: (int) 2-128 + minDatapointsInLeafs: 10, // try: (int) 1-100 + learningRate: 0.2, // try: (float) 0.025-0.4 + onFit: p => pred = p) + ) + ); + + var cvResults = mlContext.Regression.CrossValidate(data, learningPipeline, r => r.label, numFolds: 5); + var averagedMetrics = ( + L1: cvResults.Select(r => r.metrics.L1).Average(), + L2: cvResults.Select(r => r.metrics.L2).Average(), + LossFn: cvResults.Select(r => r.metrics.LossFn).Average(), + Rms: cvResults.Select(r => r.metrics.Rms).Average(), + RSquared: cvResults.Select(r => r.metrics.RSquared).Average() + ); + Console.WriteLine($"L1 - {averagedMetrics.L1}"); // 3.091095 + Console.WriteLine($"L2 - {averagedMetrics.L2}"); // 20.351073 + Console.WriteLine($"LossFunction - {averagedMetrics.LossFn}"); // 20.351074 + Console.WriteLine($"RMS - {averagedMetrics.Rms}"); // 4.478358 + Console.WriteLine($"RSquared - {averagedMetrics.RSquared}"); // 0.754977 + } + + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs new file mode 100644 index 0000000000..c9548b03c5 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBM.cs @@ -0,0 +1,76 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + // the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. + using Microsoft.ML.Runtime.Data; + using Microsoft.ML.Runtime.LightGBM; + using Microsoft.ML.StaticPipe; + using System; + +// NOTE: WHEN ADDING TO THE FILE, ALWAYS APPEND TO THE END OF IT. +// If you change the existinc content, check that the files referencing it in the XML documentation are still correct, as they reference +// line by line. +namespace Microsoft.ML.Samples.Static +{ + public partial class TrainersSamples + { + public static void LightGbmRegression() + { + // Downloading a regression dataset from github.com/dotnet/machinelearning + // this will create a housing.txt file in the filsystem this code will run + // you can open the file to see the data. + string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); + + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Creating a data reader, based on the format of the data + var reader = TextLoader.CreateReader(mlContext, c => ( + label: c.LoadFloat(0), + features: c.LoadFloat(1, 6) + ), + separator: '\t', hasHeader: true); + + // Read the data, and leave 10% out, so we can use them for testing + var data = reader.Read(new MultiFileSource(dataFile)); + var (trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1); + + // The predictor that gets produced out of training + LightGbmRegressionPredictor pred = null; + + // Create the estimator + var learningPipeline = reader.MakeNewEstimator() + .Append(r => (r.label, score: mlContext.Regression.Trainers.LightGbm( + r.label, + r.features, + numLeaves: 4, + minDataPerLeaf: 6, + learningRate: 0.001, + onFit: p => pred = p) + ) + ); + + // Fit this pipeline to the training data + var model = learningPipeline.Fit(trainData); + + // Check the weights that the model learned + VBuffer weights = default; + pred.GetFeatureWeights(ref weights); + + Console.WriteLine($"weight 0 - {weights.Values[0]}"); + Console.WriteLine($"weight 1 - {weights.Values[1]}"); + + // Evaluate how the model is doing on the test data + var dataWithPredictions = model.Transform(testData); + var metrics = mlContext.Regression.Evaluate(dataWithPredictions, r => r.label, r => r.score); + + Console.WriteLine($"L1 - {metrics.L1}"); // 4.9669731 + Console.WriteLine($"L2 - {metrics.L2}"); // 51.37296 + Console.WriteLine($"LossFunction - {metrics.LossFn}"); // 51.37296 + Console.WriteLine($"RMS - {metrics.Rms}"); // 7.167493 + Console.WriteLine($"RSquared - {metrics.RSquared}"); // 0.079478 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Static/SDCA.cs b/docs/samples/Microsoft.ML.Samples/Static/SDCA.cs new file mode 100644 index 0000000000..f4ac4e9732 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Static/SDCA.cs @@ -0,0 +1,74 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +// the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. + using Microsoft.ML.Runtime.Data; + using Microsoft.ML.Runtime.Learners; + using Microsoft.ML.StaticPipe; + using System; + +// NOTE: WHEN ADDING TO THE FILE, ALWAYS APPEND TO THE END OF IT. +// If you change the existinc content, check that the files referencing it in the XML documentation are still correct, as they reference +// line by line. +namespace Microsoft.ML.Samples.Static +{ + public static class Trainers + { + public static void SdcaRegression() + { + // Downloading a regression dataset from github.com/dotnet/machinelearning + // this will create a housing.txt file in the filsystem this code will run + // you can open the file to see the data. + string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); + + // Creating the ML.Net IHostEnvironment object, needed for the pipeline + var mlContext = new MLContext(); + + // Creating a data reader, based on the format of the data + var reader = TextLoader.CreateReader(mlContext, c => ( + label: c.LoadFloat(0), + features: c.LoadFloat(1, 6) + ), + separator: '\t', hasHeader: true); + + // Read the data, and leave 10% out, so we can use them for testing + var data = reader.Read(dataFile); + var (trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1); + + // The predictor that gets produced out of training + LinearRegressionPredictor pred = null; + + // Create the estimator + var learningPipeline = reader.MakeNewEstimator() + .Append(r => (r.label, score: mlContext.Regression.Trainers.Sdca( + r.label, + r.features, + l1Threshold: 0f, + maxIterations: 100, + onFit: p => pred = p) + ) + ); + + // Fit this pipeline to the training data + var model = learningPipeline.Fit(trainData); + + // Check the weights that the model learned + VBuffer weights = default; + pred.GetFeatureWeights(ref weights); + + Console.WriteLine($"weight 0 - {weights.Values[0]}"); + Console.WriteLine($"weight 1 - {weights.Values[1]}"); + + // Evaluate how the model is doing on the test data + var dataWithPredictions = model.Transform(testData); + var metrics = mlContext.Regression.Evaluate(dataWithPredictions, r => r.label, r => r.score); + + Console.WriteLine($"L1 - {metrics.L1}"); // 3.7226085 + Console.WriteLine($"L2 - {metrics.L2}"); // 24.250636 + Console.WriteLine($"LossFunction - {metrics.LossFn}"); // 24.25063 + Console.WriteLine($"RMS - {metrics.Rms}"); // 4.924493 + Console.WriteLine($"RSquared - {metrics.RSquared}"); // 0.565467 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Static/Trainers.cs b/docs/samples/Microsoft.ML.Samples/Static/Trainers.cs deleted file mode 100644 index fa89901b78..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/Trainers.cs +++ /dev/null @@ -1,195 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -// the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. - using Microsoft.ML.Runtime.Data; - using Microsoft.ML.Runtime.Learners; - using Microsoft.ML.Runtime.LightGBM; - using Microsoft.ML.Trainers.FastTree; - using Microsoft.ML.StaticPipe; - using System; - using System.Linq; - -// NOTE: WHEN ADDING TO THE FILE, ALWAYS APPEND TO THE END OF IT. -// If you change the existinc content, check that the files referencing it in the XML documentation are still correct, as they reference -// line by line. -namespace Microsoft.ML.Samples.Static -{ - public static class Trainers - { - - public static void SdcaRegression() - { - // Downloading a regression dataset from github.com/dotnet/machinelearning - // this will create a housing.txt file in the filsystem this code will run - // you can open the file to see the data. - string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); - - // Creating the ML.Net IHostEnvironment object, needed for the pipeline - var env = new LocalEnvironment(seed: 0); - - // Creating the ML context, based on the task performed. - var regressionContext = new RegressionContext(env); - - // Creating a data reader, based on the format of the data - var reader = TextLoader.CreateReader(env, c => ( - label: c.LoadFloat(0), - features: c.LoadFloat(1, 6) - ), - separator: '\t', hasHeader: true); - - // Read the data, and leave 10% out, so we can use them for testing - var data = reader.Read(dataFile); - var (trainData, testData) = regressionContext.TrainTestSplit(data, testFraction: 0.1); - - // The predictor that gets produced out of training - LinearRegressionPredictor pred = null; - - // Create the estimator - var learningPipeline = reader.MakeNewEstimator() - .Append(r => (r.label, score: regressionContext.Trainers.Sdca( - r.label, - r.features, - l1Threshold: 0f, - maxIterations: 100, - onFit: p => pred = p) - ) - ); - - // Fit this pipeline to the training data - var model = learningPipeline.Fit(trainData); - - // Check the weights that the model learned - VBuffer weights = default; - pred.GetFeatureWeights(ref weights); - - Console.WriteLine($"weight 0 - {weights.Values[0]}"); - Console.WriteLine($"weight 1 - {weights.Values[1]}"); - - // Evaluate how the model is doing on the test data - var dataWithPredictions = model.Transform(testData); - var metrics = regressionContext.Evaluate(dataWithPredictions, r => r.label, r => r.score); - - Console.WriteLine($"L1 - {metrics.L1}"); // 3.7226085 - Console.WriteLine($"L2 - {metrics.L2}"); // 24.250636 - Console.WriteLine($"LossFunction - {metrics.LossFn}"); // 24.25063 - Console.WriteLine($"RMS - {metrics.Rms}"); // 4.924493 - Console.WriteLine($"RSquared - {metrics.RSquared}"); // 0.565467 - } - - public static void FastTreeRegression() - { - // Downloading a regression dataset from github.com/dotnet/machinelearning - // this will create a housing.txt file in the filsystem this code will run - // you can open the file to see the data. - string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); - - // Creating the ML.Net IHostEnvironment object, needed for the pipeline - var env = new LocalEnvironment(seed: 0); - - // Creating the ML context, based on the task performed. - var regressionContext = new RegressionContext(env); - - // Creating a data reader, based on the format of the data - var reader = TextLoader.CreateReader(env, c => ( - label: c.LoadFloat(0), - features: c.LoadFloat(1, 6) - ), - separator: '\t', hasHeader: true); - - // Read the data, and leave 10% out, so we can use them for testing - var data = reader.Read(new MultiFileSource(dataFile)); - - // The predictor that gets produced out of training - FastTreeRegressionPredictor pred = null; - - // Create the estimator - var learningPipeline = reader.MakeNewEstimator() - .Append(r => (r.label, score: regressionContext.Trainers.FastTree( - r.label, - r.features, - numTrees: 100, // try: (int) 20-2000 - numLeaves: 20, // try: (int) 2-128 - minDatapointsInLeafs: 10, // try: (int) 1-100 - learningRate: 0.2, // try: (float) 0.025-0.4 - onFit: p => pred = p) - ) - ); - - var cvResults = regressionContext.CrossValidate(data, learningPipeline, r => r.label, numFolds: 5); - var averagedMetrics = ( - L1: cvResults.Select(r => r.metrics.L1).Average(), - L2: cvResults.Select(r => r.metrics.L2).Average(), - LossFn: cvResults.Select(r => r.metrics.LossFn).Average(), - Rms: cvResults.Select(r => r.metrics.Rms).Average(), - RSquared: cvResults.Select(r => r.metrics.RSquared).Average() - ); - Console.WriteLine($"L1 - {averagedMetrics.L1}"); - Console.WriteLine($"L2 - {averagedMetrics.L2}"); - Console.WriteLine($"LossFunction - {averagedMetrics.LossFn}"); - Console.WriteLine($"RMS - {averagedMetrics.Rms}"); - Console.WriteLine($"RSquared - {averagedMetrics.RSquared}"); - } - - public static void LightGbmRegression() - { - // Downloading a regression dataset from github.com/dotnet/machinelearning - // this will create a housing.txt file in the filsystem this code will run - // you can open the file to see the data. - string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); - - // Creating the ML.Net IHostEnvironment object, needed for the pipeline - var env = new LocalEnvironment(seed: 0); - - // Creating the ML context, based on the task performed. - var regressionContext = new RegressionContext(env); - - // Creating a data reader, based on the format of the data - var reader = TextLoader.CreateReader(env, c => ( - label: c.LoadFloat(0), - features: c.LoadFloat(1, 6) - ), - separator: '\t', hasHeader: true); - - // Read the data, and leave 10% out, so we can use them for testing - var data = reader.Read(new MultiFileSource(dataFile)); - var (trainData, testData) = regressionContext.TrainTestSplit(data, testFraction: 0.1); - - // The predictor that gets produced out of training - LightGbmRegressionPredictor pred = null; - - // Create the estimator - var learningPipeline = reader.MakeNewEstimator() - .Append(r => (r.label, score: regressionContext.Trainers.LightGbm( - r.label, - r.features, - numLeaves: 4, - minDataPerLeaf: 6, - learningRate: 0.001, - onFit: p => pred = p) - ) - ); - - // Fit this pipeline to the training data - var model = learningPipeline.Fit(trainData); - - // Check the weights that the model learned - VBuffer weights = default; - pred.GetFeatureWeights(ref weights); - - Console.WriteLine($"weight 0 - {weights.Values[0]}"); - Console.WriteLine($"weight 1 - {weights.Values[1]}"); - - // Evaluate how the model is doing on the test data - var dataWithPredictions = model.Transform(testData); - var metrics = regressionContext.Evaluate(dataWithPredictions, r => r.label, r => r.score); - - Console.WriteLine($"L1 - {metrics.L1}"); - Console.WriteLine($"L2 - {metrics.L2}"); - Console.WriteLine($"LossFunction - {metrics.LossFn}"); - Console.WriteLine($"RMS - {metrics.Rms}"); - Console.WriteLine($"RSquared - {metrics.RSquared}"); - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Static/Transformers.cs b/docs/samples/Microsoft.ML.Samples/Static/Transformers.cs deleted file mode 100644 index 95ad9abe00..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/Transformers.cs +++ /dev/null @@ -1,77 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -// the alignment of the usings with the methods is intentional so they can display on the same level in the docs site. - using Microsoft.ML.Runtime.Api; - using Microsoft.ML.Runtime.Data; - using Microsoft.ML.StaticPipe; - using Microsoft.ML.Transforms; - using System; - using System.Collections.Generic; - -// NOTE: WHEN ADDING TO THE FILE, ALWAYS APPEND TO THE END OF IT. -// If you change the existinc content, check that the files referencing it in the XML documentation are still correct, as they reference -// line by line. -namespace Microsoft.ML.Samples.Static -{ - public static class Transformers - { - - /// - /// The example for the statically typed concat estimator. - /// - public static void ConcatEstimator() - { - // Create a new environment for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var env = new LocalEnvironment(); - - IEnumerable data = SamplesUtils.DatasetUtils.GetInputData(); - - // A preview of InputData: - // feature_0; feature_1; feature_2; feature_3; target - // -2.75; 0.77; -0.61; 0.14; 140.66 - // -0.61; -0.37; -0.12; 0.55; 148.12 - // -0.85; -0.91; 1.81; 0.02; 402.20 - - // Convert to an DataView. - var trainingData = env.CreateStreamingDataView(data); - - // Convert the IDataView to statically-typed data view, so its schema can be used on the - // pipelines that will get built in top of it. - var staticData = trainingData.AssertStatic(env, c => ( - Feature0: c.R4.Scalar, - Feature1: c.R4.Scalar, - Feature2: c.R4.Scalar, - Feature3: c.R4.Scalar, - Target: c.R4.Scalar)); - - // Start creating our processing pipeline. - // Let just concatenate all the float columns together into one using ConcatWith. - var staticLearningPipeline = staticData.MakeNewEstimator() - .Append(r => ( - r.Target, - Features: r.Feature0.ConcatWith(r.Feature1, r.Feature2, r.Feature3))); - - // Transform the data through the above pipeline. - var transformedData = staticLearningPipeline.Fit(staticData).Transform(staticData); - - // The transformedData DataView is now of the type (Target:Scalar, Features:Vector). - - // Features target - // -2.75 0.77 -0.61 0.14; 140.66 - // -0.61 -0.37 -0.12 0.55; 148.12 - // -0.85 -0.91 1.81 0.02; 402.20 - - // Let's print out the new data. - var features = transformedData.GetColumn(r => r.Features); - - Console.WriteLine("Features column obtained post-transformation."); - foreach (var featureRow in features) - { - Console.WriteLine($"{featureRow[0]} {featureRow[1]} {featureRow[2]} {featureRow[3]}"); - } - } - } -} diff --git a/src/Microsoft.ML.Data/Transforms/ColumnConcatenatingEstimator.cs b/src/Microsoft.ML.Data/Transforms/ColumnConcatenatingEstimator.cs index 9aa0a57896..7cf61df705 100644 --- a/src/Microsoft.ML.Data/Transforms/ColumnConcatenatingEstimator.cs +++ b/src/Microsoft.ML.Data/Transforms/ColumnConcatenatingEstimator.cs @@ -152,13 +152,6 @@ public static NormVector ConcatWith(this NormVector me, params NormVect /// The first input column. /// Subsequent input columns. /// The result of concatenating all input columns together. - /// - /// - /// - /// - /// public static Vector ConcatWith(this Scalar me, params ScalarOrVector[] others) => new Impl(Join(me, others)); diff --git a/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs b/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs index 694f41d12b..2ca4f54da8 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizerCatalog.cs @@ -20,6 +20,20 @@ public static class NormalizerCatalog /// The column name /// The column name /// The used to map the old values in the new scale. + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// public static NormalizingEstimator Normalize(this TransformsCatalog catalog, string inputName, string outputName = null, NormalizingEstimator.NormalizerMode mode = NormalizingEstimator.NormalizerMode.MinMax) => new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), inputName, outputName, mode); @@ -29,6 +43,20 @@ public static NormalizingEstimator Normalize(this TransformsCatalog catalog, str /// The transform catalog /// The used to map the old values to the new ones. /// The pairs of input and output columns. + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// public static NormalizingEstimator Normalize(this TransformsCatalog catalog, NormalizingEstimator.NormalizerMode mode, params (string input, string output)[] columns) => new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), mode, columns); diff --git a/src/Microsoft.ML.FastTree/TreeTrainersStatic.cs b/src/Microsoft.ML.FastTree/TreeTrainersStatic.cs index f3008957f4..f15f94d7f9 100644 --- a/src/Microsoft.ML.FastTree/TreeTrainersStatic.cs +++ b/src/Microsoft.ML.FastTree/TreeTrainersStatic.cs @@ -38,7 +38,7 @@ public static class TreeRegressionExtensions /// /// /// /// public static Scalar FastTree(this RegressionContext.RegressionTrainers ctx, diff --git a/src/Microsoft.ML.LightGBM/LightGbmStatic.cs b/src/Microsoft.ML.LightGBM/LightGbmStatic.cs index 445a1ad6ad..8eda73d38c 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmStatic.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmStatic.cs @@ -38,7 +38,7 @@ public static class LightGbmTrainers /// /// /// /// public static Scalar LightGbm(this RegressionContext.RegressionTrainers ctx, diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index c7f60b16ac..02c5945ea0 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -32,6 +32,10 @@ private static string Download(string baseGitPath, string dataFile) return dataFile; } + /// + /// A simple set of features that help generate the Target column, according to a function. + /// Used for the transformers/estimators working on numeric data. + /// public class SampleInput { public float Feature0 { get; set; } @@ -41,6 +45,9 @@ public class SampleInput public float Target { get; set; } } + /// + /// Returns a sample of a numeric dataset. + /// public static IEnumerable GetInputData() { var data = new List(); @@ -50,5 +57,88 @@ public static IEnumerable GetInputData() return data; } + + /// + /// A dataset that contains a tweet and the sentiment assigned to that tweet: 0 - negative and 1 - positive sentiment. + /// + public class SampleSentimentData + { + public bool Sentiment { get; set; } + public string SentimentText { get; set; } + } + + /// + /// Returns a sample of the sentiment dataset. + /// + public static IEnumerable GetSentimentData() + { + var data = new List(); + data.Add(new SampleSentimentData { Sentiment = true, SentimentText = "Best game I've ever played." }); + data.Add(new SampleSentimentData { Sentiment = false, SentimentText = "==RUDE== Dude, 2" }); + data.Add(new SampleSentimentData { Sentiment = true, SentimentText = "Until the next game, this is the best Xbox game!" }); + + return data; + } + + /// + /// A dataset that contains one column with two set of keys assigned to a body of text: Review and ReviewReverse. + /// The dataset will be used to classify how accurately the keys are assigned to the text. + /// + public class SampleTopicsData + { + public string Review { get; set; } + public string ReviewReverse { get; set; } + public bool Label { get; set; } + } + + /// + /// Returns a sample of the topics dataset. + /// + public static IEnumerable GetTopicsData() + { + var data = new List(); + data.Add(new SampleTopicsData { Review = "animals birds cats dogs fish horse", ReviewReverse = "radiation galaxy universe duck", Label = true }); + data.Add(new SampleTopicsData { Review = "horse birds house fish duck cats", ReviewReverse = "space galaxy universe radiation", Label = false }); + data.Add(new SampleTopicsData { Review = "car truck driver bus pickup", ReviewReverse = "bus pickup", Label = true}); + data.Add(new SampleTopicsData { Review = "car truck driver bus pickup horse", ReviewReverse = "car truck", Label = false }); + + return data; + } + + /// + /// Represents the column of the infertility dataset. + /// + public class SampleInfertData + { + public int RowNum { get; set; } + public string Education { get; set; } + public float Age { get; set; } + public float Parity { get; set; } + public float Induced { get; set; } + public float Case { get; set; } + + public float Spontaneous { get; set; } + public float Stratum { get; set; } + public float PooledStratum { get; set; } + } + + /// + /// Returns a few rows of the infertility dataset. + /// + public static IEnumerable GetInfertData() + { + var data = new List(); + data.Add(new SampleInfertData { + RowNum = 0, Education = "0-5yrs", Age = 26, Parity = 6, Induced = 1, Case = 1, Spontaneous = 2, Stratum = 1, PooledStratum = 3 }); + data.Add(new SampleInfertData { + RowNum = 1, Education = "0-5yrs", Age = 42, Parity = 1, Induced = 1, Case = 1, Spontaneous = 0, Stratum = 2, PooledStratum = 1 }); + data.Add(new SampleInfertData { + RowNum = 2, Education = "0-5yrs", Age = 39, Parity = 6, Induced = 2, Case = 1, Spontaneous = 0, Stratum = 3, PooledStratum = 4 }); + data.Add(new SampleInfertData { + RowNum = 3, Education = "0-5yrs", Age = 34, Parity = 4, Induced = 2, Case = 1, Spontaneous = 0, Stratum = 4, PooledStratum = 2 }); + data.Add(new SampleInfertData { + RowNum = 4, Education = "6-11yrs", Age = 35, Parity = 3, Induced = 1, Case = 1, Spontaneous = 1, Stratum = 5, PooledStratum = 32 }); + return data; + } } } diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaCatalog.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaCatalog.cs index 6e3ebe5fa7..7d3e1025b9 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/SdcaCatalog.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaCatalog.cs @@ -64,7 +64,7 @@ public static class SdcaBinaryClassificationExtensions /// /// /// /// /// diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaStatic.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaStatic.cs index b803c74d36..2313bcd180 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/SdcaStatic.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaStatic.cs @@ -41,7 +41,7 @@ public static class SdcaExtensions /// /// /// /// public static Scalar Sdca(this RegressionContext.RegressionTrainers ctx, diff --git a/src/Microsoft.ML.Transforms/TextTransformCatalog.cs b/src/Microsoft.ML.Transforms/TextTransformCatalog.cs index 1a1c2af4b1..c025de5fe6 100644 --- a/src/Microsoft.ML.Transforms/TextTransformCatalog.cs +++ b/src/Microsoft.ML.Transforms/TextTransformCatalog.cs @@ -19,6 +19,20 @@ public static class TextTransformCatalog /// The input column /// The output column /// Advanced transform settings + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.TextTransforms catalog, string inputColumn, string outputColumn = null, Action advancedSettings = null) diff --git a/test/data/README.md b/test/data/README.md index 61cded9255..6a21ece35f 100644 --- a/test/data/README.md +++ b/test/data/README.md @@ -73,6 +73,21 @@ Redistributing the dataset "[housing.txt](housing.txt)" with attribution: More information: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names +### Air Quality + +This dataset is from the R documentation: [New York Air Quality Measurements]https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/airquality.html +The data were obtained from the New York State Department of Conservation (ozone data) and the National Weather Service (meteorological data). +References: Chambers, J. M., Cleveland, W. S., Kleiner, B. and Tukey, P. A. (1983) Graphical Methods for Data Analysis. Belmont, CA: Wadsworth. + +The dataset is distributed under [GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) + +### Infertility + +This dataset is from the R documentation: [Infertility after Spontaneous and Induced Abortion]https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/infert.html +Original source: Trichopoulos et al (1976) Br. J. of Obst. and Gynaec. 83, 645–650. + +The dataset is distributed under [GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) + # Images ### Located in `images` folder @@ -81,4 +96,4 @@ More information: https://archive.ics.uci.edu/ml/machine-learning-databases/hous > > "[Hot dog with mustard](https://visualsonline.cancer.gov/details.cfm?imageid=2669)" by Renee Comet is in the public domain - this image was released by the [National Cancer Institute](https://visualsonline.cancer.gov/details.cfm?imageid=2669) > -> "[Bright red tomato and cross section02](https://upload.wikimedia.org/wikipedia/commons/8/88/Bright_red_tomato_and_cross_section02.jpg)" by [fir0002](https://en.wikipedia.org/wiki/User:Fir0002) is licensed under the [CC BY-NC](https://creativecommons.org/licenses/by/2.0/) +> "[Bright red tomato and cross section02](https://upload.wikimedia.org/wikipedia/commons/8/88/Bright_red_tomato_and_cross_section02.jpg)" by [fir0002](https://en.wikipedia.org/wiki/User:Fir0002) is licensed under the [CC BY-NC](https://creativecommons.org/licenses/by/2.0/) \ No newline at end of file diff --git a/test/data/gplv2/COPYING.txt b/test/data/gplv2/COPYING.txt new file mode 100644 index 0000000000..c4aa571b70 --- /dev/null +++ b/test/data/gplv2/COPYING.txt @@ -0,0 +1,345 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. \ No newline at end of file diff --git a/test/data/gplv2/airquality.csv b/test/data/gplv2/airquality.csv new file mode 100644 index 0000000000..5448cc6a05 --- /dev/null +++ b/test/data/gplv2/airquality.csv @@ -0,0 +1,154 @@ +"","Ozone","Solar_R","Wind","Temp","Month","Day" +"1",41,190,7.4,67,5,1 +"2",36,118,8,72,5,2 +"3",12,149,12.6,74,5,3 +"4",18,313,11.5,62,5,4 +"5",NA,NA,14.3,56,5,5 +"6",28,NA,14.9,66,5,6 +"7",23,299,8.6,65,5,7 +"8",19,99,13.8,59,5,8 +"9",8,19,20.1,61,5,9 +"10",NA,194,8.6,69,5,10 +"11",7,NA,6.9,74,5,11 +"12",16,256,9.7,69,5,12 +"13",11,290,9.2,66,5,13 +"14",14,274,10.9,68,5,14 +"15",18,65,13.2,58,5,15 +"16",14,334,11.5,64,5,16 +"17",34,307,12,66,5,17 +"18",6,78,18.4,57,5,18 +"19",30,322,11.5,68,5,19 +"20",11,44,9.7,62,5,20 +"21",1,8,9.7,59,5,21 +"22",11,320,16.6,73,5,22 +"23",4,25,9.7,61,5,23 +"24",32,92,12,61,5,24 +"25",NA,66,16.6,57,5,25 +"26",NA,266,14.9,58,5,26 +"27",NA,NA,8,57,5,27 +"28",23,13,12,67,5,28 +"29",45,252,14.9,81,5,29 +"30",115,223,5.7,79,5,30 +"31",37,279,7.4,76,5,31 +"32",NA,286,8.6,78,6,1 +"33",NA,287,9.7,74,6,2 +"34",NA,242,16.1,67,6,3 +"35",NA,186,9.2,84,6,4 +"36",NA,220,8.6,85,6,5 +"37",NA,264,14.3,79,6,6 +"38",29,127,9.7,82,6,7 +"39",NA,273,6.9,87,6,8 +"40",71,291,13.8,90,6,9 +"41",39,323,11.5,87,6,10 +"42",NA,259,10.9,93,6,11 +"43",NA,250,9.2,92,6,12 +"44",23,148,8,82,6,13 +"45",NA,332,13.8,80,6,14 +"46",NA,322,11.5,79,6,15 +"47",21,191,14.9,77,6,16 +"48",37,284,20.7,72,6,17 +"49",20,37,9.2,65,6,18 +"50",12,120,11.5,73,6,19 +"51",13,137,10.3,76,6,20 +"52",NA,150,6.3,77,6,21 +"53",NA,59,1.7,76,6,22 +"54",NA,91,4.6,76,6,23 +"55",NA,250,6.3,76,6,24 +"56",NA,135,8,75,6,25 +"57",NA,127,8,78,6,26 +"58",NA,47,10.3,73,6,27 +"59",NA,98,11.5,80,6,28 +"60",NA,31,14.9,77,6,29 +"61",NA,138,8,83,6,30 +"62",135,269,4.1,84,7,1 +"63",49,248,9.2,85,7,2 +"64",32,236,9.2,81,7,3 +"65",NA,101,10.9,84,7,4 +"66",64,175,4.6,83,7,5 +"67",40,314,10.9,83,7,6 +"68",77,276,5.1,88,7,7 +"69",97,267,6.3,92,7,8 +"70",97,272,5.7,92,7,9 +"71",85,175,7.4,89,7,10 +"72",NA,139,8.6,82,7,11 +"73",10,264,14.3,73,7,12 +"74",27,175,14.9,81,7,13 +"75",NA,291,14.9,91,7,14 +"76",7,48,14.3,80,7,15 +"77",48,260,6.9,81,7,16 +"78",35,274,10.3,82,7,17 +"79",61,285,6.3,84,7,18 +"80",79,187,5.1,87,7,19 +"81",63,220,11.5,85,7,20 +"82",16,7,6.9,74,7,21 +"83",NA,258,9.7,81,7,22 +"84",NA,295,11.5,82,7,23 +"85",80,294,8.6,86,7,24 +"86",108,223,8,85,7,25 +"87",20,81,8.6,82,7,26 +"88",52,82,12,86,7,27 +"89",82,213,7.4,88,7,28 +"90",50,275,7.4,86,7,29 +"91",64,253,7.4,83,7,30 +"92",59,254,9.2,81,7,31 +"93",39,83,6.9,81,8,1 +"94",9,24,13.8,81,8,2 +"95",16,77,7.4,82,8,3 +"96",78,NA,6.9,86,8,4 +"97",35,NA,7.4,85,8,5 +"98",66,NA,4.6,87,8,6 +"99",122,255,4,89,8,7 +"100",89,229,10.3,90,8,8 +"101",110,207,8,90,8,9 +"102",NA,222,8.6,92,8,10 +"103",NA,137,11.5,86,8,11 +"104",44,192,11.5,86,8,12 +"105",28,273,11.5,82,8,13 +"106",65,157,9.7,80,8,14 +"107",NA,64,11.5,79,8,15 +"108",22,71,10.3,77,8,16 +"109",59,51,6.3,79,8,17 +"110",23,115,7.4,76,8,18 +"111",31,244,10.9,78,8,19 +"112",44,190,10.3,78,8,20 +"113",21,259,15.5,77,8,21 +"114",9,36,14.3,72,8,22 +"115",NA,255,12.6,75,8,23 +"116",45,212,9.7,79,8,24 +"117",168,238,3.4,81,8,25 +"118",73,215,8,86,8,26 +"119",NA,153,5.7,88,8,27 +"120",76,203,9.7,97,8,28 +"121",118,225,2.3,94,8,29 +"122",84,237,6.3,96,8,30 +"123",85,188,6.3,94,8,31 +"124",96,167,6.9,91,9,1 +"125",78,197,5.1,92,9,2 +"126",73,183,2.8,93,9,3 +"127",91,189,4.6,93,9,4 +"128",47,95,7.4,87,9,5 +"129",32,92,15.5,84,9,6 +"130",20,252,10.9,80,9,7 +"131",23,220,10.3,78,9,8 +"132",21,230,10.9,75,9,9 +"133",24,259,9.7,73,9,10 +"134",44,236,14.9,81,9,11 +"135",21,259,15.5,76,9,12 +"136",28,238,6.3,77,9,13 +"137",9,24,10.9,71,9,14 +"138",13,112,11.5,71,9,15 +"139",46,237,6.9,78,9,16 +"140",18,224,13.8,67,9,17 +"141",13,27,10.3,76,9,18 +"142",24,238,10.3,68,9,19 +"143",16,201,8,82,9,20 +"144",13,238,12.6,64,9,21 +"145",23,14,9.2,71,9,22 +"146",36,139,10.3,81,9,23 +"147",7,49,10.3,69,9,24 +"148",14,20,16.6,63,9,25 +"149",30,193,6.9,70,9,26 +"150",NA,145,13.2,77,9,27 +"151",14,191,14.3,75,9,28 +"152",18,131,8,76,9,29 +"153",20,223,11.5,68,9,30 \ No newline at end of file diff --git a/test/data/gplv2/infert.csv b/test/data/gplv2/infert.csv new file mode 100644 index 0000000000..4f4fc2ec51 --- /dev/null +++ b/test/data/gplv2/infert.csv @@ -0,0 +1,249 @@ +"row_num","education","age","parity","induced","case","spontaneous","stratum","pooled.stratum" +"1","0-5yrs",26,6,1,1,2,1,3 +"2","0-5yrs",42,1,1,1,0,2,1 +"3","0-5yrs",39,6,2,1,0,3,4 +"4","0-5yrs",34,4,2,1,0,4,2 +"5","6-11yrs",35,3,1,1,1,5,32 +"6","6-11yrs",36,4,2,1,1,6,36 +"7","6-11yrs",23,1,0,1,0,7,6 +"8","6-11yrs",32,2,0,1,0,8,22 +"9","6-11yrs",21,1,0,1,1,9,5 +"10","6-11yrs",28,2,0,1,0,10,19 +"11","6-11yrs",29,2,1,1,0,11,20 +"12","6-11yrs",37,4,2,1,1,12,37 +"13","6-11yrs",31,1,1,1,0,13,9 +"14","6-11yrs",29,3,2,1,0,14,29 +"15","6-11yrs",31,2,1,1,1,15,21 +"16","6-11yrs",27,2,2,1,0,16,18 +"17","6-11yrs",30,5,2,1,1,17,38 +"18","6-11yrs",26,1,0,1,1,18,7 +"19","6-11yrs",25,3,2,1,1,19,28 +"20","6-11yrs",44,1,0,1,1,20,17 +"21","6-11yrs",40,1,0,1,1,21,14 +"22","6-11yrs",35,2,2,1,0,22,24 +"23","6-11yrs",28,2,0,1,2,23,19 +"24","6-11yrs",36,1,0,1,1,24,12 +"25","6-11yrs",27,2,1,1,1,25,18 +"26","6-11yrs",40,2,0,1,2,26,27 +"27","6-11yrs",38,2,0,1,2,27,26 +"28","6-11yrs",34,3,0,1,2,28,31 +"29","6-11yrs",28,4,1,1,2,29,34 +"30","6-11yrs",30,4,2,1,0,30,35 +"31","6-11yrs",32,1,0,1,1,31,10 +"32","6-11yrs",34,2,1,1,0,32,23 +"33","6-11yrs",42,1,1,1,0,33,16 +"34","6-11yrs",32,2,0,1,2,34,22 +"35","6-11yrs",39,1,1,1,0,35,13 +"36","6-11yrs",35,2,0,1,2,36,24 +"37","6-11yrs",36,1,0,1,1,37,12 +"38","6-11yrs",34,3,1,1,2,38,31 +"39","6-11yrs",30,3,0,1,0,39,30 +"40","6-11yrs",28,1,0,1,1,40,8 +"41","6-11yrs",39,3,0,1,2,41,33 +"42","6-11yrs",35,1,0,1,0,42,11 +"43","6-11yrs",41,1,0,1,0,43,15 +"44","6-11yrs",37,2,1,1,1,44,25 +"45","12+ yrs",30,1,0,1,0,45,44 +"46","12+ yrs",37,1,1,1,0,46,48 +"47","12+ yrs",28,2,0,1,2,47,51 +"48","12+ yrs",27,4,2,1,0,48,61 +"49","12+ yrs",26,2,2,1,0,49,49 +"50","12+ yrs",38,3,0,1,2,50,60 +"51","12+ yrs",24,3,1,1,2,51,56 +"52","12+ yrs",36,5,1,1,2,52,62 +"53","12+ yrs",27,3,1,1,1,53,57 +"54","12+ yrs",28,1,0,1,1,54,42 +"55","12+ yrs",29,2,0,1,2,55,52 +"56","12+ yrs",36,2,0,1,2,56,55 +"57","12+ yrs",28,2,1,1,0,57,51 +"58","12+ yrs",28,2,0,1,2,58,51 +"59","12+ yrs",28,1,0,1,1,59,42 +"60","12+ yrs",27,2,0,1,2,60,50 +"61","12+ yrs",35,2,0,1,2,61,54 +"62","12+ yrs",25,1,0,1,1,62,41 +"63","12+ yrs",34,1,0,1,1,63,47 +"64","12+ yrs",31,2,0,1,2,64,53 +"65","12+ yrs",26,2,1,1,0,65,49 +"66","12+ yrs",32,1,0,1,1,66,46 +"67","12+ yrs",21,1,0,1,1,67,39 +"68","12+ yrs",28,3,1,1,2,68,58 +"69","12+ yrs",37,3,0,1,2,69,59 +"70","12+ yrs",25,1,1,1,0,70,41 +"71","12+ yrs",32,1,1,1,0,71,46 +"72","12+ yrs",25,1,0,1,1,72,41 +"73","12+ yrs",31,1,0,1,1,73,45 +"74","12+ yrs",38,6,0,1,2,74,63 +"75","12+ yrs",26,2,0,1,2,75,49 +"76","12+ yrs",31,1,0,1,1,76,45 +"77","12+ yrs",31,2,0,1,1,77,53 +"78","12+ yrs",25,1,1,1,0,78,41 +"79","12+ yrs",31,1,0,1,1,79,45 +"80","12+ yrs",34,1,0,1,1,80,47 +"81","12+ yrs",35,2,2,1,0,81,54 +"82","12+ yrs",29,1,0,1,1,82,43 +"83","12+ yrs",23,1,0,1,1,83,40 +"84","0-5yrs",26,6,2,0,0,1,3 +"85","0-5yrs",42,1,0,0,0,2,1 +"86","0-5yrs",39,6,2,0,0,3,4 +"87","0-5yrs",34,4,0,0,1,4,2 +"88","6-11yrs",35,3,2,0,0,5,32 +"89","6-11yrs",36,4,1,0,1,6,36 +"90","6-11yrs",23,1,0,0,0,7,6 +"91","6-11yrs",32,2,2,0,0,8,22 +"92","6-11yrs",21,1,0,0,1,9,5 +"93","6-11yrs",28,2,0,0,1,10,19 +"94","6-11yrs",29,2,0,0,0,11,20 +"95","6-11yrs",37,4,1,0,1,12,37 +"96","6-11yrs",31,1,0,0,0,13,9 +"97","6-11yrs",29,3,0,0,1,14,29 +"98","6-11yrs",31,2,1,0,0,15,21 +"99","6-11yrs",27,2,1,0,0,16,18 +"100","6-11yrs",30,5,0,0,2,17,38 +"101","6-11yrs",26,1,0,0,0,18,7 +"102","6-11yrs",25,3,0,0,1,19,28 +"103","6-11yrs",44,1,0,0,0,20,17 +"104","6-11yrs",40,1,0,0,0,21,14 +"105","6-11yrs",35,2,0,0,0,22,24 +"106","6-11yrs",28,2,0,0,0,23,19 +"107","6-11yrs",36,1,0,0,0,24,12 +"108","6-11yrs",27,2,0,0,1,25,18 +"109","6-11yrs",40,2,0,0,0,26,27 +"110","6-11yrs",38,2,0,0,0,27,26 +"111","6-11yrs",34,3,0,0,0,28,31 +"112","6-11yrs",28,4,0,0,2,29,34 +"113","6-11yrs",30,4,1,0,1,30,35 +"114","6-11yrs",32,1,0,0,0,31,10 +"115","6-11yrs",34,2,1,0,0,32,23 +"116","6-11yrs",42,1,1,0,0,33,16 +"117","6-11yrs",32,2,0,0,0,34,22 +"118","6-11yrs",39,1,0,0,0,35,13 +"119","6-11yrs",35,2,0,0,0,36,24 +"120","6-11yrs",36,1,0,0,0,37,12 +"121","6-11yrs",34,3,2,0,0,38,31 +"122","6-11yrs",30,3,0,0,2,39,30 +"123","6-11yrs",28,1,1,0,0,40,8 +"124","6-11yrs",39,3,1,0,0,41,33 +"125","6-11yrs",35,1,0,0,0,42,11 +"126","6-11yrs",41,1,0,0,0,43,15 +"127","6-11yrs",37,2,0,0,0,44,25 +"128","12+ yrs",30,1,1,0,0,45,44 +"129","12+ yrs",37,1,0,0,0,46,48 +"130","12+ yrs",28,2,1,0,0,47,51 +"131","12+ yrs",27,4,2,0,1,48,61 +"132","12+ yrs",26,2,1,0,0,49,49 +"133","12+ yrs",38,3,1,0,0,50,60 +"134","12+ yrs",24,3,2,0,1,51,56 +"135","12+ yrs",36,5,1,0,1,52,62 +"136","12+ yrs",27,3,1,0,1,53,57 +"137","12+ yrs",28,1,1,0,0,54,42 +"138","12+ yrs",29,2,1,0,0,55,52 +"139","12+ yrs",36,2,1,0,0,56,55 +"140","12+ yrs",28,2,1,0,1,57,51 +"141","12+ yrs",28,2,2,0,0,58,51 +"142","12+ yrs",28,1,1,0,0,59,42 +"143","12+ yrs",27,2,1,0,0,60,50 +"144","12+ yrs",35,2,2,0,0,61,54 +"145","12+ yrs",25,1,1,0,0,62,41 +"146","12+ yrs",34,1,0,0,0,63,47 +"147","12+ yrs",31,2,0,0,0,64,53 +"148","12+ yrs",26,2,0,0,1,65,49 +"149","12+ yrs",32,1,0,0,0,66,46 +"150","12+ yrs",21,1,0,0,1,67,39 +"151","12+ yrs",28,3,2,0,0,68,58 +"152","12+ yrs",37,3,1,0,1,69,59 +"153","12+ yrs",25,1,0,0,0,70,41 +"154","12+ yrs",32,1,1,0,0,71,46 +"155","12+ yrs",25,1,0,0,0,72,41 +"156","12+ yrs",31,1,0,0,1,73,45 +"157","12+ yrs",26,2,0,0,2,75,49 +"158","12+ yrs",31,1,0,0,0,76,45 +"159","12+ yrs",31,2,2,0,0,77,53 +"160","12+ yrs",25,1,0,0,0,78,41 +"161","12+ yrs",31,1,0,0,0,79,45 +"162","12+ yrs",34,1,0,0,0,80,47 +"163","12+ yrs",35,2,0,0,0,81,54 +"164","12+ yrs",29,1,0,0,1,82,43 +"165","12+ yrs",23,1,0,0,1,83,40 +"166","0-5yrs",26,6,2,0,0,1,3 +"167","0-5yrs",42,1,0,0,0,2,1 +"168","0-5yrs",39,6,2,0,0,3,4 +"169","0-5yrs",34,4,0,0,2,4,2 +"170","6-11yrs",35,3,0,0,0,5,32 +"171","6-11yrs",36,4,0,0,2,6,36 +"172","6-11yrs",23,1,0,0,0,7,6 +"173","6-11yrs",32,2,0,0,1,8,22 +"174","6-11yrs",21,1,1,0,0,9,5 +"175","6-11yrs",28,2,0,0,1,10,19 +"176","6-11yrs",29,2,0,0,1,11,20 +"177","6-11yrs",37,4,0,0,1,12,37 +"178","6-11yrs",31,1,0,0,0,13,9 +"179","6-11yrs",29,3,0,0,2,14,29 +"180","6-11yrs",31,2,1,0,0,15,21 +"181","6-11yrs",27,2,0,0,0,16,18 +"182","6-11yrs",30,5,1,0,2,17,38 +"183","6-11yrs",26,1,1,0,0,18,7 +"184","6-11yrs",25,3,1,0,1,19,28 +"185","6-11yrs",44,1,1,0,0,20,17 +"186","6-11yrs",40,1,0,0,0,21,14 +"187","6-11yrs",35,2,0,0,0,22,24 +"188","6-11yrs",28,2,2,0,0,23,19 +"189","6-11yrs",36,1,0,0,1,24,12 +"190","6-11yrs",27,2,0,0,2,25,18 +"191","6-11yrs",40,2,0,0,0,26,27 +"192","6-11yrs",38,2,0,0,0,27,26 +"193","6-11yrs",34,3,0,0,0,28,31 +"194","6-11yrs",28,4,2,0,1,29,34 +"195","6-11yrs",30,4,1,0,1,30,35 +"196","6-11yrs",32,1,0,0,0,31,10 +"197","6-11yrs",34,2,0,0,0,32,23 +"198","6-11yrs",42,1,0,0,0,33,16 +"199","6-11yrs",32,2,2,0,0,34,22 +"200","6-11yrs",39,1,0,0,0,35,13 +"201","6-11yrs",35,2,0,0,0,36,24 +"202","6-11yrs",36,1,0,0,0,37,12 +"203","6-11yrs",34,3,2,0,0,38,31 +"204","6-11yrs",30,3,0,0,1,39,30 +"205","6-11yrs",28,1,0,0,0,40,8 +"206","6-11yrs",39,3,0,0,0,41,33 +"207","6-11yrs",35,1,0,0,0,42,11 +"208","6-11yrs",41,1,0,0,0,43,15 +"209","6-11yrs",37,2,0,0,0,44,25 +"210","12+ yrs",30,1,0,0,0,45,44 +"211","12+ yrs",37,1,0,0,1,46,48 +"212","12+ yrs",28,2,1,0,0,47,51 +"213","12+ yrs",27,4,2,0,0,48,61 +"214","12+ yrs",26,2,1,0,0,49,49 +"215","12+ yrs",38,3,1,0,0,50,60 +"216","12+ yrs",24,3,2,0,0,51,56 +"217","12+ yrs",36,5,2,0,1,52,62 +"218","12+ yrs",27,3,2,0,0,53,57 +"219","12+ yrs",28,1,0,0,1,54,42 +"220","12+ yrs",29,2,1,0,1,55,52 +"221","12+ yrs",36,2,0,0,1,56,55 +"222","12+ yrs",28,2,2,0,0,57,51 +"223","12+ yrs",28,2,1,0,0,58,51 +"224","12+ yrs",28,1,0,0,0,59,42 +"225","12+ yrs",27,2,1,0,0,60,50 +"226","12+ yrs",35,2,1,0,0,61,54 +"227","12+ yrs",25,1,1,0,0,62,41 +"228","12+ yrs",34,1,0,0,0,63,47 +"229","12+ yrs",31,2,1,0,0,64,53 +"230","12+ yrs",26,2,0,0,2,65,49 +"231","12+ yrs",32,1,1,0,0,66,46 +"232","12+ yrs",21,1,0,0,0,67,39 +"233","12+ yrs",28,3,2,0,0,68,58 +"234","12+ yrs",37,3,0,0,2,69,59 +"235","12+ yrs",25,1,1,0,0,70,41 +"236","12+ yrs",32,1,0,0,0,71,46 +"237","12+ yrs",25,1,1,0,0,72,41 +"238","12+ yrs",31,1,0,0,0,73,45 +"239","12+ yrs",38,6,0,0,2,74,63 +"240","12+ yrs",26,2,1,0,1,75,49 +"241","12+ yrs",31,1,1,0,0,76,45 +"242","12+ yrs",31,2,0,0,1,77,53 +"243","12+ yrs",25,1,0,0,1,78,41 +"244","12+ yrs",31,1,0,0,1,79,45 +"245","12+ yrs",34,1,0,0,0,80,47 +"246","12+ yrs",35,2,2,0,0,81,54 +"247","12+ yrs",29,1,0,0,1,82,43 +"248","12+ yrs",23,1,0,0,1,83,40 \ No newline at end of file diff --git a/test/data/topics.csv b/test/data/topics.csv new file mode 100644 index 0000000000..014612d2e8 --- /dev/null +++ b/test/data/topics.csv @@ -0,0 +1,9 @@ +review,review_reverse,label +"animals birds cats dogs fish horse","radiation galaxy universe duck",1 +"horse birds house fish duck cats","space galaxy universe radiation",0 +"car truck driver bus pickup","bus pickup",1 +"car truck driver bus pickup horse","car truck",0 +"car truck","car truck driver bus pickup horse",1 +"bus pickup","car truck driver bus pickup",1 +"space galaxy universe radiation","horse birds house fish duck cats",1 +"radiation galaxy universe duck","animals birds cats dogs fish horse",1 \ No newline at end of file