diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index c997776c1..655057fac 100755 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,9 +1,7 @@ -#owners and reviewers -cuml_bench/* @Alexsandruss -daal4py_bench/* @Alexsandruss @samir-nasibli -datasets/* @Alexsandruss -modelbuilders_bench/* @Alexsandruss -report_generator/* @Alexsandruss -sklearn_bench/* @Alexsandruss @samir-nasibli -xgboost_bench/* @Alexsandruss -*.md @Alexsandruss @maria-Petrova +# owners and reviewers +configs @Alexsandruss +configs/spmd* @Alexsandruss @ethanglaser +sklbench @Alexsandruss +*.md @Alexsandruss @samir-nasibli +requirements*.txt @Alexsandruss @ethanglaser +conda-env-*.yml @Alexsandruss @ethanglaser diff --git a/.gitignore b/.gitignore index a9bb476a8..939001390 100755 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,3 @@ -# Logs -*.log - # Release and work directories __pycache__* __work* @@ -8,11 +5,14 @@ __work* # Visual Studio related files, e.g., ".vscode" .vs* -# Datasets -data +# Dataset files +data_cache *.csv *.npy +*.npz -# Results -results*.json -*.xlsx +# Results at repo root +vtune_results +/*.json +/*.xlsx +/*.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..7f2684be5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +repos: + - repo: https://github.com/psf/black + rev: 23.7.0 + hooks: + - id: black + language_version: python3.10 + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + language_version: python3.10 diff --git a/README.md b/README.md index 91f3de943..471dff2b4 100755 --- a/README.md +++ b/README.md @@ -1,147 +1,105 @@ - -# Machine Learning Benchmarks +# Machine Learning Benchmarks [![Build Status](https://dev.azure.com/daal/scikit-learn_bench/_apis/build/status/IntelPython.scikit-learn_bench?branchName=main)](https://dev.azure.com/daal/scikit-learn_bench/_build/latest?definitionId=8&branchName=main) -**Machine Learning Benchmarks** contains implementations of machine learning algorithms -across data analytics frameworks. Scikit-learn_bench can be extended to add new frameworks -and algorithms. It currently supports the [scikit-learn](https://scikit-learn.org/), -[DAAL4PY](https://intelpython.github.io/daal4py/), [cuML](https://github.com/rapidsai/cuml), -and [XGBoost](https://github.com/dmlc/xgboost) frameworks for commonly used -[machine learning algorithms](#supported-algorithms). - -## Follow us on Medium - -We publish blogs on Medium, so [follow us](https://medium.com/intel-analytics-software/tagged/machine-learning) to learn tips and tricks for more efficient data analysis. Here are our latest blogs: +**Scikit-learn_bench** is a benchmark tool for libraries and frameworks implementing Scikit-learn-like APIs and other workloads. -- [Save Time and Money with Intel Extension for Scikit-learn](https://medium.com/intel-analytics-software/save-time-and-money-with-intel-extension-for-scikit-learn-33627425ae4) -- [Superior Machine Learning Performance on the Latest Intel Xeon Scalable Processors](https://medium.com/intel-analytics-software/superior-machine-learning-performance-on-the-latest-intel-xeon-scalable-processor-efdec279f5a3) -- [Leverage Intel Optimizations in Scikit-Learn](https://medium.com/intel-analytics-software/leverage-intel-optimizations-in-scikit-learn-f562cb9d5544) -- [Optimizing CatBoost Performance](https://medium.com/intel-analytics-software/optimizing-catboost-performance-4f73f0593071) -- [Intel Gives Scikit-Learn the Performance Boost Data Scientists Need](https://medium.com/intel-analytics-software/intel-gives-scikit-learn-the-performance-boost-data-scientists-need-42eb47c80b18) -- [From Hours to Minutes: 600x Faster SVM](https://medium.com/intel-analytics-software/from-hours-to-minutes-600x-faster-svm-647f904c31ae) -- [Improve the Performance of XGBoost and LightGBM Inference](https://medium.com/intel-analytics-software/improving-the-performance-of-xgboost-and-lightgbm-inference-3b542c03447e) -- [Accelerate Kaggle Challenges Using Intel AI Analytics Toolkit](https://medium.com/intel-analytics-software/accelerate-kaggle-challenges-using-intel-ai-analytics-toolkit-beb148f66d5a) -- [Accelerate Your scikit-learn Applications](https://medium.com/intel-analytics-software/improving-the-performance-of-xgboost-and-lightgbm-inference-3b542c03447e) -- [Optimizing XGBoost Training Performance](https://medium.com/intel-analytics-software/new-optimizations-for-cpu-in-xgboost-1-1-81144ea21115) -- [Accelerate Linear Models for Machine Learning](https://medium.com/intel-analytics-software/accelerating-linear-models-for-machine-learning-5a75ff50a0fe) -- [Accelerate K-Means Clustering](https://medium.com/intel-analytics-software/accelerate-k-means-clustering-6385088788a1) -- [Fast Gradient Boosting Tree Inference](https://medium.com/intel-analytics-software/fast-gradient-boosting-tree-inference-for-intel-xeon-processors-35756f174f55) +Benefits: +- Full control of benchmarks suite through CLI +- Flexible and powerful benchmark config structure +- Available with advanced profiling tools, such as Intel(R) VTune* Profiler +- Automated benchmarks report generation -## Table of content +### πŸ“œ Table of Contents -- [How to create conda environment for benchmarking](#how-to-create-conda-environment-for-benchmarking) -- [Running Python benchmarks with runner script](#running-python-benchmarks-with-runner-script) -- [Benchmark supported algorithms](#benchmark-supported-algorithms) - - [Scikit-learn benchmakrs](#scikit-learn-benchmakrs) -- [Algorithm parameters](#algorithm-parameters) +- [Machine Learning Benchmarks](#machine-learning-benchmarks) + - [πŸ”§ Create a Python Environment](#-create-a-python-environment) + - [πŸš€ How To Use Scikit-learn\_bench](#-how-to-use-scikit-learn_bench) + - [Benchmarks Runner](#benchmarks-runner) + - [Report Generator](#report-generator) + - [Scikit-learn\_bench High-Level Workflow](#scikit-learn_bench-high-level-workflow) + - [πŸ“š Benchmark Types](#-benchmark-types) + - [πŸ“‘ Documentation](#-documentation) -## How to create conda environment for benchmarking +## πŸ”§ Create a Python Environment -Create a suitable conda environment for each framework to test. Each item in the list below links to instructions to create an appropriate conda environment for the framework. +How to create a usable Python environment with the following required frameworks: -- [**scikit-learn**](sklearn_bench#how-to-create-conda-environment-for-benchmarking) +- **sklearn, sklearnex, and gradient boosting frameworks**: ```bash -pip install -r sklearn_bench/requirements.txt -# or -conda install -c intel scikit-learn scikit-learn-intelex pandas tqdm +# with pip +pip install -r envs/requirements-sklearn.txt +# or with conda +conda env create -n sklearn -f envs/conda-env-sklearn.yml ``` -- [**daal4py**](daal4py_bench#how-to-create-conda-environment-for-benchmarking) +- **RAPIDS**: ```bash -conda install -c conda-forge scikit-learn daal4py pandas tqdm +conda env create -n rapids --solver=libmamba -f envs/conda-env-rapids.yml ``` -- [**cuml**](cuml_bench#how-to-create-conda-environment-for-benchmarking) +## πŸš€ How To Use Scikit-learn_bench -```bash -conda install -c rapidsai -c conda-forge cuml pandas cudf tqdm -``` +### Benchmarks Runner -- [**xgboost**](xgboost_bench#how-to-create-conda-environment-for-benchmarking) +How to run benchmarks using the `sklbench` module and a specific configuration: ```bash -pip install -r xgboost_bench/requirements.txt -# or -conda install -c conda-forge xgboost scikit-learn pandas tqdm +python -m sklbench --config configs/sklearn_example.json ``` -## Running Python benchmarks with runner script - -Run `python runner.py --configs configs/config_example.json [--output-file result.json --verbose INFO --report]` to launch benchmarks. - -Options: - -- ``--configs``: specify the path to a configuration file or a folder that contains configuration files. -- ``--no-intel-optimized``: use Scikit-learn without [Intel(R) Extension for Scikit-learn*](#intelr-extension-for-scikit-learn-support). Now available for [scikit-learn benchmarks](https://github.com/IntelPython/scikit-learn_bench/tree/main/sklearn_bench). By default, the runner uses Intel(R) Extension for Scikit-learn. -- ``--output-file``: specify the name of the output file for the benchmark result. The default name is `result.json` -- ``--report``: create an Excel report based on benchmark results. The `openpyxl` library is required. -- ``--dummy-run``: run configuration parser and dataset generation without benchmarks running. -- ``--verbose``: *WARNING*, *INFO*, *DEBUG*. Print out additional information when the benchmarks are running. The default is *INFO*. - -| Level | Description | -|-----------|---------------| -| *DEBUG* | etailed information, typically of interest only when diagnosing problems. Usually at this level the logging output is so low level that it’s not useful to users who are not familiar with the software’s internals. | -| *INFO* | Confirmation that things are working as expected. | -| *WARNING* | An indication that something unexpected happened, or indicative of some problem in the near future (e.g. β€˜disk space low’). The software is still working as expected. | - -Benchmarks currently support the following frameworks: +The default output is a file with JSON-formatted results of benchmarking cases. To generate a better human-readable report, use the following command: -- **scikit-learn** -- **daal4py** -- **cuml** -- **xgboost** +```bash +python -m sklbench --config configs/sklearn_example.json --report +``` -The configuration of benchmarks allows you to select the frameworks to run, select datasets for measurements and configure the parameters of the algorithms. +By default, output and report file paths are `result.json` and `report.xlsx`. To specify custom file paths, run: - You can configure benchmarks by editing a config file. Check [config.json schema](https://github.com/IntelPython/scikit-learn_bench/blob/main/configs/README.md) for more details. +```bash +python -m sklbench --config configs/sklearn_example.json --report --result-file result_example.json --report-file report_example.xlsx +``` -## Benchmark supported algorithms +For a description of all benchmarks runner arguments, refer to [documentation](sklbench/runner/README.md#arguments). -| algorithm | benchmark name | sklearn (CPU) | sklearn (GPU) | daal4py | cuml | xgboost | -|---|---|---|---|---|---|---| -|**[DBSCAN](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html)**|dbscan|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:| -|**[RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)**|df_clfs|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:x:| -|**[RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)**|df_regr|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:x:| -|**[pairwise_distances](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html)**|distances|:white_check_mark:|:x:|:white_check_mark:|:x:|:x:| -|**[KMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)**|kmeans|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:| -|**[KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)**|knn_clsf|:white_check_mark:|:x:|:x:|:white_check_mark:|:x:| -|**[LinearRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)**|linear|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:| -|**[LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)**|log_reg|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:| -|**[PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)**|pca|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:x:| -|**[Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)**|ridge|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:x:| -|**[SVM](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)**|svm|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:x:| -|**[TSNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html)**|tsne|:white_check_mark:|:x:|:x:|:white_check_mark:|:x:| -|**[train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)**|train_test_split|:white_check_mark:|:x:|:x:|:white_check_mark:|:x:| -|**[GradientBoostingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:| -|**[GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:| +### Report Generator -### Scikit-learn benchmakrs +To combine raw result files gathered from different environments, call the report generator: -When you run scikit-learn benchmarks on CPU, [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) is used by default. Use the ``--no-intel-optimized`` option to run the benchmarks without the extension. +```bash +python -m sklbench.report --result-files result_1.json result_2.json --report-file report_example.xlsx +``` -For the algorithms with both CPU and GPU support, you may use the same [configuration file](https://github.com/IntelPython/scikit-learn_bench/blob/main/configs/skl_xpu_config.json) to run the scikit-learn benchmarks on CPU and GPU. +For a description of all report generator arguments, refer to [documentation](sklbench/report/README.md#arguments). -## Algorithm parameters +### Scikit-learn_bench High-Level Workflow -You can launch benchmarks for each algorithm separately. -To do this, go to the directory with the benchmark: +```mermaid +flowchart TB + A[User] -- High-level arguments --> B[Benchmarks runner] + B -- Generated benchmarking cases --> C["Benchmarks collection"] + C -- Raw JSON-formatted results --> D[Report generator] + D -- Human-readable report --> A -```bash -cd + classDef userStyle fill:#44b,color:white,stroke-width:2px,stroke:white; + class A userStyle ``` -Run the following command: +## πŸ“š Benchmark Types -```bash -python --dataset-name -``` +**Scikit-learn_bench** supports the following types of benchmarks: -The list of supported parameters for each algorithm you can find here: + - **Scikit-learn estimator** - Measures performance and quality metrics of the [sklearn-like estimator](https://scikit-learn.org/stable/glossary.html#term-estimator). + - **Function** - Measures performance metrics of specified function. -- [**scikit-learn**](sklearn_bench#algorithms-parameters) -- [**daal4py**](daal4py_bench#algorithms-parameters) -- [**cuml**](cuml_bench#algorithms-parameters) -- [**xgboost**](xgboost_bench#algorithms-parameters) +## πŸ“‘ Documentation +[Scikit-learn_bench](README.md): +- [Configs](configs/README.md) +- [Benchmarks Runner](sklbench/runner/README.md) +- [Report Generator](sklbench/report/README.md) +- [Benchmarks](sklbench/benchmarks/README.md) +- [Data Processing](sklbench/datasets/README.md) +- [Emulators](sklbench/emulators/README.md) +- [Developer Guide](docs/README.md) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 8181d91d9..b32320dc2 100755 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,83 +1,82 @@ -variables: - - name: python.version - value: "3.9" +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== jobs: - - job: Linux_Sklearn + - job: Lint pool: - vmImage: "ubuntu-20.04" + vmImage: 'ubuntu-22.04' steps: - - task: UsePythonVersion@0 - displayName: "Use Python $(python.version)" - inputs: - versionSpec: "$(python.version)" - - script: | - pip install -r requirements-common.txt - pip install -r sklearn_bench/requirements.txt - python runner.py --configs configs/testing/sklearn.json - displayName: Run bench - - job: Linux_XGBoost + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.10' + addToPath: true + - script: | + python -m pip install --upgrade pip setuptools + pip install isort black + isort --check . && black --check . + displayName: 'Linting' + - job: Linux + dependsOn: Lint + strategy: + matrix: + Python3.9_Pip: + PYTHON_VERSION: "3.9" + PKG_MANAGER: "pip" + Python3.10_Pip: + PYTHON_VERSION: "3.10" + PKG_MANAGER: "pip" + Python3.11_Pip: + PYTHON_VERSION: "3.11" + PKG_MANAGER: "pip" + Python3.9_Conda: + PYTHON_VERSION: "3.9" + PKG_MANAGER: "conda" + Python3.10_Conda: + PYTHON_VERSION: "3.10" + PKG_MANAGER: "conda" + Python3.11_Conda: + PYTHON_VERSION: "3.11" + PKG_MANAGER: "conda" pool: - vmImage: "ubuntu-20.04" + vmImage: "ubuntu-latest" steps: - - task: UsePythonVersion@0 - displayName: "Use Python $(python.version)" - inputs: - versionSpec: "$(python.version)" - - script: | - pip install -r requirements-common.txt - pip install -r xgboost_bench/requirements.txt - python runner.py --configs configs/testing/xgboost.json --no-intel-optimized - displayName: Run bench - - job: Linux_daal4py + - template: test-configuration-linux.yml + - job: Windows + dependsOn: Lint + strategy: + matrix: + Python3.9_Pip: + PYTHON_VERSION: "3.9" + PKG_MANAGER: "pip" + Python3.10_Pip: + PYTHON_VERSION: "3.10" + PKG_MANAGER: "pip" + Python3.11_Pip: + PYTHON_VERSION: "3.11" + PKG_MANAGER: "pip" + Python3.9_Conda: + PYTHON_VERSION: "3.9" + PKG_MANAGER: "conda" + Python3.10_Conda: + PYTHON_VERSION: "3.10" + PKG_MANAGER: "conda" + Python3.11_Conda: + PYTHON_VERSION: "3.11" + PKG_MANAGER: "conda" pool: - vmImage: "ubuntu-20.04" + vmImage: "windows-latest" steps: - - task: UsePythonVersion@0 - displayName: "Use Python $(python.version)" - inputs: - versionSpec: "$(python.version)" - - script: | - pip install -r requirements-common.txt - pip install -r daal4py_bench/requirements.txt - python runner.py --configs configs/testing/daal4py.json --no-intel-optimized - displayName: Run bench - - job: Linux_XGBoost_and_daal4py - pool: - vmImage: "ubuntu-20.04" - steps: - - script: | - conda update -y -q conda - conda create -n bench -q -y -c conda-forge python=3.9 pandas xgboost scikit-learn daal4py tqdm requests - displayName: Create Anaconda environment - - script: | - . /usr/share/miniconda/etc/profile.d/conda.sh - conda activate bench - python runner.py --configs configs/testing/daal4py_xgboost.json --no-intel-optimized - displayName: Run bench - - job: Pep8 - pool: - vmImage: "ubuntu-20.04" - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: "$(python.version)" - addToPath: true - - script: | - python -m pip install --upgrade pip setuptools - pip install flake8 requests - flake8 --max-line-length=100 --count - displayName: "PEP 8 check" - - job: Mypy - pool: - vmImage: "ubuntu-20.04" - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: "$(python.version)" - addToPath: true - - script: | - python -m pip install --upgrade pip setuptools - pip install mypy data-science-types requests types-requests - mypy . --ignore-missing-imports - displayName: "mypy check" + - template: test-configuration-win.yml diff --git a/bench.py b/bench.py deleted file mode 100644 index 67ae322c7..000000000 --- a/bench.py +++ /dev/null @@ -1,567 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import json -import logging -import sys -import timeit -import re - -import numpy as np -import sklearn - - -def get_dtype(data): - ''' - Get type of input data as numpy.dtype - ''' - if hasattr(data, 'dtype'): - return data.dtype - if hasattr(data, 'dtypes'): - return str(data.dtypes[0]) - if hasattr(data, 'values'): - return data.values.dtype - raise ValueError(f'Impossible to get data type of {type(data)}') - - -def sklearn_disable_finiteness_check(): - try: - sklearn.set_config(assume_finite=True) - except AttributeError: - try: - sklearn._ASSUME_FINITE = True - except AttributeError: - sklearn.utils.validation._assert_all_finite = lambda X: None - - -def _parse_size(string, dim=2): - try: - tup = tuple(int(n) for n in string.replace('x', ',').split(',')) - except Exception as e: - msg = ( - f'Invalid size "{string}": sizes must be integers separated by ' - f'"x" or ",".' - ) - raise argparse.ArgumentTypeError(msg) from e - - if len(tup) != dim: - msg = f'Expected size parameter of {dim} dimensions but got {len(tup)}' - raise argparse.ArgumentTypeError(msg) - - return tup - - -def is_float(string): - return bool(re.match(r"^[-+]?(?:\b[0-9]+(?:\.[0-9]*)?|\.[0-9]+\b)(?:[eE][-+]?[0-9]+\b)?$", - string)) - - -def float_or_int(string): - return int(string) if string.isdigit() else float(string) - - -def float_or_int_or_str(string): - return int(string) if string.isdigit() else float(string) if is_float(string) else string - - -def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64): - ''' - Get an optimal cache size for sklearn.svm.SVC. - - Parameters - ---------- - n_rows : int - Number of rows in the dataset - dtype : dtype-like, optional (default np.double) - dtype to use for computing cache size - max_cache : int, optional (default 64) - Maximum cache size, in gigabytes - ''' - - byte_size = np.empty(0, dtype=dtype).itemsize - optimal_cache_size_bytes = byte_size * (n_rows ** 2) - one_gb = 2 ** 30 - max_cache_bytes = max_cache * one_gb - return max_cache_bytes \ - if optimal_cache_size_bytes > max_cache_bytes else optimal_cache_size_bytes - - -def parse_args(parser, size=None, loop_types=(), - n_jobs_supported=True, prefix='sklearn'): - ''' - Add common arguments useful for most benchmarks and parse. - - Parameters - ---------- - parser : argparse.ArgumentParser - Parser to which the arguments should be added. - size : tuple of int, optional - Enable '--size' argument with this default size. - If None (default), no '--size' argument will be added. - loop_types : iterable of str, optional - Add arguments like '--fit-inner-loops' and '--fit-outer-loops', - useful for tweaking runtime of the benchmark. - n_jobs_supported : bool - If set to True, generate a n_jobs member in the argparse Namespace - corresponding to the optimal n_jobs parameter for scikit-learn. - Otherwise, n_jobs will be set to None. - prefix : str, optional, default 'sklearn' - The default prefix to report - - Returns - ------- - parser : argparse.ArgumentParser - Parser to which the arguments were added. - This is the same parser that was passed to this function. - ''' - - parser.add_argument('-n', '--num-threads', '--core-number', default=-1, - dest='threads', type=int, - help='Number of threads to use') - parser.add_argument('-a', '--arch', default='?', - help='Machine architecture, for bookkeeping') - parser.add_argument('-b', '--batch', '--batchID', default='?', - help='Batch ID, for bookkeeping') - parser.add_argument('-p', '--prefix', default=prefix, - help='Prefix string, for bookkeeping') - parser.add_argument('-v', '--verbose', default=False, action='store_true', - help='Output extra debug messages') - parser.add_argument('--data-format', type=str, default='numpy', - choices=('numpy', 'pandas', 'cudf'), - help='Data format: numpy (default), pandas, cudf') - parser.add_argument('--data-order', type=str, default='C', - choices=('C', 'F'), - help='Data order: C (row-major, default) or' - 'F (column-major)') - parser.add_argument('-d', '--dtype', type=np.dtype, default=np.float64, - choices=(np.float32, np.float64), - help='Data type: float64 (default) or float32') - parser.add_argument('--check-finiteness', default=False, - action='store_true', - help='Check finiteness in sklearn input check' - '(disabled by default)') - parser.add_argument('--output-format', type=str, default='json', - choices=('json'), help='Output format: json') - parser.add_argument('--time-method', type=str, default='box_filter', - choices=('box_filter'), - help='Method used for time mesurements') - parser.add_argument('--box-filter-measurements', type=int, default=100, - help='Maximum number of measurements in box filter') - parser.add_argument('--inner-loops', default=100, type=int, - help='Maximum inner loop iterations ' - '(we take the mean over inner iterations)') - parser.add_argument('--outer-loops', default=100, type=int, - help='Maximum outer loop iterations ' - '(we take the min over outer iterations)') - parser.add_argument('--time-limit', default=10., type=float, - help='Target time to spend to benchmark') - parser.add_argument('--goal-outer-loops', default=10, - type=int, dest='goal', - help='Number of outer loops to aim ' - 'while automatically picking number of ' - 'inner loops. If zero, do not automatically ' - 'decide number of inner loops.') - parser.add_argument('--seed', type=int, default=12345, - help='Seed to pass as random_state') - parser.add_argument('--dataset-name', type=str, default=None, - help='Dataset name') - parser.add_argument('--no-intel-optimized', default=False, - action='store_true', - help='Use no intel optimized version. ' - 'Now avalible for scikit-learn benchmarks') - parser.add_argument('--device', default='none', type=str, - choices=('host', 'cpu', 'gpu', 'none'), - help='Execution context device') - - for data in ['X', 'y']: - for stage in ['train', 'test']: - parser.add_argument(f'--file-{data}-{stage}', - type=argparse.FileType('r'), - help=f'Input file with {data}_{stage},' - 'in NPY format') - - if size is not None: - parser.add_argument('-s', '--size', default=size, type=_parse_size, - dest='shape', - help='Problem size, delimited by "x" or ","') - - params = parser.parse_args() - - if not params.no_intel_optimized: - try: - from sklearnex import patch_sklearn - patch_sklearn() - except ImportError: - logging.info('Failed to import sklearnex.patch_sklearn.' - 'Use stock version scikit-learn', file=sys.stderr) - params.device = 'none' - else: - if params.device != 'none': - logging.info( - 'Device context is not supported for stock scikit-learn.' - 'Please use --no-intel-optimized=False with' - f'--device={params.device} parameter. Fallback to --device=none.', - file=sys.stderr) - params.device = 'none' - - # disable finiteness check (default) - if not params.check_finiteness: - sklearn_disable_finiteness_check() - - # Ask DAAL what it thinks about this number of threads - num_threads = prepare_daal_threads(num_threads=params.threads) - if params.verbose: - logging.info(f'@ DAAL gave us {num_threads} threads') - - n_jobs = None - if n_jobs_supported: - n_jobs = num_threads = params.threads - - # Set threading and DAAL related params here - setattr(params, 'threads', num_threads) - setattr(params, 'n_jobs', n_jobs) - - # Set size string parameter for easy printing - if size is not None: - setattr(params, 'size', size_str(params.shape)) - - # Very verbose output - if params.verbose: - logging.info(f'@ params = {params.__dict__}') - - return params - - -def size_str(shape): - return 'x'.join(str(d) for d in shape) - - -def prepare_daal_threads(num_threads=-1): - try: - import daal4py - if num_threads > 0: - daal4py.daalinit(nthreads=num_threads) - num_threads = daal4py.num_threads() - except ImportError: - logging.warning('@ Package "daal4py" was not found. Number of threads ' - 'is being ignored') - num_threads = 1 - return num_threads - - -def measure_function_time(func, *args, params, **kwargs): - return time_box_filter(func, *args, - n_meas=params.box_filter_measurements, - time_limit=params.time_limit, **kwargs) - - -def time_box_filter(func, *args, n_meas, time_limit, **kwargs): - times = [] - while len(times) < n_meas: - t0 = timeit.default_timer() - val = func(*args, **kwargs) - t1 = timeit.default_timer() - times.append(t1 - t0) - if sum(times) > time_limit: - break - - def box_filter(timing, left=0.25, right=0.75): - timing.sort() - size = len(timing) - if size == 1: - return timing[0] - Q1, Q2 = timing[int(size * left)], timing[int(size * right)] - IQ = Q2 - Q1 - lower = Q1 - 1.5 * IQ - upper = Q2 + 1.5 * IQ - result = np.array([item for item in timing if lower < item < upper]) - return np.mean(result) - - return box_filter(times), val - - -def logverbose(msg, verbose): - ''' - Print msg as a verbose logging message only if verbose is True - ''' - if verbose: - print('@', msg) - - -def convert_to_numpy(data): - ''' - Convert input data to numpy array - ''' - if 'cudf' in str(type(data)): - data = data.to_pandas().values - elif 'pandas' in str(type(data)): - data = data.values - elif isinstance(data, np.ndarray): - pass - elif 'numba.cuda.cudadrv.devicearray.DeviceNDArray' in str(type(data)): - data = np.array(data) - else: - raise TypeError( - f'Unknown data format "{type(data)}" for convertion to np.ndarray') - return data - - -def columnwise_score(y, yp, score_func): - y = convert_to_numpy(y) - yp = convert_to_numpy(yp) - if y.ndim + yp.ndim > 2: - if 1 in (y.shape + yp.shape)[1:]: - if y.ndim > 1: - y = y[:, 0] - if yp.ndim > 1: - yp = yp[:, 0] - else: - return [score_func(y[i], yp[i]) for i in range(y.shape[1])] - return score_func(y, yp) - - -def accuracy_score(y_true, y_pred): - return columnwise_score(y_true, y_pred, lambda y1, y2: np.mean(y1 == y2)) - - -def log_loss(y_true, y_pred): - from sklearn.metrics import log_loss as sklearn_log_loss - y_true = convert_to_numpy(y_true) - y_pred = convert_to_numpy(y_pred) - return sklearn_log_loss(y_true, y_pred) - - -def roc_auc_score(y_true, y_pred, multi_class='ovr'): - from sklearn.metrics import roc_auc_score as sklearn_roc_auc - y_true = convert_to_numpy(y_true) - y_pred = convert_to_numpy(y_pred) - if y_pred.shape[1] == 2: # binary case - y_pred = y_pred[:, 1] - return sklearn_roc_auc(y_true, y_pred, multi_class=multi_class) - - -def rmse_score(y_true, y_pred): - return columnwise_score( - y_true, y_pred, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) - - -def r2_score(y_true, y_pred): - from sklearn.metrics import r2_score as sklearn_r2_score - y_true = convert_to_numpy(y_true) - y_pred = convert_to_numpy(y_pred) - return sklearn_r2_score(y_true, y_pred) - - -def davies_bouldin_score(X, labels): - from sklearn.metrics.cluster import davies_bouldin_score as sklearn_dbs - X = convert_to_numpy(X) - labels = convert_to_numpy(labels) - try: - res = sklearn_dbs(X, labels) - except ValueError as ex: - res = ex - return res - - -def convert_data(data, dtype, data_order, data_format): - ''' - Convert input data (numpy array) to needed format, type and order - ''' - # Firstly, change order and type of data - if data_order == 'F': - data = np.asfortranarray(data, dtype) - elif data_order == 'C': - data = np.ascontiguousarray(data, dtype) - - # Secondly, change format of data - if data_format == 'numpy': - return data - if data_format == 'pandas': - import pandas as pd - - if data.ndim == 1: - return pd.Series(data) - return pd.DataFrame(data) - if data_format == 'cudf': - import cudf - import pandas as pd - - return cudf.DataFrame.from_pandas(pd.DataFrame(data)) - - -def read_csv(filename, params): - from string import ascii_lowercase, ascii_uppercase - - # find out header existance - header_letters = set( - ascii_lowercase.replace('e', '') + ascii_uppercase.replace('E', '')) - with open(filename, 'r') as file: - first_line = file.readline() - while 'nan' in first_line: - first_line = first_line.replace('nan', '') - header = 0 if len(header_letters & set(first_line)) != 0 else None - # try to read csv with pandas and fall back to numpy reader if failed - try: - import pandas as pd - data = pd.read_csv(filename, header=header, dtype=params.dtype).values - except ImportError: - data = np.genfromtxt(filename, delimiter=',', dtype=params.dtype, - skip_header=0 if header is None else 1) - - if data.ndim == 2: - if data.shape[1] == 1: - data = data.reshape((data.shape[0],)) - - return data - - -def load_data(params, generated_data=[], add_dtype=False, label_2d=False, - int_label=False): - full_data = { - file: None for file in ['X_train', 'X_test', 'y_train', 'y_test'] - } - param_vars = vars(params) - int_dtype = np.int32 if '32' in str(params.dtype) else np.int64 - for element in full_data: - file_arg = f'file_{element}' - # load and convert data from npy/csv file if path is specified - new_dtype = int_dtype if 'y' in element and int_label else params.dtype - if param_vars[file_arg] is not None: - if param_vars[file_arg].name.endswith('.npy'): - data = np.load(param_vars[file_arg].name, allow_pickle=True) - else: - data = read_csv(param_vars[file_arg].name, params) - full_data[element] = convert_data( - data, - new_dtype, - params.data_order, params.data_format - ) - if full_data[element] is None: - # generate and convert data if it's marked and path isn't specified - if element in generated_data: - full_data[element] = convert_data( - np.random.rand(*params.shape), - new_dtype, - params.data_order, params.data_format) - # generate and convert data if it's marked and path isn't specified - if full_data[element] is None and element in generated_data: - full_data[element] = convert_data( - np.random.rand(*params.shape), - int_dtype if 'y' in element and int_label else params.dtype, - params.data_order, params.data_format) - # convert existing labels from 1- to 2-dimensional - # if it's forced and possible - if full_data[element] is not None and 'y' in element \ - and label_2d and hasattr(full_data[element], 'reshape'): - full_data[element] = full_data[element].reshape( - (full_data[element].shape[0], 1)) - # add dtype property to data if it's needed and doesn't exist - if full_data[element] is not None and add_dtype and \ - not hasattr(full_data[element], 'dtype'): - if hasattr(full_data[element], 'values'): - full_data[element].dtype = full_data[element].values.dtype - elif hasattr(full_data[element], 'dtypes'): - full_data[element].dtype = full_data[element].dtypes[0].type - - params.dtype = get_dtype(full_data['X_train']) - # add size to parameters which is need for some cases - if not hasattr(params, 'size'): - params.size = size_str(full_data['X_train'].shape) - - # clone train data to test if test data is None - for data in ['X', 'y']: - if full_data[f'{data}_train'] is not None and full_data[f'{data}_test'] is None: - full_data[f'{data}_test'] = full_data[f'{data}_train'] - return tuple(full_data.values()) - - -def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, - alg_params=None): - result = { - 'library': library, - 'algorithm': algorithm, - 'stage': stage, - 'device': params.device, - 'input_data': { - 'data_format': params.data_format, - 'data_order': params.data_order, - 'data_type': str(params.dtype), - 'dataset_name': params.dataset_name, - 'rows': data.shape[0], - 'columns': data.shape[1] - } - } - result['algorithm_parameters'] = {} - if alg_instance is not None: - if 'Booster' in str(type(alg_instance)): - alg_instance_params = dict(alg_instance.attributes()) - else: - alg_instance_params = dict(alg_instance.get_params()) - if ('min_samples_split' in alg_instance_params - and 'handle' in alg_instance_params): - alg_instance_params['dtype'] = str( - alg_instance_params['dtype']) - result['algorithm_parameters'].update(alg_instance_params) - if alg_params is not None: - result['algorithm_parameters'].update(alg_params) - return result - - -def print_output(library, algorithm, stages, params, functions, - times, metric_type, metrics, data, alg_instance=None, - alg_params=None): - if params.output_format == 'json': - output = [] - for i, stage in enumerate(stages): - result = gen_basic_dict(library, algorithm, stage, params, - data[i], alg_instance, alg_params) - result.update({'time[s]': times[i]}) - if isinstance(metric_type, str): - result.update({f'{metric_type}': np.float64(metrics[i]) - if isinstance(metrics[i], np.float32) - else metrics[i]}) - elif isinstance(metric_type, list): - for ind, val in enumerate(metric_type): - if metrics[ind][i] is not None: - result.update({f'{val}': np.float64(metrics[ind][i]) - if isinstance(metrics[ind][i], np.float32) - else metrics[ind][i]}) - if hasattr(params, 'n_classes'): - result['input_data'].update({'classes': params.n_classes}) - if hasattr(params, 'n_clusters'): - if algorithm == 'kmeans': - result['input_data'].update( - {'n_clusters': params.n_clusters}) - elif algorithm == 'dbscan': - result.update({'n_clusters': params.n_clusters}) - # replace non-string init with string for kmeans benchmarks - if alg_instance is not None: - if 'init' in result['algorithm_parameters'].keys(): - if not isinstance(result['algorithm_parameters']['init'], str): - result['algorithm_parameters']['init'] = 'random' - result['algorithm_parameters'].pop('handle', None) - output.append(result) - print(json.dumps(output, indent=4)) - - -def run_with_context(params, function): - if params.device != 'none': - from daal4py.oneapi import sycl_context - with sycl_context(params.device): - function() - else: - function() diff --git a/configs/README.md b/configs/README.md index 02dee119b..79b80183f 100644 --- a/configs/README.md +++ b/configs/README.md @@ -1,69 +1,175 @@ -# Config JSON Schema - -Configure benchmarks by editing the `config.json` file. -You can configure some algorithm parameters, datasets, a list of frameworks to use, and the usage of some environment variables. -Refer to the tables below for descriptions of all fields in the configuration file. - -- [Root Config Object](#root-config-object) -- [Common Object](#common-object) -- [Case Object](#case-object) -- [Dataset Object](#dataset-object) -- [Training Object](#training-object) -- [Testing Object](#testing-object) - -## Root Config Object - -| Field Name | Type | Description | -| ----- | ---- |------------ | -|common| [Common Object](#common-object)| **REQUIRED** common benchmarks setting: frameworks and input data settings | -|cases| List[[Case Object](#case-object)] | **REQUIRED** list of algorithms, their parameters and training data | - -## Common Object - -| Field Name | Type | Description | -| ----- | ---- |------------ | -|data-format| Union[str, List[str]] | **REQUIRED** Input data format: *numpy*, *pandas*, or *cudf*. | -|data-order| Union[str, List[str]] | **REQUIRED** Input data order: *C* (row-major, default) or *F* (column-major). | -|dtype| Union[str, List[str]] | **REQUIRED** Input data type: *float64* (default) or *float32*. | -|check-finitness| List[] | Check finiteness during scikit-learn input check (disabled by default). | -|device| array[string] | For scikit-learn only. The list of devices to run the benchmarks on.
It can be *None* (default, run on CPU without sycl context) or one of the types of sycl devices: *cpu*, *gpu*, *host*.
Refer to [SYCL specification](https://www.khronos.org/files/sycl/sycl-2020-reference-guide.pdf) for details.| - -## Case Object - -| Field Name | Type | Description | -| ----- | ---- |------------ | -|lib| Union[str, List[str]] | **REQUIRED** A test framework or a list of frameworks. Must be from [*sklearn*, *daal4py*, *cuml*, *xgboost*]. | -|algorithm| string | **REQUIRED** Benchmark file name. | -|dataset| List[[Dataset Object](#dataset-object)] | **REQUIRED** Input data specifications. | -|**specific algorithm parameters**| Union[int, float, str, List[int], List[float], List[str]] | Other algorithm-specific parameters | - -**Important:** You can move any parameter from **"cases"** to **"common"** if this parameter is common to all cases - -## Dataset Object - -| Field Name | Type | Description | -| ----- | ---- |------------ | -|source| string | **REQUIRED** Data source: *synthetic*, *csv*, or *npy*. | -|type| string | **REQUIRED for synthetic data**. The type of task for which the dataset is generated: *classification*, *blobs*, or *regression*. | -|n_classes| int | For *synthetic* data and for *classification* type only. The number of classes (or labels) of the classification problem | -|n_clusters| int | For *synthetic* data and for *blobs* type only. The number of centers to generate | -|n_features| int | **REQUIRED for *synthetic* data**. The number of features to generate. | -|name| string | Name of the dataset. | -|training| [Training Object](#training-object) | **REQUIRED** An object with the paths to the training datasets. | -|testing| [Testing Object](#testing-object) | An object with the paths to the testing datasets. If not provided, the training datasets are used. | - -## Training Object - -| Field Name | Type | Description | -| ----- | ---- |------------ | -| n_samples | int | **REQUIRED** The total number of the training samples | -| x | str | **REQUIRED** The path to the training samples | -| y | str | **REQUIRED** The path to the training labels | - -## Testing Object - -| Field Name | Type | Description | -| ----- | ---- |------------ | -| n_samples | int | **REQUIRED** The total number of the testing samples | -| x | str | **REQUIRED** The path to the testing samples | -| y | str | **REQUIRED** The path to the testing labels | +# Configs + +Benchmarking cases in `scikit-learn_bench` are defined by configuration files and stored in the `configs` directory of the repository. + +The configuration file (config) defines: + - Measurement and profiling parameters + - Library and algorithm to use + - Algorithm-specific parameters + - Data to use as input of the algorithm + +Configs are split into subdirectories and files by benchmark scope and algorithm. + +# Benchmarking Configs Specification + +## Config Structure + +Benchmark config files are written in JSON format and have a few reserved keys: + - `INCLUDE` - Other configuration files whose parameter sets to include + - `PARAMETERS_SETS` - Benchmark parameters within each set + - `TEMPLATES` - List different setups with parameters sets template-specific parameters + - `SETS` - List parameters sets to include in the template + +Configs heavily utilize lists of scalar values and dictionaries to avoid duplication of cases. + +Formatting specification: +```json +{ + "INCLUDE": [ + "another_config_file_path_0" + ... + ] + "PARAMETERS_SETS": { + "parameters_set_name_0": Dict or List[Dict] of any JSON-serializable with any level of nesting, + ... + }, + "TEMPLATES": { + "template_name_0": { + "SETS": ["parameters_set_name_0", ...], + Dict of any JSON-serializable with any level of nesting overwriting parameter sets + }, + ... + } +} +``` + +Example +```json +{ + "PARAMETERS_SETS": { + "estimator parameters": { + "algorithm": { + "estimator": "LinearRegression", + "estimator_params": { + "fit_intercept": false + } + } + }, + "regression data": { + "data": [ + { "source": "fetch_openml", "id": 1430 }, + { "dataset": "california_housing" } + ] + } + }, + "TEMPLATES": { + "linear regression": { + "SETS": ["estimator parameters", "regression data"], + "algorithm": { + "library": ["sklearn", "sklearnex", "cuml"] + } + } + } +} +``` + +## Common Parameters + +Configs have the three highest parameter keys: + - `bench` - Specifies a workflow of the benchmark, such as parameters of measurement or profiling + - `algorithm` - Specifies measured entity parameters + - `data` - Specifies data parameters to use + +| Parameter keys | Default value | Choices | Description | +|:---------------|:--------------|:--------|:------------| +|

Benchmark workflow parameters

|||| +| `bench`:`taskset` | None | | Value for `-c` argument of `taskset` utility used over benchmark subcommand. | +| `bench`:`vtune_profiling` | None | | Analysis type for `collect` argument of Intel(R) VTune* Profiler tool. Linux* OS only. | +| `bench`:`vtune_results_directory` | `vtune_results` | | Directory path to store Intel(R) VTune* Profiler results. | +| `bench`:`n_runs` | `10` | | Number of runs for measured entity. | +| `bench`:`time_limit` | `3600` | | Time limit in seconds before the benchmark early stop. | +| `bench`:`distributor` | None | None, `mpi` | Library used to handle distributed algorithm. | +| `bench`:`mpi_params` | Empty dict | | Parameters for `mpirun` command of MPI library. | +|

Data parameters

|||| +| `data`:`cache_directory` | `data_cache` | | Directory path to store cached datasets for fast loading. | +| `data`:`raw_cache_directory` | `data`:`cache_directory` + "raw" | | Directory path to store downloaded raw datasets. | +| `data`:`dataset` | None | | Name of dataset to use from implemented dataset loaders. | +| `data`:`source` | None | `fetch_openml`, `make_regression`, `make_classification`, `make_blobs` | Data source to use for loading or synthetic generation. | +| `data`:`id` | None | | OpenML data id for `fetch_openml` source. | +| `data`:`preprocessing_kwargs`:`replace_nan` | `median` | `median`, `mean` | Value to replace NaNs in preprocessed data. | +| `data`:`preprocessing_kwargs`:`category_encoding` | `ordinal` | `ordinal`, `onehot`, `drop`, `ignore` | How to encode categorical features in preprocessed data. | +| `data`:`preprocessing_kwargs`:`normalize` | False | | Enables normalization of preprocessed data. | +| `data`:`preprocessing_kwargs`:`force_for_sparse` | True | | Forces preprocessing for sparse data formats. | +| `data`:`split_kwargs` | Empty `dict` or default split from dataset description | | Data split parameters for `train_test_split` function. | +| `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. | +| `data`:`order` | `F` | `C`, `F` | Data order to use in benchmark: contiguous(C) or Fortran. | +| `data`:`dtype` | `float64` | | Data type to use in benchmark. | +| `data`:`distributed_split` | None | None, `rank_based` | Split type used to distribute data between machines in distributed algorithm. `None` type means usage of all data without split on all machines. `rank_based` type splits the data equally between machines with split sequence based on rank id from MPI. | +|

Algorithm parameters

|||| +| `algorithm`:`library` | None | | Python module containing measured entity (class or function). | +| `algorithm`:`device` | `default` | `default`, `cpu`, `gpu` | Device selected for computation. | + +## Benchmark-Specific Parameters + +### `Scikit-learn Estimator` + +| Parameter keys | Default value | Choices | Description | +|:---------------|:--------------|:--------|:------------| +| `algorithm`:`estimator` | None | | Name of measured estimator. | +| `algorithm`:`estimator_params` | Empty `dict` | | Parameters for estimator constructor. | +| `algorithm`:`online_inference_mode` | False | | Enables online mode for inference methods of estimator (separate call for each sample). | +| `algorithm`:`sklearn_context` | None | | Parameters for sklearn `config_context` used over estimator. | +| `algorithm`:`sklearnex_context` | None | | Parameters for sklearnex `config_context` used over estimator. Updated by `sklearn_context` if set. | +| `bench`:`ensure_sklearnex_patching` | True | | If True, warns about sklearnex patching failures. | + +### `Function` + +| Parameter keys | Default value | Choices | Description | +|:---------------|:--------------|:--------|:------------| +| `algorithm`:`function` | None | | Name of measured function. | +| `algorithm`:`args_order` | `x_train\|y_train` | Any in format `{subset_0}\|..\|{subset_n}` | Arguments order for measured function. | +| `algorithm`:`kwargs` | Empty `dict` | | Named arguments for measured function. | + +## Special Value + +You can define some parameters as specific from other parameters or properties with `[SPECIAL_VALUE]` prefix in string value: +```json +... "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } ... +... "generation_kwargs": { "n_informative": "[SPECIAL_VALUE]0.5" } ... +``` + +List of available special values: + +| Parameter keys | Benchmark type[s] | Special value | Description | +|:---------------|:------------------|:--------------|:------------| +| `data`:`dataset` | all | `all_named` | Sets datasets to use as list of all named datasets available in loaders. | +| `data`:`generation_kwargs`:`n_informative` | all | *float* value in [0, 1] range | Sets datasets to use as list of all named datasets available in loaders. | +| `bench`:`taskset` | all | Specification of numa nodes in `numa:{numa_node_0}[\|{numa_node_1}...]` format | Sets CPUs affinity using `taskset` utility. | +| `algorithm`:`estimator_params`:`n_jobs` | sklearn_estimator | `physical_cpus`, `logical_cpus`, or ratio of previous ones in format `{type}_cpus:{ratio}` where `ratio` is float | Sets `n_jobs` parameter to a number of physical/logical CPUs or ratio of them for an estimator. | +| `algorithm`:`estimator_params`:`scale_pos_weight` | sklearn_estimator | `auto` | Sets `scale_pos_weight` parameter to `sum(negative instances) / sum(positive instances)` value for estimator. | +| `algorithm`:`estimator_params`:`n_clusters` | sklearn_estimator | `auto` | Sets `n_clusters` parameter to number of clusters or classes from dataset description for estimator. | +| `algorithm`:`estimator_params`:`eps` | sklearn_estimator | `distances_quantile:{quantile}` format where quantile is *float* value in [0, 1] range | Computes `eps` parameter as quantile value of distances in `x_train` matrix for estimator. | + +## Range of Values + +You can define some parameters as a range of values with the `[RANGE]` prefix in string value: +```json +... "generation_kwargs": {"n_features": "[RANGE]pow:2:5:6"} ... +``` + +Supported ranges: + + - `add:start{int}:end{int}:step{int}` - Arithmetic progression (Sequence: start + step * i <= end) + - `mul:current{int}:end{int}:step{int}` - Geometric progression (Sequence: current * step <= end) + - `pow:base{int}:start{int}:end{int}[:step{int}=1]` - Powers of base number + +## Removal of Values + +You can remove specific parameter from subset of cases when stacking parameters sets using `[REMOVE]` parameter value: + +```json +... "estimator_params": { "n_jobs": "[REMOVE]" } ... +``` + +--- +[Documentation tree](../README.md#-documentation) diff --git a/configs/blogs/skl_2021_3.json b/configs/blogs/skl_2021_3.json deleted file mode 100644 index 0035ae288..000000000 --- a/configs/blogs/skl_2021_3.json +++ /dev/null @@ -1,491 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 1000, - "n_features": 20, - "training": { - "n_samples": 1000000 - } - } - ], - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 1000, - "maxiter": 50, - "tol": 0.0 - }, - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 5, - "n_features": 50, - "training": { - "n_samples": 10000000 - } - } - ], - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 5, - "maxiter": 50, - "init": "k-means++", - "tol": 0.0 - }, - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 20, - "n_features": 50, - "training": { - "n_samples": 3000000 - } - } - ], - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 20, - "maxiter": 50, - "tol": 0.0 - }, - { - "algorithm": "pca", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 1000000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 2000, - "training": { - "n_samples": 10000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 1000, - "training": { - "n_samples": 30000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 4000, - "training": { - "n_samples": 6000 - } - } - ], - "svd-solver": "full", - "n-components": 10 - }, - { - "algorithm": "df_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "num-trees": 50, - "max-depth": 16, - "max-leaf-nodes": 131072, - "max-features": 0.2 - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - } - ], - "alpha": 5 - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - } - ] - }, - { - "algorithm": "log_reg", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 2000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ], - "maxiter": 100, - "tol": 0 - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "C": 500.0, - "kernel": "rbf" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "C": 1.5e-3, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "connect", - "training": - { - "x": "data/connect_x_train.npy", - "y": "data/connect_y_train.npy" - }, - "testing": - { - "x": "data/connect_x_test.npy", - "y": "data/connect_y_test.npy" - } - } - ], - "C": 100.0, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "C": 50.0, - "kernel": "rbf" - }, - { - "algorithm": "nusvc", - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "nu": 0.25, - "kernel": "poly" - }, - { - "algorithm": "svr", - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ], - "C": 0.1, - "kernel": "poly" - }, - { - "algorithm": "nusvr", - "dataset": [ - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - } - ], - "nu": 0.8, - "C": 2.0, - "kernel": "rbf" - }, - { - "algorithm": "dbscan", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 50, - "n_features": 3, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 50, - "n_features": 10, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 100, - "n_features": 50, - "training": { - "n_samples": 500000 - } - } - ] - }, - { - "algorithm": "knn_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 3, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 10, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 50, - "training": { - "n_samples": 20000 - }, - "testing": { - "n_samples": 20000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 10, - "n_features": 16, - "training": { - "n_samples": 250000 - }, - "testing": { - "n_samples": 250000 - } - } - ], - "method": "brute" - }, - { - "algorithm": "knn_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 50, - "training": { - "n_samples": 20000 - }, - "testing": { - "n_samples": 20000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 10, - "n_features": 16, - "training": { - "n_samples": 250000 - }, - "testing": { - "n_samples": 250000 - } - } - ], - "method": "kd_tree" - }, - { - "algorithm": "train_test_split", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 1000000 - } - } - ], - "include-y": "", - "train-size": 0.75, - "test-size": 0.25 - } - ] -} diff --git a/configs/blogs/skl_conda_config.json b/configs/blogs/skl_conda_config.json deleted file mode 100755 index 3f413a617..000000000 --- a/configs/blogs/skl_conda_config.json +++ /dev/null @@ -1,427 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 1000, - "n_features": 20, - "training": { - "n_samples": 1000000 - } - } - ], - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 1000, - "maxiter": 50, - "tol": 0.0 - }, - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 5, - "n_features": 50, - "training": { - "n_samples": 10000000 - } - } - ], - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 5, - "maxiter": 50, - "init": "k-means++", - "tol": 0.0 - }, - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 20, - "n_features": 50, - "training": { - "n_samples": 3000000 - } - } - ], - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 20, - "maxiter": 50, - "tol": 0.0 - }, - { - "algorithm": "pca", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 1000000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 2000, - "training": { - "n_samples": 10000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 1000, - "training": { - "n_samples": 30000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 4000, - "training": { - "n_samples": 6000 - } - } - ], - "svd-solver": "full", - "n-components": 10 - }, - { - "algorithm": "df_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "num-trees": 50, - "max-depth": 16, - "max-leaf-nodes": 131072, - "max-features": 0.2 - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - } - ], - "alpha": 5 - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - } - ] - }, - { - "algorithm": "log_reg", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 2000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ], - "maxiter": 100, - "tol": 0 - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "C": 500.0, - "kernel": "rbf" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "C": 1.5e-3, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "connect", - "training": - { - "x": "data/connect_x_train.npy", - "y": "data/connect_y_train.npy" - }, - "testing": - { - "x": "data/connect_x_test.npy", - "y": "data/connect_y_test.npy" - } - } - ], - "C": 100.0, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "C": 50.0, - "kernel": "rbf" - }, - { - "algorithm": "dbscan", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 50, - "n_features": 3, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 50, - "n_features": 10, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 100, - "n_features": 50, - "training": { - "n_samples": 500000 - } - } - ] - }, - { - "algorithm": "knn_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 3, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 10, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 50, - "training": { - "n_samples": 20000 - }, - "testing": { - "n_samples": 20000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 10, - "n_features": 16, - "training": { - "n_samples": 250000 - }, - "testing": { - "n_samples": 250000 - } - } - ], - "method": "brute" - }, - { - "algorithm": "knn_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 50, - "training": { - "n_samples": 20000 - }, - "testing": { - "n_samples": 20000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 10, - "n_features": 16, - "training": { - "n_samples": 250000 - }, - "testing": { - "n_samples": 250000 - } - } - ], - "method": "kd_tree" - }, - { - "algorithm": "train_test_split", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 1000000 - } - } - ], - "include-y": "", - "train-size": 0.75, - "test-size": 0.25 - } - ] -} diff --git a/configs/common/sklearn.json b/configs/common/sklearn.json new file mode 100644 index 000000000..d7b13188e --- /dev/null +++ b/configs/common/sklearn.json @@ -0,0 +1,56 @@ +{ + "PARAMETERS_SETS": { + "sklearn-ex[cpu] implementations": { + "algorithm": [ + { "library": "sklearn", "device": "cpu" }, + { "library": "sklearnex", "device": "cpu" } + ] + }, + "sklearn-ex[cpu,gpu] implementations": { + "algorithm": [ + { "library": "sklearn", "device": "cpu" }, + { "library": "sklearnex", "device": ["cpu", "gpu"] } + ] + }, + "sklearn-ex[preview] implementations": { + "algorithm": [ + { "library": "sklearn", "device": "cpu" }, + { "library": "sklearnex", "device": "cpu" }, + { "library": "sklearnex.preview", "device": ["cpu", "gpu"] } + ] + }, + "sklearnex spmd implementation": { + "algorithm": { + "library": "sklearnex.spmd", + "device": "gpu", + "estimator_params": { "n_jobs": "[REMOVE]" } + }, + "data": { + "format": "dpctl", + "order": "C", + "distributed_split": "rank_based" + }, + "bench": { + "distributor": "mpi" + } + }, + "spmd default parameters": { + "algorithm": { + "estimator_methods": { + "training": "fit", + "inference": "predict|transform" + } + }, + "data": { + "dtype": "float32" + }, + "bench": { + "mpi_params": { "n": [1, 2] } + } + }, + "cuml implementation": { + "algorithm": { "library": "cuml" }, + "data": { "format": "cudf" } + } + } +} diff --git a/configs/common/xgboost.json b/configs/common/xgboost.json new file mode 100644 index 000000000..1eced184c --- /dev/null +++ b/configs/common/xgboost.json @@ -0,0 +1,45 @@ +{ + "PARAMETERS_SETS": { + "xgboost implementations": [ + { + "algorithm": { + "device": "cpu", + "estimator_params": { "tree_method": "hist" }, + "enable_modelbuilders": false + } + }, + { + "algorithm": { + "device": "gpu", + "estimator_params": { "tree_method": "hist" } + }, + "data": { "format": "cudf" } + } + ], + "xgboost binary classification": { + "algorithm": { + "library": "xgboost", + "estimator": "XGBClassifier", + "estimator_params": { + "objective": "binary:logistic", + "scale_pos_weight": "[SPECIAL_VALUE]auto" + } + } + }, + "xgboost multiclassification": { + "algorithm": { + "library": "xgboost", + "estimator": "XGBClassifier", + "estimator_params": { "objective": "multi:softprob" } + } + }, + "xgboost regression": { + "algorithm": { + "library": "xgboost", + "estimator": "XGBRegressor", + "estimator_methods": {"inference": "predict"}, + "estimator_params": { "objective": "reg:squarederror" } + } + } + } +} diff --git a/configs/config_example.json b/configs/config_example.json deleted file mode 100644 index fa615cf29..000000000 --- a/configs/config_example.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "common": { - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "lib": "daal4py", - "algorithm": "linear", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 50, - "training": { - "n_samples": 10000 - } - } - ] - }, - { - "lib": "sklearn", - "algorithm": "svm", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 2000 - } - } - ], - "max-cache-size": 4, - "kernel": "rbf" - }, - { - "lib": "xgboost", - "algorithm": "gbt", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 100, - "training": { - "n_samples": 10000 - } - } - ], - "n-estimators": 1000, - "objective": "reg:squarederror", - "tree-method": "hist", - "max-depth": 1, - "subsample": 0.5, - "eta": 0.1 - } - ] -} diff --git a/configs/cuml_config.json b/configs/cuml_config.json deleted file mode 100755 index a96e744f2..000000000 --- a/configs/cuml_config.json +++ /dev/null @@ -1,623 +0,0 @@ -{ - "common": { - "lib": "cuml", - "data-format": "cudf", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 1000, - "n_features": 20, - "training": { - "n_samples": 1000000 - } - } - ], - "time-method": "box_filter", - "n-clusters": 1000, - "maxiter": 50, - "tol": 0.0 - }, - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 5, - "n_features": 50, - "training": { - "n_samples": 10000000 - } - } - ], - "time-method": "box_filter", - "n-clusters": 5, - "maxiter": 50, - "init": "k-means++", - "tol": 0.0 - }, - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 20, - "n_features": 50, - "training": { - "n_samples": 3000000 - } - } - ], - "time-method": "box_filter", - "n-clusters": 20, - "maxiter": 50, - "tol": 0.0 - }, - { - "algorithm": "pca", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 1000000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 2000, - "training": { - "n_samples": 10000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 1000, - "training": { - "n_samples": 30000 - } - } - ], - "svd-solver": "full", - "n-components": 10 - }, - { - "algorithm": "df_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - }, - { - "source": "npy", - "name": "airline-ohe", - "training": - { - "x": "data/airline-ohe_x_train.npy", - "y": "data/airline-ohe_y_train.npy" - }, - "testing": - { - "x": "data/airline-ohe_x_test.npy", - "y": "data/airline-ohe_y_test.npy" - } - } - ], - "num-trees": 50, - "max-depth": 16, - "max-leaf-nodes": 131072, - "max-features": 0.2 - }, - { - "algorithm": "df_regr", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - }, - { - "source": "npy", - "name": "airline_regression", - "training": - { - "x": "data/airline_regression_x_train.npy", - "y": "data/airline_regression_y_train.npy" - }, - "testing": - { - "x": "data/airline_regression_x_test.npy", - "y": "data/airline_regression_y_test.npy" - } - } - ] - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "regression", - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ], - "alpha": 5 - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "regression", - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ] - }, - { - "algorithm": "log_reg", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 2000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ], - "maxiter": 100, - "tol": 0 - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "ijcnn", - "training": - { - "x": "data/ijcnn_x_train.npy", - "y": "data/ijcnn_y_train.npy" - }, - "testing": - { - "x": "data/ijcnn_x_test.npy", - "y": "data/ijcnn_y_test.npy" - } - } - ], - "C": 1000.0, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "C": 1.5e-3, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "connect", - "training": - { - "x": "data/connect_x_train.npy", - "y": "data/connect_y_train.npy" - }, - "testing": - { - "x": "data/connect_x_test.npy", - "y": "data/connect_y_test.npy" - } - } - ], - "C": 100.0, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - }, - "testing": - { - "x": "data/sensit_x_test.npy", - "y": "data/sensit_y_test.npy" - } - } - ], - "C": 500.0, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - } - ], - "C": 1.0, - "kernel": "rbf" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "covertype", - "training": - { - "x": "data/covertype_x_train.npy", - "y": "data/covertype_y_train.npy" - }, - "testing": - { - "x": "data/covertype_x_test.npy", - "y": "data/covertype_y_test.npy" - } - } - ], - "C": 100.0, - "kernel": "rbf" - }, - { - "algorithm": "svr", - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ], - "C": 0.1, - "kernel": "poly" - }, - { - "algorithm": "svr", - "dataset": [ - { - "source": "npy", - "name": "twodplanes", - "training": - { - "x": "data/twodplanes_x_train.npy", - "y": "data/twodplanes_y_train.npy" - }, - "testing": - { - "x": "data/twodplanes_x_test.npy", - "y": "data/twodplanes_y_test.npy" - } - } - ], - "C": 10.0, - "kernel": "rbf" - }, - { - "algorithm": "dbscan", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 50, - "n_features": 3, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 50, - "n_features": 10, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 100, - "n_features": 50, - "training": { - "n_samples": 500000 - } - } - ] - }, - { - "algorithm": "knn_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 3, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 10, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 50, - "training": { - "n_samples": 20000 - }, - "testing": { - "n_samples": 20000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 10, - "n_features": 16, - "training": { - "n_samples": 250000 - }, - "testing": { - "n_samples": 250000 - } - } - ], - "method": "brute" - }, - { - "algorithm": "train_test_split", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 20, - "training": { - "n_samples": 5000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 1000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 10000, - "training": { - "n_samples": 10000 - } - } - ], - "train-size": 0.75, - "test-size": 0.25 - }, - { - "algorithm": "train_test_split", - "dataset": [ - { - "source": "npy", - "name": "census", - "training": - { - "x": "data/census_x_train.npy", - "y": "data/census_y_train.npy" - } - } - ], - "train-size": 0.9, - "test-size": 0.1 - }, - { - "algorithm": "lasso", - "dataset": [ - { - "source": "npy", - "name": "mortgage1Q", - "training": - { - "x": "data/mortgage1Q_x_train.npy", - "y": "data/mortgage1Q_y_train.npy" - } - } - ], - "alpha": 1.0, - "tol": 1e-4 - }, - { - "algorithm": "elasticnet", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "alpha": 2.0, - "l1_ratio": 0.5, - "tol": 1e-4 - }, - { - "algorithm": "tsne", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/daal4py/daal4py_gbt_config.json b/configs/daal4py/daal4py_gbt_config.json deleted file mode 100644 index 422208b32..000000000 --- a/configs/daal4py/daal4py_gbt_config.json +++ /dev/null @@ -1,280 +0,0 @@ -{ - "common": { - "lib": "daal4py", - "data-format": "pandas", - "data-order": "F", - "fptype": "float", - "algorithm": "gbt" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ], - "max_tree_depth": 6, - "n_estimators": 1000, - "objective": "reg:squarederror" - }, - { - "dataset": [ - { - "source": "npy", - "name": "airline-ohe", - "training": { - "x": "data/airline-ohe_x_train.npy", - "y": "data/airline-ohe_y_train.npy" - }, - "testing": { - "x": "data/airline-ohe_x_test.npy", - "y": "data/airline-ohe_y_test.npy" - } - } - ], - "reg_lambda": 1, - "max_tree_depth": 8, - "n_estimators": 1000, - "objective": "binary:logistic" - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "max_bins": 256, - "reg_lambda": 1, - "max_tree_depth": 8, - "n_estimators": [100, 300, 1000, 3000, 10000, 30000], - "objective": "binary:logistic" - }, - { - "dataset": [ - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - } - ], - "max_tree_depth": 6, - "n_estimators": 1000, - "objective": "multi:softprob" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mlsr", - "training": { - "x": "data/mlsr_x_train.npy", - "y": "data/mlsr_y_train.npy" - } - } - ], - "max_bins": 256, - "reg_lambda": 2, - "max_tree_depth": 8, - "n_estimators": 200, - "objective": "multi:softprob" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mortgage1Q", - "training": { - "x": "data/mortgage1Q_x_train.npy", - "y": "data/mortgage1Q_y_train.npy" - } - } - ], - "n_estimators": 100, - "objective": "reg:squarederror", - "max_tree_depth": 8, - "reg_lambda": 1 - }, - { - "dataset": [ - { - "source": "npy", - "name": "plasticc", - "training": { - "x": "data/plasticc_x_train.npy", - "y": "data/plasticc_y_train.npy" - }, - "testing": { - "x": "data/plasticc_x_test.npy", - "y": "data/plasticc_y_test.npy" - } - } - ], - "n_estimators": 60, - "objective": "multi:softprob", - "max_tree_depth": 7 - }, - { - "dataset": [ - { - "source": "npy", - "name": "santander", - "training": { - "x": "data/santander_x_train.npy", - "y": "data/santander_y_train.npy" - }, - "testing": { - "x": "data/santander_x_test.npy", - "y": "data/santander_y_test.npy" - } - } - ], - "n_estimators": 10000, - "objective": "binary:logistic", - "max_tree_depth": 1 - }, - { - "objective": "binary:logistic", - "dataset": [ - { - "source": "npy", - "name": "airline", - "training": { - "x": "data/airline_x_train.npy", - "y": "data/airline_y_train.npy" - }, - "testing": { - "x": "data/airline_x_test.npy", - "y": "data/airline_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "dataset": [ - { - "source": "npy", - "name": "bosch", - "training": { - "x": "data/bosch_x_train.npy", - "y": "data/bosch_y_train.npy" - }, - "testing": { - "x": "data/bosch_x_test.npy", - "y": "data/bosch_y_test.npy" - } - } - ] - }, - { - "objective": "multi:softmax", - "dataset": [ - { - "source": "npy", - "name": "covtype", - "training": { - "x": "data/covtype_x_train.npy", - "y": "data/covtype_y_train.npy" - }, - "testing": { - "x": "data/covtype_x_test.npy", - "y": "data/covtype_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "dataset": [ - { - "source": "npy", - "name": "epsilon", - "training": { - "x": "data/epsilon_x_train.npy", - "y": "data/epsilon_y_train.npy" - }, - "testing": { - "x": "data/epsilon_x_test.npy", - "y": "data/epsilon_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "dataset": [ - { - "source": "npy", - "name": "fraud", - "training": { - "x": "data/fraud_x_train.npy", - "y": "data/fraud_y_train.npy" - }, - "testing": { - "x": "data/fraud_x_test.npy", - "y": "data/fraud_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "dataset": [ - { - "source": "npy", - "name": "higgs", - "training": { - "x": "data/higgs_x_train.npy", - "y": "data/higgs_y_train.npy" - }, - "testing": { - "x": "data/higgs_x_test.npy", - "y": "data/higgs_y_test.npy" - } - } - ] - }, - { - "objective": "reg:squarederror", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/experiments/daal4py_svd.json b/configs/experiments/daal4py_svd.json new file mode 100644 index 000000000..fb9f37f60 --- /dev/null +++ b/configs/experiments/daal4py_svd.json @@ -0,0 +1,40 @@ +{ + "PARAMETERS_SETS": { + "svd": { + "algorithm": { + "library": "daal4py", + "estimator": "svd", + "estimator_methods": { "training": "compute" } + }, + "data": { + "dataset": [ + "skin_segmentation", + "road_network", + "codrnanorm", + "twodplanes", + "medical_charges_nominal", + "susy", + "ijcnn", + "hepmass", + "higgs", + "creditcard", + "fraud", + "klaverjas", + "covtype", + "year_prediction_msd", + "sensit", + "yolanda", + "connect", + "mnist" + ], + "split_kwargs": { "ignore": true } + }, + "bench": { "n_runs": 50 } + } + }, + "TEMPLATES": { + "svd": { + "SETS": ["svd"] + } + } +} diff --git a/configs/experiments/nearest_neighbors.json b/configs/experiments/nearest_neighbors.json new file mode 100644 index 000000000..23d76300f --- /dev/null +++ b/configs/experiments/nearest_neighbors.json @@ -0,0 +1,83 @@ +{ + "PARAMETERS_SETS": { + "nearest neighbors implementations": [ + { + "algorithm": { + "library": ["sklearn", "sklearnex"], + "device": "cpu", + "estimator_params": { "algorithm": "brute" } + } + }, + { + "algorithm": { + "library": "sklbench.emulators.raft", + "device": "gpu", + "estimator_params": { + "algorithm": ["brute", "ivf_flat", "ivf_pq", "cagra"], + "n_lists": 1024, + "n_probes": 256, + "m_subvectors": 0.2 + } + }, + "data": { + "format": "cupy", + "order": "C" + } + }, + { + "algorithm": { + "library": "sklbench.emulators.faiss", + "device": ["cpu", "gpu"], + "estimator_params": { + "algorithm": ["brute", "ivf_flat", "ivf_pq"], + "n_lists": 1024, + "n_probes": 256, + "m_subvectors": 0.2 + } + } + }, + { + "algorithm": { + "library": "sklbench.emulators.svs", + "device": "cpu", + "estimator_params": { + "algorithm": "vamana", + "graph_max_degree": 128, + "window_size": 256 + } + } + } + ], + "nearest neighbors common parameters": { + "algorithm": { + "estimator": "NearestNeighbors", + "estimator_params": { + "metric": "euclidean", + "n_neighbors": [8, 32, 128] + }, + "estimator_methods": { + "training": "fit", + "inference": "kneighbors" + }, + "batch_size": { "inference": [1, 10, 100, null] } + }, + "data": { "dtype": "float32" } + }, + "nearest neighbors data": [ + { + "data": { + "dataset": ["mnist", "fashion_mnist", "sift", "gist"] + } + } + ] + }, + "TEMPLATES": { + "nearest neighbors": { + "SETS": [ + "nearest neighbors implementations", + "nearest neighbors common parameters", + "nearest neighbors data" + ] + } + } +} diff --git a/configs/modelbuilders/catboost_mb_config.json b/configs/modelbuilders/catboost_mb_config.json deleted file mode 100644 index c676ac687..000000000 --- a/configs/modelbuilders/catboost_mb_config.json +++ /dev/null @@ -1,304 +0,0 @@ -{ - "common": { - "lib": "modelbuilders", - "data-format": "pandas", - "data-order": "F", - "grow-policy": "Depthwise", - "dtype": "float32", - "algorithm": "catboost_mb", - "count-pool": "", - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256 - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ], - "learning-rate": 0.03, - "max-depth": 6, - "n-estimators": 1000, - "objective": "RMSE" - }, - { - "dataset": [ - { - "source": "npy", - "name": "airline-ohe", - "training": { - "x": "data/airline-ohe_x_train.npy", - "y": "data/airline-ohe_y_train.npy" - }, - "testing": { - "x": "data/airline-ohe_x_test.npy", - "y": "data/airline-ohe_y_test.npy" - } - } - ], - "max-bin": 256, - "scale-pos-weight": 2, - "subsample": 1, - "n-estimators": 1000, - "objective": "Logloss" - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "max-bin": 256, - "scale-pos-weight": 2, - "subsample": 1, - "n-estimators": [100, 300, 1000, 3000], - "objective": "Logloss" - }, - { - "dataset": [ - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - } - ], - "learning-rate": 0.03, - "max-depth": 6, - "max-leaves": 0, - "n-estimators": 1000, - "objective": "multi:softprob" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mlsr", - "training": { - "x": "data/mlsr_x_train.npy", - "y": "data/mlsr_y_train.npy" - } - } - ], - "max-bin": 256, - "learning-rate": 0.3, - "subsample": 1, - "reg-lambda": 2, - "n-estimators": 200, - "objective": "multi:softprob" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mortgage1Q", - "training": { - "x": "data/mortgage1Q_x_train.npy", - "y": "data/mortgage1Q_y_train.npy" - } - } - ], - "n-estimators": 100, - "objective": "RMSE", - "scale-pos-weight": 2, - "subsample": 1 - }, - { - "dataset": [ - { - "source": "npy", - "name": "plasticc", - "training": { - "x": "data/plasticc_x_train.npy", - "y": "data/plasticc_y_train.npy" - }, - "testing": { - "x": "data/plasticc_x_test.npy", - "y": "data/plasticc_y_test.npy" - } - } - ], - "learning-rate": 0.3, - "n-estimators": 60, - "objective": "multi:softprob", - "max-depth": 7, - "max-leaves": 0, - "subsample": 0.7 - }, - { - "dataset": [ - { - "source": "npy", - "name": "santander", - "training": { - "x": "data/santander_x_train.npy", - "y": "data/santander_y_train.npy" - }, - "testing": { - "x": "data/santander_x_test.npy", - "y": "data/santander_y_test.npy" - } - } - ], - "learning-rate": 0.3, - "n-estimators": 10000, - "objective": "Logloss", - "max-depth": 1, - "max-leaves": 0, - "subsample": 0.5, - "eta": 0.1 - }, - { - "objective": "Logloss", - "scale-pos-weight": 2.1067817411664587, - "dataset": [ - { - "source": "npy", - "name": "airline", - "training": { - "x": "data/airline_x_train.npy", - "y": "data/airline_y_train.npy" - }, - "testing": { - "x": "data/airline_x_test.npy", - "y": "data/airline_y_test.npy" - } - } - ] - }, - { - "objective": "Logloss", - "scale-pos-weight": 173.63348001466812, - "dataset": [ - { - "source": "npy", - "name": "bosch", - "training": { - "x": "data/bosch_x_train.npy", - "y": "data/bosch_y_train.npy" - }, - "testing": { - "x": "data/bosch_x_test.npy", - "y": "data/bosch_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "covtype", - "training": { - "x": "data/covtype_x_train.npy", - "y": "data/covtype_y_train.npy" - }, - "testing": { - "x": "data/covtype_x_test.npy", - "y": "data/covtype_y_test.npy" - } - } - ], - "objective": "multi:softprob", - "n-estimators": 100 - }, - { - "objective": "Logloss", - "scale-pos-weight": 2.0017715678375363, - "dataset": [ - { - "source": "npy", - "name": "epsilon", - "training": { - "x": "data/epsilon_x_train.npy", - "y": "data/epsilon_y_train.npy" - }, - "testing": { - "x": "data/epsilon_x_test.npy", - "y": "data/epsilon_y_test.npy" - } - } - ] - }, - { - "objective": "Logloss", - "scale-pos-weight": 578.2868020304569, - "dataset": [ - { - "source": "npy", - "name": "fraud", - "training": { - "x": "data/fraud_x_train.npy", - "y": "data/fraud_y_train.npy" - }, - "testing": { - "x": "data/fraud_x_test.npy", - "y": "data/fraud_y_test.npy" - } - } - ] - }, - { - "objective": "Logloss", - "scale-pos-weight": 1.8872389605086624, - "dataset": [ - { - "source": "npy", - "name": "higgs", - "training": { - "x": "data/higgs_x_train.npy", - "y": "data/higgs_y_train.npy" - }, - "testing": { - "x": "data/higgs_x_test.npy", - "y": "data/higgs_y_test.npy" - } - } - ] - }, - { - "objective": "RMSE", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/modelbuilders/lgbm_mb_cpu_config.json b/configs/modelbuilders/lgbm_mb_cpu_config.json deleted file mode 100755 index 649a66bf7..000000000 --- a/configs/modelbuilders/lgbm_mb_cpu_config.json +++ /dev/null @@ -1,354 +0,0 @@ -{ - "common": { - "lib": "modelbuilders", - "data-format": "pandas", - "data-order": "F", - "dtype": "float32", - "algorithm": "lgbm_mb" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ], - "learning-rate": 0.03, - "max-depth": 6, - "max-leaves": 256, - "n-estimators": [1000], - "objective": "regression" - }, - { - "dataset": [ - { - "source": "npy", - "name": "airline-ohe", - "training": { - "x": "data/airline-ohe_x_train.npy", - "y": "data/airline-ohe_y_train.npy" - }, - "testing": { - "x": "data/airline-ohe_x_test.npy", - "y": "data/airline-ohe_y_test.npy" - } - } - ], - "reg-alpha": 0.9, - "max-bin": 256, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-lambda": 1, - "min-child-weight": 0, - "max-depth": 8, - "max-leaves": 256, - "n-estimators": 1000, - "objective": "binary" - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "reg-alpha": 0.9, - "max-bin": 256, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-lambda": 1, - "min-child-weight": 0, - "max-depth": 8, - "max-leaves": 256, - "n-estimators": [100, 300, 1000, 3000], - "objective": "binary" - }, - { - "dataset": [ - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - } - ], - "learning-rate": 0.03, - "max-depth": 6, - "max-leaves": 256, - "n-estimators": 1000, - "objective": "multiclass" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mlsr", - "training": { - "x": "data/mlsr_x_train.npy", - "y": "data/mlsr_y_train.npy" - } - } - ], - "max-bin": 256, - "learning-rate": 0.3, - "subsample": 1, - "reg-lambda": 2, - "min-child-weight": 1, - "max-depth": 8, - "max-leaves": 256, - "n-estimators": 200, - "objective": "multiclass" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mortgage1Q", - "training": { - "x": "data/mortgage1Q_x_train.npy", - "y": "data/mortgage1Q_y_train.npy" - } - } - ], - "n-estimators": 100, - "objective": "regression", - "max-depth": 8, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-alpha": 0.9, - "reg-lambda": 1, - "min-child-weight": 0, - "max-leaves": 256 - }, - { - "dataset": [ - { - "source": "npy", - "name": "plasticc", - "training": { - "x": "data/plasticc_x_train.npy", - "y": "data/plasticc_y_train.npy" - }, - "testing": { - "x": "data/plasticc_x_test.npy", - "y": "data/plasticc_y_test.npy" - } - } - ], - "n-estimators": 60, - "objective": "multiclass", - "max-depth": 7, - "subsample": 0.7, - "max-leaves": 256, - "colsample-bytree": 0.7 - }, - { - "dataset": [ - { - "source": "npy", - "name": "santander", - "training": { - "x": "data/santander_x_train.npy", - "y": "data/santander_y_train.npy" - }, - "testing": { - "x": "data/santander_x_test.npy", - "y": "data/santander_y_test.npy" - } - } - ], - "n-estimators": 10000, - "objective": "binary", - "max-depth": 1, - "max-leaves": 256, - "subsample": 0.5, - "eta": 0.1, - "colsample-bytree": 0.05 - }, - { - "objective": "binary", - "scale-pos-weight": 2.1067817411664587, - "dataset": [ - { - "source": "npy", - "name": "airline", - "training": { - "x": "data/airline_x_train.npy", - "y": "data/airline_y_train.npy" - }, - "testing": { - "x": "data/airline_x_test.npy", - "y": "data/airline_y_test.npy" - } - } - ], - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "n-estimators": 100 - }, - { - "objective": "binary", - "scale-pos-weight": 173.63348001466812, - "dataset": [ - { - "source": "npy", - "name": "bosch", - "training": { - "x": "data/bosch_x_train.npy", - "y": "data/bosch_y_train.npy" - }, - "testing": { - "x": "data/bosch_x_test.npy", - "y": "data/bosch_y_test.npy" - } - } - ], - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "n-estimators": 100 - }, - { - "objective": "multiclass", - "dataset": [ - { - "source": "npy", - "name": "covtype", - "training": { - "x": "data/covtype_x_train.npy", - "y": "data/covtype_y_train.npy" - }, - "testing": { - "x": "data/covtype_x_test.npy", - "y": "data/covtype_y_test.npy" - } - } - ], - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "n-estimators": 100 - }, - { - "objective": "binary", - "scale-pos-weight": 2.0017715678375363, - "dataset": [ - { - "source": "npy", - "name": "epsilon", - "training": { - "x": "data/epsilon_x_train.npy", - "y": "data/epsilon_y_train.npy" - }, - "testing": { - "x": "data/epsilon_x_test.npy", - "y": "data/epsilon_y_test.npy" - } - } - ], - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "n-estimators": 100 - }, - { - "objective": "binary", - "scale-pos-weight": 578.2868020304569, - "dataset": [ - { - "source": "npy", - "name": "fraud", - "training": { - "x": "data/fraud_x_train.npy", - "y": "data/fraud_y_train.npy" - }, - "testing": { - "x": "data/fraud_x_test.npy", - "y": "data/fraud_y_test.npy" - } - } - ], - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "n-estimators": 100 - }, - { - "objective": "binary", - "scale-pos-weight": 1.8872389605086624, - "dataset": [ - { - "source": "npy", - "name": "higgs", - "training": { - "x": "data/higgs_x_train.npy", - "y": "data/higgs_y_train.npy" - }, - "testing": { - "x": "data/higgs_x_test.npy", - "y": "data/higgs_y_test.npy" - } - } - ], - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "n-estimators": 100 - }, - { - "objective": "regression", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "n-estimators": 100 - } - ] -} diff --git a/configs/modelbuilders/xgb_mb_cpu_config.json b/configs/modelbuilders/xgb_mb_cpu_config.json deleted file mode 100755 index 19f3ea864..000000000 --- a/configs/modelbuilders/xgb_mb_cpu_config.json +++ /dev/null @@ -1,349 +0,0 @@ -{ - "common": { - "lib": "modelbuilders", - "data-format": "pandas", - "data-order": "F", - "dtype": "float32", - "algorithm": "xgb_mb", - "tree-method": "hist", - "count-dmatrix": "" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ], - "learning-rate": 0.03, - "max-depth": 6, - "n-estimators": 1000, - "objective": "reg:squarederror" - }, - { - "dataset": [ - { - "source": "npy", - "name": "airline-ohe", - "training": { - "x": "data/airline-ohe_x_train.npy", - "y": "data/airline-ohe_y_train.npy" - }, - "testing": { - "x": "data/airline-ohe_x_test.npy", - "y": "data/airline-ohe_y_test.npy" - } - } - ], - "reg-alpha": 0.9, - "max-bin": 256, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-lambda": 1, - "min-child-weight": 0, - "max-depth": 8, - "max-leaves": 256, - "n-estimators": 1000, - "objective": "binary:logistic" - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "reg-alpha": 0.9, - "max-bin": 256, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-lambda": 1, - "min-child-weight": 0, - "max-depth": 8, - "max-leaves": 256, - "n-estimators": 1000, - "objective": "binary:logistic", - "enable-experimental-json-serialization": "False", - "inplace-predict": "" - }, - { - "dataset": [ - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - } - ], - "learning-rate": 0.03, - "max-depth": 6, - "n-estimators": 1000, - "objective": "multi:softprob" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mlsr", - "training": { - "x": "data/mlsr_x_train.npy", - "y": "data/mlsr_y_train.npy" - } - } - ], - "max-bin": 256, - "learning-rate": 0.3, - "subsample": 1, - "reg-lambda": 2, - "min-child-weight": 1, - "min-split-loss": 0.1, - "max-depth": 8, - "n-estimators": 200, - "objective": "multi:softprob", - "single-precision-histogram": "" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mortgage1Q", - "training": { - "x": "data/mortgage1Q_x_train.npy", - "y": "data/mortgage1Q_y_train.npy" - } - } - ], - "n-estimators": 100, - "objective": "reg:squarederror", - "max-depth": 8, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-alpha": 0.9, - "reg-lambda": 1, - "min-child-weight": 0, - "max-leaves": 256 - }, - { - "dataset": [ - { - "source": "npy", - "name": "plasticc", - "training": { - "x": "data/plasticc_x_train.npy", - "y": "data/plasticc_y_train.npy" - }, - "testing": { - "x": "data/plasticc_x_test.npy", - "y": "data/plasticc_y_test.npy" - } - } - ], - "n-estimators": 60, - "objective": "multi:softprob", - "max-depth": 7, - "subsample": 0.7, - "colsample-bytree": 0.7 - }, - { - "dataset": [ - { - "source": "npy", - "name": "santander", - "training": { - "x": "data/santander_x_train.npy", - "y": "data/santander_y_train.npy" - }, - "testing": { - "x": "data/santander_x_test.npy", - "y": "data/santander_y_test.npy" - } - } - ], - "n-estimators": 10000, - "objective": "binary:logistic", - "max-depth": 1, - "subsample": 0.5, - "eta": 0.1, - "colsample-bytree": 0.05, - "single-precision-histogram": "" - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 2.1067817411664587, - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "dataset": [ - { - "source": "npy", - "name": "airline", - "training": { - "x": "data/airline_x_train.npy", - "y": "data/airline_y_train.npy" - }, - "testing": { - "x": "data/airline_x_test.npy", - "y": "data/airline_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 173.63348001466812, - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "dataset": [ - { - "source": "npy", - "name": "bosch", - "training": { - "x": "data/bosch_x_train.npy", - "y": "data/bosch_y_train.npy" - }, - "testing": { - "x": "data/bosch_x_test.npy", - "y": "data/bosch_y_test.npy" - } - } - ] - }, - { - "objective": "multi:softmax", - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "dataset": [ - { - "source": "npy", - "name": "covtype", - "training": { - "x": "data/covtype_x_train.npy", - "y": "data/covtype_y_train.npy" - }, - "testing": { - "x": "data/covtype_x_test.npy", - "y": "data/covtype_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 2.0017715678375363, - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "dataset": [ - { - "source": "npy", - "name": "epsilon", - "training": { - "x": "data/epsilon_x_train.npy", - "y": "data/epsilon_y_train.npy" - }, - "testing": { - "x": "data/epsilon_x_test.npy", - "y": "data/epsilon_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 578.2868020304569, - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "dataset": [ - { - "source": "npy", - "name": "fraud", - "training": { - "x": "data/fraud_x_train.npy", - "y": "data/fraud_y_train.npy" - }, - "testing": { - "x": "data/fraud_x_test.npy", - "y": "data/fraud_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 1.8872389605086624, - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "dataset": [ - { - "source": "npy", - "name": "higgs", - "training": { - "x": "data/higgs_x_train.npy", - "y": "data/higgs_y_train.npy" - }, - "testing": { - "x": "data/higgs_x_test.npy", - "y": "data/higgs_y_test.npy" - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/modelbuilders/xgb_mb_cpu_config_shap.json b/configs/modelbuilders/xgb_mb_cpu_config_shap.json deleted file mode 100644 index c91c7fd77..000000000 --- a/configs/modelbuilders/xgb_mb_cpu_config_shap.json +++ /dev/null @@ -1,309 +0,0 @@ -{ - "common": { - "lib": "modelbuilders", - "data-format": "pandas", - "data-order": "F", - "dtype": "float32", - "algorithm": "xgb_mb", - "tree-method": "hist", - "count-dmatrix": "", - "num-threads": -1, - "n-estimators": 50 - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ], - "learning-rate": 0.03, - "max-depth": 6, - "n-estimators": 1000, - "objective": "reg:squarederror" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mortgage1Q", - "training": { - "x": "data/mortgage1Q_x_train.npy", - "y": "data/mortgage1Q_y_train.npy" - } - } - ], - "n-estimators": 100, - "objective": "reg:squarederror", - "max-depth": 8, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-alpha": 0.9, - "reg-lambda": 1, - "min-child-weight": 0, - "max-leaves": 256 - }, - { - "objective": "reg:squarederror", - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 6, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 1024, - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 10, - "n_informative": 10, - "training": { - "n_samples": 2000 - }, - "testing": { - "n_samples": 8000 - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 10, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 4096, - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 10, - "n_informative": 10, - "training": { - "n_samples": 3000 - }, - "testing": { - "n_samples": 80000 - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "n_informative": 20, - "training": { - "n_samples": 2000 - }, - "testing": { - "n_samples": 80000 - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 10, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 1024, - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "n_informative": 20, - "training": { - "n_samples": 4000 - }, - "testing": { - "n_samples": 80000 - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 14, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 4096, - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "n_informative": 20, - "training": { - "n_samples": 10000 - }, - "testing": { - "n_samples": 80000 - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 50, - "n_informative": 50, - "training": { - "n_samples": 2000 - }, - "testing": { - "n_samples": 80000 - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 10, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 1024, - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 50, - "n_informative": 50, - "training": { - "n_samples": 2000 - }, - "testing": { - "n_samples": 80000 - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 14, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 4096, - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 50, - "n_informative": 50, - "training": { - "n_samples": 4000 - }, - "testing": { - "n_samples": 80000 - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256, - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 100, - "n_informative": 100, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 80000 - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 10, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 1024, - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 100, - "n_informative": 100, - "training": { - "n_samples": 2000 - }, - "testing": { - "n_samples": 80000 - } - } - ] - }, - { - "objective": "reg:squarederror", - "max-depth": 14, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 4096, - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 100, - "n_informative": 100, - "training": { - "n_samples": 3000 - }, - "testing": { - "n_samples": 80000 - } - } - ] - } - ] -} diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json new file mode 100644 index 000000000..71dcdc9b6 --- /dev/null +++ b/configs/regular/dbscan.json @@ -0,0 +1,76 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common dbscan parameters": { + "algorithm": { + "estimator": "DBSCAN", + "estimator_params": { + "eps": "[SPECIAL_VALUE]distances_quantile:0.01", + "min_samples": 5, + "metric": "euclidean" + } + }, + "data": { + "dtype": ["float32", "float64"] + } + }, + "sklearn dbscan parameters": { + "algorithm": { + "estimator_params": { + "algorithm": "brute", + "n_jobs": "[SPECIAL_VALUE]physical_cpus" + } + } + }, + "cuml dbscan parameters": { + "algorithm": { + "estimator_params": { "calc_core_sample_indices": false, "verbose": 2 } + } + }, + "dbscan datasets": [ + { + "data": { "dataset": ["cifar", "mnist"], "split_kwargs": { "train_size": 10000 } } + }, + { + "data": { "dataset": ["sensit", "hepmass"], "split_kwargs": { "train_size": 20000 } } + }, + { + "data": { + "dataset": "road_network", + "preprocessing_kwargs": { "normalize": true }, + "split_kwargs": { "train_size": [20000, 50000] } + } + }, + { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 20, + "n_samples": 50000, + "n_features": [4, 16, 64, 256], + "cluster_std": 1.5 + }, + "split_kwargs": { "ignore": true } + } + } + ] + }, + "TEMPLATES": { + "sklearn dbscan": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common dbscan parameters", + "sklearn dbscan parameters", + "dbscan datasets" + ] + }, + "cuml dbscan": { + "SETS": [ + "cuml implementation", + "common dbscan parameters", + "cuml dbscan parameters", + "dbscan datasets" + ] + } + } +} diff --git a/configs/regular/ensemble.json b/configs/regular/ensemble.json new file mode 100644 index 000000000..56e37e778 --- /dev/null +++ b/configs/regular/ensemble.json @@ -0,0 +1,124 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common ensemble params": { + "algorithm": { + "estimator_params": { + "n_estimators": 200, + "max_depth": 16, + "max_samples": 1.0, + "min_samples_split": 5, + "min_samples_leaf": 2, + "min_impurity_decrease": 0.0, + "bootstrap": true, + "random_state": 42 + } + } + }, + "sklearn ensemble classifier params": { + "algorithm": { + "estimator": ["RandomForestClassifier", "ExtraTreesClassifier"], + "estimator_params": { + "criterion": "gini", + "max_features": "sqrt", + "max_leaf_nodes": null, + "n_jobs": "[SPECIAL_VALUE]physical_cpus" + } + } + }, + "sklearn ensemble regressor params": { + "algorithm": { + "estimator": ["RandomForestRegressor", "ExtraTreesRegressor"], + "estimator_params": { + "criterion": "squared_error", + "max_features": 1.0, + "max_leaf_nodes": null, + "n_jobs": "[SPECIAL_VALUE]physical_cpus" + } + } + }, + "cuml ensemble classifier params": { + "algorithm": { + "estimator": "RandomForestClassifier", + "estimator_params": { + "n_streams": 4, + "split_criterion": "gini", + "max_features": "sqrt", + "max_leaves": -1, + "n_bins": 256 + } + } + }, + "cuml ensemble regressor params": { + "algorithm": { + "estimator": "RandomForestRegressor", + "estimator_params": { + "n_streams": 4, + "split_criterion": "mse", + "max_features": 1.0, + "max_leaves": -1, + "n_bins": 256 + } + } + }, + "ensemble classification data": { + "data": [ + { "dataset": "skin_segmentation", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } }, + { "dataset": "creditcard", "split_kwargs": { "train_size": 100000, "test_size": null } }, + { "dataset": "a9a", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } }, + { "dataset": "mnist", "split_kwargs": { "train_size": 20000, "test_size": null } }, + { "dataset": "gisette", "split_kwargs": { "train_size": 5000, "test_size": 2000 } }, + { "dataset": "svhn", "split_kwargs": { "train_size": 10000, "test_size": 10000 } } + ] + }, + "ensemble regression data": { + "data": [ + { + "dataset": "road_network", + "split_kwargs": { + "train_size": 200000, "test_size": null, + "shuffle": true, "random_state": 42 + } + }, + { "dataset": "creditcard", "split_kwargs": { "train_size": 100000, "test_size": null } }, + { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 50000, "test_size": null } }, + { "dataset": "a9a", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } }, + { "dataset": "gisette", "split_kwargs": { "train_size": 5000, "test_size": 2000 } } + ] + } + }, + "TEMPLATES": { + "sklearn ensemble classification": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common ensemble params", + "sklearn ensemble classifier params", + "ensemble classification data" + ] + }, + "sklearn ensemble regression": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common ensemble params", + "sklearn ensemble regressor params", + "ensemble regression data" + ] + }, + "cuml ensemble classification": { + "SETS": [ + "cuml implementation", + "common ensemble params", + "cuml ensemble classifier params", + "ensemble classification data" + ] + }, + "cuml ensemble regression": { + "SETS": [ + "cuml implementation", + "common ensemble params", + "cuml ensemble regressor params", + "ensemble regression data" + ] + } + } +} diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json new file mode 100644 index 000000000..7283590bd --- /dev/null +++ b/configs/regular/kmeans.json @@ -0,0 +1,88 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "n_clusters": "[SPECIAL_VALUE]auto", + "n_init": 10, + "max_iter": 30, + "tol": 1e-3, + "random_state": 42 + }, + "estimator_methods": { "inference": "predict" } + }, + "data": { + "dtype": ["float32", "float64"], + "preprocessing_kwargs": { "normalize": true } + } + }, + "sklearn kmeans parameters": { + "algorithm": { "estimator_params": { "init": "k-means++", "algorithm": "lloyd" } } + }, + "cuml kmeans parameters": { + "algorithm": { + "estimator_params": { "init": "scalable-k-means++" } + } + }, + "kmeans datasets": [ + { + "data": [ + { + "dataset": ["covtype", "sensit"], + "split_kwargs": { "ignore": true } + }, + { + "dataset": ["mnist", "gisette"], + "split_kwargs": { "ignore": true }, + "preprocessing_kwargs": { "normalize": false } + } + ] + }, + { + "data": { + "dataset": "higgs", + "split_kwargs": { + "train_size": 100000, + "test_size": 2000000, + "shuffle": true, + "random_state": 42 + } + }, + "algorithm": [ + { + "estimator_params": { + "n_clusters": 100, + "max_iter": 10 + } + }, + { + "estimator_params": { + "n_clusters": 10, + "max_iter": 100 + } + } + ] + } + ] + }, + "TEMPLATES": { + "sklearn kmeans": { + "SETS": [ + "sklearn-ex[preview] implementations", + "common kmeans parameters", + "sklearn kmeans parameters", + "kmeans datasets" + ] + }, + "cuml kmeans": { + "SETS": [ + "cuml implementation", + "common kmeans parameters", + "cuml kmeans parameters", + "kmeans datasets" + ] + } + } +} diff --git a/configs/regular/knn.json b/configs/regular/knn.json new file mode 100644 index 000000000..e1cd8a75a --- /dev/null +++ b/configs/regular/knn.json @@ -0,0 +1,122 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common knn parameters": { + "algorithm": { + "estimator_params": { + "n_neighbors": [10, 100], + "weights": "uniform" + } + }, + "data": { + "preprocessing_kwargs": { "normalize": true } + } + }, + "sklearn knn parameters": { + "algorithm": { "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } } + }, + "brute knn algorithm - classification data": { + "algorithm": { + "estimator": "KNeighborsClassifier", + "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } + }, + "data": [ + { "dataset": "susy", "split_kwargs": { "train_size": 100000, "test_size": 10000 } }, + { "dataset": "connect" }, + { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } } + ] + }, + "kd_tree knn algorithm - classification data": { + "algorithm": { + "estimator": "KNeighborsClassifier", + "estimator_params": { "algorithm": "kd_tree", "metric": "minkowski", "p": 2 } + }, + "data": { + "source": "make_classification", + "generation_kwargs": { + "n_classes": 5, + "n_samples": [50000, 250000], + "n_features": [8, 16], + "n_informative": "[SPECIAL_VALUE]0.5" + }, + "split_kwargs": { "train_size": 0.8, "test_size": 0.2 } + } + }, + "brute knn algorithm - regression data": { + "algorithm": { + "estimator": "KNeighborsRegressor", + "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } + }, + "data": [ + { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 40000, "test_size": 10000 } }, + { "dataset": ["fried", "twodplanes"] } + ] + }, + "kd_tree knn algorithm - regression data": { + "algorithm": { + "estimator": "KNeighborsRegressor", + "estimator_params": { "algorithm": "kd_tree", "metric": "minkowski", "p": 2 } + }, + "data": [ + { "dataset": "fried" }, + { + "source": "make_regression", + "generation_kwargs": { + "n_samples": [50000, 250000], + "n_features": [8, 16], + "noise": 0.75 + }, + "split_kwargs": { "train_size": 0.8, "test_size": 0.2 } + } + ] + } + }, + "TEMPLATES": { + "sklearn brute knn clsf": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common knn parameters", + "sklearn knn parameters", + "brute knn algorithm - classification data" + ] + }, + "sklearn kd_tree knn clsf": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "common knn parameters", + "sklearn knn parameters", + "kd_tree knn algorithm - classification data" + ] + }, + "sklearn brute knn regr": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common knn parameters", + "sklearn knn parameters", + "brute knn algorithm - regression data" + ] + }, + "sklearn kd_tree knn regr": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "common knn parameters", + "sklearn knn parameters", + "kd_tree knn algorithm - regression data" + ] + }, + "cuml brute knn clsf": { + "SETS": [ + "cuml implementation", + "common knn parameters", + "brute knn algorithm - classification data" + ] + }, + "cuml brute knn regr": { + "SETS": [ + "cuml implementation", + "common knn parameters", + "brute knn algorithm - regression data" + ] + } + } +} diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json new file mode 100644 index 000000000..eb1b79ba9 --- /dev/null +++ b/configs/regular/linear_model.json @@ -0,0 +1,149 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "regression datasets": [ + { + "data": { + "source": "make_regression", + "split_kwargs": { "train_size": 0.2, "test_size": 0.8 }, + "generation_kwargs": { + "n_samples": 500000, + "n_features": [400, 2000], + "n_informative": 5, + "noise": 2.0 + } + } + }, + { + "data": { + "dataset": "year_prediction_msd", + "preprocessing_kwargs": { "normalize": true }, + "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } + } + }, + { + "data": { + "dataset": ["hepmass", "susy"], + "split_kwargs": { "train_size": 1000000, "test_size": null } + } + } + ], + "common linear parameters": { + "algorithm": { + "estimator": "LinearRegression", + "estimator_params": { "fit_intercept": true, "copy_X": true } + } + }, + "common ridge parameters": { + "algorithm": { + "estimator": "Ridge", + "estimator_params": { + "fit_intercept": true, + "alpha": 2.0, + "tol": 1e-4 + } + } + }, + "common lasso parameters": { + "algorithm": { + "estimator": "Lasso", + "estimator_params": { + "fit_intercept": true, + "max_iter": 1000, + "selection": "cyclic", + "alpha": 1e-3, + "tol": 1e-4 + } + } + }, + "common elasticnet parameters": { + "algorithm": { + "estimator": "ElasticNet", + "estimator_params": { + "fit_intercept": true, + "max_iter": 1000, + "selection": "cyclic", + "alpha": 1e-3, + "l1_ratio": 0.9, + "tol": 1e-4 + } + } + }, + "sklearn linear parameters": { + "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } + }, + "sklearn ridge parameters": { + "estimator_params": { "solver": "auto" } + }, + "cuml L2 parameters": { + "estimator_params": { "solver": "eig" } + }, + "cuml L1 parameters": { + "estimator_params": { "solver": "cd" } + } + }, + "TEMPLATES": { + "sklearn linear": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common linear parameters", + "sklearn linear parameters", + "regression datasets" + ] + }, + "sklearn ridge": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "common ridge parameters", + "sklearn ridge parameters", + "regression datasets" + ] + }, + "sklearn lasso": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "common lasso parameters", + "regression datasets" + ] + }, + "sklearn elasticnet": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "common elasticnet parameters", + "regression datasets" + ] + }, + "cuml linear": { + "SETS": [ + "cuml implementation", + "common linear parameters", + "cuml L2 parameters", + "regression datasets" + ] + }, + "cuml ridge": { + "SETS": [ + "cuml implementation", + "common ridge parameters", + "cuml L2 parameters", + "regression datasets" + ] + }, + "cuml lasso": { + "SETS": [ + "cuml implementation", + "common lasso parameters", + "cuml L1 parameters", + "regression datasets" + ] + }, + "cuml elasticnet": { + "SETS": [ + "cuml implementation", + "common elasticnet parameters", + "cuml L1 parameters", + "regression datasets" + ] + } + } +} diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json new file mode 100644 index 000000000..a94a7fcf7 --- /dev/null +++ b/configs/regular/logreg.json @@ -0,0 +1,72 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common logreg parameters": { + "algorithm": { + "estimator": "LogisticRegression", + "estimator_methods": { "inference": "predict" }, + "estimator_params": { + "penalty": "l2", + "tol": 1e-4, + "C": 1.0, + "l1_ratio": null, + "max_iter": 200 + } + } + }, + "sklearn logreg parameters": { + "algorithm": { + "estimator_params": { + "solver": "lbfgs", + "n_jobs": "[SPECIAL_VALUE]physical_cpus", + "random_state": 42 + } + } + }, + "cuml logreg parameters": { + "algorithm": { "estimator_params": { "solver": "qn" } } + }, + "logreg datasets": [ + { + "data": { + "source": "make_classification", + "generation_kwargs": { + "n_samples": 200000, + "n_features": [50, 500], + "n_classes": [2, 5], + "n_informative": "[SPECIAL_VALUE]0.5", + "class_sep": 0.75 + }, + "split_kwargs": { + "train_size": 0.5, + "test_size": 0.5 + } + } + }, + { "data": { "dataset": "mnist", "split_kwargs": { "train_size": 20000, "test_size": 50000 } } }, + { "data": { "dataset": "susy", "split_kwargs": { "train_size": 0.2, "test_size": 0.8 } } }, + { "data": { "dataset": "cifar", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } } }, + { "data": { "dataset": "klaverjas" } }, + { "data": { "dataset": "gisette" } }, + { "data": { "dataset": "skin_segmentation" } } + ] + }, + "TEMPLATES": { + "sklearn logreg": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common logreg parameters", + "sklearn logreg parameters", + "logreg datasets" + ] + }, + "cuml logreg": { + "SETS": [ + "cuml implementation", + "common logreg parameters", + "cuml logreg parameters", + "logreg datasets" + ] + } + } +} diff --git a/configs/regular/pca.json b/configs/regular/pca.json new file mode 100644 index 000000000..bab34c0f4 --- /dev/null +++ b/configs/regular/pca.json @@ -0,0 +1,62 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "pca parameters": { + "algorithm": { + "estimator": "PCA", + "estimator_params": { + "n_components": 3, + "copy": true, + "whiten": false, + "svd_solver": "full", + "tol": 0.0, + "iterated_power": 15, + "random_state": 42 + } + } + }, + "pca datasets": [ + { + "data": { + "dataset": ["cifar", "mnist", "yolanda"], + "split_kwargs": { "ignore": true } + } + }, + { + "data": { + "source": "make_blobs", + "generation_kwargs": { "n_samples": 20000, "n_features": 2000, "centers": 2 }, + "split_kwargs": { "train_size": 0.1, "test_size": 0.9 } + } + }, + { + "data": { + "dataset": "epsilon", + "split_kwargs": { "train_size": 50000, "test_size": null } + } + }, + { + "data": { + "dataset": "higgs", + "split_kwargs": { "train_size": 1000000, "test_size": null } + } + } + ] + }, + "TEMPLATES": { + "sklearn pca": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "pca parameters", + "pca datasets" + ] + }, + "cuml pca": { + "SETS": [ + "cuml implementation", + "pca parameters", + "pca datasets" + ] + } + } +} diff --git a/configs/regular/svm.json b/configs/regular/svm.json new file mode 100644 index 000000000..177b3d568 --- /dev/null +++ b/configs/regular/svm.json @@ -0,0 +1,247 @@ +{ + "PARAMETERS_SETS": { + "binary svc implementations": [ + { + "algorithm": [ + { "library": "sklearn", "device": "cpu" }, + { "library": "sklearnex", "device": ["cpu", "gpu"] } + ] + }, + { + "algorithm": { + "library": "cuml", + "estimator_methods": {"inference": "predict"}, + "estimator_params": { "verbose": false, "multiclass_strategy": "ovr" } + }, + "data": { "format": "cudf" } + } + ], + "multi svc implementations": [ + { + "algorithm": { + "library": ["sklearn", "sklearnex"], + "device": "cpu", + "estimator_params": { "decision_function_shape": "ovr" } + } + }, + { + "algorithm": { + "library": "cuml", + "estimator_methods": {"inference": "predict"}, + "estimator_params": { "multiclass_strategy": "ovr" } + }, + "data": { "format": "cudf" } + } + ], + "svr implementations": [ + { + "algorithm": { + "library": ["sklearn", "sklearnex"], + "device": "cpu" + } + }, + { + "algorithm": { + "library": "cuml", + "estimator_methods": {"inference": "predict"} + }, + "data": { "format": "cudf" } + } + ], + "nusvm implementations": { + "algorithm": [ + { "library": "sklearn", "device": "cpu" }, + { "library": "sklearnex", "device": "cpu" } + ] + }, + "common svm parameters": { + "algorithm": { + "estimator_params": { + "kernel": "rbf", + "degree": 3, + "gamma": "scale", + "tol": 1e-3, + "cache_size": 16384, + "max_iter": 10000 + } + }, + "data": { "preprocessing_kwargs": { "normalize": true } } + }, + "svm clsf parameters": { + "algorithm": { "estimator_params": { "random_state": 42 } } + }, + "svc parameters": { + "algorithm": { "estimator": "SVC", "estimator_params": { "C": 1.0 } } + }, + "svr parameters": { + "algorithm": { "estimator": "SVR", "estimator_params": { "C": 1.0 } } + }, + "nusvc parameters": { + "algorithm": { "estimator": "NuSVC", "estimator_params": { "nu": 0.5 } } + }, + "nusvr parameters": { + "algorithm": { "estimator": "NuSVR", "estimator_params": { "nu": 0.5, "C": 1.0 } } + }, + "svc binary data": [ + { + "data": { "dataset": "a9a", "split_kwargs": { "train_size": 5000, "test_size": null } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": "linear" } } + }, + { + "data": { "dataset": "skin_segmentation", "split_kwargs": { "train_size": 20000, "test_size": null } }, + "algorithm": { "estimator_params": { "C": 10.0, "kernel": "rbf" } } + }, + { + "data": { "dataset": "ijcnn", "split_kwargs": { "train_size": 20000, "test_size": null } }, + "algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } } + }, + { + "data": { "dataset": "epsilon", "split_kwargs": { "train_size": 10000, "test_size": 10000 } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } } + }, + { + "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }, + "algorithm": { + "estimator_params": { "C": 100.0, "kernel": ["linear", "poly", "rbf"] } + } + } + ], + "svc multiclass data": [ + { + "data": { "dataset": "connect", "split_kwargs": { "train_size": 20000, "test_size": null } }, + "algorithm": { "estimator_params": { "C": 10.0, "kernel": ["poly", "rbf"] } } + }, + { + "data": { + "dataset": "mnist", + "split_kwargs": { "train_size": 20000, "test_size": null }, + "preprocessing_kwargs": { "normalize": false } + }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } } + } + ], + "svr data": [ + { + "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 20000, "test_size": null } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } } + }, + { + "data": { "dataset": "fried", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } }, + "algorithm": { "estimator_params": { "C": 2.0, "kernel": "rbf" } } + }, + { + "data": { "dataset": "twodplanes", "split_kwargs": { "train_size": 25000, "test_size": null } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } } + }, + { + "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } } + }, + { + "data": { + "source": "make_regression", + "generation_kwargs": { + "n_samples": 20000, + "n_features": 1000, + "n_informative": "[SPECIAL_VALUE]0.5" + }, + "split_kwargs": { "train_size": 0.5 } + }, + "algorithm": { "estimator_params": { "C": 0.1, "kernel": "linear" } } + } + ], + "nusvc data": [ + { + "data": { "dataset": "a9a", "split_kwargs": { "train_size": 5000, "test_size": null } }, + "algorithm": { "estimator_params": { "nu": 0.1, "kernel": ["poly", "rbf"] } } + }, + { + "data": { "dataset": "codrnanorm", "split_kwargs": { "train_size": 20000, "test_size": null } }, + "algorithm": { "estimator_params": { "nu": 0.5, "kernel": "poly" } } + }, + { + "data": { "dataset": "ijcnn", "split_kwargs": { "train_size": 20000, "test_size": null } }, + "algorithm": { "estimator_params": { "nu": 0.1, "kernel": "rbf" } } + }, + { + "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }, + "algorithm": { "estimator_params": { "nu": 0.9, "kernel": ["linear", "rbf"] } } + } + ], + "nusvr data": [ + { + "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 20000, "test_size": null } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } } + }, + { + "data": { "dataset": "twodplanes", "split_kwargs": { "train_size": 25000, "test_size": null } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } } + }, + { + "data": { "dataset": "fried" }, + "algorithm": { "estimator_params": { "nu": 0.8, "C": 2.0, "kernel": "rbf" } } + }, + { + "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }, + "algorithm": { "estimator_params": { "nu": 0.9, "C": 1.0, "kernel": "rbf" } } + }, + { + "data": { + "source": "make_regression", + "generation_kwargs": { + "n_samples": 20000, + "n_features": 1000, + "n_informative": "[SPECIAL_VALUE]0.5" + }, + "split_kwargs": { "train_size": 0.5 } + }, + "algorithm": { "estimator_params": { "nu": 0.5, "C": 0.1, "kernel": "linear" } } + } + ] + }, + "TEMPLATES": { + "svc binary": { + "SETS": [ + "binary svc implementations", + "common svm parameters", + "svm clsf parameters", + "svc parameters", + "svc binary data" + ] + }, + "svc multiclass": { + "SETS": [ + "multi svc implementations", + "common svm parameters", + "svm clsf parameters", + "svc parameters", + "svc multiclass data" + ] + }, + "svr": { + "SETS": [ + "svr implementations", + "common svm parameters", + "svr parameters", + "svr data" + ] + }, + "nusvc": { + "SETS": [ + "nusvm implementations", + "common svm parameters", + "svm clsf parameters", + "nusvc parameters", + "nusvc data" + ] + }, + "nusvr": { + "SETS": [ + "nusvm implementations", + "common svm parameters", + "nusvr parameters", + "nusvr data" + ] + } + } +} diff --git a/configs/regular/train_test_split.json b/configs/regular/train_test_split.json new file mode 100644 index 000000000..134d9e4eb --- /dev/null +++ b/configs/regular/train_test_split.json @@ -0,0 +1,52 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "train_test_split parameters": { + "algorithm": { + "function": "train_test_split", + "args_order": "x_train|y_train", + "kwargs": { + "test_size": 0.25, + "random_state": 42, + "shuffle": true + } + } + }, + "train_test_split datasets": [ + { + "data": { + "dataset": "hepmass", + "split_kwargs": { + "train_size": [100000, 1000000, 10000000], + "test_size": null + } + } + }, + { + "data": { + "dataset": ["a9a", "mnist", "cifar", "gisette"], + "split_kwargs": [ + { "train_size": 0.4 }, + { "ignore": true } + ] + } + } + ] + }, + "TEMPLATES": { + "sklearn train_test_split": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "train_test_split parameters", + "train_test_split datasets" + ] + }, + "cuml train_test_split": { + "SETS": [ + "cuml implementation", + "train_test_split parameters", + "train_test_split datasets" + ] + } + } +} diff --git a/configs/regular/tsne.json b/configs/regular/tsne.json new file mode 100644 index 000000000..135ebc16a --- /dev/null +++ b/configs/regular/tsne.json @@ -0,0 +1,72 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "sklearn parameters": { + "algorithm": { + "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } + } + }, + "cuml parameters": { + "algorithm": { + "estimator_params": { "learning_rate_method": "none", "n_neighbors": 91 } + } + }, + "common tsne parameters": { + "algorithm": { + "estimator": "TSNE", + "estimator_params": { + "n_components": 2, + "perplexity": 30.0, + "early_exaggeration": 12.0, + "learning_rate": 200.0, + "n_iter": 1000, + "n_iter_without_progress": 300, + "min_grad_norm": 1e-7, + "metric": "euclidean", + "init": "random", + "random_state": 42, + "method": "barnes_hut", + "angle": 0.5 + } + } + }, + "tsne datasets": [ + { + "data": { + "dataset": "california_housing", + "split_kwargs": { "train_size": 10000 } + } + }, + { + "data": { + "dataset": "hepmass", + "split_kwargs": { "train_size": [1000, 2000, 5000] } + } + }, + { + "data": { + "dataset": ["a9a", "mnist", "gisette"], + "split_kwargs": { "train_size": 5000 } + } + } + ] + }, + "TEMPLATES": { + "sklearn tsne": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common tsne parameters", + "sklearn parameters", + "tsne datasets" + ] + }, + "cuml tsne": { + "SETS": [ + "cuml implementation", + "common tsne parameters", + "cuml parameters", + "tsne datasets" + ] + } + } +} diff --git a/configs/regular/xgboost_binary.json b/configs/regular/xgboost_binary.json new file mode 100644 index 000000000..170d41efd --- /dev/null +++ b/configs/regular/xgboost_binary.json @@ -0,0 +1,110 @@ +{ + "INCLUDE": ["../common/xgboost.json"], + "PARAMETERS_SETS": { + "binary classification data": [ + { + "data": { + "dataset": "airline_depdelay", + "preprocessing_kwargs": { + "category_encoding": ["onehot", "ordinal"], + "subsample": 600000 + }, + "split_kwargs": { + "train_size": 100000, + "test_size": 500000 + } + }, + "algorithm": { + "estimator_params": { + "learning_rate": 0.1, + "max_depth": 8, + "reg_alpha": 1.0, + "reg_lambda": 1.0, + "n_estimators": 500 + } + } + }, + { + "data": { + "dataset": "hepmass", + "split_kwargs": { + "train_size": 200000, + "test_size": 1000000 + } + }, + "algorithm": { + "estimator_params": { + "learning_rate": 0.1, + "reg_alpha": 1.0, + "reg_lambda": 1.0, + "max_leaves": 256, + "n_estimators": 500 + } + } + }, + { + "data": { + "dataset": "bosch", + "split_kwargs": { + "train_size": 200000, + "test_size": null + } + }, + "algorithm": { + "estimator_params": { + "learning_rate": 0.01, + "reg_alpha": 1.0, + "max_leaves": 256, + "colsample_bytree": 0.5, + "colsample_bynode": 0.5, + "n_estimators": 200 + } + } + }, + { + "data": { + "dataset": "epsilon", + "split_kwargs": { + "train_size": 10000, + "test_size": 100000 + } + }, + "algorithm": { + "estimator_params": { + "max_depth": 8, + "colsample_bytree": 0.1, + "colsample_bynode": 0.1, + "n_estimators": 200 + } + } + }, + { + "data": { + "dataset": "gisette", + "split_kwargs": { + "train_size": 2000, + "test_size": 5000 + } + }, + "algorithm": { + "estimator_params": { + "learning_rate": 0.15, + "max_leaves": 256, + "colsample_bytree": 0.1, + "colsample_bynode": 0.1, + "n_estimators": 100 + } + } + } + ] + }, + "TEMPLATES": { + "binary classification": { + "SETS": [ + "xgboost binary classification", + "xgboost implementations", + "binary classification data" + ] + } + } +} diff --git a/configs/regular/xgboost_multi.json b/configs/regular/xgboost_multi.json new file mode 100644 index 000000000..4552e05d0 --- /dev/null +++ b/configs/regular/xgboost_multi.json @@ -0,0 +1,73 @@ +{ + "INCLUDE": ["../common/xgboost.json"], + "PARAMETERS_SETS": { + "multiclassification data": [ + { + "data": { + "dataset": "letters", + "split_kwargs": { + "train_size": 0.5, + "test_size": 0.5 + } + }, + "algorithm": { + "estimator_params": { + "learning_rate": 0.05, + "reg_lambda": 1, + "max_depth": 6, + "n_estimators": 200 + } + } + }, + { + "data": { + "dataset": "mnist", + "split_kwargs": { + "train_size": 20000, + "test_size": 50000 + } + }, + "algorithm": { + "estimator_params": { + "max_leaves": 256, + "colsample_bytree": 0.2, + "colsample_bynode": 0.5, + "n_estimators": 100 + } + } + }, + { + "data": { + "dataset": "covtype" + }, + "algorithm": { + "estimator_params": { + "learning_rate": 0.1, + "reg_lambda": 1, + "max_depth": 8, + "n_estimators": 200 + } + } + }, + { + "data": { + "dataset": "connect" + }, + "algorithm": { + "estimator_params": { + "n_estimators": 500 + } + } + } + ] + }, + "TEMPLATES": { + "multi classification": { + "SETS": [ + "xgboost multiclassification", + "xgboost implementations", + "multiclassification data" + ] + } + } +} diff --git a/configs/regular/xgboost_regression.json b/configs/regular/xgboost_regression.json new file mode 100644 index 000000000..adffeebdb --- /dev/null +++ b/configs/regular/xgboost_regression.json @@ -0,0 +1,104 @@ +{ + "INCLUDE": ["../common/xgboost.json"], + "PARAMETERS_SETS": { + "regression data": [ + { + "data": { + "dataset": "twodplanes", + "split_kwargs": { + "train_size": 0.33, + "test_size": null + } + }, + "algorithm": { + "estimator_params": { + "learning_rate": 0.01, + "reg_alpha": 1.0, + "reg_lambda": 0.1, + "n_estimators": 500 + } + } + }, + { + "data": { + "dataset": "medical_charges_nominal", + "split_kwargs": { + "train_size": 0.1, + "test_size": 0.9 + } + }, + "algorithm": { + "estimator_params": { + "learning_rate": 0.1, + "max_depth": 5, + "reg_alpha": 1.0, + "reg_lambda": 1.0, + "n_estimators": 1000 + } + } + }, + { + "data": { + "dataset": "year_prediction_msd", + "split_kwargs": { + "train_size": 0.25, + "test_size": 0.75 + } + }, + "algorithm": { + "estimator_params": { + "learning_rate": 0.1, + "reg_alpha": 0.5, + "reg_lambda": 0.5, + "n_estimators": 200 + } + } + }, + { + "data": { + "dataset": "hepmass", + "split_kwargs": { + "train_size": 200000, + "test_size": 1000000 + } + }, + "algorithm": { + "estimator_params": { + "learning_rate": 0.1, + "reg_alpha": 1.0, + "reg_lambda": 1.0, + "max_leaves": 256, + "n_estimators": 500 + } + } + }, + { + "data": { + "dataset": "gisette", + "split_kwargs": { + "train_size": 2000, + "test_size": 5000 + } + }, + "algorithm": { + "estimator_params": { + "learning_rate": 0.15, + "max_leaves": 256, + "colsample_bytree": 0.1, + "colsample_bynode": 0.1, + "n_estimators": 100 + } + } + } + ] + }, + "TEMPLATES": { + "regression": { + "SETS": [ + "xgboost regression", + "xgboost implementations", + "regression data" + ] + } + } +} diff --git a/configs/skl_config.json b/configs/skl_config.json deleted file mode 100644 index f3f1fa93f..000000000 --- a/configs/skl_config.json +++ /dev/null @@ -1,789 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 1000, - "n_features": 20, - "training": { - "n_samples": 1000000 - } - } - ], - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 1000, - "maxiter": 50, - "tol": 0.0 - }, - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 5, - "n_features": 50, - "training": { - "n_samples": 10000000 - } - } - ], - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 5, - "maxiter": 50, - "init": "k-means++", - "tol": 0.0 - }, - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 20, - "n_features": 50, - "training": { - "n_samples": 3000000 - } - } - ], - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 20, - "maxiter": 50, - "tol": 0.0 - }, - { - "algorithm": "pca", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 1000000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 2000, - "training": { - "n_samples": 10000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 1000, - "training": { - "n_samples": 30000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 4000, - "training": { - "n_samples": 6000 - } - } - ], - "svd-solver": "full", - "n-components": 10 - }, - { - "algorithm": "df_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - }, - { - "source": "npy", - "name": "airline-ohe", - "training": - { - "x": "data/airline-ohe_x_train.npy", - "y": "data/airline-ohe_y_train.npy" - }, - "testing": - { - "x": "data/airline-ohe_x_test.npy", - "y": "data/airline-ohe_y_test.npy" - } - } - ], - "num-trees": 50, - "max-depth": 16, - "max-leaf-nodes": 131072, - "max-features": 0.2 - }, - { - "algorithm": "df_regr", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - }, - { - "source": "npy", - "name": "airline_regression", - "training": - { - "x": "data/airline_regression_x_train.npy", - "y": "data/airline_regression_y_train.npy" - }, - "testing": - { - "x": "data/airline_regression_x_test.npy", - "y": "data/airline_regression_y_test.npy" - } - } - ] - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "regression", - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ], - "alpha": 5 - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "regression", - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ] - }, - { - "algorithm": "log_reg", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 2000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ], - "maxiter": 100, - "tol": 0 - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "ijcnn", - "training": - { - "x": "data/ijcnn_x_train.npy", - "y": "data/ijcnn_y_train.npy" - }, - "testing": - { - "x": "data/ijcnn_x_test.npy", - "y": "data/ijcnn_y_test.npy" - } - } - ], - "C": 1000.0, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "C": 1.5e-3, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "connect", - "training": - { - "x": "data/connect_x_train.npy", - "y": "data/connect_y_train.npy" - }, - "testing": - { - "x": "data/connect_x_test.npy", - "y": "data/connect_y_test.npy" - } - } - ], - "C": 100.0, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - }, - "testing": - { - "x": "data/sensit_x_test.npy", - "y": "data/sensit_y_test.npy" - } - } - ], - "C": 500.0, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - } - ], - "C": 1.0, - "kernel": "rbf" - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "covertype", - "training": - { - "x": "data/covertype_x_train.npy", - "y": "data/covertype_y_train.npy" - }, - "testing": - { - "x": "data/covertype_x_test.npy", - "y": "data/covertype_y_test.npy" - } - } - ], - "C": 100.0, - "kernel": "rbf" - }, - { - "algorithm": "nusvc", - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "nu": 0.25, - "kernel": "sigmoid" - }, - { - "algorithm": "nusvc", - "dataset": [ - { - "source": "npy", - "name": "klaverjas", - "training": - { - "x": "data/klaverjas_x_train.npy", - "y": "data/klaverjas_y_train.npy" - }, - "testing": - { - "x": "data/klaverjas_x_test.npy", - "y": "data/klaverjas_y_test.npy" - } - } - ], - "nu": 0.7, - "kernel": "rbf" - }, - { - "algorithm": "nusvc", - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "nu": 0.5, - "kernel": "rbf" - }, - { - "algorithm": "nusvc", - "dataset": [ - { - "source": "npy", - "name": "codrnanorm", - "training": - { - "x": "data/codrnanorm_x_train.npy", - "y": "data/codrnanorm_y_train.npy" - }, - "testing": - { - "x": "data/codrnanorm_x_test.npy", - "y": "data/codrnanorm_y_test.npy" - } - } - ], - "nu": 0.15, - "kernel": "poly" - }, - { - "algorithm": "svr", - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ], - "C": 0.1, - "kernel": "poly" - }, - { - "algorithm": "svr", - "dataset": [ - { - "source": "npy", - "name": "twodplanes", - "training": - { - "x": "data/twodplanes_x_train.npy", - "y": "data/twodplanes_y_train.npy" - }, - "testing": - { - "x": "data/twodplanes_x_test.npy", - "y": "data/twodplanes_y_test.npy" - } - } - ], - "C": 10.0, - "kernel": "rbf" - }, - { - "algorithm": "nusvr", - "dataset": [ - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - } - ], - "nu": 0.8, - "C": 2.0, - "kernel": "rbf" - }, - { - "algorithm": "nusvr", - "dataset": [ - { - "source": "npy", - "name": "medical_charges_nominal", - "training": - { - "x": "data/medical_charges_nominal_x_train.npy", - "y": "data/medical_charges_nominal_y_train.npy" - }, - "testing": - { - "x": "data/medical_charges_nominal_x_test.npy", - "y": "data/medical_charges_nominal_y_test.npy" - } - } - ], - "nu": 0.5, - "C": 10.0, - "kernel": "poly", - "degree": 2 - }, - { - "algorithm": "nusvr", - "dataset": [ - { - "source": "npy", - "name": "yolanda", - "training": - { - "x": "data/yolanda_x_train.npy", - "y": "data/yolanda_y_train.npy" - }, - "testing": - { - "x": "data/yolanda_x_test.npy", - "y": "data/yolanda_y_test.npy" - } - } - ], - "nu": 0.8, - "C": 2.0, - "kernel": "rbf" - }, - { - "algorithm": "dbscan", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 50, - "n_features": 3, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 50, - "n_features": 10, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 100, - "n_features": 50, - "training": { - "n_samples": 500000 - } - } - ] - }, - { - "algorithm": "knn_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 3, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 10, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 50, - "training": { - "n_samples": 20000 - }, - "testing": { - "n_samples": 20000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 10, - "n_features": 16, - "training": { - "n_samples": 250000 - }, - "testing": { - "n_samples": 250000 - } - } - ], - "method": ["brute", "kd_tree"] - }, - { - "algorithm": "train_test_split", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 20, - "training": { - "n_samples": 5000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 1000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 10000, - "training": { - "n_samples": 10000 - } - } - ], - "include-y": "", - "train-size": 0.75, - "test-size": 0.25 - }, - { - "algorithm": "train_test_split", - "dataset": [ - { - "source": "npy", - "name": "census", - "training": - { - "x": "data/census_x_train.npy", - "y": "data/census_y_train.npy" - } - } - ], - "data-format": "numpy", - "data-order": "C", - "include-y": "", - "train-size": 0.9, - "test-size": 0.1 - }, - { - "algorithm": "lasso", - "dataset": [ - { - "source": "npy", - "name": "mortgage1Q", - "training": - { - "x": "data/mortgage1Q_x_train.npy", - "y": "data/mortgage1Q_y_train.npy" - } - } - ], - "alpha": 1.0, - "tol": 1e-4 - }, - { - "algorithm": "elasticnet", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "alpha": 2.0, - "l1_ratio": 0.5, - "tol": 1e-4 - }, - { - "algorithm": "tsne", - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/skl_public_config.json b/configs/skl_public_config.json deleted file mode 100644 index 4aa6feaad..000000000 --- a/configs/skl_public_config.json +++ /dev/null @@ -1,1007 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "data-format": "pandas", - "data-order": "F", - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "algorithm": "knn_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "method": ["brute", "kd_tree"], - "n-neighbors": [5, 100] - }, - { - "algorithm": "knn_clsf", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "cifar_binary", - "training": - { - "x": "data/cifar_binary_x_train.npy", - "y": "data/cifar_binary_y_train.npy" - }, - "testing": - { - "x": "data/cifar_binary_x_test.npy", - "y": "data/cifar_binary_y_test.npy" - } - } - ], - "method": ["brute", "kd_tree"], - "n-neighbors": 7 - }, - { - "algorithm": "knn_clsf", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "method": ["brute", "kd_tree"], - "n-neighbors": 5 - }, - { - "algorithm": "knn_regr", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "method": ["brute", "kd_tree"], - "n-neighbors": 5 - }, - { - "algorithm": "knn_regr", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "cifar_binary", - "training": - { - "x": "data/cifar_binary_x_train.npy", - "y": "data/cifar_binary_y_train.npy" - }, - "testing": - { - "x": "data/cifar_binary_x_test.npy", - "y": "data/cifar_binary_y_test.npy" - } - } - ], - "method": ["brute", "kd_tree"], - "n-neighbors": 7 - }, - { - "algorithm": "knn_regr", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "cifar_binary", - "training": - { - "x": "data/cifar_binary_x_train.npy", - "y": "data/cifar_binary_y_train.npy" - }, - "testing": - { - "x": "data/cifar_binary_x_test.npy", - "y": "data/cifar_binary_y_test.npy" - } - } - ], - "task": "search", - "n-neighbors": 7 - }, - { - "algorithm": "knn_regr", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "task": "search", - "n-neighbors": 5 - }, - { - "algorithm": "pca", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - } - } - ] - }, - { - "algorithm": "pca", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "epsilon_30K", - "training": - { - "x": "data/epsilon_30K_x_train.npy", - "y": "data/epsilon_30K_y_train.npy" - } - } - ] - }, - { - "algorithm": "pca", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "cifar_binary", - "training": - { - "x": "data/cifar_binary_x_train.npy", - "y": "data/cifar_binary_y_train.npy" - } - } - ] - }, - { - "algorithm": "dbscan", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "hepmass_10K_cluster", - "training": - { - "x": "data/hepmass_10K_cluster.npy" - } - } - ], - "eps": 5, - "min-samples": 3 - }, - { - "algorithm": "dbscan", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "mnist_10K_cluster", - "training": - { - "x": "data/mnist_10K_cluster.npy" - } - } - ], - "eps": 1.7e3, - "min-samples": 3 - }, - { - "algorithm": "dbscan", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "road_network_20K_cluster", - "training": - { - "x": "data/road_network_20K_cluster.npy" - } - } - ], - "eps": 1.0e3, - "min-samples": 220 - }, - { - "algorithm": "log_reg", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "susy", - "training": - { - "x": "data/susy_x_train.npy", - "y": "data/susy_y_train.npy" - }, - "testing": - { - "x": "data/susy_x_test.npy", - "y": "data/susy_y_test.npy" - } - } - ], - "solver": "saga", - "maxiter": "20", - "tol": 1e-3 - }, - { - "algorithm": "log_reg", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "C": 0.2, - "maxiter": "500", - "tol": 1e-2 - }, - { - "algorithm": "df_clsf", - "dtype": "float32", - "max-features": "sqrt", - "dataset": [ - { - "source": "npy", - "name": "susy", - "training": - { - "x": "data/susy_x_train.npy", - "y": "data/susy_y_train.npy" - }, - "testing": - { - "x": "data/susy_x_test.npy", - "y": "data/susy_y_test.npy" - } - } - ], - "num-trees": 10, - "max-depth": 5 - }, - { - "algorithm": "df_clsf", - "dtype": "float32", - "max-features": "sqrt", - "dataset": [ - { - "source": "npy", - "name": "susy", - "training": - { - "x": "data/susy_x_train.npy", - "y": "data/susy_y_train.npy" - }, - "testing": - { - "x": "data/susy_x_test.npy", - "y": "data/susy_y_test.npy" - } - } - ], - "num-trees": 100, - "max-depth": 8 - }, - { - "algorithm": "df_clsf", - "dtype": "float32", - "max-features": "sqrt", - "dataset": [ - { - "source": "npy", - "name": "susy", - "training": - { - "x": "data/susy_x_train.npy", - "y": "data/susy_y_train.npy" - }, - "testing": - { - "x": "data/susy_x_test.npy", - "y": "data/susy_y_test.npy" - } - } - ], - "num-trees": 20, - "max-depth": 16 - }, - { - "algorithm": "df_clsf", - "dtype": "float32", - "max-features": "sqrt", - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "num-trees": 100, - "max-depth": 10 - }, - { - "algorithm": "df_clsf", - "dtype": ["float32", "float64"], - "max-features": "sqrt", - "dataset": [ - { - "source": "npy", - "name": "hepmass_150K", - "training": - { - "x": "data/hepmass_150K_x_train.npy", - "y": "data/hepmass_150K_y_train.npy" - }, - "testing": - { - "x": "data/hepmass_150K_x_test.npy", - "y": "data/hepmass_150K_y_test.npy" - } - } - ], - "num-trees": 50, - "max-depth": 15 - }, - { - "algorithm": "df_regr", - "dtype": ["float32", "float64"], - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "num-trees": 10, - "max-depth": 5 - }, - { - "algorithm": "df_regr", - "dtype": "float32", - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "num-trees": 20, - "max-depth": 8 - }, - { - "algorithm": "df_regr", - "dtype": "float32", - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "higgs_10500K", - "training": - { - "x": "data/higgs_10500K_x_train.npy", - "y": "data/higgs_10500K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_10500K_x_test.npy", - "y": "data/higgs_10500K_y_test.npy" - } - } - ], - "num-trees": 100, - "max-depth": 8 - }, - { - "algorithm": "df_regr", - "dtype": "float32", - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "higgs_10500K", - "training": - { - "x": "data/higgs_10500K_x_train.npy", - "y": "data/higgs_10500K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_10500K_x_test.npy", - "y": "data/higgs_10500K_y_test.npy" - } - } - ], - "num-trees": 20, - "max-depth": 16 - }, - { - "algorithm": "svm", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "C": 1.5e-3, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "C": 1.0, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - }, - "testing": - { - "x": "data/sensit_x_test.npy", - "y": "data/sensit_y_test.npy" - } - } - ], - "C": 500.0, - "kernel": "linear" - }, - { - "algorithm": "svm", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "epsilon_16K", - "training": - { - "x": "data/epsilon_16K_x_train.npy", - "y": "data/epsilon_16K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_16K_x_test.npy", - "y": "data/epsilon_16K_y_test.npy" - } - } - ], - "C": 9.0e2, - "kernel": "rbf" - }, - { - "algorithm": "nusvc", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "nu": 0.25, - "kernel": "sigmoid" - }, - { - "algorithm": "nusvc", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "nu": 0.5, - "kernel": "rbf" - }, - { - "algorithm": "svr", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ], - "C": 0.1, - "kernel": "poly" - }, - { - "algorithm": "svr", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "twodplanes", - "training": - { - "x": "data/twodplanes_x_train.npy", - "y": "data/twodplanes_y_train.npy" - }, - "testing": - { - "x": "data/twodplanes_x_test.npy", - "y": "data/twodplanes_y_test.npy" - } - } - ], - "C": 10.0, - "kernel": "rbf" - }, - { - "algorithm": "nusvr", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - } - ], - "nu": 0.8, - "C": 2.0, - "kernel": "rbf" - }, - { - "algorithm": "nusvr", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "medical_charges_nominal", - "training": - { - "x": "data/medical_charges_nominal_x_train.npy", - "y": "data/medical_charges_nominal_y_train.npy" - }, - "testing": - { - "x": "data/medical_charges_nominal_x_test.npy", - "y": "data/medical_charges_nominal_y_test.npy" - } - } - ], - "nu": 0.5, - "C": 10.0, - "kernel": "poly", - "degree": 2 - }, - { - "algorithm": "linear", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ] - }, - { - "algorithm": "linear", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - }, - { - "algorithm": "ridge", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "alpha": 5 - }, - { - "algorithm": "ridge", - "dtype": "float32", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - }, - { - "algorithm": "kmeans", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "higgs_one_m_clustering", - "training": - { - "x": "data/higgs_one_m_clustering.npy" - } - } - ], - "n-clusters": 10, - "maxiter": 100 - }, - { - "algorithm": "kmeans", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "higgs_one_m_clustering", - "training": - { - "x": "data/higgs_one_m_clustering.npy" - } - } - ], - "n-clusters": [100, 250], - "maxiter": 10 - }, - { - "algorithm": "kmeans", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "epsilon_50K_cluster", - "training": - { - "x": "data/epsilon_50K_cluster.npy" - } - } - ], - "n-clusters": [512, 1024], - "maxiter": 10 - }, - { - "algorithm": "kmeans", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "hepmass_1M_cluster", - "training": - { - "x": "data/hepmass_1M_cluster.npy" - } - } - ], - "n-clusters": 100, - "maxiter": 10 - }, - { - "algorithm": "kmeans", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "hepmass_1M_cluster", - "training": - { - "x": "data/hepmass_1M_cluster.npy" - } - } - ], - "n-clusters": 10, - "maxiter": 100 - }, - { - "algorithm": "kmeans", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "susy_cluster", - "training": - { - "x": "data/susy_cluster.npy" - } - } - ], - "n-clusters": 10, - "maxiter": 100 - }, - { - "algorithm": "kmeans", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "susy_cluster", - "training": - { - "x": "data/susy_cluster.npy" - } - } - ], - "n-clusters": [100 , 250], - "maxiter": 10 - }, - { - "algorithm": "kmeans", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "cifar_cluster", - "training": - { - "x": "data/cifar_cluster.npy" - } - } - ], - "n-clusters": [512, 1024, 2048], - "maxiter": 10 - }, - { - "algorithm": "elasticnet", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "alpha": 2.0, - "l1_ratio": 0.5, - "tol": 1e-4 - }, - { - "algorithm": "lasso", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "alpha": 1.0, - "tol": 1e-4 - }, - { - "algorithm": "tsne", - "dtype": ["float32", "float64"], - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/skl_xpu_config.json b/configs/skl_xpu_config.json deleted file mode 100644 index 1cc591b07..000000000 --- a/configs/skl_xpu_config.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 10, - "n_features": 50, - "training": { - "n_samples": 1000000 - } - } - ], - "n-clusters": 10 - }, - { - "algorithm": "dbscan", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 10, - "n_features": 50, - "training": { - "n_samples": 10000 - } - } - ] - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 50, - "training": { - "n_samples": 1000000 - } - } - ] - }, - { - "algorithm": "log_reg", - "solver":["lbfgs", "newton-cg"], - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 100, - "training": { - "n_samples": 100000 - } - } - ] - } - ] -} diff --git a/configs/sklearn/performance/dbscan.json b/configs/sklearn/performance/dbscan.json deleted file mode 100644 index 64dacc40a..000000000 --- a/configs/sklearn/performance/dbscan.json +++ /dev/null @@ -1,89 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "dbscan", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 50, - "n_features": 3, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 50, - "n_features": 10, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 100, - "n_features": 50, - "training": { - "n_samples": 500000 - } - } - ], - "workload-size": "medium" - }, - { - "dataset": [ - { - "source": "npy", - "name": "hepmass_10K_cluster", - "training": - { - "x": "data/hepmass_10K_cluster.npy" - } - } - ], - "workload-size": "small", - "eps": 5, - "min-samples": 3 - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist_10K_cluster", - "training": - { - "x": "data/mnist_10K_cluster.npy" - } - } - ], - "workload-size": "medium", - "eps": 1.7e3, - "min-samples": 3 - }, - { - "dataset": [ - { - "source": "npy", - "name": "road_network_20K_cluster", - "training": - { - "x": "data/road_network_20K_cluster.npy" - } - } - ], - "workload-size": "small", - "eps": 1.0e3, - "min-samples": 220 - } - ] -} diff --git a/configs/sklearn/performance/df_clsf.json b/configs/sklearn/performance/df_clsf.json deleted file mode 100644 index 3c0bc477c..000000000 --- a/configs/sklearn/performance/df_clsf.json +++ /dev/null @@ -1,165 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "df_clsf", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "max-features": "sqrt", - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "workload-size": "medium", - "num-trees": 50, - "max-depth": 16, - "max-leaf-nodes": 131072, - "max-features": 0.2 - }, - { - "device": "none", - "dataset": [ - { - "source": "npy", - "name": "airline-ohe", - "training": - { - "x": "data/airline-ohe_x_train.npy", - "y": "data/airline-ohe_y_train.npy" - }, - "testing": - { - "x": "data/airline-ohe_x_test.npy", - "y": "data/airline-ohe_y_test.npy" - } - } - ], - "workload-size": "medium", - "num-trees": 50, - "max-depth": 16, - "max-leaf-nodes": 131072, - "max-features": 0.2 - }, - { - "dataset": [ - { - "source": "npy", - "name": "susy", - "training": - { - "x": "data/susy_x_train.npy", - "y": "data/susy_y_train.npy" - }, - "testing": - { - "x": "data/susy_x_test.npy", - "y": "data/susy_y_test.npy" - } - } - ], - "workload-size": "medium", - "num-trees": 10, - "max-depth": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "susy", - "training": - { - "x": "data/susy_x_train.npy", - "y": "data/susy_y_train.npy" - }, - "testing": - { - "x": "data/susy_x_test.npy", - "y": "data/susy_y_test.npy" - } - } - ], - "workload-size": "large", - "num-trees": 100, - "max-depth": 8 - }, - { - "dataset": [ - { - "source": "npy", - "name": "susy", - "training": - { - "x": "data/susy_x_train.npy", - "y": "data/susy_y_train.npy" - }, - "testing": - { - "x": "data/susy_x_test.npy", - "y": "data/susy_y_test.npy" - } - } - ], - "workload-size": "medium", - "num-trees": 20, - "max-depth": 16 - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "workload-size": "large", - "num-trees": 100, - "max-depth": 10 - }, - { - "dataset": [ - { - "source": "npy", - "name": "hepmass_150K", - "training": - { - "x": "data/hepmass_150K_x_train.npy", - "y": "data/hepmass_150K_y_train.npy" - }, - "testing": - { - "x": "data/hepmass_150K_x_test.npy", - "y": "data/hepmass_150K_y_test.npy" - } - } - ], - "workload-size": "medium", - "num-trees": 50, - "max-depth": 15 - } - ] -} diff --git a/configs/sklearn/performance/df_regr.json b/configs/sklearn/performance/df_regr.json deleted file mode 100644 index c757f1f02..000000000 --- a/configs/sklearn/performance/df_regr.json +++ /dev/null @@ -1,251 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "df_regr", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "device": "none", - "dataset": [ - { - "source": "npy", - "name": "airline_regression", - "training": - { - "x": "data/airline_regression_x_train.npy", - "y": "data/airline_regression_y_train.npy" - }, - "testing": - { - "x": "data/airline_regression_x_test.npy", - "y": "data/airline_regression_y_test.npy" - } - } - ], - "workload-size": "large", - "num-trees": 100 - }, - { - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "workload-size": "large", - "num-trees": 100 - }, - { - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "workload-size": "medium", - "num-trees": 10, - "max-depth": 5 - }, - { - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "workload-size": "large", - "num-trees": 100, - "max-depth": 5 - }, - { - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "workload-size": "medium", - "num-trees": 20, - "max-depth": 8 - }, - { - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "workload-size": "large", - "num-trees": 100, - "max-depth": 8 - }, - { - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "workload-size": "large", - "num-trees": 20, - "max-depth": 16 - }, - { - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "workload-size": "medium", - "num-trees": [15, 20], - "max-depth": 8 - }, - { - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "workload-size": "large", - "num-trees": 100, - "max-depth": 8 - }, - { - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "higgs_10500K", - "training": - { - "x": "data/higgs_10500K_x_train.npy", - "y": "data/higgs_10500K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_10500K_x_test.npy", - "y": "data/higgs_10500K_y_test.npy" - } - } - ], - "workload-size": "large", - "num-trees": 100, - "max-depth": 8 - }, - { - "max-features": 0.33, - "dataset": [ - { - "source": "npy", - "name": "higgs_10500K", - "training": - { - "x": "data/higgs_10500K_x_train.npy", - "y": "data/higgs_10500K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_10500K_x_test.npy", - "y": "data/higgs_10500K_y_test.npy" - } - } - ], - "workload-size": "large", - "num-trees": 20, - "max-depth": 16 - } - ] -} diff --git a/configs/sklearn/performance/elasticnet.json b/configs/sklearn/performance/elasticnet.json deleted file mode 100644 index 896076a08..000000000 --- a/configs/sklearn/performance/elasticnet.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "elasticnet", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": "none" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "workload-size": "medium", - "alpha": 2.0, - "l1_ratio": 0.5, - "tol": 1e-4 - } - ] -} diff --git a/configs/sklearn/performance/kmeans.json b/configs/sklearn/performance/kmeans.json deleted file mode 100644 index b466ea5d9..000000000 --- a/configs/sklearn/performance/kmeans.json +++ /dev/null @@ -1,221 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "kmeans", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 1000, - "n_features": 20, - "training": { - "n_samples": 1000000 - } - } - ], - "workload-size": "medium", - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 1000, - "maxiter": 50, - "tol": 0.0 - }, - { - "device": "none", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 5, - "n_features": 50, - "training": { - "n_samples": 10000000 - } - } - ], - "workload-size": "medium", - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 5, - "maxiter": 50, - "init": "k-means++", - "tol": 0.0 - }, - { - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 20, - "n_features": 50, - "training": { - "n_samples": 3000000 - } - } - ], - "workload-size": "medium", - "time-method": "box_filter", - "time-limit": 50, - "n-clusters": 20, - "maxiter": 50, - "tol": 0.0 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_one_m_clustering", - "training": - { - "x": "data/higgs_one_m_clustering.npy" - } - } - ], - "workload-size": "medium", - "n-clusters": 10, - "maxiter": 100 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_one_m_clustering", - "training": - { - "x": "data/higgs_one_m_clustering.npy" - } - } - ], - "workload-size": "small", - "n-clusters": 100, - "maxiter": 10 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_one_m_clustering", - "training": - { - "x": "data/higgs_one_m_clustering.npy" - } - } - ], - "workload-size": "medium", - "n-clusters": 250, - "maxiter": 10 - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_50K_cluster", - "training": - { - "x": "data/epsilon_50K_cluster.npy" - } - } - ], - "workload-size": "medium", - "n-clusters": [512, 1024, 2048], - "maxiter": 10 - }, - { - "dataset": [ - { - "source": "npy", - "name": "hepmass_1M_cluster", - "training": - { - "x": "data/hepmass_1M_cluster.npy" - } - } - ], - "workload-size": "small", - "n-clusters": 100, - "maxiter": 10 - }, - { - "dataset": [ - { - "source": "npy", - "name": "hepmass_1M_cluster", - "training": - { - "x": "data/hepmass_1M_cluster.npy" - } - } - ], - "workload-size": "medium", - "n-clusters": 250, - "maxiter": 10 - }, - { - "dataset": [ - { - "source": "npy", - "name": "hepmass_1M_cluster", - "training": - { - "x": "data/hepmass_1M_cluster.npy" - } - } - ], - "workload-size": "medium", - "n-clusters": 10, - "maxiter": 100 - }, - { - "dataset": [ - { - "source": "npy", - "name": "susy_cluster", - "training": - { - "x": "data/susy_cluster.npy" - } - } - ], - "workload-size": "medium", - "n-clusters": 10, - "maxiter": 100 - }, - { - "dataset": [ - { - "source": "npy", - "name": "susy_cluster", - "training": - { - "x": "data/susy_cluster.npy" - } - } - ], - "workload-size": "medium", - "n-clusters": [100 , 250], - "maxiter": 10 - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar_cluster", - "training": - { - "x": "data/cifar_cluster.npy" - } - } - ], - "workload-size": "medium", - "n-clusters": [512, 1024, 2048], - "maxiter": 10 - } - ] -} diff --git a/configs/sklearn/performance/knn_clsf.json b/configs/sklearn/performance/knn_clsf.json deleted file mode 100644 index ac556c407..000000000 --- a/configs/sklearn/performance/knn_clsf.json +++ /dev/null @@ -1,347 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_clsf", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 3, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 10, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - } - ], - "workload-size": "medium", - "method": "brute" - }, - { - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 50, - "training": { - "n_samples": 20000 - }, - "testing": { - "n_samples": 20000 - } - } - ], - "workload-size": "small", - "method": "brute" - }, - { - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 10, - "n_features": 16, - "training": { - "n_samples": 250000 - }, - "testing": { - "n_samples": 250000 - } - } - ], - "workload-size": "large", - "method": "brute" - }, - { - "device": "none", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 3, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 10, - "training": { - "n_samples": 100000 - }, - "testing": { - "n_samples": 100000 - } - } - ], - "workload-size": "medium", - "method": "kd_tree" - }, - { - "device": "none", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 50, - "training": { - "n_samples": 20000 - }, - "testing": { - "n_samples": 20000 - } - } - ], - "workload-size": "small", - "method": "kd_tree" - }, - { - "device": "none", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 10, - "n_features": 16, - "training": { - "n_samples": 250000 - }, - "testing": { - "n_samples": 250000 - } - } - ], - "workload-size": "large", - "method": "kd_tree" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_100K", - "training": - { - "x": "data/epsilon_100K_x_train.npy", - "y": "data/epsilon_100K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_100K_x_train.npy", - "y": "data/epsilon_100K_y_train.npy" - } - } - ], - "workload-size": "medium", - "n-neighbors": [2, 100] - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - }, - { - "source": "npy", - "name": "hepmass_150K", - "training": - { - "x": "data/hepmass_150K_x_train.npy", - "y": "data/hepmass_150K_y_train.npy" - }, - "testing": - { - "x": "data/hepmass_150K_x_test.npy", - "y": "data/hepmass_150K_y_test.npy" - } - } - ], - "workload-size": "medium", - "n-neighbors": [5, 100] - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar_binary", - "training": - { - "x": "data/cifar_binary_x_train.npy", - "y": "data/cifar_binary_y_train.npy" - }, - "testing": - { - "x": "data/cifar_binary_x_test.npy", - "y": "data/cifar_binary_y_test.npy" - } - } - ], - "workload-size": "medium", - "n-neighbors": 7 - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "workload-size": "medium", - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_100K", - "training": - { - "x": "data/epsilon_100K_x_train.npy", - "y": "data/epsilon_100K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_100K_x_train.npy", - "y": "data/epsilon_100K_y_train.npy" - } - } - ], - "workload-size": "medium", - "task": "search", - "n-neighbors": 2 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "workload-size": "medium", - "task": "search", - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar_binary", - "training": - { - "x": "data/cifar_binary_x_train.npy", - "y": "data/cifar_binary_y_train.npy" - }, - "testing": - { - "x": "data/cifar_binary_x_test.npy", - "y": "data/cifar_binary_y_test.npy" - } - } - ], - "workload-size": "medium", - "task": "search", - "n-neighbors": 7 - }, - { - "device": "none", - "dataset": [ - { - "source": "npy", - "name": "cifar_binary", - "training": - { - "x": "data/cifar_binary_x_train.npy", - "y": "data/cifar_binary_y_train.npy" - }, - "testing": - { - "x": "data/cifar_binary_x_test.npy", - "y": "data/cifar_binary_y_test.npy" - } - } - ], - "method": "kd_tree", - "n-neighbors": 7 - }, - { - "algorithm": "knn_clsf", - "device": "none", - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "method": "kd_tree", - "n-neighbors": 5 - } - ] -} diff --git a/configs/sklearn/performance/knn_regr.json b/configs/sklearn/performance/knn_regr.json deleted file mode 100644 index 38c9629b9..000000000 --- a/configs/sklearn/performance/knn_regr.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_regr", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "workload-size": "medium", - "n-neighbors": 2 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "workload-size": "medium", - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar_binary", - "training": - { - "x": "data/cifar_binary_x_train.npy", - "y": "data/cifar_binary_y_train.npy" - }, - "testing": - { - "x": "data/cifar_binary_x_test.npy", - "y": "data/cifar_binary_y_test.npy" - } - } - ], - "workload-size": "medium", - "n-neighbors": 7 - }, - { - "device": "none", - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "method": "kd_tree", - "n-neighbors": 5 - }, - { - "device": "none", - "dataset": [ - { - "source": "npy", - "name": "cifar_binary", - "training": - { - "x": "data/cifar_binary_x_train.npy", - "y": "data/cifar_binary_y_train.npy" - }, - "testing": - { - "x": "data/cifar_binary_x_test.npy", - "y": "data/cifar_binary_y_test.npy" - } - } - ], - "method": "kd_tree", - "n-neighbors": 7 - } - ] -} diff --git a/configs/sklearn/performance/lasso.json b/configs/sklearn/performance/lasso.json deleted file mode 100644 index 7acaffef5..000000000 --- a/configs/sklearn/performance/lasso.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "lasso", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": "none" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "workload-size": "medium", - "alpha": 1.0, - "tol": 1e-4 - } - ] -} diff --git a/configs/sklearn/performance/linear.json b/configs/sklearn/performance/linear.json deleted file mode 100644 index 1acc165cf..000000000 --- a/configs/sklearn/performance/linear.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "linear", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "algorithm": "linear", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "regression", - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ], - "workload-size": "medium" - }, - { - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "workload-size": "small" - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "workload-size": "small" - } - ] -} diff --git a/configs/sklearn/performance/log_reg.json b/configs/sklearn/performance/log_reg.json deleted file mode 100644 index 09abc1e02..000000000 --- a/configs/sklearn/performance/log_reg.json +++ /dev/null @@ -1,136 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "log_reg", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 2000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ], - "workload-size": "medium", - "maxiter": 100, - "tol": 0 - }, - { - "dataset": [ - { - "source": "npy", - "name": "susy", - "training": - { - "x": "data/susy_x_train.npy", - "y": "data/susy_y_train.npy" - }, - "testing": - { - "x": "data/susy_x_test.npy", - "y": "data/susy_y_test.npy" - } - } - ], - "workload-size": "medium", - "maxiter": "20" - }, - { - "dataset": [ - { - "source": "npy", - "name": "susy", - "training": - { - "x": "data/susy_x_train.npy", - "y": "data/susy_y_train.npy" - }, - "testing": - { - "x": "data/susy_x_test.npy", - "y": "data/susy_y_test.npy" - } - } - ], - "workload-size": "medium", - "maxiter": "10" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "workload-size": "small", - "no-fit-intercept": "", - "maxiter": "50" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "workload-size": "medium", - "maxiter": "500" - } - ] -} diff --git a/configs/sklearn/performance/nusvc.json b/configs/sklearn/performance/nusvc.json deleted file mode 100644 index 9c82f68f1..000000000 --- a/configs/sklearn/performance/nusvc.json +++ /dev/null @@ -1,96 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "nusvc", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "device": "none" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "workload-size": "small", - "nu": 0.25, - "kernel": "sigmoid" - }, - { - "dataset": [ - { - "source": "npy", - "name": "klaverjas", - "training": - { - "x": "data/klaverjas_x_train.npy", - "y": "data/klaverjas_y_train.npy" - }, - "testing": - { - "x": "data/klaverjas_x_test.npy", - "y": "data/klaverjas_y_test.npy" - } - } - ], - "workload-size": "large", - "nu": 0.7, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "workload-size": "medium", - "nu": 0.5, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "codrnanorm", - "training": - { - "x": "data/codrnanorm_x_train.npy", - "y": "data/codrnanorm_y_train.npy" - }, - "testing": - { - "x": "data/codrnanorm_x_test.npy", - "y": "data/codrnanorm_y_test.npy" - } - } - ], - "workload-size": "medium", - "nu": 0.15, - "kernel": "poly" - } - ] -} diff --git a/configs/sklearn/performance/nusvr.json b/configs/sklearn/performance/nusvr.json deleted file mode 100644 index 702303db0..000000000 --- a/configs/sklearn/performance/nusvr.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "nusvr", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "device": "none" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - } - ], - "workload-size": "small", - "nu": 0.8, - "C": 2.0, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "medical_charges_nominal", - "training": - { - "x": "data/medical_charges_nominal_x_train.npy", - "y": "data/medical_charges_nominal_y_train.npy" - }, - "testing": - { - "x": "data/medical_charges_nominal_x_test.npy", - "y": "data/medical_charges_nominal_y_test.npy" - } - } - ], - "workload-size": "medium", - "nu": 0.5, - "C": 10.0, - "kernel": "poly", - "degree": 2 - }, - { - "dataset": [ - { - "source": "npy", - "name": "yolanda", - "training": - { - "x": "data/yolanda_x_train.npy", - "y": "data/yolanda_y_train.npy" - }, - "testing": - { - "x": "data/yolanda_x_test.npy", - "y": "data/yolanda_y_test.npy" - } - } - ], - "workload-size": "large", - "nu": 0.8, - "C": 2.0, - "kernel": "rbf" - } - ] -} diff --git a/configs/sklearn/performance/pca.json b/configs/sklearn/performance/pca.json deleted file mode 100644 index 294db5076..000000000 --- a/configs/sklearn/performance/pca.json +++ /dev/null @@ -1,107 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "pca", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 1000000 - }, - "testing": { - "n_samples": 100000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 2000, - "training": { - "n_samples": 10000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 1000, - "training": { - "n_samples": 30000 - } - } - ], - "workload-size": "small", - "svd-solver": "full", - "n-components": 10 - }, - { - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 4000, - "training": { - "n_samples": 6000 - } - } - ], - "workload-size": "medium", - "svd-solver": "full", - "n-components": 10 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - } - } - ], - "workload-size": "small" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_30K", - "training": - { - "x": "data/epsilon_30K_x_train.npy", - "y": "data/epsilon_30K_y_train.npy" - } - } - ], - "workload-size": "small" - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar_binary", - "training": - { - "x": "data/cifar_binary_x_train.npy", - "y": "data/cifar_binary_y_train.npy" - } - } - ], - "workload-size": "medium" - } - ] -} diff --git a/configs/sklearn/performance/ridge.json b/configs/sklearn/performance/ridge.json deleted file mode 100644 index 3792589ff..000000000 --- a/configs/sklearn/performance/ridge.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "ridge", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": "none" - }, - "cases": [ - { - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "regression", - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ], - "workload-size": "small", - "alpha": 5 - } - ] -} diff --git a/configs/sklearn/performance/svm.json b/configs/sklearn/performance/svm.json deleted file mode 100644 index c213a195c..000000000 --- a/configs/sklearn/performance/svm.json +++ /dev/null @@ -1,303 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "svm", - "data-format": "pandas", - "data-order": "C", - "dtype": "float32", - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "device": "none", - "data-order": "F", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "ijcnn", - "training": - { - "x": "data/ijcnn_x_train.npy", - "y": "data/ijcnn_y_train.npy" - }, - "testing": - { - "x": "data/ijcnn_x_test.npy", - "y": "data/ijcnn_y_test.npy" - } - } - ], - "workload-size": "medium", - "C": 1000.0, - "kernel": "linear" - }, - { - "device": "none", - "data-order": "F", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "connect", - "training": - { - "x": "data/connect_x_train.npy", - "y": "data/connect_y_train.npy" - }, - "testing": - { - "x": "data/connect_x_test.npy", - "y": "data/connect_y_test.npy" - } - } - ], - "workload-size": "medium", - "C": 100.0, - "kernel": "linear" - }, - { - "device": "none", - "data-order": "F", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - }, - "testing": - { - "x": "data/sensit_x_test.npy", - "y": "data/sensit_y_test.npy" - } - } - ], - "workload-size": "large", - "C": 500.0, - "kernel": "linear" - }, - { - "device": "none", - "data-order": "F", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - } - ], - "workload-size": "medium", - "C": 1.0, - "kernel": "rbf" - }, - { - "device": "none", - "data-order": "F", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "covertype", - "training": - { - "x": "data/covertype_x_train.npy", - "y": "data/covertype_y_train.npy" - }, - "testing": - { - "x": "data/covertype_x_test.npy", - "y": "data/covertype_y_test.npy" - } - } - ], - "workload-size": "large", - "C": 100.0, - "kernel": "rbf" - }, - { - "device": "none", - "data-order": "F", - "dtype": "float64", - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "workload-size": "small", - "C": 1.5e-3, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "workload-size": "small", - "C": 1.5e-3, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "workload-size": "large", - "C": 1.0, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_80K", - "training": - { - "x": "data/epsilon_80K_x_train.npy", - "y": "data/epsilon_80K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_80K_x_train.npy", - "y": "data/epsilon_80K_y_train.npy" - } - } - ], - "workload-size": "large", - "C": 1.0, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar_binary", - "training": - { - "x": "data/cifar_binary_x_train.npy", - "y": "data/cifar_binary_y_train.npy" - }, - "testing": - { - "x": "data/cifar_binary_x_test.npy", - "y": "data/cifar_binary_y_test.npy" - } - } - ], - "workload-size": "medium", - "C": 1.0e-7, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_16K", - "training": - { - "x": "data/epsilon_16K_x_train.npy", - "y": "data/epsilon_16K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_16K_x_test.npy", - "y": "data/epsilon_16K_y_test.npy" - } - } - ], - "workload-size": "medium", - "C": 9.0e2, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "covtype_binary", - "training": - { - "x": "data/covtype_binary_x_train.npy", - "y": "data/covtype_binary_y_train.npy" - }, - "testing": - { - "x": "data/covtype_binary_x_test.npy", - "y": "data/covtype_binary_y_test.npy" - } - } - ], - "workload-size": "medium", - "C": 1000.0, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_80K", - "training": - { - "x": "data/epsilon_80K_x_train.npy", - "y": "data/epsilon_80K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_80K_x_train.npy", - "y": "data/epsilon_80K_y_train.npy" - } - } - ], - "workload-size": "medium", - "C": 1000.0, - "kernel": "rbf" - } - ] -} diff --git a/configs/sklearn/performance/svr.json b/configs/sklearn/performance/svr.json deleted file mode 100644 index 0bcdaf2ce..000000000 --- a/configs/sklearn/performance/svr.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "svr", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "device": "none" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ], - "workload-size": "small", - "C": 0.1, - "kernel": "poly" - }, - { - "dataset": [ - { - "source": "npy", - "name": "twodplanes", - "training": - { - "x": "data/twodplanes_x_train.npy", - "y": "data/twodplanes_y_train.npy" - }, - "testing": - { - "x": "data/twodplanes_x_test.npy", - "y": "data/twodplanes_y_test.npy" - } - } - ], - "workload-size": "medium", - "C": 10.0, - "kernel": "rbf" - } - ] -} diff --git a/configs/sklearn/performance/train_test_split.json b/configs/sklearn/performance/train_test_split.json deleted file mode 100644 index 1e9aaafd0..000000000 --- a/configs/sklearn/performance/train_test_split.json +++ /dev/null @@ -1,66 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "train_test_split", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": "none" - }, - "cases": [ - { - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 20, - "training": { - "n_samples": 5000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 100, - "training": { - "n_samples": 1000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 10000, - "training": { - "n_samples": 10000 - } - } - ], - "workload-size": "small", - "include-y": "", - "train-size": 0.75, - "test-size": 0.25 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_10500K", - "training": - { - "x": "data/higgs_10500K_x_train.npy", - "y": "data/higgs_10500K_y_train.npy" - } - } - ], - "workload-size": "medium", - "data-format": "numpy", - "data-order": "C", - "include-y": "", - "train-size": 0.9, - "test-size": 0.1 - } - ] -} diff --git a/configs/sklearn/performance/tsne.json b/configs/sklearn/performance/tsne.json deleted file mode 100644 index 68fdc8cb7..000000000 --- a/configs/sklearn/performance/tsne.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "tsne", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": "none" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - }, - { - "source": "npy", - "name": "cifar_10", - "training": - { - "x": "data/cifar_10_x_train.npy", - "y": "data/cifar_10_y_train.npy" - } - }, - { - "source": "npy", - "name": "epsilon_30K", - "training": - { - "x": "data/epsilon_30K_x_train.npy", - "y": "data/epsilon_30K_y_train.npy" - } - } - ], - "workload-size": "medium" - } - ] -} diff --git a/configs/sklearn_example.json b/configs/sklearn_example.json new file mode 100644 index 000000000..be5a40179 --- /dev/null +++ b/configs/sklearn_example.json @@ -0,0 +1,98 @@ +{ + "PARAMETERS_SETS": { + "common": { + "algorithm": { + "library": ["sklearn", "sklearnex"], + "device": "cpu", + "sklearn_context": { "assume_finite": true } + }, + "bench": { "n_runs": 10, "time_limit": 60 } + }, + "blobs data": { + "data": { + "source": "make_blobs", + "generation_kwargs": { + "centers": 2, + "n_samples": 1000, + "n_features": [16, 64] + }, + "split_kwargs": { "ignore": true } + } + }, + "binary clsf data": { + "data": [ + { + "source": "make_classification", + "generation_kwargs": { + "n_classes": 2, + "n_samples": 2000, + "n_features": "[RANGE]pow:2:5:6", + "n_informative": "[SPECIAL_VALUE]0.5" + }, + "split_kwargs": { "test_size": 0.2 } + }, + { + "source": "fetch_openml", + "id": 1430, + "split_kwargs": { + "train_size": 1000, + "test_size": 1000, + "shuffle": true, + "random_state": 42 + } + } + ] + }, + "multi clsf data": { + "data": { + "source": "make_classification", + "generation_kwargs": { + "n_classes": 4, + "n_samples": 1000, + "n_features": 10, + "n_informative": "[SPECIAL_VALUE]0.75" + }, + "split_kwargs": { "test_size": 0.1 } + } + }, + "unsupervised algorithms": { + "algorithm": + [ + { + "estimator": "PCA", + "estimator_params": { "svd_solver": "auto", "n_components": 3 } + }, + { + "estimator": "KMeans", + "estimator_params": { "n_init": 10, "n_clusters": "[SPECIAL_VALUE]auto" }, + "estimator_methods": { "inference": "predict" } + } + ] + }, + "supervised algorithms": { + "algorithm": [ + { + "estimator": ["KNeighborsClassifier", "KNeighborsRegressor"], + "estimator_params": { "n_neighbors": 5, "algorithm": "brute" } + }, + { "estimator": "ElasticNet" }, + { "estimator": "SVC" } + ] + } + }, + "TEMPLATES": { + "multi clsf": { + "SETS": ["common", "multi clsf data"], + "algorithm": { + "estimator": "LogisticRegression", + "online_inference_mode": true + } + }, + "supervised": { + "SETS": ["common", "binary clsf data", "supervised algorithms"] + }, + "unsupervised": { + "SETS": ["common", "blobs data", "unsupervised algorithms"] + } + } +} diff --git a/configs/spmd/dbscan.json b/configs/spmd/dbscan.json new file mode 100644 index 000000000..b3a039c87 --- /dev/null +++ b/configs/spmd/dbscan.json @@ -0,0 +1,18 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../regular/dbscan.json"], + "PARAMETERS_SETS": { + "spmd dbscan parameters": {} + }, + "TEMPLATES": { + "kmeans": { + "SETS": [ + "common dbscan parameters", + "sklearn dbscan parameters", + "dbscan datasets", + "sklearnex spmd implementation", + "spmd default parameters", + "spmd dbscan parameters" + ] + } + } +} diff --git a/configs/spmd/ensemble.json b/configs/spmd/ensemble.json new file mode 100644 index 000000000..da8e70361 --- /dev/null +++ b/configs/spmd/ensemble.json @@ -0,0 +1,59 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../regular/ensemble.json"], + "PARAMETERS_SETS": { + "spmd ensemble classifier params": { + "algorithm": { + "estimator": "RandomForestClassifier" + } + }, + "spmd ensemble regressor params": { + "algorithm": { + "estimator": "RandomForestRegressor" + } + }, + "ensemble classification data": { + "data": [ + { "dataset": "skin_segmentation", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } }, + { "dataset": "creditcard", "split_kwargs": { "train_size": 100000, "test_size": null } }, + { "dataset": "a9a", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } }, + { "dataset": "mnist", "split_kwargs": { "train_size": 20000, "test_size": null } } + ] + }, + "ensemble regression data": { + "data": [ + { + "dataset": "road_network", + "split_kwargs": { + "train_size": 200000, "test_size": null, + "shuffle": true, "random_state": 42 + } + }, + { "dataset": "creditcard", "split_kwargs": { "train_size": 100000, "test_size": null } }, + { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 50000, "test_size": null } }, + { "dataset": "a9a", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } } + ] + } + }, + "TEMPLATES": { + "ensemble classification": { + "SETS": [ + "common ensemble params", + "sklearn ensemble classifier params", + "ensemble classification data", + "sklearnex spmd implementation", + "spmd default parameters", + "spmd ensemble classifier params" + ] + }, + "ensemble regression": { + "SETS": [ + "common ensemble params", + "sklearn ensemble regressor params", + "ensemble regression data", + "sklearnex spmd implementation", + "spmd default parameters", + "spmd ensemble regressor params" + ] + } + } +} diff --git a/configs/spmd/kmeans.json b/configs/spmd/kmeans.json new file mode 100644 index 000000000..f9e8bb752 --- /dev/null +++ b/configs/spmd/kmeans.json @@ -0,0 +1,18 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../regular/kmeans.json"], + "PARAMETERS_SETS": { + "spmd kmeans parameters": {} + }, + "TEMPLATES": { + "kmeans": { + "SETS": [ + "common kmeans parameters", + "sklearn kmeans parameters", + "kmeans datasets", + "sklearnex spmd implementation", + "spmd default parameters", + "spmd kmeans parameters" + ] + } + } +} diff --git a/configs/spmd/knn.json b/configs/spmd/knn.json new file mode 100644 index 000000000..f64d26b55 --- /dev/null +++ b/configs/spmd/knn.json @@ -0,0 +1,27 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../regular/knn.json"], + "PARAMETERS_SETS": { + "spmd knn parameters": { + "algorithm": { + "estimator_params": { + "algorithm": "brute", + "metric": "minkowski", + "p": 2, + "weights": "uniform" + } + } + } + }, + "TEMPLATES": { + "knn regressor": { + "SETS": [ + "common knn parameters", + "sklearn knn parameters", + "brute knn algorithm - regression data", + "sklearnex spmd implementation", + "spmd default parameters", + "spmd knn parameters" + ] + } + } +} diff --git a/configs/spmd/linear_model.json b/configs/spmd/linear_model.json new file mode 100644 index 000000000..03058374a --- /dev/null +++ b/configs/spmd/linear_model.json @@ -0,0 +1,18 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../regular/linear_model.json"], + "PARAMETERS_SETS": { + "spmd linear parameters": {} + }, + "TEMPLATES": { + "linreg": { + "SETS": [ + "common linear parameters", + "sklearn linear parameters", + "regression datasets", + "sklearnex spmd implementation", + "spmd default parameters", + "spmd linear parameters" + ] + } + } +} diff --git a/configs/spmd/logreg.json b/configs/spmd/logreg.json new file mode 100644 index 000000000..1c825ffae --- /dev/null +++ b/configs/spmd/logreg.json @@ -0,0 +1,22 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../regular/logreg.json"], + "PARAMETERS_SETS": { + "spmd logreg parameters": { + "algorithm": { + "estimator_params": { "solver": "newton-cg" } + } + } + }, + "TEMPLATES": { + "logreg": { + "SETS": [ + "common logreg parameters", + "sklearn logreg parameters", + "logreg datasets", + "sklearnex spmd implementation", + "spmd default parameters", + "spmd logreg parameters" + ] + } + } +} diff --git a/configs/spmd/pca.json b/configs/spmd/pca.json new file mode 100644 index 000000000..aa3cb15c1 --- /dev/null +++ b/configs/spmd/pca.json @@ -0,0 +1,28 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../regular/pca.json"], + "PARAMETERS_SETS": { + "spmd pca parameters": { + "algorithm": { + "estimator_params": { + "copy": "[REMOVE]", + "svd_solver": "[REMOVE]", + "tol": "[REMOVE]", + "iterated_power": "[REMOVE]", + "random_state": "[REMOVE]", + "method": "cov" + } + } + } + }, + "TEMPLATES": { + "pca": { + "SETS": [ + "pca parameters", + "pca datasets", + "sklearnex spmd implementation", + "spmd default parameters", + "spmd pca parameters" + ] + } + } +} diff --git a/configs/spmd/stats_covariance.json b/configs/spmd/stats_covariance.json new file mode 100644 index 000000000..d9c36a2fd --- /dev/null +++ b/configs/spmd/stats_covariance.json @@ -0,0 +1,54 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "spmd basic statistics parameters": { + "algorithm": { + "estimator": "BasicStatistics", + "estimator_methods": { + "training": "compute" + } + } + }, + "spmd covariance parameters": { + "algorithm": { + "estimator": "EmpiricalCovariance", + "estimator_params": { + "bias": true + } + } + }, + "datasets": { + "data": [ + { + "dataset": ["susy", "higgs"] + }, + { + "source": "make_blobs", + "generation_kwargs": [ + { "n_samples": 400000, "n_features": 1000, "centers": 1 }, + { "n_samples": 40000000, "n_features": 10, "centers": 1 } + ], + "split_kwargs": { "ignore": true } + } + ] + } + }, + "TEMPLATES": { + "basic statistics": { + "SETS": [ + "sklearnex spmd implementation", + "spmd default parameters", + "spmd basic statistics parameters", + "datasets" + ] + }, + "covariance": { + "SETS": [ + "sklearnex spmd implementation", + "spmd default parameters", + "spmd covariance parameters", + "datasets" + ] + } + } +} diff --git a/configs/spmd_example.json b/configs/spmd_example.json new file mode 100644 index 000000000..ea8548fe4 --- /dev/null +++ b/configs/spmd_example.json @@ -0,0 +1,71 @@ +{ + "PARAMETERS_SETS": { + "implementations": [ + { + "algorithm": { + "library": "sklearnex.spmd", + "device": "gpu" + }, + "data": { "distributed_split": "rank_based" }, + "bench": { + "distributor": "mpi", + "mpi_params": { "n": 2, "ppn": 2 } + } + }, + { + "algorithm": { + "library": "sklearnex", + "device": "cpu" + } + } + ], + "datasets": { + "data": [ + { + "dataset": "higgs", + "split_kwargs": { "train_size": 10000, "test_size": 10000 } + }, + { + "source": "make_regression", + "generation_kwargs": { + "n_samples": 20000, + "n_features": 100, + "noise": 1.0 + }, + "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } + } + ] + }, + "linear regression": { + "algorithm": { "estimator": "LinearRegression" } + }, + "knn regression": { + "algorithm": { + "estimator": "KNeighborsRegressor", + "estimator_params": { "algorithm": "brute", "n_neighbors": 5 } + } + }, + "random forest regression": { + "algorithm": { + "estimator": "RandomForestRegressor", + "estimator_params": { + "criterion": "squared_error", + "max_features": 1.0, + "n_estimators": 10, + "max_depth": 4 + } + } + } + }, + "TEMPLATES": { + "linear regression": { + "SETS": ["implementations", "datasets", "linear regression"] + }, + "knn regression": { + "SETS": ["implementations", "datasets", "knn regression"] + }, + "random forest regression": { + "SETS": ["implementations", "datasets", "random forest regression"] + } + } +} diff --git a/configs/svm/svc_proba_cuml.json b/configs/svm/svc_proba_cuml.json deleted file mode 100755 index c765a2164..000000000 --- a/configs/svm/svc_proba_cuml.json +++ /dev/null @@ -1,222 +0,0 @@ -{ - "common": { - "lib": ["cuml"], - "data-format": ["cudf"], - "data-order": ["F"], - "dtype": ["float64"], - "max-cache-size": [2], - "probability": [""] - }, - "cases": [ - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "ijcnn", - "training": - { - "x": "data/ijcnn_x_train.npy", - "y": "data/ijcnn_y_train.npy" - }, - "testing": - { - "x": "data/ijcnn_x_test.npy", - "y": "data/ijcnn_y_test.npy" - } - } - ], - "C": [1000.0], - "kernel": ["linear"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "C": [500.0], - "kernel": ["rbf"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "C": [1.5e-3], - "kernel": ["linear"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "klaverjas", - "training": - { - "x": "data/klaverjas_x_train.npy", - "y": "data/klaverjas_y_train.npy" - }, - "testing": - { - "x": "data/klaverjas_x_test.npy", - "y": "data/klaverjas_y_test.npy" - } - } - ], - "C": [1.0], - "kernel": ["rbf"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "connect", - "training": - { - "x": "data/connect_x_train.npy", - "y": "data/connect_y_train.npy" - }, - "testing": - { - "x": "data/connect_x_test.npy", - "y": "data/connect_y_test.npy" - } - } - ], - "C": [100.0], - "kernel": ["linear"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "C": [50.0], - "kernel": ["rbf"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - }, - "testing": - { - "x": "data/sensit_x_test.npy", - "y": "data/sensit_y_test.npy" - } - } - ], - "C": [500.0], - "kernel": ["linear"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - } - ], - "C": [1.0], - "kernel": ["rbf"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "covertype", - "training": - { - "x": "data/covertype_x_train.npy", - "y": "data/covertype_y_train.npy" - }, - "testing": - { - "x": "data/covertype_x_test.npy", - "y": "data/covertype_y_test.npy" - } - } - ], - "C": [100.0], - "kernel": ["rbf"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "codrnanorm", - "training": - { - "x": "data/codrnanorm_x_train.npy", - "y": "data/codrnanorm_y_train.npy" - }, - "testing": - { - "x": "data/codrnanorm_x_test.npy", - "y": "data/codrnanorm_y_test.npy" - } - } - ], - "C": [1000.0], - "kernel": ["linear"] - } - ] -} diff --git a/configs/svm/svc_proba_sklearn.json b/configs/svm/svc_proba_sklearn.json deleted file mode 100755 index 3ded70b29..000000000 --- a/configs/svm/svc_proba_sklearn.json +++ /dev/null @@ -1,222 +0,0 @@ -{ - "common": { - "lib": ["sklearn"], - "data-format": ["pandas"], - "data-order": ["F"], - "dtype": ["float64"], - "max-cache-size": [2], - "probability": [""] - }, - "cases": [ - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "ijcnn", - "training": - { - "x": "data/ijcnn_x_train.npy", - "y": "data/ijcnn_y_train.npy" - }, - "testing": - { - "x": "data/ijcnn_x_test.npy", - "y": "data/ijcnn_y_test.npy" - } - } - ], - "C": [1000.0], - "kernel": ["linear"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "C": [500.0], - "kernel": ["rbf"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "C": [1.5e-3], - "kernel": ["linear"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "klaverjas", - "training": - { - "x": "data/klaverjas_x_train.npy", - "y": "data/klaverjas_y_train.npy" - }, - "testing": - { - "x": "data/klaverjas_x_test.npy", - "y": "data/klaverjas_y_test.npy" - } - } - ], - "C": [1.0], - "kernel": ["rbf"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "connect", - "training": - { - "x": "data/connect_x_train.npy", - "y": "data/connect_y_train.npy" - }, - "testing": - { - "x": "data/connect_x_test.npy", - "y": "data/connect_y_test.npy" - } - } - ], - "C": [100.0], - "kernel": ["linear"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "C": [50.0], - "kernel": ["rbf"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - }, - "testing": - { - "x": "data/sensit_x_test.npy", - "y": "data/sensit_y_test.npy" - } - } - ], - "C": [500.0], - "kernel": ["linear"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - } - ], - "C": [1.0], - "kernel": ["rbf"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "covertype", - "training": - { - "x": "data/covertype_x_train.npy", - "y": "data/covertype_y_train.npy" - }, - "testing": - { - "x": "data/covertype_x_test.npy", - "y": "data/covertype_y_test.npy" - } - } - ], - "C": [100.0], - "kernel": ["rbf"] - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "npy", - "name": "codrnanorm", - "training": - { - "x": "data/codrnanorm_x_train.npy", - "y": "data/codrnanorm_y_train.npy" - }, - "testing": - { - "x": "data/codrnanorm_x_test.npy", - "y": "data/codrnanorm_y_test.npy" - } - } - ], - "C": [1000.0], - "kernel": ["linear"] - } - ] -} diff --git a/configs/testing/azure-pipelines-ci.json b/configs/testing/azure-pipelines-ci.json new file mode 100644 index 000000000..ffdf261da --- /dev/null +++ b/configs/testing/azure-pipelines-ci.json @@ -0,0 +1,137 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common parameters": { + "data": { + "split_kwargs": { + "train_size": 400, + "test_size": 100, + "shuffle": true, + "random_state": 42 + }, + "preprocessing_kwargs": { + "normalize": true + } + }, + "bench": { "n_runs": 5 }, + "algorithm": { "device": "default" } + }, + "data formats": { + "data": [ + { + "format": "numpy", + "dtype": "float64", + "order": "C" + }, + { + "format": "pandas", + "dtype": "float32", + "order": "F" + } + ] + }, + "datasets": { + "data": + [ + { + "source": "fetch_openml", + "id": 1430 + }, + { + "source": "make_classification", + "generation_kwargs": { + "n_classes": 2, + "n_samples": 500, + "n_features": 16, + "n_informative": "[SPECIAL_VALUE]0.5" + } + } + ] + }, + "algorithms": [ + { + "algorithm": { + "estimator": "DBSCAN", + "estimator_params": { "algorithm": "brute" } + } + }, + { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "init": "random", + "algorithm": "lloyd", + "max_iter": 20 + } + } + }, + { + "algorithm": { + "estimator": "PCA", + "estimator_params": { "svd_solver": "full" } + } + }, + { + "algorithm": { + "estimator": "TSNE", + "estimator_params": { "n_iter": 250 } + } + }, + { + "algorithm": { + "estimator": [ + "RandomForestClassifier", "ExtraTreesClassifier", + "RandomForestRegressor", "ExtraTreesRegressor" + ], + "estimator_params": { "n_estimators": 20 } + } + }, + { + "algorithm": { + "estimator": [ + "KNeighborsClassifier", "KNeighborsRegressor" + ], + "estimator_params": { "algorithm": ["brute", "kd_tree"] } + } + }, + { + "algorithm": { + "estimator": ["LinearRegression", "Ridge", "Lasso", "ElasticNet"] + } + }, + { + "algorithm": { + "estimator": ["SVC", "SVR"], + "estimator_params": { "max_iter": 1000 } + } + }, + { + "algorithm": { + "estimator": ["NuSVC", "NuSVR"], + "estimator_params": { "max_iter": 1000, "nu": 0.1 } + } + }, + { + "algorithm": { + "function": "train_test_split", + "args_order": "x_train|y_train", + "kwargs": { + "random_state": 42, + "shuffle": true + } + } + } + ] + }, + "TEMPLATES": { + "test": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "common parameters", + "data formats", + "datasets", + "algorithms" + ] + } + } +} diff --git a/configs/testing/daal4py.json b/configs/testing/daal4py.json deleted file mode 100755 index 9af747f16..000000000 --- a/configs/testing/daal4py.json +++ /dev/null @@ -1,142 +0,0 @@ -{ - "common": { - "lib": "daal4py", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 10, - "n_features": 2, - "training": { - "n_samples": 100 - } - } - ], - "n-clusters": 10 - }, - { - "algorithm": "df_clsf", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 20 - } - } - ], - "num-trees": 10 - }, - { - "algorithm": "df_regr", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 100 - }, - "testing": { - "n_samples": 20 - } - } - ], - "num-trees": 10 - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 200 - } - } - ], - "alpha": 5 - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 200 - } - } - ] - }, - { - "algorithm": "dbscan", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 10, - "n_features": 2, - "training": { - "n_samples": 100 - } - } - ] - }, - { - "algorithm": "pca", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 20 - } - } - ] - }, - { - "algorithm": "distances", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 20 - } - } - ] - } - ] -} diff --git a/configs/testing/daal4py_xgboost.json b/configs/testing/daal4py_xgboost.json deleted file mode 100755 index 548ec82bf..000000000 --- a/configs/testing/daal4py_xgboost.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "common": { - "lib": "modelbuilders", - "data-format": "pandas", - "data-order": "F", - "dtype": "float32", - "algorithm": "xgb_mb", - "tree-method": "hist", - "count-dmatrix":"" - }, - "cases": [ - { - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 100 - }, - "testing": { - "n_samples": 20 - } - } - ], - "n-estimators": 10, - "max-depth": 8, - "objective": "multi:softprob" - } - ] -} diff --git a/configs/testing/metrics/dbscan.json b/configs/testing/metrics/dbscan.json deleted file mode 100755 index 8a35d5bcd..000000000 --- a/configs/testing/metrics/dbscan.json +++ /dev/null @@ -1,159 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "dbscan", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "ijcnn", - "training": - { - "x": "data/ijcnn_x_train.npy", - "y": "data/ijcnn_y_train.npy" - }, - "testing": - { - "x": "data/ijcnn_x_test.npy", - "y": "data/ijcnn_y_test.npy" - } - } - ], - "eps": [0.5] - }, - { - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "eps": [0.5] - }, - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "eps": 18800 - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "eps": 2 - }, - { - "dataset": [ - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - }, - "testing": - { - "x": "data/sensit_x_test.npy", - "y": "data/sensit_y_test.npy" - } - } - ], - "eps": 0.5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - } - ], - "eps": 0.5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ], - "eps": 0.5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - } - ], - "eps": 0.5 - } - ] -} diff --git a/configs/testing/metrics/elasticnet.json b/configs/testing/metrics/elasticnet.json deleted file mode 100755 index c64044f89..000000000 --- a/configs/testing/metrics/elasticnet.json +++ /dev/null @@ -1,116 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "elasticnet", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": - { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": - { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ], - "alpha": 0.005, - "tol": 1e-4, - "l1_ratio": 0.85 - }, - { - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ], - "alpha": 0.01, - "tol": 1e-4, - "l1_ratio": 0.7 - }, - { - "dataset": [ - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - } - ], - "alpha": 0.0625, - "tol": 1e-4, - "l1_ratio": 0.75 - }, - { - "dataset": [ - { - "source": "npy", - "name": "twodplanes", - "training": - { - "x": "data/twodplanes_x_train.npy", - "y": "data/twodplanes_y_train.npy" - }, - "testing": - { - "x": "data/twodplanes_x_test.npy", - "y": "data/twodplanes_y_test.npy" - } - } - ], - "alpha": 0.006, - "tol": 1e-4, - "l1_ratio": 0.25 - }, - { - "dataset": [ - { - "source": "npy", - "name": "medical_charges_nominal", - "training": - { - "x": "data/medical_charges_nominal_x_train.npy", - "y": "data/medical_charges_nominal_y_train.npy" - }, - "testing": - { - "x": "data/medical_charges_nominal_x_test.npy", - "y": "data/medical_charges_nominal_y_test.npy" - } - } - ], - "alpha": 0.15, - "tol": 1e-4, - "l1_ratio": 0.4 - } - ] -} diff --git a/configs/testing/metrics/kmeans.json b/configs/testing/metrics/kmeans.json deleted file mode 100755 index 427844912..000000000 --- a/configs/testing/metrics/kmeans.json +++ /dev/null @@ -1,273 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "kmeans", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "time-method": "box_filter", - "time-limit": 50, - "n_init": 10, - "maxiter": 300, - "tol": 1e-4, - "init": "k-means++" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "ijcnn", - "training": - { - "x": "data/ijcnn_x_train.npy", - "y": "data/ijcnn_y_train.npy" - }, - "testing": - { - "x": "data/ijcnn_x_test.npy", - "y": "data/ijcnn_y_test.npy" - } - } - ], - "n-clusters": 10 - }, - { - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "n-clusters": 45 - }, - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "n-clusters": 70 - }, - { - "dataset": [ - { - "source": "npy", - "name": "klaverjas", - "training": - { - "x": "data/klaverjas_x_train.npy", - "y": "data/klaverjas_y_train.npy" - }, - "testing": - { - "x": "data/klaverjas_x_test.npy", - "y": "data/klaverjas_y_test.npy" - } - } - ], - "n-clusters": 60 - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "n-clusters": 25 - }, - { - "dataset": [ - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - }, - "testing": - { - "x": "data/sensit_x_test.npy", - "y": "data/sensit_y_test.npy" - } - } - ], - "n-clusters": 40 - }, - { - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - } - ], - "n-clusters": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "covertype", - "training": - { - "x": "data/covertype_x_train.npy", - "y": "data/covertype_y_train.npy" - }, - "testing": - { - "x": "data/covertype_x_test.npy", - "y": "data/covertype_y_test.npy" - } - } - ], - "n-clusters": 8 - }, - { - "dataset": [ - { - "source": "npy", - "name": "codrnanorm", - "training": - { - "x": "data/codrnanorm_x_train.npy", - "y": "data/codrnanorm_y_train.npy" - }, - "testing": - { - "x": "data/codrnanorm_x_test.npy", - "y": "data/codrnanorm_y_test.npy" - } - } - ], - "n-clusters": 12 - }, - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ], - "n-clusters": 3 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "n-clusters": 35 - }, - { - "dataset": [ - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - } - ], - "n-clusters": 9 - }, - { - "dataset": [ - { - "source": "npy", - "name": "covtype", - "training": { - "x": "data/covtype_x_train.npy", - "y": "data/covtype_y_train.npy" - }, - "testing": { - "x": "data/covtype_x_test.npy", - "y": "data/covtype_y_test.npy" - } - } - ], - "n-clusters": 10 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs", - "training": { - "x": "data/higgs_x_train.npy", - "y": "data/higgs_y_train.npy" - }, - "testing": { - "x": "data/higgs_x_test.npy", - "y": "data/higgs_y_test.npy" - } - } - ], - "n-clusters": 20 - } - ] -} diff --git a/configs/testing/metrics/knn_brute_clsf.json b/configs/testing/metrics/knn_brute_clsf.json deleted file mode 100755 index 8903055b8..000000000 --- a/configs/testing/metrics/knn_brute_clsf.json +++ /dev/null @@ -1,84 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_clsf", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "method": "brute" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - }, - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - }, - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - }, - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - }, - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - }, - "testing": - { - "x": "data/sensit_x_test.npy", - "y": "data/sensit_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/testing/metrics/knn_brute_regr.json b/configs/testing/metrics/knn_brute_regr.json deleted file mode 100755 index 46edbb2fc..000000000 --- a/configs/testing/metrics/knn_brute_regr.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_regr", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": - { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": - { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - }, - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - }, - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - }, - { - "source": "npy", - "name": "medical_charges_nominal", - "training": - { - "x": "data/medical_charges_nominal_x_train.npy", - "y": "data/medical_charges_nominal_y_train.npy" - }, - "testing": - { - "x": "data/medical_charges_nominal_x_test.npy", - "y": "data/medical_charges_nominal_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/testing/metrics/knn_kdtree_clsf.json b/configs/testing/metrics/knn_kdtree_clsf.json deleted file mode 100755 index ea15e071a..000000000 --- a/configs/testing/metrics/knn_kdtree_clsf.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_clsf", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "method": "kd_tree" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - }, - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - }, - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - }, - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/testing/metrics/knn_kdtree_regr.json b/configs/testing/metrics/knn_kdtree_regr.json deleted file mode 100755 index 21a8dd3b1..000000000 --- a/configs/testing/metrics/knn_kdtree_regr.json +++ /dev/null @@ -1,86 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_regr", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "method": "kd_tree" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": - { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": - { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - }, - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - }, - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - }, - { - "source": "npy", - "name": "twodplanes", - "training": - { - "x": "data/twodplanes_x_train.npy", - "y": "data/twodplanes_y_train.npy" - }, - "testing": - { - "x": "data/twodplanes_x_test.npy", - "y": "data/twodplanes_y_test.npy" - } - }, - { - "source": "npy", - "name": "medical_charges_nominal", - "training": - { - "x": "data/medical_charges_nominal_x_train.npy", - "y": "data/medical_charges_nominal_y_train.npy" - }, - "testing": - { - "x": "data/medical_charges_nominal_x_test.npy", - "y": "data/medical_charges_nominal_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/testing/metrics/lasso.json b/configs/testing/metrics/lasso.json deleted file mode 100755 index 53daa8daa..000000000 --- a/configs/testing/metrics/lasso.json +++ /dev/null @@ -1,107 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "lasso", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "tol": 1e-4 - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": - { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": - { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ], - "alpha": -0.0025 - }, - { - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ], - "alpha": 0.015625 - }, - { - "dataset": [ - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - } - ], - "alpha": 0.0625 - }, - { - "dataset": [ - { - "source": "npy", - "name": "twodplanes", - "training": - { - "x": "data/twodplanes_x_train.npy", - "y": "data/twodplanes_y_train.npy" - }, - "testing": - { - "x": "data/twodplanes_x_test.npy", - "y": "data/twodplanes_y_test.npy" - } - } - ], - "alpha": -0.0625 - }, - { - "dataset": [ - { - "source": "npy", - "name": "medical_charges_nominal", - "training": - { - "x": "data/medical_charges_nominal_x_train.npy", - "y": "data/medical_charges_nominal_y_train.npy" - }, - "testing": - { - "x": "data/medical_charges_nominal_x_test.npy", - "y": "data/medical_charges_nominal_y_test.npy" - } - } - ], - "alpha": 0.03125 - } - ] -} diff --git a/configs/testing/metrics/linreg.json b/configs/testing/metrics/linreg.json deleted file mode 100644 index 704aa7c04..000000000 --- a/configs/testing/metrics/linreg.json +++ /dev/null @@ -1,143 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "linear", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": - { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": - { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ] - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ] - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - } - ] - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "npy", - "name": "twodplanes", - "training": - { - "x": "data/twodplanes_x_train.npy", - "y": "data/twodplanes_y_train.npy" - }, - "testing": - { - "x": "data/twodplanes_x_test.npy", - "y": "data/twodplanes_y_test.npy" - } - } - ] - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "npy", - "name": "medical_charges_nominal", - "training": - { - "x": "data/medical_charges_nominal_x_train.npy", - "y": "data/medical_charges_nominal_y_train.npy" - }, - "testing": - { - "x": "data/medical_charges_nominal_x_test.npy", - "y": "data/medical_charges_nominal_y_test.npy" - } - } - ] - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "npy", - "name": "yolanda", - "training": - { - "x": "data/yolanda_x_train.npy", - "y": "data/yolanda_y_train.npy" - }, - "testing": - { - "x": "data/yolanda_x_test.npy", - "y": "data/yolanda_y_test.npy" - } - } - ] - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/testing/metrics/logreg.json b/configs/testing/metrics/logreg.json deleted file mode 100755 index 1f4e3bab9..000000000 --- a/configs/testing/metrics/logreg.json +++ /dev/null @@ -1,171 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "log_reg", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "maxiter": 5000, - "tol": 0 - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - }, - "testing": - { - "x": "data/sensit_x_test.npy", - "y": "data/sensit_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "ijcnn", - "training": - { - "x": "data/ijcnn_x_train.npy", - "y": "data/ijcnn_y_train.npy" - }, - "testing": - { - "x": "data/ijcnn_x_test.npy", - "y": "data/ijcnn_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "klaverjas", - "training": - { - "x": "data/klaverjas_x_train.npy", - "y": "data/klaverjas_y_train.npy" - }, - "testing": - { - "x": "data/klaverjas_x_test.npy", - "y": "data/klaverjas_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "codrnanorm", - "training": - { - "x": "data/codrnanorm_x_train.npy", - "y": "data/codrnanorm_y_train.npy" - }, - "testing": - { - "x": "data/codrnanorm_x_test.npy", - "y": "data/codrnanorm_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/testing/metrics/nusvc.json b/configs/testing/metrics/nusvc.json deleted file mode 100755 index 83755f2a6..000000000 --- a/configs/testing/metrics/nusvc.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "nusvc", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "nu": 0.07, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "nu": 0.25, - "kernel": "sigmoid" - }, - { - "dataset": [ - { - "source": "npy", - "name": "connect", - "training": - { - "x": "data/connect_x_train.npy", - "y": "data/connect_y_train.npy" - }, - "testing": - { - "x": "data/connect_x_test.npy", - "y": "data/connect_y_test.npy" - } - } - ], - "nu": 0.25, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - } - ], - "nu": 0.01, - "kernel": "rbf" - } - ] -} diff --git a/configs/testing/metrics/nusvr.json b/configs/testing/metrics/nusvr.json deleted file mode 100755 index 1f3b2981c..000000000 --- a/configs/testing/metrics/nusvr.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "nusvr", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": - { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": - { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ], - "C": 0.1, - "kernel": "poly", - "nu": 0.17 - }, - { - "dataset": [ - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - } - ], - "C": 2.0, - "kernel": "rbf", - "nu": 0.8 - } - ] -} diff --git a/configs/testing/metrics/pca.json b/configs/testing/metrics/pca.json deleted file mode 100755 index 7479666bc..000000000 --- a/configs/testing/metrics/pca.json +++ /dev/null @@ -1,153 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "pca", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "svd-solver": "full" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - } - } - ], - "n-components": 0.8 - }, - { - "dataset": [ - { - "source": "npy", - "name": "letters", - "training": - { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - } - } - ], - "n-components": 0.6 - }, - { - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - } - } - ], - "n-components": 0.8 - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - } - } - ], - "n-components": 0.6 - }, - { - "dataset": [ - { - "source": "npy", - "name": "connect", - "training": - { - "x": "data/connect_x_train.npy", - "y": "data/connect_y_train.npy" - } - } - ], - "n-components": 0.8 - }, - { - "dataset": [ - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - } - } - ], - "n-components": 0.6 - }, - { - "algorithm": "pca", - "dataset": [ - { - "source": "npy", - "name": "ijcnn", - "training": - { - "x": "data/ijcnn_x_train.npy", - "y": "data/ijcnn_y_train.npy" - } - } - ], - "n-components": 0.8 - }, - { - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - } - } - ], - "n-components": 0.6 - }, - { - "dataset": [ - { - "source": "npy", - "name": "klaverjas", - "training": - { - "x": "data/klaverjas_x_train.npy", - "y": "data/klaverjas_y_train.npy" - } - } - ], - "n-components": 0.8 - }, - { - "dataset": [ - { - "source": "npy", - "name": "covertype", - "training": - { - "x": "data/covertype_x_train.npy", - "y": "data/covertype_y_train.npy" - } - } - ], - "n-components": 0.8 - } - ] -} diff --git a/configs/testing/metrics/rf_clsf.json b/configs/testing/metrics/rf_clsf.json deleted file mode 100755 index bfdaaf730..000000000 --- a/configs/testing/metrics/rf_clsf.json +++ /dev/null @@ -1,188 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "df_clsf", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "num-trees": 500 - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "codrnanorm", - "training": - { - "x": "data/codrnanorm_x_train.npy", - "y": "data/codrnanorm_y_train.npy" - }, - "testing": - { - "x": "data/codrnanorm_x_test.npy", - "y": "data/codrnanorm_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "ijcnn", - "training": - { - "x": "data/ijcnn_x_train.npy", - "y": "data/ijcnn_y_train.npy" - }, - "testing": - { - "x": "data/ijcnn_x_test.npy", - "y": "data/ijcnn_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "klaverjas", - "training": - { - "x": "data/klaverjas_x_train.npy", - "y": "data/klaverjas_y_train.npy" - }, - "testing": - { - "x": "data/klaverjas_x_test.npy", - "y": "data/klaverjas_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "sensit", - "training": - { - "x": "data/sensit_x_train.npy", - "y": "data/sensit_y_train.npy" - }, - "testing": - { - "x": "data/sensit_x_test.npy", - "y": "data/sensit_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "covertype", - "training": - { - "x": "data/covertype_x_train.npy", - "y": "data/covertype_y_train.npy" - }, - "testing": - { - "x": "data/covertype_x_test.npy", - "y": "data/covertype_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "covtype", - "training": { - "x": "data/covtype_x_train.npy", - "y": "data/covtype_y_train.npy" - }, - "testing": { - "x": "data/covtype_x_test.npy", - "y": "data/covtype_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/testing/metrics/rf_regr.json b/configs/testing/metrics/rf_regr.json deleted file mode 100644 index 399dc1add..000000000 --- a/configs/testing/metrics/rf_regr.json +++ /dev/null @@ -1,137 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "df_regr", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": - { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": - { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "twodplanes", - "training": - { - "x": "data/twodplanes_x_train.npy", - "y": "data/twodplanes_y_train.npy" - }, - "testing": - { - "x": "data/twodplanes_x_test.npy", - "y": "data/twodplanes_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "medical_charges_nominal", - "training": - { - "x": "data/medical_charges_nominal_x_train.npy", - "y": "data/medical_charges_nominal_y_train.npy" - }, - "testing": - { - "x": "data/medical_charges_nominal_x_test.npy", - "y": "data/medical_charges_nominal_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "yolanda", - "training": - { - "x": "data/yolanda_x_train.npy", - "y": "data/yolanda_y_train.npy" - }, - "testing": - { - "x": "data/yolanda_x_test.npy", - "y": "data/yolanda_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/testing/metrics/ridge.json b/configs/testing/metrics/ridge.json deleted file mode 100755 index 271e677fa..000000000 --- a/configs/testing/metrics/ridge.json +++ /dev/null @@ -1,144 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "ridge", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "alpha": 1.0 - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": - { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": - { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ] - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ] - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - } - ] - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "npy", - "name": "twodplanes", - "training": - { - "x": "data/twodplanes_x_train.npy", - "y": "data/twodplanes_y_train.npy" - }, - "testing": - { - "x": "data/twodplanes_x_test.npy", - "y": "data/twodplanes_y_test.npy" - } - } - ] - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "npy", - "name": "medical_charges_nominal", - "training": - { - "x": "data/medical_charges_nominal_x_train.npy", - "y": "data/medical_charges_nominal_y_train.npy" - }, - "testing": - { - "x": "data/medical_charges_nominal_x_test.npy", - "y": "data/medical_charges_nominal_y_test.npy" - } - } - ] - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "npy", - "name": "yolanda", - "training": - { - "x": "data/yolanda_x_train.npy", - "y": "data/yolanda_y_train.npy" - }, - "testing": - { - "x": "data/yolanda_x_test.npy", - "y": "data/yolanda_y_test.npy" - } - } - ] - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/testing/metrics/svc.json b/configs/testing/metrics/svc.json deleted file mode 100755 index 60a5cd96c..000000000 --- a/configs/testing/metrics/svc.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "svm", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "C": 0.0015, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "a9a", - "training": - { - "x": "data/a9a_x_train.npy", - "y": "data/a9a_y_train.npy" - }, - "testing": - { - "x": "data/a9a_x_test.npy", - "y": "data/a9a_y_test.npy" - } - } - ], - "C": 500, - "kernel": "sigmoid" - }, - { - "dataset": [ - { - "source": "npy", - "name": "connect", - "training": - { - "x": "data/connect_x_train.npy", - "y": "data/connect_y_train.npy" - }, - "testing": - { - "x": "data/connect_x_test.npy", - "y": "data/connect_y_test.npy" - } - } - ], - "C": 100, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "skin_segmentation", - "training": - { - "x": "data/skin_segmentation_x_train.npy", - "y": "data/skin_segmentation_y_train.npy" - }, - "testing": - { - "x": "data/skin_segmentation_x_test.npy", - "y": "data/skin_segmentation_y_test.npy" - } - } - ], - "C": 1.0, - "kernel": "rbf" - } - ] -} diff --git a/configs/testing/metrics/svr.json b/configs/testing/metrics/svr.json deleted file mode 100755 index 7884e1edb..000000000 --- a/configs/testing/metrics/svr.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "svr", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": - { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": - { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ] - }, - { - "dataset": [ - { - "source": "npy", - "name": "california_housing", - "training": - { - "x": "data/california_housing_x_train.npy", - "y": "data/california_housing_y_train.npy" - }, - "testing": - { - "x": "data/california_housing_x_test.npy", - "y": "data/california_housing_y_test.npy" - } - } - ], - "C": 0.1, - "kernel": "poly" - }, - { - "dataset": [ - { - "source": "npy", - "name": "fried", - "training": - { - "x": "data/fried_x_train.npy", - "y": "data/fried_y_train.npy" - }, - "testing": - { - "x": "data/fried_x_test.npy", - "y": "data/fried_y_test.npy" - } - } - ], - "C": 2.0, - "kernel": "rbf" - } - ] -} diff --git a/configs/testing/sklearn.json b/configs/testing/sklearn.json deleted file mode 100644 index f114ef793..000000000 --- a/configs/testing/sklearn.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64" - }, - "cases": [ - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 10, - "n_features": 2, - "training": { - "n_samples": 100 - } - } - ], - "init": "k-means++", - "n-clusters": 10 - }, - { - "algorithm": "df_clsf", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 20 - } - } - ], - "num-trees": 10 - }, - { - "algorithm": "df_regr", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 100 - }, - "testing": { - "n_samples": 20 - } - } - ], - "num-trees": 10 - }, - { - "algorithm": "ridge", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 200 - } - } - ], - "alpha": 5 - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 200 - } - } - ] - }, - { - "algorithm": "log_reg", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 20 - } - } - ], - "tol": 0.01 - }, - { - "algorithm": "svm", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 100 - }, - "testing": { - "n_samples": 20 - } - } - ], - "C": 10.0, - "kernel": "linear" - }, - { - "algorithm": "nusvc", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 100 - }, - "testing": { - "n_samples": 20 - } - } - ], - "nu": 0.1, - "kernel": "poly" - }, - { - "algorithm": "svr", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 200 - } - } - ], - "C": 10.0, - "kernel": "rbf" - }, - { - "algorithm": "nusvr", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 200 - } - } - ], - "nu": 0.1, - "C": 1.0, - "kernel": "poly", - "degree": 2 - }, - { - "algorithm": "dbscan", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 10, - "n_features": 2, - "training": { - "n_samples": 100 - } - } - ] - }, - { - "algorithm": "knn_clsf", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 100 - }, - "testing": { - "n_samples": 20 - } - } - ], - "method": ["brute", "kd_tree"] - }, - { - "algorithm": "knn_regr", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 200 - } - } - ], - "method": ["brute", "kd_tree"] - }, - { - "algorithm": "train_test_split", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - } - } - ], - "include-y": "", - "train-size": 0.75, - "test-size": 0.25 - }, - { - "algorithm": "lasso", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 20 - } - } - ], - "alpha": 1.0, - "tol": 1e-4 - }, - { - "algorithm": "elasticnet", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 20 - } - } - ], - "alpha": 2.0, - "l1_ratio": 0.5, - "tol": 1e-4 - }, - { - "algorithm": "pca", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 20 - } - } - ] - }, - { - "algorithm": "distances", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 20 - } - } - ] - }, - { - "algorithm": "tsne", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 20 - } - } - ] - } - ] -} diff --git a/configs/testing/xgboost.json b/configs/testing/xgboost.json deleted file mode 100755 index 33242a630..000000000 --- a/configs/testing/xgboost.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "common": { - "lib": "xgboost", - "data-format": "pandas", - "data-order": "F", - "dtype": "float32", - "algorithm": "gbt", - "tree-method": "hist", - "count-dmatrix":"" - }, - "cases": [ - { - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 1000 - }, - "testing": { - "n_samples": 20 - } - } - ], - "n-estimators": 50, - "max-depth": 7, - "subsample": 0.7, - "colsample-bytree": 0.7, - "objective": "multi:softprob" - }, - { - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_classes": 5, - "n_features": 10, - "training": { - "n_samples": 100 - }, - "testing": { - "n_samples": 20 - } - } - ], - "n-estimators": 50, - "max-depth": 8, - "learning-rate": 0.1, - "reg-alpha": 0.9, - "objective": "reg:squarederror" - } - ] -} diff --git a/configs/xgboost/xgb_cpu_additional_config.json b/configs/xgboost/xgb_cpu_additional_config.json deleted file mode 100644 index 648492c54..000000000 --- a/configs/xgboost/xgb_cpu_additional_config.json +++ /dev/null @@ -1,141 +0,0 @@ -{ - "common": { - "lib": "xgboost", - "data-format": "pandas", - "data-order": "F", - "dtype": "float32", - "algorithm": "gbt", - "tree-method": "hist", - "count-dmatrix": "", - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256 - }, - "cases": [ - { - "objective": "binary:logistic", - "scale-pos-weight": 2.1067817411664587, - "dataset": [ - { - "source": "npy", - "name": "airline", - "training": { - "x": "data/airline_x_train.npy", - "y": "data/airline_y_train.npy" - }, - "testing": { - "x": "data/airline_x_test.npy", - "y": "data/airline_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 173.63348001466812, - "dataset": [ - { - "source": "npy", - "name": "bosch", - "training": { - "x": "data/bosch_x_train.npy", - "y": "data/bosch_y_train.npy" - }, - "testing": { - "x": "data/bosch_x_test.npy", - "y": "data/bosch_y_test.npy" - } - } - ] - }, - { - "objective": "multi:softmax", - "dataset": [ - { - "source": "npy", - "name": "covtype", - "training": { - "x": "data/covtype_x_train.npy", - "y": "data/covtype_y_train.npy" - }, - "testing": { - "x": "data/covtype_x_test.npy", - "y": "data/covtype_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 2.0017715678375363, - "dataset": [ - { - "source": "npy", - "name": "epsilon", - "training": { - "x": "data/epsilon_x_train.npy", - "y": "data/epsilon_y_train.npy" - }, - "testing": { - "x": "data/epsilon_x_test.npy", - "y": "data/epsilon_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 578.2868020304569, - "dataset": [ - { - "source": "npy", - "name": "fraud", - "training": { - "x": "data/fraud_x_train.npy", - "y": "data/fraud_y_train.npy" - }, - "testing": { - "x": "data/fraud_x_test.npy", - "y": "data/fraud_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 1.8872389605086624, - "dataset": [ - { - "source": "npy", - "name": "higgs", - "training": { - "x": "data/higgs_x_train.npy", - "y": "data/higgs_y_train.npy" - }, - "testing": { - "x": "data/higgs_x_test.npy", - "y": "data/higgs_y_test.npy" - } - } - ] - }, - { - "objective": "reg:squarederror", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/xgboost/xgb_cpu_main_config.json b/configs/xgboost/xgb_cpu_main_config.json deleted file mode 100644 index d17585b6d..000000000 --- a/configs/xgboost/xgb_cpu_main_config.json +++ /dev/null @@ -1,197 +0,0 @@ -{ - "common": { - "lib": "xgboost", - "data-format": "pandas", - "data-order": "F", - "dtype": "float32", - "algorithm": "gbt", - "tree-method": "hist", - "count-dmatrix": "" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ], - "learning-rate": 0.03, - "max-depth": 6, - "n-estimators": 1000, - "objective": "reg:squarederror" - }, - { - "dataset": [ - { - "source": "npy", - "name": "airline-ohe", - "training": { - "x": "data/airline-ohe_x_train.npy", - "y": "data/airline-ohe_y_train.npy" - }, - "testing": { - "x": "data/airline-ohe_x_test.npy", - "y": "data/airline-ohe_y_test.npy" - } - } - ], - "reg-alpha": 0.9, - "max-bin": 256, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-lambda": 1, - "min-child-weight": 0, - "max-depth": 8, - "max-leaves": 256, - "n-estimators": 1000, - "objective": "binary:logistic" - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "reg-alpha": 0.9, - "max-bin": 256, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-lambda": 1, - "min-child-weight": 0, - "max-depth": 8, - "max-leaves": 256, - "n-estimators": 1000, - "objective": "binary:logistic", - "enable-experimental-json-serialization": "False", - "inplace-predict": "" - }, - { - "dataset": [ - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - } - ], - "learning-rate": 0.03, - "max-depth": 6, - "n-estimators": 1000, - "objective": "multi:softprob" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mlsr", - "training": { - "x": "data/mlsr_x_train.npy", - "y": "data/mlsr_y_train.npy" - } - } - ], - "max-bin": 256, - "learning-rate": 0.3, - "subsample": 1, - "reg-lambda": 2, - "min-child-weight": 1, - "min-split-loss": 0.1, - "max-depth": 8, - "n-estimators": 200, - "objective": "multi:softprob", - "single-precision-histogram": "" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mortgage1Q", - "training": { - "x": "data/mortgage1Q_x_train.npy", - "y": "data/mortgage1Q_y_train.npy" - } - } - ], - "n-estimators": 100, - "objective": "reg:squarederror", - "max-depth": 8, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-alpha": 0.9, - "reg-lambda": 1, - "min-child-weight": 0, - "max-leaves": 256 - }, - { - "dataset": [ - { - "source": "npy", - "name": "plasticc", - "training": { - "x": "data/plasticc_x_train.npy", - "y": "data/plasticc_y_train.npy" - }, - "testing": { - "x": "data/plasticc_x_test.npy", - "y": "data/plasticc_y_test.npy" - } - } - ], - "n-estimators": 60, - "objective": "multi:softprob", - "max-depth": 7, - "subsample": 0.7, - "colsample-bytree": 0.7 - }, - { - "dataset": [ - { - "source": "npy", - "name": "santander", - "training": { - "x": "data/santander_x_train.npy", - "y": "data/santander_y_train.npy" - }, - "testing": { - "x": "data/santander_x_test.npy", - "y": "data/santander_y_test.npy" - } - } - ], - "n-estimators": 10000, - "objective": "binary:logistic", - "max-depth": 1, - "subsample": 0.5, - "eta": 0.1, - "colsample-bytree": 0.05, - "single-precision-histogram": "" - } - ] -} diff --git a/configs/xgboost/xgb_gpu_additional_config.json b/configs/xgboost/xgb_gpu_additional_config.json deleted file mode 100644 index 75036ad4b..000000000 --- a/configs/xgboost/xgb_gpu_additional_config.json +++ /dev/null @@ -1,141 +0,0 @@ -{ - "common": { - "lib": "xgboost", - "data-format": "cudf", - "data-order": "F", - "dtype": "float32", - "algorithm": "gbt", - "tree-method": "gpu_hist", - "count-dmatrix": "", - "max-depth": 8, - "learning-rate": 0.1, - "reg-lambda": 1, - "max-leaves": 256 - }, - "cases": [ - { - "objective": "binary:logistic", - "scale-pos-weight": 2.1067817411664587, - "dataset": [ - { - "source": "npy", - "name": "airline", - "training": { - "x": "data/airline_x_train.npy", - "y": "data/airline_y_train.npy" - }, - "testing": { - "x": "data/airline_x_test.npy", - "y": "data/airline_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 173.63348001466812, - "dataset": [ - { - "source": "npy", - "name": "bosch", - "training": { - "x": "data/bosch_x_train.npy", - "y": "data/bosch_y_train.npy" - }, - "testing": { - "x": "data/bosch_x_test.npy", - "y": "data/bosch_y_test.npy" - } - } - ] - }, - { - "objective": "multi:softmax", - "dataset": [ - { - "source": "npy", - "name": "covtype", - "training": { - "x": "data/covtype_x_train.npy", - "y": "data/covtype_y_train.npy" - }, - "testing": { - "x": "data/covtype_x_test.npy", - "y": "data/covtype_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 2.0017715678375363, - "dataset": [ - { - "source": "npy", - "name": "epsilon", - "training": { - "x": "data/epsilon_x_train.npy", - "y": "data/epsilon_y_train.npy" - }, - "testing": { - "x": "data/epsilon_x_test.npy", - "y": "data/epsilon_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 578.2868020304569, - "dataset": [ - { - "source": "npy", - "name": "fraud", - "training": { - "x": "data/fraud_x_train.npy", - "y": "data/fraud_y_train.npy" - }, - "testing": { - "x": "data/fraud_x_test.npy", - "y": "data/fraud_y_test.npy" - } - } - ] - }, - { - "objective": "binary:logistic", - "scale-pos-weight": 1.8872389605086624, - "dataset": [ - { - "source": "npy", - "name": "higgs", - "training": { - "x": "data/higgs_x_train.npy", - "y": "data/higgs_y_train.npy" - }, - "testing": { - "x": "data/higgs_x_test.npy", - "y": "data/higgs_y_test.npy" - } - } - ] - }, - { - "objective": "reg:squarederror", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - } - ] -} diff --git a/configs/xgboost/xgb_gpu_main_config.json b/configs/xgboost/xgb_gpu_main_config.json deleted file mode 100644 index 11144ca35..000000000 --- a/configs/xgboost/xgb_gpu_main_config.json +++ /dev/null @@ -1,194 +0,0 @@ -{ - "common": { - "lib": "xgboost", - "data-format": "cudf", - "data-order": "F", - "dtype": "float32", - "algorithm": "gbt", - "tree-method": "gpu_hist", - "count-dmatrix": "" - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "abalone", - "training": { - "x": "data/abalone_x_train.npy", - "y": "data/abalone_y_train.npy" - }, - "testing": { - "x": "data/abalone_x_test.npy", - "y": "data/abalone_y_test.npy" - } - } - ], - "learning-rate": 0.03, - "max-depth": 6, - "n-estimators": 1000, - "objective": "reg:squarederror" - }, - { - "dataset": [ - { - "source": "npy", - "name": "airline-ohe", - "training": { - "x": "data/airline-ohe_x_train.npy", - "y": "data/airline-ohe_y_train.npy" - }, - "testing": { - "x": "data/airline-ohe_x_test.npy", - "y": "data/airline-ohe_y_test.npy" - } - } - ], - "reg-alpha": 0.9, - "max-bin": 256, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-lambda": 1, - "min-child-weight": 0, - "max-depth": 8, - "max-leaves": 256, - "n-estimators": 1000, - "objective": "binary:logistic" - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ], - "reg-alpha": 0.9, - "max-bin": 256, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-lambda": 1, - "min-child-weight": 0, - "max-depth": 8, - "max-leaves": 256, - "n-estimators": 1000, - "objective": "binary:logistic", - "inplace-predict": "" - }, - { - "dataset": [ - { - "source": "npy", - "name": "letters", - "training": { - "x": "data/letters_x_train.npy", - "y": "data/letters_y_train.npy" - }, - "testing": { - "x": "data/letters_x_test.npy", - "y": "data/letters_y_test.npy" - } - } - ], - "learning-rate": 0.03, - "max-depth": 6, - "n-estimators": 1000, - "objective": "multi:softprob" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mlsr", - "training": { - "x": "data/mlsr_x_train.npy", - "y": "data/mlsr_y_train.npy" - } - } - ], - "max-bin": 256, - "learning-rate": 0.3, - "subsample": 1, - "reg-lambda": 2, - "min-child-weight": 1, - "min-split-loss": 0.1, - "max-depth": 8, - "n-estimators": 200, - "objective": "multi:softprob" - }, - { - "dataset": [ - { - "source": "npy", - "name": "mortgage1Q", - "training": { - "x": "data/mortgage1Q_x_train.npy", - "y": "data/mortgage1Q_y_train.npy" - } - } - ], - "n-estimators": 100, - "objective": "reg:squarederror", - "max-depth": 8, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-alpha": 0.9, - "reg-lambda": 1, - "min-child-weight": 0, - "max-leaves": 256 - }, - { - "dataset": [ - { - "source": "npy", - "name": "plasticc", - "training": { - "x": "data/plasticc_x_train.npy", - "y": "data/plasticc_y_train.npy" - }, - "testing": { - "x": "data/plasticc_x_test.npy", - "y": "data/plasticc_y_test.npy" - } - } - ], - "n-estimators": 60, - "objective": "multi:softprob", - "max-depth": 7, - "subsample": 0.7, - "colsample-bytree": 0.7 - }, - { - "dataset": [ - { - "source": "npy", - "name": "santander", - "training": { - "x": "data/santander_x_train.npy", - "y": "data/santander_y_train.npy" - }, - "testing": { - "x": "data/santander_x_test.npy", - "y": "data/santander_y_test.npy" - } - } - ], - "n-estimators": 10000, - "objective": "binary:logistic", - "max-depth": 1, - "subsample": 0.5, - "eta": 0.1, - "colsample-bytree": 0.05 - } - ] -} diff --git a/configs/xgboost_example.json b/configs/xgboost_example.json new file mode 100644 index 000000000..de06d647f --- /dev/null +++ b/configs/xgboost_example.json @@ -0,0 +1,45 @@ +{ + "PARAMETERS_SETS": { + "common": { + "algorithm": { + "device": "cpu", + "library": "xgboost", + "estimator_params": { "max_depth": [3, 5] }, + "enable_modelbuilders": [true, false] + }, + "bench": { "n_runs": 5, "time_limit": 60 } + }, + "classification": { + "algorithm": { + "estimator": "XGBClassifier", + "estimator_params": { "scale_pos_weight": "[SPECIAL_VALUE]auto" } + }, + "data": { + "source": "make_classification", + "generation_kwargs": { + "n_classes": [2, 4], + "n_samples": 1250, + "n_features": 8, + "n_informative": "[SPECIAL_VALUE]0.75" + }, + "split_kwargs": { "test_size": 0.2 } + } + }, + "regression": { + "algorithm": { "estimator": "XGBRegressor" }, + "data": { + "source": "make_regression", + "generation_kwargs": { "n_samples": 1000, "n_features": 8 }, + "split_kwargs": { "ignore": true } + } + } + }, + "TEMPLATES": { + "classification": { + "SETS": ["common", "classification"] + }, + "regression": { + "SETS": ["common", "regression"] + } + } +} diff --git a/cuml_bench/README.md b/cuml_bench/README.md deleted file mode 100644 index edcd20735..000000000 --- a/cuml_bench/README.md +++ /dev/null @@ -1,162 +0,0 @@ - -## How to create conda environment for benchmarking -`conda create -n bench -c rapidsai -c conda-forge python=3.7 scikit-learn cuml pandas cudf tqdm` - -## Algorithms parameters - -You can launch benchmarks for each algorithm separately. The tables below list all supported parameters for each algorithm: - -- [General](#general) -- [DBSCAN](#dbscan) -- [RandomForestClassifier](#randomforestclassifier) -- [RandomForestRegressor](#randomforestregressor) -- [pairwise_distances](#pairwise_distances) -- [KMeans](#kmeans) -- [KNeighborsClassifier](#kneighborsclassifier) -- [LinearRegression](#linearregression) -- [LogisticRegression](#logisticregression) -- [PCA](#pca) -- [Ridge Regression](#ridge) -- [SVC](#svc) -- [TSNE](#tsne) -- [train_test_split](#train_test_split) - -#### General -| Parameter Name | Type | Default Value | Description | -| ----- | ---- |---- |---- | -|num-threads|int|-1| The number of threads to use| -|arch|str|?|Achine architecture, for bookkeeping| -|batch|str|?|Batch ID, for bookkeeping| -|prefix|str|sklearn|Prefix string, for bookkeeping| -|header|action|False|Output CSV header| -|verbose|action|False|Output extra debug messages| -|data-format|str|numpy|Data formats: *numpy*, *pandas* or *cudf*| -|data-order|str|C|Data order: C (row-major, default) or F (column-major)| -|dtype|np.dtype|np.float64|Data type: *float64* (default) or *float32*| -|check-finiteness|action|False|Check finiteness in sklearn input check(disabled by default)| -|output-format|str|csv|Output format: *csv* (default) or *json*'| -|time-method|str|mean_min|Method used for time mesurements| -|box-filter-measurements|int|100|Maximum number of measurements in box filter| -|inner-loops|int|100|Maximum inner loop iterations. (we take the mean over inner iterations)| -|outer-loops|int|100|Maximum outer loop iterations. (we take the min over outer iterations)| -|time-limit|float|10|Target time to spend to benchmark| -|goal-outer-loops|int|10|The number of outer loops to aim while automatically picking number of inner loops. If zero, do not automatically decide number of inner loops| -|seed|int|12345|Seed to pass as random_state| -|dataset-name|str|None|Dataset name| - - -#### DBSCAN -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| epsilon | float | 10 | Radius of neighborhood of a point| -| min_samples | int | 5 | The minimum number of samples required in a 'neighborhood to consider a point a core point | - -#### RandomForestClassifier - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split | -|split-algorithm|str|hist|*hist* or *global_quantile*. The algorithm to determine how nodes are split in the tree| -| num-trees | int | 100 | The number of trees in the forest | -| max-features | float_or_int | None | Upper bound on features used at each split | -| max-depth | int | None | Upper bound on depth of constructed trees | -| min-samples-split | float_or_int | 2 | Minimum samples number for node splitting | -| max-leaf-nodes | int | None | Maximum leaf nodes per tree | -| min-impurity-decrease | float | 0 | Needed impurity decrease for node splitting | -| no-bootstrap | store_false | True | Don't control bootstraping | - -#### RandomForestRegressor - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split | -|split-algorithm|str|hist|*hist* or *global_quantile*. The algorithm to determine how nodes are split in the tree| -| num-trees | int | 100 | The number of trees in the forest | -| max-features | float_or_int | None | Upper bound on features used at each split | -| max-depth | int | None | Upper bound on depth of constructed trees | -| min-samples-split | float_or_int | 2 | Minimum samples number for node splitting | -| max-leaf-nodes | int | None | Maximum leaf nodes per tree | -| min-impurity-decrease | float | 0 | Needed impurity decrease for node splitting | -| no-bootstrap | action | True | Don't control bootstraping | - -#### KMeans - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| init | str | | Initial clusters | -| tol | float | 0 | Absolute threshold | -| maxiter | int | 100 | Maximum number of iterations | -| samples-per-batch | int | 32768 | The number of samples per batch | -| n-clusters | int | | The number of clusters | - -#### KNeighborsClassifier - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| n-neighbors | int | 5 | The number of neighbors to use | -| weights | str | uniform | Weight function used in prediction | -| method | str | brute | Algorithm used to compute the nearest neighbors | -| metric | str | euclidean | Distance metric to use | - -#### LinearRegression - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| no-fit-intercept | action | True | Don't fit intercept (assume data already centered) | -| solver | str | eig | *eig* or *svd*. Solver used for training | - -#### LogisticRegression - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| no-fit-intercept | action | True | Don't fit intercept| -| solver | str | qn | *qn*, *owl*. Solver to use| -| maxiter | int | 100 | Maximum iterations for the iterative solver | -| C | float | 1.0 | Regularization parameter | -| tol | float | None | Tolerance for solver | - -#### PCA - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| svd-solver | str | full | *auto*, *full* or *jacobi*. SVD solver to use | -| n-components | int | None | The number of components to find | -| whiten | action | False | Perform whitening | - -#### Ridge - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| no-fit-intercept | action | True | Don't fit intercept (assume data already centered) | -| solver | str | eig | *eig*, *cd* or *svd*. Solver used for training | -| alpha | float | 1.0 | Regularization strength | - -#### SVC - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| C | float | 0.01 | SVM slack parameter | -| kernel | str | linear | *linear* or *rbf*. SVM kernel function | -| gamma | float | None | Parameter for kernel="rbf" | -| max-cache-size | int | 64 | Maximum cache size for SVM. | -| tol | float | 1e-16 | Tolerance passed to sklearn.svm.SVC | -| probability | action | True | Use probability for SVC | - -### TSNE - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| n-components | int | 2 | Dimension of the embedded space | -| early-exaggeration | float | 12.0 | This factor increases the attractive forces between points
and allows points to move around more freely finding their nearest neighbors more easily | -| learning-rate | float | 200.0 | The learning rate for t-SNE is usually in the range [10.0, 1000.0] | -| angle | float | 0.5 | Angular size. This is the trade-off between speed and accuracy | -| min-grad-norm | float | 1e-7 | If the gradient norm is below this threshold, the optimization is stopped | -| random-state | int | 1234 | Determines the random number generator | - -#### train_test_split - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| train-size | float | 0.75 | Size of training subset | -| test-size | float | 0.25 | Size of testing subset | -| do-not-shuffle | action | False | Do not perform data shuffle before splitting | diff --git a/cuml_bench/dbscan.py b/cuml_bench/dbscan.py deleted file mode 100644 index 03ecfc2e4..000000000 --- a/cuml_bench/dbscan.py +++ /dev/null @@ -1,52 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from cuml import DBSCAN -from sklearn.metrics.cluster import davies_bouldin_score - - -parser = argparse.ArgumentParser(description='cuML DBSCAN benchmark') -parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., - help='Radius of neighborhood of a point') -parser.add_argument('-m', '--min-samples', default=5, type=int, - help='The minimum number of samples required in a ' - 'neighborhood to consider a point a core point') -params = bench.parse_args(parser) - -# Load generated data -X, _, _, _ = bench.load_data(params) - -# Create our clustering object -dbscan = DBSCAN(eps=params.eps, - min_samples=params.min_samples) - -# Time fit -time, _ = bench.measure_function_time(dbscan.fit, X, params=params) -labels = dbscan.labels_ - -X_host = bench.convert_to_numpy(X) -labels_host = bench.convert_to_numpy(labels) - -acc = davies_bouldin_score(X_host, labels_host) -params.n_clusters = len(set(labels_host)) - (1 if -1 in labels_host else 0) - -bench.print_output(library='cuml', algorithm='dbscan', stages=['training'], - params=params, functions=['DBSCAN'], times=[time], - metrics=[acc], metric_type='davies_bouldin_score', data=[X], - alg_instance=dbscan) diff --git a/cuml_bench/df_clsf.py b/cuml_bench/df_clsf.py deleted file mode 100755 index 80f659638..000000000 --- a/cuml_bench/df_clsf.py +++ /dev/null @@ -1,101 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import cuml -from cuml.ensemble import RandomForestClassifier - -parser = argparse.ArgumentParser(description='cuml random forest ' - 'classification benchmark') - -parser.add_argument('--criterion', type=str, default='gini', - choices=('gini', 'entropy'), - help='The function to measure the quality of a split') -parser.add_argument('--split-algorithm', type=str, default='hist', - choices=('hist', 'global_quantile'), - help='The algorithm to determine how ' - 'nodes are split in the tree') -parser.add_argument('--num-trees', type=int, default=100, - help='Number of trees in the forest') -parser.add_argument('--max-features', type=bench.float_or_int, default=None, - help='Upper bound on features used at each split') -parser.add_argument('--max-depth', type=int, default=None, - help='Upper bound on depth of constructed trees') -parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, - help='Minimum samples number for node splitting') -parser.add_argument('--max-leaf-nodes', type=int, default=-1, - help='Maximum leaf nodes per tree') -parser.add_argument('--min-impurity-decrease', type=float, default=0., - help='Needed impurity decrease for node splitting') -parser.add_argument('--no-bootstrap', dest='bootstrap', default=True, - action='store_false', help="Don't control bootstraping") - -params = bench.parse_args(parser) - -# Load and convert data -X_train, X_test, y_train, y_test = bench.load_data(params, int_label=True) - -if params.criterion == 'gini': - params.criterion = 0 -else: - params.criterion = 1 - -if params.split_algorithm == 'hist': - params.split_algorithm = 0 -else: - params.split_algorithm = 1 - -params.n_classes = y_train[y_train.columns[0]].nunique() - -clf = RandomForestClassifier( - split_criterion=params.criterion, - split_algo=params.split_algorithm, - n_estimators=params.num_trees, - max_depth=params.max_depth, - max_features=params.max_features, - min_samples_split=params.min_samples_split, - max_leaves=params.max_leaf_nodes, - min_impurity_decrease=params.min_impurity_decrease, - bootstrap=params.bootstrap, -) - - -def fit(clf, X, y): - return clf.fit(X, y) - - -def predict(clf, X): - prediction_args = {'predict_model': 'GPU'} - if int(cuml.__version__.split('.')[1]) <= 14: - prediction_args.update({'num_classes': params.n_classes}) - return clf.predict(X, **prediction_args) - - -fit_time, _ = bench.measure_function_time(fit, clf, X_train, y_train, params=params) -y_pred = predict(clf, X_train) -train_acc = 100 * bench.accuracy_score(y_pred, y_train) - -predict_time, y_pred = bench.measure_function_time(predict, clf, X_test, params=params) -test_acc = 100 * bench.accuracy_score(y_pred, y_test) - -bench.print_output(library='cuml', algorithm='df_clsf', - stages=['training', 'prediction'], - params=params, functions=['df_clsf.fit', 'df_clsf.predict'], - times=[fit_time, predict_time], metric_type='accuracy[%]', - metrics=[train_acc, test_acc], data=[X_train, X_test], - alg_instance=clf) diff --git a/cuml_bench/df_regr.py b/cuml_bench/df_regr.py deleted file mode 100644 index 1ba061468..000000000 --- a/cuml_bench/df_regr.py +++ /dev/null @@ -1,87 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import bench -from cuml.ensemble import RandomForestRegressor - -parser = argparse.ArgumentParser(description='cuml random forest ' - 'regression benchmark') - -parser.add_argument('--split-algorithm', type=str, default='hist', - choices=('hist', 'global_quantile'), - help='The algorithm to determine how ' - 'nodes are split in the tree') -parser.add_argument('--num-trees', type=int, default=100, - help='Number of trees in the forest') -parser.add_argument('--max-features', type=bench.float_or_int, default=1.0, - help='Upper bound on features used at each split') -parser.add_argument('--max-depth', type=int, default=16, - help='Upper bound on depth of constructed trees') -parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, - help='Minimum samples number for node splitting') -parser.add_argument('--max-leaf-nodes', type=int, default=-1, - help='Maximum leaf nodes per tree') -parser.add_argument('--min-impurity-decrease', type=float, default=0.0, - help='Needed impurity decrease for node splitting') -parser.add_argument('--no-bootstrap', dest='bootstrap', default=True, - action='store_false', help="Don't control bootstraping") - -params = bench.parse_args(parser) - -# Load and convert data -X_train, X_test, y_train, y_test = bench.load_data(params, int_label=True) - -if params.split_algorithm == 'hist': - params.split_algorithm = 0 -else: - params.split_algorithm = 1 - -# Create our random forest regressor -regr = RandomForestRegressor( - n_estimators=params.num_trees, - split_algo=params.split_algorithm, - max_features=params.max_features, - min_samples_split=params.min_samples_split, - max_depth=params.max_depth, - max_leaves=params.max_leaf_nodes, - min_impurity_decrease=params.min_impurity_decrease, - bootstrap=params.bootstrap, - -) - - -def fit(regr, X, y): - return regr.fit(X, y) - - -def predict(regr, X): - return regr.predict(X, predict_model='GPU') - - -fit_time, _ = bench.measure_function_time(fit, regr, X_train, y_train, params=params) -y_pred = predict(regr, X_train) -train_rmse = bench.rmse_score(y_pred, y_train) - -predict_time, y_pred = bench.measure_function_time(predict, regr, X_test, params=params) -test_rmse = bench.rmse_score(y_pred, y_test) - -bench.print_output(library='cuml', algorithm='df_regr', - stages=['training', 'prediction'], params=params, - functions=['df_regr.fit', 'df_regr.predict'], - times=[fit_time, predict_time], metric_type='rmse', - metrics=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) diff --git a/cuml_bench/elasticnet.py b/cuml_bench/elasticnet.py deleted file mode 100755 index 2d969a886..000000000 --- a/cuml_bench/elasticnet.py +++ /dev/null @@ -1,61 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from cuml.linear_model import ElasticNet - - -parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") -parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, - help='Regularization parameter') -parser.add_argument('--maxiter', type=int, default=1000, - help='Maximum iterations for the iterative solver') -parser.add_argument('--l1_ratio', dest='l1_ratio', type=float, default=0.5, - help='Regularization parameter') -parser.add_argument('--tol', type=float, default=0.0, - help='Tolerance for solver.') -params = bench.parse_args(parser) - -# Load data -X_train, X_test, y_train, y_test = bench.load_data(params) - -# Create our regression object -regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio, - alpha=params.alpha, tol=params.tol, max_iter=params.maxiter) - -# Time fit -fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - -# Time predict -predict_time, pred_train = bench.measure_function_time(regr.predict, X_train, - params=params) - -train_rmse = bench.rmse_score(pred_train, y_train) -pred_test = regr.predict(X_test) -test_rmse = bench.rmse_score(pred_test, y_test) - -bench.print_output(library='cuml', algorithm='elasticnet', - stages=['training', 'prediction'], params=params, - functions=['ElasticNet.fit', 'ElasticNet.predict'], - times=[fit_time, predict_time], metric_type='rmse', - metrics=[train_rmse, test_rmse], data=[X_train, X_train], - alg_instance=regr) diff --git a/cuml_bench/kmeans.py b/cuml_bench/kmeans.py deleted file mode 100644 index 2e3e9d9ff..000000000 --- a/cuml_bench/kmeans.py +++ /dev/null @@ -1,93 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import warnings -from typing import Any - -import bench -import numpy as np -from cuml import KMeans -from sklearn.metrics.cluster import davies_bouldin_score - -warnings.filterwarnings('ignore', category=FutureWarning) -parser = argparse.ArgumentParser(description='cuML K-means benchmark') -parser.add_argument('-i', '--filei', '--fileI', '--init', - type=str, help='Initial clusters') -parser.add_argument('-t', '--tol', type=float, default=0., - help='Absolute threshold') -parser.add_argument('--maxiter', type=int, default=100, - help='Maximum number of iterations') -parser.add_argument('--samples-per-batch', type=int, default=32768, - help='Maximum number of iterations') -parser.add_argument('--n-clusters', type=int, help='Number of clusters') -params = bench.parse_args(parser, prefix='cuml', loop_types=('fit', 'predict')) - -# Load and convert generated data -X_train, X_test, _, _ = bench.load_data(params) - -X_init: Any -if params.filei == 'k-means++': - X_init = 'k-means++' -# Load initial centroids from specified path -elif params.filei is not None: - X_init = {k: v.astype(params.dtype) for k, v in np.load(params.filei).items()} - if isinstance(X_init, np.ndarray): - params.n_clusters = X_init.shape[0] -# or choose random centroids from training data -else: - np.random.seed(params.seed) - centroids_idx = np.random.randint(low=0, high=X_train.shape[0], - size=params.n_clusters) - if hasattr(X_train, "iloc"): - X_init = X_train.iloc[centroids_idx].to_pandas().values - else: - X_init = X_train[centroids_idx] - - -# Workaround for cuML kmeans fail -# when second call of 'fit' method causes AttributeError -def kmeans_fit(X): - alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, - max_iter=params.maxiter, init=X_init, - max_samples_per_batch=params.samples_per_batch) - alg.fit(X) - return alg - - -# Time fit -fit_time, kmeans = bench.measure_function_time(kmeans_fit, X_train, params=params) -train_predict = kmeans.predict(X_train) - -# Time predict -predict_time, test_predict = bench.measure_function_time(kmeans.predict, X_test, - params=params) - -X_train_host = bench.convert_to_numpy(X_train) -train_predict_host = bench.convert_to_numpy(train_predict) -acc_train = davies_bouldin_score(X_train_host, train_predict_host) - -X_test_host = bench.convert_to_numpy(X_test) -test_predict_host = bench.convert_to_numpy(test_predict) - -acc_test = davies_bouldin_score(X_test_host, test_predict_host) - -bench.print_output(library='cuml', algorithm='kmeans', - stages=['training', 'prediction'], params=params, - functions=['KMeans.fit', 'KMeans.predict'], - times=[fit_time, predict_time], metric_type='davies_bouldin_score', - metrics=[acc_train, acc_test], data=[X_train, X_test], - alg_instance=kmeans) diff --git a/cuml_bench/knn_clsf.py b/cuml_bench/knn_clsf.py deleted file mode 100755 index ec7d21490..000000000 --- a/cuml_bench/knn_clsf.py +++ /dev/null @@ -1,80 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from cuml.neighbors import KNeighborsClassifier - - -parser = argparse.ArgumentParser( - description='cuML kNN classifier benchmark') - -parser.add_argument('--task', default='classification', type=str, - choices=('search', 'classification'), - help='kNN task: search or classification') -parser.add_argument('--n-neighbors', default=5, type=int, - help='Number of neighbors to use') -parser.add_argument('--weights', type=str, default='uniform', - help='Weight function used in prediction') -parser.add_argument('--method', type=str, default='brute', - help='Algorithm used to compute the nearest neighbors') -parser.add_argument('--metric', type=str, default='euclidean', - help='Distance metric to use') -params = bench.parse_args(parser) - -# Load generated data -X_train, X_test, y_train, y_test = bench.load_data(params) -params.n_classes = y_train[y_train.columns[0]].nunique() - -# Create classification object -knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors, - weights=params.weights, - algorithm=params.method, - metric=params.metric) - -# Measure time and accuracy on fitting -train_time, _ = bench.measure_function_time(knn_clsf.fit, X_train, y_train, - params=params) -if params.task == 'classification': - y_pred = knn_clsf.predict(X_train) - train_acc = 100 * bench.accuracy_score(y_pred, y_train) - -# Measure time and accuracy on prediction -if params.task == 'classification': - predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test, - params=params) - test_acc = 100 * bench.accuracy_score(yp, y_test) -else: - predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test, - params=params) - -if params.task == 'classification': - bench.print_output(library='cuml', - algorithm=knn_clsf.algorithm + '_knn_clsf', - stages=['training', 'prediction'], params=params, - functions=['knn_clsf.fit', 'knn_clsf.predict'], - times=[train_time, predict_time], - metrics=[train_acc, test_acc], metric_type='accuracy[%]', - data=[X_train, X_test], alg_instance=knn_clsf) -else: - bench.print_output(library='cuml', - algorithm=knn_clsf.algorithm + '_knn_search', - stages=['training', 'search'], params=params, - functions=['knn_clsf.fit', 'knn_clsf.kneighbors'], - times=[train_time, predict_time], - metrics=[], metric_type=None, - data=[X_train, X_test], alg_instance=knn_clsf) diff --git a/cuml_bench/lasso.py b/cuml_bench/lasso.py deleted file mode 100755 index 9dc9e9e1c..000000000 --- a/cuml_bench/lasso.py +++ /dev/null @@ -1,58 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from cuml.linear_model import Lasso - - -parser = argparse.ArgumentParser(description='scikit-learn lasso regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=False, - action='store_false', - help="Don't fit intercept (assume data already centered)") -parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, - help='Regularization parameter') -parser.add_argument('--maxiter', type=int, default=1000, - help='Maximum iterations for the iterative solver') -parser.add_argument('--tol', type=float, default=0.0, - help='Tolerance for solver.') -params = bench.parse_args(parser) - -# Load data -X_train, X_test, y_train, y_test = bench.load_data(params) - -# Create our regression object -regr = Lasso(fit_intercept=params.fit_intercept, alpha=params.alpha, - tol=params.tol, max_iter=params.maxiter) -# Time fit -fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - -# Time predict -predict_time, pred_train = bench.measure_function_time(regr.predict, X_train, - params=params) - -train_rmse = bench.rmse_score(pred_train, y_train) -pred_test = regr.predict(X_test) -test_rmse = bench.rmse_score(pred_test, y_test) - -bench.print_output(library='sklearn', algorithm='lasso', - stages=['training', 'prediction'], - params=params, functions=['Lasso.fit', 'Lasso.predict'], - times=[fit_time, predict_time], metric_type='rmse', - metrics=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) diff --git a/cuml_bench/linear.py b/cuml_bench/linear.py deleted file mode 100644 index 714454cfc..000000000 --- a/cuml_bench/linear.py +++ /dev/null @@ -1,55 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from cuml import LinearRegression - - -parser = argparse.ArgumentParser(description='cuML linear regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") -parser.add_argument('--solver', default='eig', choices=('eig', 'svd'), - help='Solver used for training') -params = bench.parse_args(parser, prefix='cuml') - -# Load data -X_train, X_test, y_train, y_test = bench.load_data( - params, generated_data=['X_train', 'y_train']) - -# Create our regression object -regr = LinearRegression(fit_intercept=params.fit_intercept, - algorithm=params.solver) - -# Time fit -fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - -# Time predict -predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) - -test_rmse = bench.rmse_score(yp, y_test) -yp = regr.predict(X_train) -train_rmse = bench.rmse_score(yp, y_train) - -bench.print_output(library='cuml', algorithm='lin_reg', - stages=['training', 'prediction'], params=params, - functions=['Linear.fit', 'Linear.predict'], - times=[fit_time, predict_time], metric_type='rmse', - metrics=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) diff --git a/cuml_bench/log_reg.py b/cuml_bench/log_reg.py deleted file mode 100644 index 5dda0611a..000000000 --- a/cuml_bench/log_reg.py +++ /dev/null @@ -1,66 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from cuml import LogisticRegression - - -parser = argparse.ArgumentParser(description='cuML logistic ' - 'regression benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', - action='store_false', default=True, - help="Don't fit intercept") -parser.add_argument('--solver', default='qn', choices=('qn', 'owl'), - help='Solver to use.') -parser.add_argument('--linesearch-max-iter', type=int, default=50, - help='Maximum iterations per solver outer iteration') -parser.add_argument('--maxiter', type=int, default=100, - help='Maximum iterations for the iterative solver') -parser.add_argument('-C', dest='C', type=float, default=1.0, - help='Regularization parameter') -parser.add_argument('--tol', type=float, default=1e-10, - help='Tolerance for solver. Default is 1e-10.') -params = bench.parse_args(parser) - -# Load generated data -X_train, X_test, y_train, y_test = bench.load_data(params) - -params.n_classes = y_train[y_train.columns[0]].nunique() - -# Create our classifier object -clf = LogisticRegression(penalty='l2', C=params.C, - linesearch_max_iter=params.linesearch_max_iter, - fit_intercept=params.fit_intercept, verbose=params.verbose, - tol=params.tol, - max_iter=params.maxiter, solver=params.solver) - -# Time fit and predict -fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) -y_pred = clf.predict(X_train) -train_acc = 100 * bench.accuracy_score(y_pred, y_train) - -predict_time, y_pred = bench.measure_function_time( - clf.predict, X_test, params=params) -test_acc = 100 * bench.accuracy_score(y_pred, y_test) - -bench.print_output(library='cuml', algorithm='log_reg', - stages=['training', 'prediction'], params=params, - functions=['LogReg.fit', 'LogReg.predict'], - times=[fit_time, predict_time], metric_type='accuracy[%]', - metrics=[train_acc, test_acc], data=[X_train, X_test], - alg_instance=clf) diff --git a/cuml_bench/pca.py b/cuml_bench/pca.py deleted file mode 100644 index bf9b9a878..000000000 --- a/cuml_bench/pca.py +++ /dev/null @@ -1,56 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from cuml import PCA - - -parser = argparse.ArgumentParser(description='cuML PCA benchmark') -parser.add_argument('--svd-solver', type=str, default='full', - choices=['auto', 'full', 'jacobi'], - help='SVD solver to use') -parser.add_argument('--n-components', type=int, default=None, - help='Number of components to find') -parser.add_argument('--whiten', action='store_true', default=False, - help='Perform whitening') -params = bench.parse_args(parser) - -# Load random data -X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train']) - -if params.n_components is None: - p, n = X_train.shape - params.n_components = min((n, (2 + min((n, p))) // 3)) - -# Create our PCA object -pca = PCA(svd_solver=params.svd_solver, whiten=params.whiten, - n_components=params.n_components) - -# Time fit -fit_time, _ = bench.measure_function_time(pca.fit, X_train, params=params) - -# Time transform -transform_time, _ = bench.measure_function_time( - pca.transform, X_train, params=params) - -bench.print_output(library='cuml', algorithm='PCA', - stages=['training', 'transformation'], - params=params, functions=['PCA.fit', 'PCA.transform'], - times=[fit_time, transform_time], metric_type=None, - metrics=[None, None], data=[X_train, X_test], - alg_instance=pca) diff --git a/cuml_bench/ridge.py b/cuml_bench/ridge.py deleted file mode 100644 index caf80392b..000000000 --- a/cuml_bench/ridge.py +++ /dev/null @@ -1,57 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from cuml import Ridge - - -parser = argparse.ArgumentParser(description='cuML ridge regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") -parser.add_argument('--solver', default='eig', choices=('eig', 'cd', 'svd'), - help='Solver used for training') -parser.add_argument('--alpha', type=float, default=1.0, - help='Regularization strength') -params = bench.parse_args(parser) - -# Load data -X_train, X_test, y_train, y_test = bench.load_data( - params, generated_data=['X_train', 'y_train']) - -# Create our regression object -regr = Ridge(fit_intercept=params.fit_intercept, alpha=params.alpha, - solver=params.solver) - -# Time fit -fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - -# Time predict -predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) - -test_rmse = bench.rmse_score(yp, y_test) -yp = regr.predict(X_train) -train_rmse = bench.rmse_score(yp, y_train) - -bench.print_output(library='cuml', algorithm='ridge_regr', - stages=['training', 'prediction'], params=params, - functions=['Ridge.fit', 'Ridge.predict'], - times=[fit_time, predict_time], metric_type='rmse', - metrics=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py deleted file mode 100644 index 0b2c0020a..000000000 --- a/cuml_bench/svm.py +++ /dev/null @@ -1,87 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from cuml.svm import SVC - - -parser = argparse.ArgumentParser(description='cuML SVM benchmark') - -parser.add_argument('-C', dest='C', type=float, default=1.0, - help='SVM regularization parameter') -parser.add_argument('--kernel', choices=('linear', 'rbf', 'poly', 'sigmoid'), - default='linear', help='SVM kernel function') -parser.add_argument('--degree', type=int, default=3, - help='Degree of the polynomial kernel function') -parser.add_argument('--gamma', type=float, default=None, - help='Parameter for kernel="rbf"') -parser.add_argument('--max-cache-size', type=int, default=8, - help='Maximum cache size, in gigabytes, for SVM.') -parser.add_argument('--tol', type=float, default=1e-3, - help='Tolerance passed to sklearn.svm.SVC') -parser.add_argument('--probability', action='store_true', default=False, - dest='probability', help="Use probability for SVC") - -params = bench.parse_args(parser) - -X_train, X_test, y_train, y_test = bench.load_data(params) - -if params.gamma is None: - params.gamma = 1.0 / X_train.shape[1] - -cache_size_bytes = bench.get_optimal_cache_size(X_train.shape[0], - max_cache=params.max_cache_size) -params.cache_size_mb = cache_size_bytes / 1024**2 -params.n_classes = y_train[y_train.columns[0]].nunique() - -clf = SVC(C=params.C, kernel=params.kernel, cache_size=params.cache_size_mb, - tol=params.tol, gamma=params.gamma, probability=params.probability, - degree=params.degree) - -fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) - -if params.probability: - state_predict = 'predict_proba' - metric_type = 'log_loss' - clf_predict = clf.predict_proba - - def metric_call(x, y): - return bench.log_loss(x, y) -else: - state_predict = 'prediction' - metric_type = 'accuracy[%]' - clf_predict = clf.predict - - def metric_call(x, y): - return 100 * bench.accuracy_score(x, y) - - -predict_train_time, y_pred = bench.measure_function_time( - clf_predict, X_train, params=params) -train_acc = metric_call(y_train, y_pred) - -predict_test_time, y_pred = bench.measure_function_time( - clf_predict, X_test, params=params) -test_acc = metric_call(y_test, y_pred) - -bench.print_output(library='cuml', algorithm='SVC', - stages=['training', state_predict], params=params, - functions=['SVM.fit', 'SVM.predict'], - times=[fit_time, predict_train_time], metric_type=metric_type, - metrics=[train_acc, test_acc], data=[X_train, X_train], - alg_instance=clf) diff --git a/cuml_bench/svr.py b/cuml_bench/svr.py deleted file mode 100644 index 616b5bcbd..000000000 --- a/cuml_bench/svr.py +++ /dev/null @@ -1,71 +0,0 @@ -# =============================================================================== -# Copyright 2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from cuml.svm import SVR - - -parser = argparse.ArgumentParser(description='cuML SVR benchmark') - -parser.add_argument('-C', dest='C', type=float, default=1.0, - help='SVR regularization parameter') -parser.add_argument('--epsilon', dest='epsilon', type=float, default=.1, - help='Epsilon in the epsilon-SVR model') -parser.add_argument('--kernel', choices=('linear', 'rbf', 'poly', 'sigmoid'), - default='linear', help='SVR kernel function') -parser.add_argument('--degree', type=int, default=3, - help='Degree of the polynomial kernel function') -parser.add_argument('--gamma', type=float, default=None, - help='Parameter for kernel="rbf"') -parser.add_argument('--max-cache-size', type=int, default=8, - help='Maximum cache size, in gigabytes, for SVR.') -parser.add_argument('--tol', type=float, default=1e-3, - help='Tolerance passed to sklearn.svm.SVR') - -params = bench.parse_args(parser) - -X_train, X_test, y_train, y_test = bench.load_data(params) - -if params.gamma is None: - params.gamma = 1.0 / X_train.shape[1] - -cache_size_bytes = bench.get_optimal_cache_size(X_train.shape[0], - max_cache=params.max_cache_size) -params.cache_size_mb = cache_size_bytes / 1024**2 -params.n_classes = y_train[y_train.columns[0]].nunique() - -regr = SVR(C=params.C, epsilon=params.epsilon, kernel=params.kernel, - cache_size=params.cache_size_mb, tol=params.tol, gamma=params.gamma, - degree=params.degree) - -fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - -predict_train_time, y_pred = bench.measure_function_time( - regr.predict, X_train, params=params) -train_rmse = bench.rmse_score(y_train, y_pred) - -predict_test_time, y_pred = bench.measure_function_time( - regr.predict, X_test, params=params) -test_rmse = bench.rmse_score(y_test, y_pred) - -bench.print_output(library='cuml', algorithm='SVR', - stages=['training', 'prediction'], params=params, - functions=['SVR.fit', 'SVR.predict'], - times=[fit_time, predict_train_time], metric_type='rmse', - metrics=[train_rmse, test_rmse], data=[X_train, X_train], - alg_instance=regr) diff --git a/cuml_bench/train_test_split.py b/cuml_bench/train_test_split.py deleted file mode 100644 index d8f70f7e6..000000000 --- a/cuml_bench/train_test_split.py +++ /dev/null @@ -1,48 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from cuml import train_test_split - - -parser = argparse.ArgumentParser( - description='cuml train_test_split benchmark') -parser.add_argument('--train-size', type=float, default=0.75, - help='Size of training subset') -parser.add_argument('--test-size', type=float, default=0.25, - help='Size of testing subset') -parser.add_argument('--do-not-shuffle', default=False, action='store_true', - help='Do not perform data shuffle before splitting') -params = bench.parse_args(parser) - -# Load generated data -X, y, _, _ = bench.load_data(params) - -tts_params = { - 'train_size': params.train_size, - 'test_size': params.test_size, - 'shuffle': not params.do_not_shuffle, - 'random_state': params.seed -} - -time, _ = bench.measure_function_time(train_test_split, X=X, y=y, params=params) - -bench.print_output(library='cuml', algorithm='train_test_split', - stages=['training'], params=params, - functions=['train_test_split'], times=[time], metrics=[None], - metric_type=None, data=[X], alg_params=tts_params) diff --git a/cuml_bench/tsne.py b/cuml_bench/tsne.py deleted file mode 100644 index 6af329a35..000000000 --- a/cuml_bench/tsne.py +++ /dev/null @@ -1,39 +0,0 @@ -import argparse -import bench -from cuml.manifold import TSNE - -parser = argparse.ArgumentParser(description='cuml tsne') - -parser.add_argument('--n-components', type=int, default=2, - help='The dimension of the embedded space.') -parser.add_argument('--early-exaggeration', type=float, default=12.0, - help='This factor increases the attractive forces between points ' - 'and allows points to move around more freely, ' - 'finding their nearest neighbors more easily.') -parser.add_argument('--learning-rate', type=float, default=200.0, - help='The learning rate for t-SNE is usually in the range [10.0, 1000.0].') -parser.add_argument('--angle', type=float, default=0.5, - help='Angular size. This is the trade-off between speed and accuracy.') -parser.add_argument('--min-grad-norm', type=float, default=1e-7, - help='If the gradient norm is below this threshold,' - 'the optimization is stopped.') -parser.add_argument('--random-state', type=int, default=1234) -params = bench.parse_args(parser) - -# Load and convert data -X, _, _, _ = bench.load_data(params) - -# Create our random forest regressor -tsne = TSNE(n_components=params.n_components, early_exaggeration=params.early_exaggeration, - learning_rate=params.learning_rate, angle=params.angle, - min_grad_norm=params.min_grad_norm, random_state=params.random_state) - -fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params) -# Need to investigate how to compare sklearn and cuml metrics for tsne - -bench.print_output(library='cuml', algorithm='tsne', - stages=['training'], params=params, - functions=['tsne.fit'], - times=[fit_time], metric_type=None, - metrics=None, data=[X], - alg_instance=tsne) diff --git a/daal4py_bench/README.md b/daal4py_bench/README.md deleted file mode 100644 index c1c940ef0..000000000 --- a/daal4py_bench/README.md +++ /dev/null @@ -1,159 +0,0 @@ - -## How to create conda environment for benchmarking - -`conda create -n bench -c intel python=3.7 daal4py pandas scikit-learn tqdm` - -## Algorithms parameters - -You can launch benchmarks for each algorithm separately. The tables below list all supported parameters for each algorithm: - -- [General](#general) -- [DBSCAN](#dbscan) -- [RandomForestClassifier](#randomforestclassifier) -- [RandomForestRegressor](#randomforestregressor) -- [pairwise_distances](#pairwise_distances) -- [KMeans](#kmeans) -- [KNeighborsClassifier](#kneighborsclassifier) -- [LinearRegression](#linearregression) -- [LogisticRegression](#logisticregression) -- [PCA](#pca) -- [Ridge Regression](#ridge) -- [SVC](#svc) -- [train_test_split](#train_test_split) - -#### General -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -|num-threads|int|-1| The number of threads to use| -|arch|str|?|Achine architecture, for bookkeeping| -|batch|str|?|Batch ID, for bookkeeping| -|prefix|str|sklearn|Prefix string, for bookkeeping| -|header|action|False|Output CSV header| -|verbose|action|False|Output extra debug messages| -|data-format|str|numpy|Data formats: *numpy*, *pandas* or *cudf*| -|data-order|str|C|Data order: C (row-major, default) or F (column-major)| -|dtype|np.dtype|np.float64|Data type: *float64* (default) or *float32*| -|check-finiteness|action|False|Check finiteness in sklearn input check(disabled by default)| -|output-format|str|csv|Output format: *csv* (default) or *json*'| -|time-method|str|mean_min|*box_filter* or *mean_min*. Method used for time mesurements| -|box-filter-measurements|int|100|Maximum number of measurements in box filter| -|inner-loops|int|100|Maximum inner loop iterations. (we take the mean over inner iterations)| -|outer-loops|int|100|Maximum outer loop iterations. (we take the min over outer iterations)| -|time-limit|float|10|Target time to spend to benchmark| -|goal-outer-loops|int|10|The number of outer loops to aim while automatically picking number of inner loops. If zero, do not automatically decide number of inner loops| -|seed|int|12345|Seed to pass as random_state| -|dataset-name|str|None|Dataset name| - - -#### DBSCAN -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| epsilon | float | 10 | Radius of neighborhood of a point| -| min_samples | int | 5 | The minimum number of samples required in a 'neighborhood to consider a point a core point | - -#### RandomForestClassifier - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split | -| num-trees | int | 100 | The number of trees in the forest | -| max-features | float_or_int | None | Upper bound on features used at each split | -| max-depth | int | None | Upper bound on depth of constructed trees | -| min-samples-split | float_or_int | 2 | Minimum samples number for node splitting | -| max-leaf-nodes | int | None | Maximum leaf nodes per tree | -| min-impurity-decrease | float | 0 | Needed impurity decrease for node splitting | -| no-bootstrap | store_false | True | Don't control bootstraping | -| use-sklearn-class | store_true | | Force use of sklearn.ensemble.RandomForestClassifier | - -#### RandomForestRegressor - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split | -| num-trees | int | 100 | The number of trees in the forest | -| max-features | float_or_int | None | Upper bound on features used at each split | -| max-depth | int | None | Upper bound on depth of constructed trees | -| min-samples-split | float_or_int | 2 | Minimum samples number for node splitting | -| max-leaf-nodes | int | None | Maximum leaf nodes per tree | -| min-impurity-decrease | float | 0 | Needed impurity decrease for node splitting | -| no-bootstrap | action | True | Don't control bootstraping | -| use-sklearn-class | action | | Force use of sklearn.ensemble.RandomForestClassifier | - -#### pairwise_distances - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| metric | str | cosine | *cosine* or *correlation* Metric to test for pairwise distances | - -#### KMeans - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| init | str | | Initial clusters | -| tol | float | 0 | Absolute threshold | -| maxiter | inte | 100 | Maximum number of iterations | -| n-clusters | int | | The number of clusters | - -#### KNeighborsClassifier - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| n-neighbors | int | 5 | The number of neighbors to use | -| weights | str | uniform | Weight function used in prediction | -| method | str | brute | Algorithm used to compute the nearest neighbors | -| metric | str | euclidean | Distance metric to use | - -#### LinearRegression - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| no-fit-intercept | action | True | Don't fit intercept (assume data already centered) | - -#### LogisticRegression - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| no-fit-intercept | action | True | Don't fit intercept| -| multiclass | str | auto | *auto*, *ovr* or *multinomial*. How to treat multi class data| -| solver | str | lbfgs | *lbfgs*, *newton-cg* or *saga*. Solver to use| -| maxiter | int | 100 | Maximum iterations for the iterative solver | -| C | float | 1.0 | Regularization parameter | -| tol | float | None | Tolerance for solver | - -#### PCA - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| svd-solver | str | daal | *daal*, *full*. SVD solver to use | -| n-components | int | None | The number of components to find | -| whiten | action | False | Perform whitening | - -#### Ridge - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| no-fit-intercept | action | True | Don't fit intercept (assume data already centered) | -| solver | str | auto | Solver used for training | -| alpha | float | 1.0 | Regularization strength | - -#### SVC - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| C | float | 0.01 | SVM slack parameter | -| kernel | str | linear | *linear* or *rbf*. SVM kernel function | -| gamma | float | None | Parameter for kernel="rbf" | -| maxiter | int | 2000 | Maximum iterations for the iterative solver | -| max-cache-size | int | 64 | Maximum cache size for SVM. | -| tol | float | 1e-16 | Tolerance passed to sklearn.svm.SVC | -| no-shrinking | action | True | Don't use shrinking heuristic | - -#### train_test_split - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| train-size | float | 0.75 | Size of training subset | -| test-size | float | 0.25 | Size of testing subset | -| do-not-shuffle | action | False | Do not perform data shuffle before splitting | -| include-y | action | False | Include label (Y) in splitting | -| rng | str | None | *MT19937*, *SFMT19937*, *MT2203*, *R250*, *WH*, *MCG31*, *MCG59*, *MRG32K3A*, *PHILOX4X32X10*, *NONDETERM* or None. Random numbers generator for shuffling.(only for IDP scikit-learn)| diff --git a/daal4py_bench/dbscan.py b/daal4py_bench/dbscan.py deleted file mode 100644 index d5010cfd0..000000000 --- a/daal4py_bench/dbscan.py +++ /dev/null @@ -1,54 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from daal4py import dbscan -from daal4py.sklearn._utils import getFPType - - -parser = argparse.ArgumentParser(description='daal4py DBSCAN clustering ' - 'benchmark') -parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., - help='Radius of neighborhood of a point') -parser.add_argument('-m', '--min-samples', default=5, type=int, - help='The minimum number of samples required in a ' - 'neighborhood to consider a point a core point') -params = bench.parse_args(parser, prefix='daal4py') - -# Load generated data -X, _, _, _ = bench.load_data(params, add_dtype=True) - - -# Define functions to time -def test_dbscan(X): - algorithm = dbscan( - fptype=getFPType(X), - epsilon=params.eps, - minObservations=params.min_samples, - resultsToCompute='computeCoreIndices' - ) - return algorithm.compute(X) - - -# Time clustering -time, result = bench.measure_function_time(test_dbscan, X, params=params) -params.n_clusters = int(result.nClusters[0, 0]) - -bench.print_output(library='daal4py', algorithm='dbscan', stages=['training'], - params=params, functions=['DBSCAN'], times=[time], - metrics=[None], metric_type=None, data=[X]) diff --git a/daal4py_bench/df_clsf.py b/daal4py_bench/df_clsf.py deleted file mode 100644 index 0e32a1185..000000000 --- a/daal4py_bench/df_clsf.py +++ /dev/null @@ -1,130 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np -from daal4py import (decision_forest_classification_prediction, - decision_forest_classification_training, engines_mt2203) -from daal4py.sklearn._utils import getFPType -from sklearn.metrics import accuracy_score - - -def df_clsf_fit(X, y, n_classes, n_trees=100, seed=12345, - n_features_per_node=0, max_depth=0, min_impurity=0, - bootstrap=True, verbose=False): - - fptype = getFPType(X) - - features_per_node = X.shape[1] - if n_features_per_node > 0 and n_features_per_node < features_per_node: - features_per_node = n_features_per_node - - engine = engines_mt2203(seed=seed, fptype=fptype) - - algorithm = decision_forest_classification_training( - nClasses=n_classes, - fptype=fptype, - method='defaultDense', - nTrees=n_trees, - observationsPerTreeFraction=1., - featuresPerNode=features_per_node, - maxTreeDepth=max_depth, - minObservationsInLeafNode=1, - engine=engine, - impurityThreshold=min_impurity, - varImportance='MDI', - resultsToCompute='', - memorySavingMode=False, - bootstrap=bootstrap - ) - - df_clsf_result = algorithm.compute(X, y) - - return df_clsf_result - - -def df_clsf_predict(X, training_result, n_classes, verbose=False): - - algorithm = decision_forest_classification_prediction( - nClasses=n_classes, - fptype='float', # we give float here specifically to match sklearn - ) - - result = algorithm.compute(X, training_result.model) - - return result.prediction - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='daal4py random forest ' - 'classification benchmark') - - parser.add_argument('--criterion', type=str, default='gini', - choices=('gini'), - help='The function to measure the quality of a split') - parser.add_argument('--num-trees', type=int, default=100, - help='Number of trees in the forest') - parser.add_argument('--max-features', type=bench.float_or_int, default=0, - help='Upper bound on features used at each split') - parser.add_argument('--max-depth', type=int, default=0, - help='Upper bound on depth of constructed trees') - parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, - help='Minimum samples number for node splitting') - parser.add_argument('--max-leaf-nodes', type=int, default=None, - help='Maximum leaf nodes per tree') - parser.add_argument('--min-impurity-decrease', type=float, default=0., - help='Needed impurity decrease for node splitting') - parser.add_argument('--no-bootstrap', dest='bootstrap', default=True, - action='store_false', - help="Don't control bootstraping") - - params = bench.parse_args(parser, prefix='daal4py') - - # Load data - X_train, X_test, y_train, y_test = bench.load_data( - params, add_dtype=True, label_2d=True) - - params.n_classes = len(np.unique(y_train)) - if isinstance(params.max_features, float): - params.max_features = int(X_train.shape[1] * params.max_features) - - # Time fit and predict - fit_time, res = bench.measure_function_time( - df_clsf_fit, X_train, y_train, - params.n_classes, - n_trees=params.num_trees, - n_features_per_node=params.max_features, - max_depth=params.max_depth, - min_impurity=params.min_impurity_decrease, - bootstrap=params.bootstrap, - seed=params.seed, - params=params) - - yp = df_clsf_predict(X_train, res, params.n_classes) - train_acc = 100 * accuracy_score(yp, y_train) - - predict_time, yp = bench.measure_function_time( - df_clsf_predict, X_test, res, params.n_classes, params=params) - test_acc = 100 * accuracy_score(yp, y_test) - - bench.print_output(library='daal4py', algorithm='decision_forest_classification', - stages=['training', 'prediction'], params=params, - functions=['df_clsf.fit', 'df_clsf.predict'], - times=[fit_time, predict_time], metric_type='accuracy[%]', - metrics=[train_acc, test_acc], data=[X_train, X_test]) diff --git a/daal4py_bench/df_regr.py b/daal4py_bench/df_regr.py deleted file mode 100644 index 5ff2beb9b..000000000 --- a/daal4py_bench/df_regr.py +++ /dev/null @@ -1,127 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from daal4py import (decision_forest_regression_prediction, - decision_forest_regression_training, engines_mt2203) -from daal4py.sklearn._utils import getFPType - - -def df_regr_fit(X, y, n_trees=100, seed=12345, n_features_per_node=0, - max_depth=0, min_impurity=0, bootstrap=True): - - fptype = getFPType(X) - - features_per_node = X.shape[1] - if n_features_per_node > 0 and n_features_per_node <= features_per_node: - features_per_node = n_features_per_node - - engine = engines_mt2203(seed=seed, fptype=fptype) - - algorithm = decision_forest_regression_training( - fptype=fptype, - method='defaultDense', - nTrees=n_trees, - observationsPerTreeFraction=1., - featuresPerNode=features_per_node, - maxTreeDepth=max_depth, - minObservationsInLeafNode=1, - engine=engine, - impurityThreshold=min_impurity, - varImportance='MDI', - resultsToCompute='', - memorySavingMode=False, - bootstrap=bootstrap - ) - - df_regr_result = algorithm.compute(X, y) - - return df_regr_result - - -def df_regr_predict(X, training_result): - - algorithm = decision_forest_regression_prediction( - fptype='float' - ) - - result = algorithm.compute(X, training_result.model) - - return result.prediction - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='daal4py random forest ' - 'regression benchmark') - - parser.add_argument('--criterion', type=str, default='mse', - choices=('mse'), - help='The function to measure the quality of a split') - parser.add_argument('--num-trees', type=int, default=100, - help='Number of trees in the forest') - parser.add_argument('--max-features', type=bench.float_or_int, default=0, - help='Upper bound on features used at each split') - parser.add_argument('--max-depth', type=int, default=0, - help='Upper bound on depth of constructed trees') - parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, - help='Minimum samples number for node splitting') - parser.add_argument('--max-leaf-nodes', type=int, default=None, - help='Grow trees with max_leaf_nodes in best-first fashion' - 'if it is not None') - parser.add_argument('--min-impurity-decrease', type=float, default=0., - help='Needed impurity decrease for node splitting') - parser.add_argument('--no-bootstrap', dest='bootstrap', default=True, - action='store_false', help="Don't control bootstraping") - - parser.add_argument('--use-sklearn-class', action='store_true', - help='Force use of ' - 'sklearn.ensemble.RandomForestRegressor') - params = bench.parse_args(parser, prefix='daal4py') - - # Load data - X_train, X_test, y_train, y_test = bench.load_data( - params, add_dtype=True, label_2d=True) - - if isinstance(params.max_features, float): - params.max_features = int(X_train.shape[1] * params.max_features) - - # Time fit and predict - fit_time, res = bench.measure_function_time( - df_regr_fit, X_train, y_train, - n_trees=params.num_trees, - n_features_per_node=params.max_features, - max_depth=params.max_depth, - min_impurity=params.min_impurity_decrease, - bootstrap=params.bootstrap, - seed=params.seed, - params=params) - - yp = df_regr_predict(X_train, res) - train_rmse = bench.rmse_score(yp, y_train) - - predict_time, yp = bench.measure_function_time( - df_regr_predict, X_test, res, params=params) - - test_rmse = bench.rmse_score(yp, y_test) - - bench.print_output(library='daal4py', algorithm='decision_forest_regression', - stages=['training', 'prediction'], params=params, - functions=['df_regr.fit', 'df_regr.predict'], - times=[fit_time, predict_time], metric_type='rmse', - metrics=[train_rmse, test_rmse], data=[X_train, X_test]) diff --git a/daal4py_bench/distances.py b/daal4py_bench/distances.py deleted file mode 100644 index 82d2c5ec8..000000000 --- a/daal4py_bench/distances.py +++ /dev/null @@ -1,48 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from daal4py import cosine_distance, correlation_distance -from daal4py.sklearn._utils import getFPType - - -def compute_distances(pairwise_distances, X): - algorithm = pairwise_distances(fptype=getFPType(X)) - return algorithm.compute(X) - - -parser = argparse.ArgumentParser(description='daal4py pairwise distances ' - 'benchmark') -parser.add_argument('--metric', default='cosine', - choices=['cosine', 'correlation'], - help='Metric to test for pairwise distances') -params = bench.parse_args(parser) - -# Load data -X, _, _, _ = bench.load_data(params, generated_data=[ - 'X_train'], add_dtype=True) - -pairwise_distances = cosine_distance if params.metric == 'cosine' else correlation_distance - -time, _ = bench.measure_function_time( - compute_distances, pairwise_distances, X, params=params) - -bench.print_output(library='daal4py', algorithm='distances', stages=['computation'], - params=params, functions=[params.metric.capitalize()], times=[time], - metric_type=None, metrics=[None], data=[X], - alg_params={'metric': params.metric}) diff --git a/daal4py_bench/gbt.py b/daal4py_bench/gbt.py deleted file mode 100644 index 02aa60514..000000000 --- a/daal4py_bench/gbt.py +++ /dev/null @@ -1,178 +0,0 @@ -# =============================================================================== -# Copyright 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import logging -import sys - -import bench -import numpy as np -from sklearn.utils import check_random_state -from sklearn import preprocessing -import daal4py as d4p - - -parser = argparse.ArgumentParser( - description='daal4py gradient boosted trees benchmark') - -parser.add_argument('--max_bins', type=int, default=256, - help='Maximum number of discrete bins to ' - 'bucket continuous features') -parser.add_argument('--min_bin_size', type=int, default=5, - help='Minimum size of discrete bins') -parser.add_argument('--max_tree_depth', type=int, default=8, - help='Maximum depth of a tree') -parser.add_argument('--min-split-loss', '--gamma', type=float, default=0, - help='Minimum loss reduction required to make' - ' partition on a leaf node') -parser.add_argument('--n_estimators', type=int, default=100, - help='The number of gradient boosted trees') -parser.add_argument('--reg_lambda', type=float, default=1, - help='L2 regularization term on weights') -parser.add_argument('--split_method', type=str, required=False, - default='inexact', - help='The split algorithm used in daal4py') -parser.add_argument('--shrinkage', type=float, default=0.3, - help='Shrinkage rate') -parser.add_argument('--min_split_loss', type=float, default=0, - help='Minimal spilit loss') -parser.add_argument('--observations_per_tree_fraction', type=int, default=1, - help='Observations per tree fraction') -parser.add_argument('--features_per_node', type=int, default=0, - help='Features per node') -parser.add_argument('--min_observations_in_leaf_node', type=int, default=5, - help='Min observations in leaf node') -parser.add_argument('--memory_saving_mode', type=bool, default=False, - help='Enable memory saving mode') -parser.add_argument('--random_state', type=str, default=None, - help='Pass random state') -parser.add_argument('--objective', type=str, default="reg:squarederror", - help='Objective function') -parser.add_argument('--fptype', type=str, default="float", - help='FPType to use') - -params = bench.parse_args(parser) - -# Load and convert data -X_train, X_test, y_train, y_test = bench.load_data(params) - -if np.isnan(X_test.values).any(): - logging.warning('Nan values aren not supported in GBT DAAL fit yet') - sys.exit(1) - -# Get random seed -rs_ = check_random_state(params.random_state) -seed_ = rs_.randint(0, 2**31) - -d4p_params = { - 'split_method': params.split_method, - 'n_estimators': params.n_estimators, - 'max_tree_depth': params.max_tree_depth, - 'shrinkage': params.shrinkage, - 'min_split_loss': params.min_split_loss, - 'reg_lambda': params.reg_lambda, - 'objective': params.objective, - 'fptype': params.fptype, - 'observations_per_tree_fraction': params.observations_per_tree_fraction, - 'features_per_node': params.features_per_node, - 'min_observations_in_leaf_node': params.min_observations_in_leaf_node, - 'memory_saving_mode': params.memory_saving_mode, - 'max_bins': params.max_bins, - 'min_bin_size': params.min_bin_size, - 'random_state': params.random_state} - -if d4p_params["objective"].startswith('reg'): - task = "regression" - metric_name, metric_func = 'rmse', bench.rmse_score - train_algo = d4p.gbt_regression_training( - fptype=d4p_params["fptype"], - splitMethod=d4p_params["split_method"], - maxIterations=d4p_params["n_estimators"], - maxTreeDepth=d4p_params["max_tree_depth"], - shrinkage=d4p_params["shrinkage"], - minSplitLoss=d4p_params["min_split_loss"], - lambda_=d4p_params["reg_lambda"], - observationsPerTreeFraction=d4p_params["observations_per_tree_fraction"], - featuresPerNode=d4p_params["features_per_node"], - minObservationsInLeafNode=d4p_params["min_observations_in_leaf_node"], - memorySavingMode=d4p_params["memory_saving_mode"], - maxBins=d4p_params["max_bins"], - minBinSize=d4p_params["min_bin_size"], - engine=d4p.engines_mcg59(seed=seed_)) -else: - task = "classification" - metric_name = 'accuracy' - metric_func = bench.accuracy_score - le = preprocessing.LabelEncoder() - le.fit(y_train) - - n_classes = len(le.classes_) - # Covtype has one class more than there is in train - if params.dataset_name == 'covtype': - n_classes += 1 - n_iterations = d4p_params["n_estimators"] - train_algo = d4p.gbt_classification_training( - fptype=d4p_params["fptype"], - nClasses=n_classes, - splitMethod=d4p_params["split_method"], - maxIterations=n_iterations, - maxTreeDepth=d4p_params["max_tree_depth"], - shrinkage=d4p_params["shrinkage"], - minSplitLoss=d4p_params["min_split_loss"], - lambda_=d4p_params["reg_lambda"], - observationsPerTreeFraction=d4p_params["observations_per_tree_fraction"], - featuresPerNode=d4p_params["features_per_node"], - minObservationsInLeafNode=d4p_params["min_observations_in_leaf_node"], - memorySavingMode=d4p_params["memory_saving_mode"], - maxBins=d4p_params["max_bins"], - minBinSize=d4p_params["min_bin_size"], - engine=d4p.engines_mcg59(seed=seed_)) - - -def fit(X_train, y_train): - return train_algo.compute(X_train, y_train).model - - -def predict(X_test): # type: ignore - if task == "regression": - predict_algo = d4p.gbt_regression_prediction( - fptype=d4p_params["fptype"]) - else: - predict_algo = d4p.gbt_classification_prediction( - fptype=d4p_params["fptype"], - nClasses=n_classes, - resultsToEvaluate="computeClassLabels") - return predict_algo.compute(X_test, booster).prediction.ravel() - - -fit_time, booster = bench.measure_function_time( - fit, X_train, y_train, params=params) -train_metric = metric_func( - predict(X_train), y_train) - -predict_time, y_pred = bench.measure_function_time( - predict, X_test, params=params) -test_metric = metric_func(y_pred, y_test) - -bench.print_output( - library='daal4py', - algorithm=f'gradient_boosted_trees_{task}', - stages=['training', 'prediction'], - params=params, functions=['gbt.fit', 'gbt.predict'], - times=[fit_time, predict_time], metric_type=metric_name, - metrics=[train_metric, test_metric], - data=[X_train, X_test], - alg_params=d4p_params) diff --git a/daal4py_bench/kmeans.py b/daal4py_bench/kmeans.py deleted file mode 100644 index bc9b1afe4..000000000 --- a/daal4py_bench/kmeans.py +++ /dev/null @@ -1,91 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -from typing import Any - -import bench -import numpy as np -from daal4py import kmeans -from daal4py.sklearn._utils import getFPType - -parser = argparse.ArgumentParser(description='daal4py K-Means clustering ' - 'benchmark') -parser.add_argument('-i', '--filei', '--fileI', '--init', - type=str, help='Initial clusters') -parser.add_argument('-t', '--tol', default=0., type=float, - help='Absolute threshold') -parser.add_argument('--maxiter', type=int, default=100, - help='Maximum number of iterations') -parser.add_argument('--n-clusters', type=int, help='Number of clusters') -params = bench.parse_args(parser, prefix='daal4py') - -# Load generated data -X_train, X_test, _, _ = bench.load_data(params, add_dtype=True) - -X_init: Any -# Load initial centroids from specified path -if params.filei is not None: - X_init = {k: v.astype(params.dtype) for k, v in np.load(params.filei).items()} - params.n_clusters = X_init.shape[0] -# or choose random centroids from training data -else: - np.random.seed(params.seed) - centroids_idx = np.random.randint(low=0, high=X_train.shape[0], - size=params.n_clusters) - if hasattr(X_train, "iloc"): - X_init = X_train.iloc[centroids_idx].values - else: - X_init = X_train[centroids_idx] - - -# Define functions to time -def test_fit(X, X_init): - algorithm = kmeans( - fptype=getFPType(X), - nClusters=params.n_clusters, - maxIterations=params.maxiter, - assignFlag=True, - accuracyThreshold=params.tol - ) - return algorithm.compute(X, X_init) - - -def test_predict(X, X_init): - algorithm = kmeans( - fptype=getFPType(X), - nClusters=params.n_clusters, - maxIterations=0, - assignFlag=True, - accuracyThreshold=0.0 - ) - return algorithm.compute(X, X_init) - - -# Time fit -fit_time, res = bench.measure_function_time(test_fit, X_train, X_init, params=params) -train_inertia = float(res.objectiveFunction[0, 0]) - -# Time predict -predict_time, res = bench.measure_function_time( - test_predict, X_test, X_init, params=params) -test_inertia = float(res.objectiveFunction[0, 0]) - -bench.print_output(library='daal4py', algorithm='kmeans', - stages=['training', 'prediction'], - params=params, functions=['KMeans.fit', 'KMeans.predict'], - times=[fit_time, predict_time], metric_type='inertia', - metrics=[train_inertia, test_inertia], data=[X_train, X_test]) diff --git a/daal4py_bench/linear.py b/daal4py_bench/linear.py deleted file mode 100644 index 8cf076427..000000000 --- a/daal4py_bench/linear.py +++ /dev/null @@ -1,72 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -from daal4py import linear_regression_prediction, linear_regression_training -from daal4py.sklearn._utils import getFPType - - -parser = argparse.ArgumentParser(description='daal4py linear regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") -parser.add_argument('--method', default='normEqDense', - choices=('normEqDense', 'qrDense'), - help='Training method used by DAAL. "normEqDense" selects' - 'the normal equations method, while "qrDense" selects' - 'the method based on QR decomposition.') - -params = bench.parse_args(parser, prefix='daal4py') - -# Generate random data -X_train, X_test, y_train, y_test = bench.load_data( - params, generated_data=['X_train', 'y_train'], add_dtype=True, - label_2d=True if params.file_X_train is not None else False) - - -# Create our regression objects -def test_fit(X, y): - regr_train = linear_regression_training(fptype=getFPType(X), - method=params.method, - interceptFlag=params.fit_intercept) - return regr_train.compute(X, y) - - -def test_predict(Xp, model): - regr_predict = linear_regression_prediction(fptype=getFPType(Xp)) - return regr_predict.compute(Xp, model) - - -# Time fit -fit_time, res = bench.measure_function_time( - test_fit, X_train, y_train, params=params) - -# Time predict -predict_time, pres = bench.measure_function_time( - test_predict, X_test, res.model, params=params) - -test_rmse = bench.rmse_score(pres.prediction, y_test) -pres = test_predict(X_train, res.model) -train_rmse = bench.rmse_score(pres.prediction, y_train) - -bench.print_output(library='daal4py', algorithm='linear_regression', - stages=['training', 'prediction'], - params=params, functions=['Linear.fit', 'Linear.predict'], - times=[fit_time, predict_time], metric_type='rmse', - metrics=[train_rmse, test_rmse], data=[X_train, X_test]) diff --git a/daal4py_bench/pca.py b/daal4py_bench/pca.py deleted file mode 100644 index f2b4ab4d3..000000000 --- a/daal4py_bench/pca.py +++ /dev/null @@ -1,147 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np -from daal4py import normalization_zscore, pca, pca_transform -from daal4py.sklearn._utils import getFPType -from sklearn.utils.extmath import svd_flip - - -parser = argparse.ArgumentParser(description='daal4py PCA benchmark') -parser.add_argument('--svd-solver', type=str, - choices=['daal', 'full', 'correlation'], - default='daal', help='SVD solver to use') -parser.add_argument('--n-components', type=int, default=None, - help='Number of components to find') -parser.add_argument('--whiten', action='store_true', default=False, - help='Perform whitening') -parser.add_argument('--write-results', action='store_true', default=False, - help='Write results to disk for verification') -params = bench.parse_args(parser) - -# Load data -X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train'], - add_dtype=True) - -if params.n_components is None: - p, n = X_train.shape - params.n_components = min((n, (2 + min((n, p))) // 3)) - - -# Define how to do our scikit-learn PCA using DAAL... -def pca_fit_daal(X, n_components, method): - - if n_components < 1: - n_components = min(X.shape) - - fptype = getFPType(X) - - centering_algo = normalization_zscore( - fptype=fptype, - doScale=False - ) - - pca_algorithm = pca( - fptype=fptype, - method=method, - normalization=centering_algo, - resultsToCompute='mean|variance|eigenvalue', - isDeterministic=True, - nComponents=n_components - ) - - pca_result = pca_algorithm.compute(X) - eigenvectors = pca_result.eigenvectors - eigenvalues = pca_result.eigenvalues.ravel() - singular_values = np.sqrt((X.shape[0] - 1) * eigenvalues) - - return pca_result, eigenvalues, eigenvectors, singular_values - - -def pca_transform_daal(pca_result, X, n_components, fit_n_samples, - eigenvalues, eigenvectors, - whiten=False, scale_eigenvalues=False): - - fptype = getFPType(X) - - tr_data = {} - tr_data['mean'] = pca_result.dataForTransform['mean'] - - if whiten: - if scale_eigenvalues: - tr_data['eigenvalue'] = (fit_n_samples - 1) \ - * pca_result.eigenvalues - else: - tr_data['eigenvalue'] = pca_result.eigenvalues - elif scale_eigenvalues: - tr_data['eigenvalue'] = np.full((1, pca_result.eigenvalues.size), - fit_n_samples - 1, dtype=X.dtype) - - transform_algorithm = pca_transform( - fptype=fptype, - nComponents=n_components - ) - transform_result = transform_algorithm.compute(X, pca_result.eigenvectors, - tr_data) - return transform_result.transformedData - - -def pca_fit_full_daal(X, n_components): - - fit_result, eigenvalues, eigenvectors, S = pca_fit_daal(X, min(X.shape), 'svdDense') - U = pca_transform_daal(fit_result, X, min(X.shape), X.shape[0], - eigenvalues, eigenvectors, - whiten=True, scale_eigenvalues=True) - V = fit_result.eigenvectors - - U, V = svd_flip(U, V) - - eigenvalues = fit_result.eigenvalues[:n_components].copy() - eigenvectors = fit_result.eigenvectors[:n_components].copy() - - return fit_result, eigenvalues, eigenvectors, U, S, V - - -def test_fit(X): - if params.svd_solver == 'full': - return pca_fit_full_daal(X, params.n_components) - method = 'correlationDense' if params.svd_solver == 'correlation' else 'svdDense' - return pca_fit_daal(X, params.n_components, method) - - -def test_transform(Xp, pca_result, eigenvalues, eigenvectors): - return pca_transform_daal(pca_result, Xp, params.n_components, - X_train.shape[0], eigenvalues, - eigenvectors, whiten=params.whiten) - - -# Time fit -fit_time, res = bench.measure_function_time(test_fit, X_train, params=params) - -# Time transform -transform_time, tr = bench.measure_function_time( - test_transform, X_test, *res[:3], params=params) - -bench.print_output(library='daal4py', algorithm='pca', - stages=['training', 'transformation'], - params=params, functions=['PCA.fit', 'PCA.transform'], - times=[fit_time, transform_time], metric_type=None, - metrics=[None, None], data=[X_train, X_test], - alg_params={'svd_solver': params.svd_solver, - 'n_components': params.n_components}) diff --git a/daal4py_bench/requirements.txt b/daal4py_bench/requirements.txt deleted file mode 100644 index 0600c14d3..000000000 --- a/daal4py_bench/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -daal4py==2023.0.0 -dpcpp-cpp-rt==2023.0.0 diff --git a/daal4py_bench/ridge.py b/daal4py_bench/ridge.py deleted file mode 100644 index 38722cb60..000000000 --- a/daal4py_bench/ridge.py +++ /dev/null @@ -1,68 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np -from daal4py import ridge_regression_prediction, ridge_regression_training -from daal4py.sklearn._utils import getFPType - -parser = argparse.ArgumentParser(description='daal4py ridge regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") -parser.add_argument('--alpha', type=float, default=1.0, - help='Regularization strength') -params = bench.parse_args(parser, prefix='daal4py') - -# Generate random data -X_train, X_test, y_train, y_test = bench.load_data( - params, generated_data=['X_train', 'y_train'], add_dtype=True, - label_2d=True if params.file_X_train is not None else False) - - -# Create our regression objects -def test_fit(X, y): - regr_train = ridge_regression_training( - fptype=getFPType(X), ridgeParameters=np.array([[params.alpha]]), - interceptFlag=params.fit_intercept) - return regr_train.compute(X, y) - - -def test_predict(Xp, model): - regr_predict = ridge_regression_prediction(fptype=getFPType(Xp)) - return regr_predict.compute(Xp, model) - - -# Time fit -fit_time, res = bench.measure_function_time( - test_fit, X_train, y_train, params=params) - -# Time predict -predict_time, yp = bench.measure_function_time( - test_predict, X_test, res.model, params=params) - -test_rmse = bench.rmse_score(yp.prediction, y_test) -pres = test_predict(X_train, res.model) -train_rmse = bench.rmse_score(pres.prediction, y_train) - -bench.print_output(library='daal4py', algorithm='ridge_regression', - stages=['training', 'prediction'], params=params, - functions=['Ridge.fit', 'Ridge.predict'], - times=[fit_time, predict_time], metric_type='rmse', - metrics=[train_rmse, test_rmse], data=[X_train, X_test]) diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py deleted file mode 100644 index 202b67b21..000000000 --- a/datasets/load_datasets.py +++ /dev/null @@ -1,131 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import logging -import os -import sys -from pathlib import Path -from typing import Callable, Dict - -from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, - census, cifar_binary, codrnanorm, covtype_binary, creditcard, - epsilon, epsilon_16K, epsilon_30K, epsilon_80K, epsilon_100K, - fraud, gisette, hepmass_150K, - higgs, higgs_one_m, higgs_150K, ijcnn, klaverjas, - santander, skin_segmentation, susy) -from .loader_multiclass import (cifar_10, connect, covertype, covtype, letters, mlsr, - mnist, msrank, plasticc, sensit) -from .loader_regression import (abalone, california_housing, fried, higgs_10500K, - medical_charges_nominal, mortgage_first_q, - twodplanes, year_prediction_msd, yolanda, airline_regression) -from .loader_clustering import (cifar_cluster, epsilon_50K_cluster, higgs_one_m_clustering, - hepmass_1M_cluster, hepmass_10K_cluster, mnist_10K_cluster, - road_network_20K_cluster, susy_cluster) - -dataset_loaders: Dict[str, Callable[[Path], bool]] = { - "a9a": a_nine_a, - "abalone": abalone, - "airline": airline, - "airline-ohe": airline_ohe, - "airline_regression": airline_regression, - "bosch": bosch, - "california_housing": california_housing, - "census": census, - "cifar_binary": cifar_binary, - "cifar_cluster": cifar_cluster, - "cifar_10": cifar_10, - "codrnanorm": codrnanorm, - "connect": connect, - "covertype": covertype, - "covtype_binary": covtype_binary, - "covtype": covtype, - "creditcard": creditcard, - "epsilon": epsilon, - "epsilon_16K": epsilon_16K, - "epsilon_30K": epsilon_30K, - "epsilon_80K": epsilon_80K, - "epsilon_100K": epsilon_100K, - "epsilon_50K_cluster": epsilon_50K_cluster, - "fraud": fraud, - "fried": fried, - "gisette": gisette, - "hepmass_150K": hepmass_150K, - "hepmass_1M_cluster": hepmass_1M_cluster, - "hepmass_10K_cluster": hepmass_10K_cluster, - "higgs": higgs, - "higgs1m": higgs_one_m, - "higgs_150K": higgs_150K, - "higgs_10500K": higgs_10500K, - "higgs_one_m_clustering": higgs_one_m_clustering, - "ijcnn": ijcnn, - "klaverjas": klaverjas, - "letters": letters, - "mlsr": mlsr, - "medical_charges_nominal": medical_charges_nominal, - "mnist": mnist, - "mnist_10K_cluster": mnist_10K_cluster, - "mortgage1Q": mortgage_first_q, - "msrank": msrank, - "plasticc": plasticc, - "road_network_20K_cluster": road_network_20K_cluster, - "santander": santander, - "sensit": sensit, - "skin_segmentation": skin_segmentation, - "susy_cluster": susy_cluster, - "susy": susy, - "twodplanes": twodplanes, - "year_prediction_msd": year_prediction_msd, - "yolanda": yolanda, -} - - -def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: - if dataset_name in dataset_loaders: - try: - return dataset_loaders[dataset_name](output_directory) - except BaseException as ex: - logging.warning(f"Internal error loading dataset:\n{ex}") - return False - else: - logging.warning(f"There is no script to download the dataset: {dataset_name}. " - "You need to add a dataset or script to load it.") - return False - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Use \'-d\' or \'--datasets\' option to enumerate ' - 'dataset(s) that should be downloaded') - parser.add_argument('-l', '--list', action='store_const', - const=True, help='The list of available datasets') - parser.add_argument('-d', '--datasets', type=str, nargs='*', - help='The datasets that should be downloaded.') - args = parser.parse_args() - - if args.list: - for key in dataset_loaders: - print(key) - sys.exit(0) - - root_dir = Path(os.environ['DATASETSROOT']) - - if args.datasets is not None: - for val in dataset_loaders.values(): - val(root_dir) - else: - logging.warning( - 'Warning: Enumerate dataset(s) that should be downloaded') diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py deleted file mode 100644 index 9ca1016cc..000000000 --- a/datasets/loader_classification.py +++ /dev/null @@ -1,1003 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import logging -import os -import subprocess -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -from sklearn.datasets import fetch_openml, load_svmlight_file, fetch_covtype -from sklearn.model_selection import train_test_split - -from .loader_utils import retrieve - - -def a_nine_a(dataset_dir: Path) -> bool: - """ - Author: Ronny Kohavi","Barry Becker - libSVM","AAD group - Source: original - Date unknown - Site: http://archive.ics.uci.edu/ml/datasets/Adult - - Classification task. n_classes = 2. - a9a X train dataset (39073, 123) - a9a y train dataset (39073, 1) - a9a X test dataset (9769, 123) - a9a y test dataset (9769, 1) - """ - dataset_name = 'a9a' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='a9a', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) - - y[y == -1] = 0 - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=11) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def airline(dataset_dir: Path) -> bool: - """ - Airline dataset - http://kt.ijs.si/elena_ikonomovska/data.html - - Classification task. n_classes = 2. - airline X train dataset (92055213, 13) - airline y train dataset (92055213, 1) - airline X test dataset (23013804, 13) - airline y test dataset (23013804, 1) - """ - dataset_name = 'airline' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'http://kt.ijs.si/elena_ikonomovska/datasets/airline/airline_14col.data.bz2' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - cols = [ - "Year", "Month", "DayofMonth", "DayofWeek", "CRSDepTime", - "CRSArrTime", "UniqueCarrier", "FlightNum", "ActualElapsedTime", - "Origin", "Dest", "Distance", "Diverted", "ArrDelay" - ] - - # load the data as int16 - dtype = np.int16 - - dtype_columns = { - "Year": dtype, "Month": dtype, "DayofMonth": dtype, "DayofWeek": dtype, - "CRSDepTime": dtype, "CRSArrTime": dtype, "FlightNum": dtype, - "ActualElapsedTime": dtype, "Distance": - dtype, - "Diverted": dtype, "ArrDelay": dtype, - } - - df: Any = pd.read_csv(local_url, names=cols, dtype=dtype_columns) - - # Encode categoricals as numeric - for col in df.select_dtypes(['object']).columns: - df[col] = df[col].astype("category").cat.codes - - # Turn into binary classification problem - df["ArrDelayBinary"] = 1 * (df["ArrDelay"] > 0) - - X = df[df.columns.difference(["ArrDelay", "ArrDelayBinary"]) - ].to_numpy(dtype=np.float32) - y = df["ArrDelayBinary"].to_numpy(dtype=np.float32) - del df - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - test_size=0.2, - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def airline_ohe(dataset_dir: Path) -> bool: - """ - Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - - Classification task. n_classes = 2. - airline-ohe X train dataset (1000000, 692) - airline-ohe y train dataset (1000000, 1) - airline-ohe X test dataset (100000, 692) - airline-ohe y test dataset (100000, 1) - """ - dataset_name = 'airline-ohe' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://s3.amazonaws.com/benchm-ml--main/train-10m.csv' - url_test = 'https://s3.amazonaws.com/benchm-ml--main/test.csv' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name} train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name} test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - - sets = [] - labels = [] - - categorical_names = ["Month", "DayofMonth", - "DayOfWeek", "UniqueCarrier", "Origin", "Dest"] - - for local_url in [local_url_train, local_url_test]: - df = pd.read_csv(local_url, nrows=1000000 - if local_url.endswith('train-10m.csv') else None) - X = df.drop(labels=['dep_delayed_15min'], axis=1) - y: Any = df["dep_delayed_15min"] - - y_num = np.where(y == "Y", 1, 0) - - sets.append(X) - labels.append(y_num) - - n_samples_train = sets[0].shape[0] - - X_final: Any = pd.concat(sets) - X_final = pd.get_dummies(X_final, columns=categorical_names) - sets = [X_final[:n_samples_train], X_final[n_samples_train:]] - - for data, name in zip((sets[0], sets[1], labels[0], labels[1]), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) # type: ignore - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def bosch(dataset_dir: Path) -> bool: - """ - Bosch Production Line Performance data set - https://www.kaggle.com/c/bosch-production-line-performance - - Requires Kaggle API and API token (https://github.com/Kaggle/kaggle-api) - Contains missing values as NaN. - - TaskType:binclass - NumberOfFeatures:968 - NumberOfInstances:1.184M - """ - dataset_name = 'bosch' - os.makedirs(dataset_dir, exist_ok=True) - - filename = "train_numeric.csv.zip" - local_url = os.path.join(dataset_dir, filename) - - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - args = ["kaggle", "competitions", "download", "-c", - "bosch-production-line-performance", "-f", filename, "-p", - str(dataset_dir)] - _ = subprocess.check_output(args) - logging.info(f'{dataset_name} is loaded, started parsing...') - X = pd.read_csv(local_url, index_col=0, compression='zip', dtype=np.float32) - y = X.iloc[:, -1].to_numpy(dtype=np.float32) - X.drop(labels=[X.columns[-1]], axis=1, inplace=True) - X_np = X.to_numpy(dtype=np.float32) - X_train, X_test, y_train, y_test = train_test_split(X_np, y, random_state=77, - test_size=0.2, - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def census(dataset_dir: Path) -> bool: - """ - # TODO: add an loading instruction - """ - return False - - -def codrnanorm(dataset_dir: Path) -> bool: - """ - Abstract: Detection of non-coding RNAs on the basis of predicted secondary - structure formation free energy change. - Author: Andrew V Uzilov,Joshua M Keegan,David H Mathews. - Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets) - - Classification task. n_classes = 2. - codrnanorm X train dataset (390852, 8) - codrnanorm y train dataset (390852, 1) - codrnanorm X test dataset (97713, 8) - codrnanorm y test dataset (97713, 1) - """ - dataset_name = 'codrnanorm' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='codrnaNorm', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def covtype_binary(dataset_dir: Path) -> bool: - """ - Cover type dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/covertype - - y contains 7 unique class labels from 1 to 7 inclusive. - Classification task. n_classes = 7. - covtype X train dataset (464809, 54) - covtype y train dataset (464809, 1) - covtype X test dataset (116203, 54) - covtype y test dataset (116203, 1) - """ - dataset_name = 'covtype_binary' - os.makedirs(dataset_dir, exist_ok=True) - - nrows_train, nrows_test = 100000, 100000 - logging.info(f'Started loading {dataset_name}') - X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg - logging.info(f'{dataset_name} is loaded, started parsing...') - - y = (y > 3).astype(int) - - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - train_size=nrows_train, - test_size=nrows_test, - shuffle=False - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def creditcard(dataset_dir: Path) -> bool: - """ - Classification task. n_classes = 2. - creditcard X train dataset (256326, 29) - creditcard y train dataset (256326, 1) - creditcard X test dataset (28481, 29) - creditcard y test dataset (28481, 1) - """ - dataset_name = 'creditcard' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='creditcard', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.1, random_state=777) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon X train dataset (400000, 2000) - epsilon y train dataset (400000, 1) - epsilon X test dataset (100000, 2000) - epsilon y test dataset (100000, 1) - """ - dataset_name = 'epsilon' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=np.float32) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=np.float32) - X_train = X_train.toarray() - X_test = X_test.toarray() - y_train[y_train <= 0] = 0 - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon_16K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_16K x train dataset (16000, 2000) - epsilon_16K y train dataset (16000, 1) - epsilon_16K x test dataset (16000, 2000) - epsilon_16K y test dataset (16000, 1) - """ - dataset_name = 'epsilon_16K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - num_train, num_test, dtype = 16000, 16000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon_30K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_30K x train dataset (30000, 2000) - epsilon_30K y train dataset (30000, 2000) - """ - dataset_name = 'epsilon_30K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - - num_train, dtype = 30000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - y_train = y_train[:num_train] - - for data, name in zip((X_train, y_train), - ('x_train', 'y_train')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon_100K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_100K x train dataset (50000, 2000) - epsilon_100K y train dataset (50000, 1) - """ - dataset_name = 'epsilon_100K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - - num_train, dtype = 50000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - - for data, name in zip((X_train, y_train), - ('x_train', 'y_train')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon_80K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_80K x train dataset (80000, 2000) - epsilon_80K y train dataset (80000, 1) - """ - dataset_name = 'epsilon_80K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - - num_train, dtype = 80000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - - for data, name in zip((X_train, y_train), - ('x_train', 'y_train')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def fraud(dataset_dir: Path) -> bool: - """ - Credit Card Fraud Detection contest - https://www.kaggle.com/mlg-ulb/creditcardfraud - - Requires Kaggle API and API token (https://github.com/Kaggle/kaggle-api) - Contains missing values as NaN. - - TaskType:binclass - NumberOfFeatures:28 - NumberOfInstances:285K - """ - dataset_name = 'fraud' - os.makedirs(dataset_dir, exist_ok=True) - - filename = "creditcard.csv" - local_url = os.path.join(dataset_dir, filename) - - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - args = ["kaggle", "datasets", "download", "mlg-ulb/creditcardfraud", "-f", - filename, "-p", str(dataset_dir)] - _ = subprocess.check_output(args) - logging.info(f'{dataset_name} is loaded, started parsing...') - - df = pd.read_csv(local_url + ".zip", dtype=np.float32) - X = df[[col for col in df.columns if col.startswith('V')]].to_numpy(dtype=np.float32) - y = df['Class'].to_numpy(dtype=np.float32) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - test_size=0.2, - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def gisette(dataset_dir: Path) -> bool: - """ - GISETTE is a handwritten digit recognition problem. - The problem is to separate the highly confusable digits '4' and '9'. - This dataset is one of five datasets of the NIPS 2003 feature selection challenge. - - Classification task. n_classes = 2. - gisette X train dataset (6000, 5000) - gisette y train dataset (6000, 1) - gisette X test dataset (1000, 5000) - gisette y test dataset (1000, 1) - """ - dataset_name = 'gisette' - os.makedirs(dataset_dir, exist_ok=True) - - cache_dir = os.path.join(dataset_dir, '_gisette') - os.makedirs(cache_dir, exist_ok=True) - - domen_hhtp = 'http://archive.ics.uci.edu/ml/machine-learning-databases/' - - gisette_train_data_url = domen_hhtp + '/gisette/GISETTE/gisette_train.data' - filename_train_data = os.path.join(cache_dir, 'gisette_train.data') - if not os.path.exists(filename_train_data): - retrieve(gisette_train_data_url, filename_train_data) - - gisette_train_labels_url = domen_hhtp + '/gisette/GISETTE/gisette_train.labels' - filename_train_labels = os.path.join(cache_dir, 'gisette_train.labels') - if not os.path.exists(filename_train_labels): - retrieve(gisette_train_labels_url, filename_train_labels) - - gisette_test_data_url = domen_hhtp + '/gisette/GISETTE/gisette_valid.data' - filename_test_data = os.path.join(cache_dir, 'gisette_valid.data') - if not os.path.exists(filename_test_data): - retrieve(gisette_test_data_url, filename_test_data) - - gisette_test_labels_url = domen_hhtp + '/gisette/gisette_valid.labels' - filename_test_labels = os.path.join(cache_dir, 'gisette_valid.labels') - if not os.path.exists(filename_test_labels): - retrieve(gisette_test_labels_url, filename_test_labels) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - num_cols = 5000 - - df_train = pd.read_csv(filename_train_data, header=None) - df_labels = pd.read_csv(filename_train_labels, header=None) - num_train = 6000 - x_train_arr = df_train.iloc[:num_train].values - x_train = pd.DataFrame(np.array([np.fromstring( - elem[0], dtype=int, count=num_cols, sep=' ') for elem in x_train_arr])) # type: ignore - y_train_arr = df_labels.iloc[:num_train].values - y_train = pd.DataFrame((y_train_arr > 0).astype(int)) - - num_train = 1000 - df_test = pd.read_csv(filename_test_data, header=None) - df_labels = pd.read_csv(filename_test_labels, header=None) - x_test_arr = df_test.iloc[:num_train].values - x_test = pd.DataFrame(np.array( - [np.fromstring( - elem[0], - dtype=int, count=num_cols, sep=' ') # type: ignore - for elem in x_test_arr])) - y_test_arr = df_labels.iloc[:num_train].values - y_test = pd.DataFrame((y_test_arr > 0).astype(int)) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data.to_numpy()) - logging.info('dataset gisette is ready.') - return True - - -def hepmass_150K(dataset_dir: Path) -> bool: - """ - HEPMASS dataset from UCI machine learning repository ( - https://archive.ics.uci.edu/ml/datasets/HEPMASS). - - Classification task. n_classes = 2. - hepmass_150K X train dataset (100000, 28) - hepmass_150K y train dataset (100000, 1) - hepmass_150K X test dataset (50000, 28) - hepmass_150K y test dataset (50000, 1) - """ - dataset_name = 'hepmass_150K' - os.makedirs(dataset_dir, exist_ok=True) - - url_test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_test.csv.gz' - url_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_train.csv.gz' - - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, nrows_test, dtype = 100000, 50000, np.float32 - data_test: Any = pd.read_csv(local_url_test, delimiter=",", - compression="gzip", dtype=dtype, - nrows=nrows_test) - data_train: Any = pd.read_csv(local_url_train, delimiter=",", - compression="gzip", dtype=dtype, - nrows=nrows_train) - - x_test = np.ascontiguousarray(data_test.values[:nrows_test, 1:], dtype=dtype) - y_test = np.ascontiguousarray(data_test.values[:nrows_test, 0], dtype=dtype) - x_train = np.ascontiguousarray(data_train.values[:nrows_train, 1:], dtype=dtype) - y_train = np.ascontiguousarray(data_train.values[:nrows_train, 0], dtype=dtype) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def higgs(dataset_dir: Path) -> bool: - """ - Higgs dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/HIGGS - - Classification task. n_classes = 2. - higgs X train dataset (8799999, 28) - higgs y train dataset (8799999, 1) - higgs X test dataset (2200000, 28) - higgs y test dataset (2200000, 1) - """ - dataset_name = 'higgs' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - higgs = pd.read_csv(local_url) - X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32) - y = higgs.iloc[:, 0].to_numpy(dtype=np.float32) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - test_size=0.2, - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def higgs_one_m(dataset_dir: Path) -> bool: - """ - Higgs dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/HIGGS - - Only first 1.5M samples is taken - - Classification task. n_classes = 2. - higgs1m X train dataset (1000000, 28) - higgs1m y train dataset (1000000, 1) - higgs1m X test dataset (500000, 28) - higgs1m y test dataset (500000, 1) - """ - dataset_name = 'higgs1m' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, nrows_test, dtype = 1000000, 500000, np.float32 - data: Any = pd.read_csv(local_url, delimiter=",", header=None, - compression="gzip", dtype=dtype, - nrows=nrows_train + nrows_test) - - data = data[list(data.columns[1:]) + list(data.columns[0:1])] - n_features = data.shape[1] - 1 - train_data = np.ascontiguousarray(data.values[:nrows_train, :n_features], dtype=dtype) - train_label = np.ascontiguousarray(data.values[:nrows_train, n_features], dtype=dtype) - test_data = np.ascontiguousarray( - data.values[nrows_train: nrows_train + nrows_test, : n_features], - dtype=dtype) - test_label = np.ascontiguousarray( - data.values[nrows_train: nrows_train + nrows_test, n_features], - dtype=dtype) - for data, name in zip((train_data, test_data, train_label, test_label), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def higgs_150K(dataset_dir: Path) -> bool: - """ - Higgs dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/HIGGS - - Classification task. n_classes = 2. - higgs_150K X train dataset (100000, 28) - higgs_150K y train dataset (50000, 1) - higgs_150K X test dataset (100000, 28) - higgs_150K y test dataset (50000, 1) - """ - dataset_name = 'higgs_150K' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, nrows_test, dtype = 100000, 50000, np.float32 - data: Any = pd.read_csv(local_url, delimiter=",", header=None, - compression="gzip", dtype=dtype, - nrows=nrows_train + nrows_test) - - x_train = np.ascontiguousarray(data.values[:nrows_train, 1:], dtype=dtype) - y_train = np.ascontiguousarray(data.values[:nrows_train, 0], dtype=dtype) - x_test = np.ascontiguousarray( - data.values[nrows_train: nrows_train + nrows_test, 1:], - dtype=dtype) - y_test = np.ascontiguousarray( - data.values[nrows_train: nrows_train + nrows_test, 0], - dtype=dtype) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def ijcnn(dataset_dir: Path) -> bool: - """ - Author: Danil Prokhorov. - libSVM,AAD group - Cite: Danil Prokhorov. IJCNN 2001 neural network competition. - Slide presentation in IJCNN'01, - Ford Research Laboratory, 2001. http://www.geocities.com/ijcnn/nnc_ijcnn01.pdf. - - Classification task. n_classes = 2. - ijcnn X train dataset (153344, 22) - ijcnn y train dataset (153344, 1) - ijcnn X test dataset (38337, 22) - ijcnn y test dataset (38337, 1) - """ - dataset_name = 'ijcnn' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='ijcnn', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) - - y[y == -1] = 0 - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def klaverjas(dataset_dir: Path) -> bool: - """ - Abstract: - Klaverjas is an example of the Jack-Nine card games, - which are characterized as trick-taking games where the the Jack - and nine of the trump suit are the highest-ranking trumps, and - the tens and aces of other suits are the most valuable cards - of these suits. It is played by four players in two teams. - - Task Information: - Classification task. n_classes = 2. - klaverjas X train dataset (196308, 32) - klaverjas y train dataset (196308, 1) - klaverjas X test dataset (785233, 32) - klaverjas y test dataset (785233, 1) - """ - dataset_name = 'klaverjas' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='Klaverjas2018', return_X_y=True, - as_frame=True, data_home=dataset_dir) - - y = y.cat.codes - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, train_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def santander(dataset_dir: Path) -> bool: - """ - # TODO: add an loading instruction - """ - return False - - -def skin_segmentation(dataset_dir: Path) -> bool: - """ - Abstract: - The Skin Segmentation dataset is constructed over B, G, R color space. - Skin and Nonskin dataset is generated using skin textures from - face images of diversity of age, gender, and race people. - Author: Rajen Bhatt, Abhinav Dhall, rajen.bhatt '@' gmail.com, IIT Delhi. - - Classification task. n_classes = 2. - skin_segmentation X train dataset (196045, 3) - skin_segmentation y train dataset (196045, 1) - skin_segmentation X test dataset (49012, 3) - skin_segmentation y test dataset (49012, 1) - """ - dataset_name = 'skin_segmentation' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='skin-segmentation', - return_X_y=True, as_frame=True, data_home=dataset_dir) - y = y.astype(int) - y[y == 2] = 0 - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def cifar_binary(dataset_dir: Path) -> bool: - """ - Cifar dataset from LIBSVM Datasets ( - https://www.cs.toronto.edu/~kriz/cifar.html#cifar) - TaskType: Classification - cifar_binary x train dataset (50000, 3072) - cifar_binary y train dataset (50000, 1) - cifar_binary x test dataset (10000, 3072) - cifar_binary y test dataset (10000, 1) - """ - dataset_name = 'cifar_binary' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_train, y_train = load_svmlight_file(local_url_train, - dtype=np.float32) - - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_test, y_test = load_svmlight_file(local_url_test, - dtype=np.float32) - - x_train = x_train.toarray() - y_train = (y_train > 0).astype(int) - - x_test = x_test.toarray() - y_test = (y_test > 0).astype(int) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - return True - - -def susy(dataset_dir: Path) -> bool: - """ - SUSY dataset from UCI machine learning repository ( - https://archive.ics.uci.edu/ml/datasets/SUSY). - - Classification task. n_classes = 2. - susy X train dataset (4500000, 28) - susy y train dataset (4500000, 1) - susy X test dataset (500000, 28) - susy y test dataset (500000, 1) - """ - dataset_name = 'susy' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, nrows_test, dtype = 4500000, 500000, np.float32 - data: Any = pd.read_csv(local_url, delimiter=",", header=None, - compression="gzip", dtype=dtype, - nrows=nrows_train + nrows_test) - - X = data[data.columns[1:]] - y = data[data.columns[0:1]].values.ravel() - - x_train, x_test, y_train, y_test = train_test_split( - X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True diff --git a/datasets/loader_clustering.py b/datasets/loader_clustering.py deleted file mode 100644 index 8749194fc..000000000 --- a/datasets/loader_clustering.py +++ /dev/null @@ -1,295 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import logging -import os -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -from sklearn.datasets import fetch_openml, load_svmlight_file -from sklearn.model_selection import train_test_split - -from .loader_utils import retrieve - - -def epsilon_50K_cluster(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Clustering task. n_classes = 2. - epsilon_50K x cluster dataset (50000, 2001) - """ - dataset_name = 'epsilon_50K_cluster' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - - num_train = 50000 - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_train, y_train = load_svmlight_file(local_url, - dtype=np.float32) - - x_train = x_train.toarray()[:num_train] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - - filename = f'{dataset_name}.npy' - data = np.concatenate((x_train, y_train[:, None]), axis=1) - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def cifar_cluster(dataset_dir: Path) -> bool: - """ - Cifar dataset from LIBSVM Datasets ( - https://www.cs.toronto.edu/~kriz/cifar.html#cifar) - TaskType: Clustering - cifar x cluster dataset (50000, 3073) - """ - dataset_name = 'cifar_cluster' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_train, y_train = load_svmlight_file(local_url, - dtype=np.float32) - - x_train = x_train.toarray() - y_train = (y_train > 0).astype(int) - - filename = f'{dataset_name}.npy' - data = np.concatenate((x_train, y_train[:, None]), axis=1) - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def higgs_one_m_clustering(dataset_dir: Path) -> bool: - """ - Higgs dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/HIGGS - - Clustering task. n_classes = 2. - higgs1m X cluster dataset (1000000, 29) - """ - dataset_name = 'higgs_one_m_clustering' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, nrows_test, dtype = 1000000, 500000, np.float32 - data: Any = pd.read_csv(local_url, delimiter=",", header=None, - compression="gzip", dtype=dtype, - nrows=nrows_train + nrows_test) - - X = data[data.columns[1:]] - y = data[data.columns[0:1]] - - x_train, _, y_train, _ = train_test_split( - X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) - - filename = f'{dataset_name}.npy' - data = np.concatenate((x_train, y_train), axis=1) - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def hepmass_1M_cluster(dataset_dir: Path) -> bool: - """ - HEPMASS dataset from UCI machine learning repository ( - https://archive.ics.uci.edu/ml/datasets/HEPMASS). - - Clustering task. n_classes = 2. - hepmass_10K X cluster dataset (1000000, 29) - """ - dataset_name = 'hepmass_1M_cluster' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_train.csv.gz' - - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, dtype = 1000000, np.float32 - data_train: Any = pd.read_csv(local_url_train, delimiter=",", - compression="gzip", dtype=dtype, - nrows=nrows_train) - - x_train = np.ascontiguousarray(data_train.values[:nrows_train, 1:], dtype=dtype) - y_train = np.ascontiguousarray(data_train.values[:nrows_train, 0], dtype=dtype) - - filename = f'{dataset_name}.npy' - data = np.concatenate((x_train, y_train[:, None]), axis=1) - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def hepmass_10K_cluster(dataset_dir: Path) -> bool: - """ - HEPMASS dataset from UCI machine learning repository ( - https://archive.ics.uci.edu/ml/datasets/HEPMASS). - - Clustering task. n_classes = 2. - hepmass_10K X cluster dataset (10000, 29) - """ - dataset_name = 'hepmass_10K_cluster' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_train.csv.gz' - - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, dtype = 10000, np.float32 - data_train: Any = pd.read_csv(local_url_train, delimiter=",", - compression="gzip", dtype=dtype, - nrows=nrows_train) - - x_train = np.ascontiguousarray(data_train.values[:nrows_train, 1:], dtype=dtype) - y_train = np.ascontiguousarray(data_train.values[:nrows_train, 0], dtype=dtype) - - filename = f'{dataset_name}.npy' - data = np.concatenate((x_train, y_train[:, None]), axis=1) - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def susy_cluster(dataset_dir: Path) -> bool: - """ - SUSY dataset from UCI machine learning repository ( - https://archive.ics.uci.edu/ml/datasets/SUSY). - - Clustering task. n_classes = 2. - susy X cluster dataset (4500000, 29) - """ - dataset_name = 'susy_cluster' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, dtype = 4500000, np.float32 - data_raw: Any = pd.read_csv(local_url, delimiter=",", header=None, - compression="gzip", dtype=dtype, - nrows=nrows_train) - - X = data_raw.iloc[:nrows_train, 1:].values - y = data_raw.iloc[:nrows_train, 0].values - data = np.concatenate((X, y[:, None]), axis=1) - - filename = f'{dataset_name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def mnist_10K_cluster(dataset_dir: Path) -> bool: - """ - Abstract: - The MNIST database of handwritten digits with 784 features. - It can be split in a training set of the first 60,000 examples, - and a test set of 10,000 examples - Source: - Yann LeCun, Corinna Cortes, Christopher J.C. Burges - http://yann.lecun.com/exdb/mnist/ - - Clustering task. n_classes = 10. - mnist x clustering dataset (10000, 785) - """ - dataset_name = 'mnist_10K_cluster' - - os.makedirs(dataset_dir, exist_ok=True) - - nrows_train, dtype = 10000, np.float32 - X, y = fetch_openml(name='mnist_784', return_X_y=True, - as_frame=True, data_home=dataset_dir) - y = y.astype(int) - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train = np.ascontiguousarray(X.values[:nrows_train, 1:], dtype=dtype) - y_train = np.ascontiguousarray(y.values[:nrows_train], dtype=dtype) - - filename = f'{dataset_name}.npy' - data = np.concatenate((x_train, y_train[:, None]), axis=1) - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def road_network_20K_cluster(dataset_dir: Path) -> bool: - """ - 3DRoadNetwork dataset from UCI repository ( - http://archive.ics.uci.edu/ml/datasets/3D+Road+Network+%28North+Jutland%2c+Denmark%29#) - road_network x cluster dataset (20000, 4) - """ - dataset_name = 'road_network_20K_cluster' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00246/3D_spatial_network.txt' - - local_url = os.path.join(dataset_dir, os.path.basename(url)) - - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, dtype = 20000, np.float32 - data_train: Any = pd.read_csv(local_url, dtype=dtype, - nrows=nrows_train) - - x_train = np.ascontiguousarray(data_train.values[:nrows_train, 1:], dtype=dtype) - y_train = np.ascontiguousarray(data_train.values[:nrows_train, 0], dtype=dtype) - - filename = f'{dataset_name}.npy' - data = np.concatenate((x_train, y_train[:, None]), axis=1) - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True diff --git a/datasets/loader_multiclass.py b/datasets/loader_multiclass.py deleted file mode 100644 index 874db9939..000000000 --- a/datasets/loader_multiclass.py +++ /dev/null @@ -1,333 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import logging -import os -import tarfile -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -from sklearn.datasets import fetch_covtype, fetch_openml -from sklearn.model_selection import train_test_split - -from .loader_utils import count_lines, read_libsvm_msrank, retrieve - - -def cifar_10(dataset_dir: Path) -> bool: - """ - Source: - University of Toronto - Collected by Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton - https://www.cs.toronto.edu/~kriz/cifar.html - - Classification task. n_classes = 10 - cifar_10 x train dataset (54000, 3072) - cifar_10 y train dataset (54000, 1) - cifar_10 x test dataset (6000, 3072) - cifar_10 y test dataset (6000, 1) - - """ - dataset_name = 'cifar_10' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(data_id=40927, return_X_y=True, - as_frame=False, data_home=dataset_dir) - - X = pd.DataFrame(X) - y = pd.DataFrame(y) - y = y.astype(int) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.1, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def connect(dataset_dir: Path) -> bool: - """ - Source: - UC Irvine Machine Learning Repository - http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.htm - - Classification task. n_classes = 3. - connect X train dataset (60801, 126) - connect y train dataset (60801, 1) - connect X test dataset (6756, 126) - connect y test dataset (6756, 1) - """ - dataset_name = 'connect' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='connect-4', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) - y = y.astype(int) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.1, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def covertype(dataset_dir: Path) -> bool: - """ - Abstract: This is the original version of the famous - covertype dataset in ARFF format. - Author: Jock A. Blackard, Dr. Denis J. Dean, Dr. Charles W. Anderson - Source: [original](https://archive.ics.uci.edu/ml/datasets/covertype) - - Classification task. n_classes = 7. - covertype X train dataset (390852, 54) - covertype y train dataset (390852, 1) - covertype X test dataset (97713, 54) - covertype y test dataset (97713, 1) - """ - dataset_name = 'covertype' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='covertype', version=3, return_X_y=True, - as_frame=True, data_home=dataset_dir) - y = y.astype(int) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.4, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def covtype(dataset_dir: Path) -> bool: - """ - Cover type dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/covertype - - y contains 7 unique class labels from 1 to 7 inclusive. - Classification task. n_classes = 7. - covtype X train dataset (464809, 54) - covtype y train dataset (464809, 1) - covtype X test dataset (116203, 54) - covtype y test dataset (116203, 1) - """ - dataset_name = 'covtype' - os.makedirs(dataset_dir, exist_ok=True) - - logging.info(f'Started loading {dataset_name}') - X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg - logging.info(f'{dataset_name} is loaded, started parsing...') - - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - test_size=0.2, - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def letters(dataset_dir: Path) -> bool: - """ - http://archive.ics.uci.edu/ml/datasets/Letter+Recognition - - Classification task. n_classes = 26. - letters X train dataset (16000, 16) - letters y train dataset (16000, 1) - letters X test dataset (4000, 16) - letters y test dataset (4000, 1) - """ - dataset_name = 'letters' - os.makedirs(dataset_dir, exist_ok=True) - - url = ('http://archive.ics.uci.edu/ml/machine-learning-databases/' + - 'letter-recognition/letter-recognition.data') - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - letters = pd.read_csv(local_url, header=None) - X = letters.iloc[:, 1:].values - y: Any = letters.iloc[:, 0] - y = y.astype('category').cat.codes.values - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, - random_state=0) - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def mlsr(dataset: Path) -> bool: - """ - # TODO: add an loading instruction - """ - return False - - -def mnist(dataset_dir: Path) -> bool: - """ - Abstract: - The MNIST database of handwritten digits with 784 features. - It can be split in a training set of the first 60,000 examples, - and a test set of 10,000 examples - Source: - Yann LeCun, Corinna Cortes, Christopher J.C. Burges - http://yann.lecun.com/exdb/mnist/ - - Classification task. n_classes = 10. - mnist X train dataset (60000, 784) - mnist y train dataset (60000, 1) - mnist X test dataset (10000, 784) - mnist y test dataset (10000, 1) - """ - dataset_name = 'mnist' - - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='mnist_784', return_X_y=True, - as_frame=True, data_home=dataset_dir) - y = y.astype(int) - X = X / 255 - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=10000, shuffle=False) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def msrank(dataset_dir: Path) -> bool: - """ - Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - - Classification task. n_classes = 5. - msrank X train dataset (958671, 137) - msrank y train dataset (958671, 1) - msrank X test dataset (241521, 137) - msrank y test dataset (241521, 1) - """ - dataset_name = 'msrank' - os.makedirs(dataset_dir, exist_ok=True) - url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz" - local_url = os.path.join(dataset_dir, os.path.basename(url)) - unzipped_url = os.path.join(dataset_dir, "MSRank") - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - if not os.path.isdir(unzipped_url): - logging.info(f'{dataset_name} is loaded, unzipping...') - tar = tarfile.open(local_url, "r:gz") - tar.extractall(dataset_dir) - tar.close() - logging.info(f'{dataset_name} is unzipped, started parsing...') - - sets = [] - labels = [] - n_features = 137 - - for set_name in ['train.txt', 'vali.txt', 'test.txt']: - file_name = os.path.join(unzipped_url, set_name) - - n_samples = count_lines(file_name) - with open(file_name, 'r') as file_obj: - X, y = read_libsvm_msrank(file_obj, n_samples, n_features, np.float32) - - sets.append(X) - labels.append(y) - - sets[0] = np.vstack((sets[0], sets[1])) - labels[0] = np.hstack((labels[0], labels[1])) - - sets = [np.ascontiguousarray(sets[i]) for i in [0, 2]] - labels = [np.ascontiguousarray(labels[i]) for i in [0, 2]] - - for data, name in zip((sets[0], sets[1], labels[0], labels[1]), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def plasticc(dataset_dir: Path) -> bool: - """ - # TODO: add an loading instruction - """ - return False - - -def sensit(dataset_dir: Path) -> bool: - """ - Abstract: Vehicle classification in distributed sensor networks. - Author: M. Duarte, Y. H. Hu - Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets) - - Classification task. n_classes = 3. - sensit X train dataset (78822, 100) - sensit y train dataset (78822, 1) - sensit X test dataset (19706, 100) - sensit y test dataset (19706, 1) - """ - dataset_name = 'sensit' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='SensIT-Vehicle-Combined', - return_X_y=True, as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) - y = y.astype(int) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True diff --git a/datasets/loader_regression.py b/datasets/loader_regression.py deleted file mode 100644 index e2fd31c47..000000000 --- a/datasets/loader_regression.py +++ /dev/null @@ -1,337 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import logging -import os -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -from sklearn.model_selection import train_test_split -from sklearn.datasets import fetch_openml, fetch_california_housing -from sklearn.preprocessing import StandardScaler - -from .loader_utils import retrieve - - -def abalone(dataset_dir: Path) -> bool: - """ - https://archive.ics.uci.edu/ml/machine-learning-databases/abalone - - abalone x train dataset (3341, 8) - abalone y train dataset (3341, 1) - abalone x test dataset (836, 8) - abalone y train dataset (836, 1) - """ - dataset_name = 'abalone' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - abalone: Any = pd.read_csv(local_url, header=None) - abalone[0] = abalone[0].astype('category').cat.codes - X = abalone.iloc[:, :-1].values - y = abalone.iloc[:, -1].values - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, - random_state=0) - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def california_housing(dataset_dir: Path) -> bool: - """ - california_housing x train dataset (18576, 8) - california_housing y train dataset (18576, 1) - california_housing x test dataset (2064, 8) - california_housing y train dataset (2064, 1) - """ - dataset_name = 'california_housing' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_california_housing(return_X_y=True, as_frame=False, - data_home=dataset_dir) - X = pd.DataFrame(X) - y = pd.DataFrame(y) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.1, random_state=42) - - scaler = StandardScaler().fit(x_train, y_train) - x_train = scaler.transform(x_train) - x_test = scaler.transform(x_test) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def fried(dataset_dir: Path) -> bool: - """ - fried x train dataset (32614, 10) - fried y train dataset (32614, 1) - fried x test dataset (8154, 10) - fried y train dataset (8154, 1) - """ - dataset_name = 'fried' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml( - name='fried', return_X_y=True, as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X) - y = pd.DataFrame(y) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def medical_charges_nominal(dataset_dir: Path) -> bool: - """ - medical_charges_nominal x train dataset (130452, 11) - medical_charges_nominal y train dataset (130452, 1) - medical_charges_nominal x test dataset (32613, 11) - medical_charges_nominal y train dataset (32613, 1) - """ - dataset_name = 'medical_charges_nominal' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='medical_charges_nominal', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X) - y = pd.DataFrame(y) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - - scaler = StandardScaler().fit(x_train, y_train) - x_train = scaler.transform(x_train) - x_test = scaler.transform(x_test) - - scaler = StandardScaler().fit(y_train) - y_train = scaler.transform(y_train) - y_test = scaler.transform(y_test) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def mortgage_first_q(dataset_dir: Path) -> bool: - """ - # TODO: add an loading instruction - """ - return False - - -def twodplanes(dataset_dir: Path) -> bool: - """ - twodplanes x train dataset (106288, 10) - twodplanes y train dataset (106288, 1) - twodplanes x test dataset (70859, 10) - twodplanes y train dataset (70859, 1) - """ - dataset_name = 'twodplanes' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml( - name='BNG(2dplanes)', return_X_y=True, as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X) - y = pd.DataFrame(y) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.4, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def year_prediction_msd(dataset_dir: Path) -> bool: - """ - YearPredictionMSD dataset from UCI repository - https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd - - year_prediction_msd x train dataset (463715, 90) - year_prediction_msd y train dataset (463715, 1) - year_prediction_msd x test dataset (51630, 90) - year_prediction_msd y train dataset (51630, 1) - """ - dataset_name = 'year_prediction_msd' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/' \ - 'YearPredictionMSD.txt.zip' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - year = pd.read_csv(local_url, header=None) - X = year.iloc[:, 1:].to_numpy(dtype=np.float32) - y = year.iloc[:, 0].to_numpy(dtype=np.float32) - - X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, - train_size=463715, - test_size=51630) - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def yolanda(dataset_dir: Path) -> bool: - """ - yolanda x train dataset (130452, 11) - yolanda y train dataset (130452, 1) - yolanda x test dataset (32613, 11) - yolanda y train dataset (32613, 1) - """ - dataset_name = 'yolanda' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='yolanda', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X) - y = pd.DataFrame(y) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - - scaler = StandardScaler().fit(x_train, y_train) - x_train = scaler.transform(x_train) - x_test = scaler.transform(x_test) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def airline_regression(dataset_dir: Path) -> bool: - """ - yolanda x train dataset (8500000, 9) - yolanda y train dataset (8500000, 1) - yolanda x test dataset (1500000, 9) - yolanda y train dataset (1500000, 1) - """ - dataset_name = 'airline_regression' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='Airlines_DepDelay_10M', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X) - y = pd.DataFrame(y) - - logging.info(f'{dataset_name} is loaded, started parsing...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - - scaler = StandardScaler().fit(x_train, y_train) - x_train = scaler.transform(x_train) - x_test = scaler.transform(x_test) - - scaler = StandardScaler().fit(y_train) - y_train = scaler.transform(y_train) - y_test = scaler.transform(y_test) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def higgs_10500K(dataset_dir: Path) -> bool: - """ - Higgs dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/HIGGS - - Classification task. n_classes = 2. - higgs_10500K X train dataset (10500000, 28) - higgs_10500K y train dataset (10500000, 1) - higgs_10500K X test dataset (500000, 28) - higgs_10500K y test dataset (500000, 1) - """ - dataset_name = 'higgs_10500K' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, nrows_test, dtype = 10500000, 500000, np.float32 - data: Any = pd.read_csv(local_url, delimiter=",", header=None, - compression="gzip", dtype=dtype, - nrows=nrows_train + nrows_test) - - X = data[data.columns[1:]] - y = data[data.columns[0:1]] - - x_train, x_test, y_train, y_test = train_test_split( - X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py deleted file mode 100755 index 4385e3dda..000000000 --- a/datasets/loader_utils.py +++ /dev/null @@ -1,82 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import re -import requests -import os -from shutil import copyfile -import numpy as np -from tqdm import tqdm - - -def retrieve(url: str, filename: str) -> None: - # rewritting urlretrieve without using urllib library, - # otherwise it would fail codefactor test due to security issues. - if os.path.isfile(url): - # reporthook is ignored for local urls - copyfile(url, filename) - elif url.startswith('http'): - response = requests.get(url, stream=True) - if response.status_code != 200: - raise AssertionError(f"Failed to download from {url},\n" + - "Response returned status code {response.status_code}") - total_size = int(response.headers.get('content-length', 0)) - block_size = 8192 - pbar = tqdm(total=total_size/1024, unit='kB') - with open(filename, 'wb+') as file: - for data in response.iter_content(block_size): - pbar.update(len(data)/1024) - file.write(data) - pbar.close() - if total_size != 0 and pbar.n != total_size/1024: - raise AssertionError( - "Some content was present but not downloaded/written") - - -def read_libsvm_msrank(file_obj, n_samples, n_features, dtype): - X = np.zeros((n_samples, n_features)) - y = np.zeros((n_samples,)) - - counter = 0 - - regexp = re.compile(r'[A-Za-z0-9]+:(-?\d*\.?\d+)') - - for line in file_obj: - line = str(line).replace("\\n'", "") - line = regexp.sub(r'\g<1>', line) - line = line.rstrip(" \n\r").split(' ') - - y[counter] = int(line[0]) - X[counter] = [float(i) for i in line[1:]] - - counter += 1 - if counter == n_samples: - break - - return np.array(X, dtype=dtype), np.array(y, dtype=dtype) - - -def _make_gen(reader): - b = reader(1024 * 1024) - while b: - yield b - b = reader(1024 * 1024) - - -def count_lines(filename): - with open(filename, 'rb') as f: - f_gen = _make_gen(f.read) - return sum(buf.count(b'\n') for buf in f_gen) diff --git a/datasets/make_datasets.py b/datasets/make_datasets.py deleted file mode 100644 index 0add738cb..000000000 --- a/datasets/make_datasets.py +++ /dev/null @@ -1,141 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import logging -import os -import numpy as np -from sklearn.datasets import make_classification, make_regression, make_blobs -from sklearn.utils import check_random_state -import sys - - -def try_gen_dataset(args, folder): - try: - if args.type == 'regression': - gen_regression(args, folder) - elif args.type == 'classification': - gen_classification(args, folder) - elif args.type == 'blobs': - gen_blobs(args, folder) - else: - raise ValueError(f'{args.type} is unknown dataset type') - return True - except BaseException as ex: - logging.warning(f"Internal error generating dataset:\n{ex}") - return False - - -def gen_blobs(args, folder): - os.makedirs(os.path.join(folder, "data"), exist_ok=True) - X, y = make_blobs(n_samples=args.samples + args.test_samples, - n_features=args.features, - centers=args.clusters, - center_box=(-32, 32), - shuffle=True, - random_state=args.seed) - np.save(os.path.join(folder, args.filex), X[:args.samples]) - if args.test_samples != 0: - np.save(os.path.join(folder, args.filextest), X[args.samples:]) - return 0 - - -def gen_regression(args, folder): - os.makedirs(os.path.join(folder, "data"), exist_ok=True) - rs = check_random_state(args.seed) - X, y = make_regression(n_targets=1, - n_samples=args.samples + args.test_samples, - n_features=args.features, - n_informative=args.features, - bias=rs.normal(0, 3), - random_state=rs) - np.save(os.path.join(folder, args.filex), X[:args.samples]) - np.save(os.path.join(folder, args.filey), y[:args.samples]) - if args.test_samples != 0: - np.save(os.path.join(folder, args.filextest), X[args.samples:]) - np.save(os.path.join(folder, args.fileytest), y[args.samples:]) - return 0 - - -def gen_classification(args, folder): - os.makedirs(os.path.join(folder, "data"), exist_ok=True) - X, y = make_classification(n_samples=args.samples + args.test_samples, - n_features=args.features, - n_informative=args.features, - n_repeated=0, - n_redundant=0, - n_classes=args.classes, - random_state=args.seed) - np.save(os.path.join(folder, args.filex), X[:args.samples]) - np.save(args.filey, y[:args.samples]) - if args.test_samples != 0: - np.save(os.path.join(folder, args.filextest), X[args.samples:]) - np.save(os.path.join(folder, args.fileytest), y[args.samples:]) - return 0 - - -def main(): - parser = argparse.ArgumentParser( - description='Dataset generator using scikit-learn') - parser.add_argument('-f', '--features', type=int, default=1000, - help='Number of features in dataset') - parser.add_argument('-s', '--samples', type=int, default=10000, - help='Number of samples in dataset') - parser.add_argument('--ts', '--test-samples', type=int, default=0, - dest='test_samples', - help='Number of test samples in dataset') - parser.add_argument('-d', '--seed', type=int, default=0, - help='Seed for random state') - subparsers = parser.add_subparsers(dest='problem') - subparsers.required = True - - regr_parser = subparsers.add_parser('regression', - help='Regression data') - regr_parser.set_defaults(func=gen_regression) - regr_parser.add_argument('-x', '--filex', '--fileX', type=str, - required=True, help='Path to save matrix X') - regr_parser.add_argument('-y', '--filey', '--fileY', type=str, - required=True, help='Path to save vector y') - regr_parser.add_argument('--xt', '--filextest', '--fileXtest', type=str, - dest='filextest', - help='Path to save test matrix X') - regr_parser.add_argument('--yt', '--fileytest', '--fileYtest', type=str, - dest='fileytest', - help='Path to save test vector y') - - clsf_parser = subparsers.add_parser('classification', - help='Classification data') - clsf_parser.set_defaults(func=gen_classification) - clsf_parser.add_argument('-c', '--classes', type=int, default=5, - help='Number of classes') - clsf_parser.add_argument('-x', '--filex', '--fileX', type=str, - required=True, help='Path to save matrix X') - clsf_parser.add_argument('-y', '--filey', '--fileY', type=str, - required=True, - help='Path to save label vector y') - clsf_parser.add_argument('--xt', '--filextest', '--fileXtest', type=str, - dest='filextest', - help='Path to save test matrix X') - clsf_parser.add_argument('--yt', '--fileytest', '--fileYtest', type=str, - dest='fileytest', - help='Path to save test vector y') - - args = parser.parse_args() - return args.func(args) - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..7cc473140 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,69 @@ +# Developer Guide + +This document covers topics useful for contributors to Scikit-learn_bench: + +- [Developer Guide](#developer-guide) + - [High-level workflow of Scikit-learn\_bench](#high-level-workflow-of-scikit-learn_bench) + - [Configs parser workflow](#configs-parser-workflow) + +## High-Level Workflow of Scikit-learn_bench + +```mermaid +stateDiagram-v2 + classDef inputOutput fill:#33b,color:white,stroke-width:2px,stroke:white; + + user_arguments:::inputOutput --> ArgumentParser + BenchmarksRunner --> raw_results[JSON]:::inputOutput + raw_results[JSON] --> ReportGenerator + ReportGenerator --> benchmarks_report[Excel]:::inputOutput + + state BenchmarksRunner { + ArgumentParser --> ConfigParser: config_arguments + ArgumentParser --> Benchmarks: other_arguments + ConfigParser --> Benchmarks: benchmark_cases\n[JSON-formatted string] + ConfigParser --> Benchmarks: benchmark_filters\n[JSON-formatted string] + + state Benchmarks { + SklearnLikeEstimator --> raw_results[JSON] + ... --> raw_results[JSON] + Functional --> raw_results[JSON] + } + } +``` + +Scikit-learn_bench consists of three main parts: + - **Benchmarks runner**: + 1. Consumes user-provided high-level arguments (argument parser). + 2. Transforms arguments to benchmark cases as parameters for individual benchmarks (config parser). + 3. Combines the raw outputs. + - **Individual benchmarks** wrapping specific entities or workloads (sklearn-like estimators, custom functions, etc.) + - **Report generator** which consumes benchmarks' outputs and generates high-level report with aggregated stats + +Runner is responsible for orchestration of benchmarking cases, individual benchmarks - for actual run of each case, report generator - for human-readable output. + +## Configs parser workflow + +Benchmarking configuration exists as two stages: +1. **Benchmarking template** where parameters or group of them might be defined as a *range of values* +2. **Benchmarking case** with deducted *scalar values* of parameters + +In other words, the template has the `Dict[str, AnyJSONSerializable]` type, while the case has `Dict[str, Dict[str, ... [str, Scalar] ... ]]`. + +Configs parser steps: +1. Find all config files from the user-provided `config` argument or use globally defined `parameters` as a standalone config +2. Convert configs to templates +3. Expand template-special values and ranges to all possible cases +4. Remove duplicated cases and assign case-special values if possible + +Special values might be assigned on three stages: + - During template reading in runner + - During benchmarking cases generation in runner + - During run of individual benchmark + +Benchmark parameters the following overwriting priority: +1. CLI parameters +2. Config template parameters +3. Parameters set + +--- +[Documentation tree](../README.md#-documentation) diff --git a/envs/conda-env-rapids.yml b/envs/conda-env-rapids.yml new file mode 100644 index 000000000..d72aa2d83 --- /dev/null +++ b/envs/conda-env-rapids.yml @@ -0,0 +1,21 @@ +channels: + - rapidsai + - conda-forge + - nvidia + - nodefaults +dependencies: + - python=3.10 + - rapids + - cudatoolkit + # sklbench dependencies + - scikit-learn + - pandas + - tabulate + - fastparquet + - h5py + - kaggle + - openpyxl + - tqdm + - psutil + - requests + - py-cpuinfo diff --git a/envs/conda-env-sklearn.yml b/envs/conda-env-sklearn.yml new file mode 100644 index 000000000..13b2bdd33 --- /dev/null +++ b/envs/conda-env-sklearn.yml @@ -0,0 +1,23 @@ +channels: + - conda-forge + - nodefaults +dependencies: + # additional frameworks + - xgboost + - catboost + - lightgbm + - faiss-cpu + - modin-all + - scikit-learn-intelex + # sklbench dependencies + - scikit-learn<1.5 + - pandas + - tabulate + - fastparquet + - h5py + - kaggle + - openpyxl + - tqdm + - psutil + - requests + - py-cpuinfo diff --git a/envs/requirements-sklearn.txt b/envs/requirements-sklearn.txt new file mode 100644 index 000000000..cc2dcc81f --- /dev/null +++ b/envs/requirements-sklearn.txt @@ -0,0 +1,21 @@ +# additional frameworks +xgboost +catboost +lightgbm +faiss-cpu +modin[all] +scikit-learn-intelex +dpctl +dpnp +# sklbench dependencies +scikit-learn<1.5 +pandas +tabulate +fastparquet +h5py +kaggle +openpyxl +tqdm +psutil +requests +py-cpuinfo diff --git a/modelbuilders_bench/__init__.py b/modelbuilders_bench/__init__.py deleted file mode 100755 index e69de29bb..000000000 diff --git a/modelbuilders_bench/catboost_mb.py b/modelbuilders_bench/catboost_mb.py deleted file mode 100644 index 62c5068af..000000000 --- a/modelbuilders_bench/catboost_mb.py +++ /dev/null @@ -1,246 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np -import catboost as cb -import daal4py -import typing as tp - - -def convert_probs_to_classes(y_prob, class_labels): - return np.array([class_labels[np.argmax(y_prob[i])] - for i in range(y_prob.shape[0])]) - - -def convert_cb_predictions(y_pred, objective, metric_name, class_labels): - if objective != 'RMSE': - if metric_name == 'accuracy': - y_pred = convert_probs_to_classes(y_pred, class_labels) - return y_pred - - -parser = argparse.ArgumentParser( - description='catboost gbt + model transform + daal predict benchmark') - -parser.add_argument('--count-pool', default=False, action='store_true', - help='Count Pool creation in time measurements') - -parser.add_argument('--grow-policy', type=str, default='Depthwise', - help='Controls a way new nodes are added to the tree') - -parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, - help='Step size shrinkage used in update ' - 'to prevents overfitting') - -parser.add_argument('--max-bin', type=int, default=256, - help='Maximum number of discrete bins to ' - 'bucket continuous features') - -parser.add_argument('--max-depth', type=int, default=6, - help='Maximum depth of a tree') - -parser.add_argument('--max-leaves', type=int, default=0, - help='Maximum number of nodes to be added') - -parser.add_argument('--n-estimators', type=int, default=100, - help='Number of gradient boosted trees') - -parser.add_argument('--objective', type=str, required=True, - choices=('RMSE', 'Logloss', - 'multi:softmax', 'multi:softprob'), - help='Control a balance of positive and negative weights') - -parser.add_argument('--reg-lambda', type=float, default=1, - help='L2 regularization term on weights') - -parser.add_argument('--scale-pos-weight', type=float, default=1, - help='Controls a balance of positive and negative weights') - -parser.add_argument('--subsample', type=float, default=1, - help='Subsample ratio of the training instances') - -params = bench.parse_args(parser) - -X_train, X_test, y_train, y_test = bench.load_data(params) - -cb_params = { - 'verbose': 0, - 'learning_rate': params.learning_rate, - 'max_depth': params.max_depth, - 'subsample': params.subsample, - 'colsample_bylevel': 1, - 'reg_lambda': params.reg_lambda, - 'grow_policy': params.grow_policy, - 'max_bin': params.max_bin, - 'objective': params.objective, - 'random_seed': params.seed, - 'iterations': params.n_estimators, -} - -# CatBoost restriction -if cb_params['grow_policy'] == 'Lossguide': - cb_params['max_leaves'] = params.max_leaves - -if params.threads != -1: - cb_params.update({'thread_count': params.threads}) - -metric_name: tp.List[str] -metric_func: tp.List[tp.Callable] - -class_labels = None - -if params.objective == "RMSE": - task = 'regression' - metric_name = ['rmse', 'r2'] - metric_func = [bench.rmse_score, bench.r2_score] -else: - task = 'classification' - class_labels = sorted(np.unique(y_train)) - if params.objective.startswith('multi'): - metric_name = ['accuracy'] - metric_func = [bench.accuracy_score] - else: - metric_name = ['accuracy', 'log_loss'] - metric_func = [bench.accuracy_score, bench.log_loss] - - if 'cudf' in str(type(y_train)): - params.n_classes = y_train[y_train.columns[0]].nunique() - else: - params.n_classes = len(np.unique(y_train)) - unique_y_train = np.unique(y_train) - params.n_classes = len(unique_y_train) - - if params.n_classes > 2: - cb_params['bootstrap_type'] = 'Bernoulli' - cb_params['objective'] = 'MultiClass' - else: - cb_params['scale_pos_weight'] = params.scale_pos_weight - cb_params['objective'] = 'Logloss' - -t_create_train, dtrain = bench.measure_function_time( - cb.Pool, X_train, params=params, label=y_train) - -t_create_test, dtest = bench.measure_function_time( - cb.Pool, X_test, params=params, label=y_test) - - -def fit(pool): - if pool is None: - pool = cb.Pool(X_train, label=y_train) - return cb.CatBoost(cb_params).fit(pool) - - -if params.objective.startswith('multi'): - def predict(pool): - if pool is None: - pool = cb.Pool(X_test, label=y_test) - return booster.predict(pool, prediction_type='Probability') -else: - if cb_params['objective'] == 'Logloss': - def predict(pool): - if pool is None: - pool = cb.Pool(X_test, label=y_test) - return booster.predict(pool, prediction_type='Probability') - else: - def predict(pool): - if pool is None: - pool = cb.Pool(X_test, label=y_test) - return booster.predict(pool) - - -fit_time, booster = bench.measure_function_time( - fit, None if params.count_pool else dtrain, params=params) - -# Create array where each metric has all the stages -metrics = [[None] * 6 for i in range(len(metric_name))] - -# Metrics for training -for i, func in enumerate(metric_func): - metrics[i][1] = func( - y_train, - convert_cb_predictions( - predict(dtrain), - params.objective, - metric_name[i], - class_labels)) - -predict_time, y_pred = bench.measure_function_time( - predict, None if params.count_pool else dtest, params=params) - -# Metrics for _prediction -for i, func in enumerate(metric_func): - metrics[i][3] = func(y_test, convert_cb_predictions( - y_pred, params.objective, metric_name[i], class_labels)) - -transform_time, model_daal = bench.measure_function_time( - daal4py.get_gbt_model_from_catboost, booster, params=params) - -if hasattr(params, 'n_classes'): - predict_algo = daal4py.gbt_classification_prediction( - nClasses=params.n_classes, - resultsToEvaluate='computeClassProbabilities', - fptype='float') - predict_time_daal, daal_pred = bench.measure_function_time( - predict_algo.compute, X_test, model_daal, params=params) - daal_pred_value = daal_pred.probabilities -else: - predict_algo = daal4py.gbt_regression_prediction() - predict_time_daal, daal_pred = bench.measure_function_time( - predict_algo.compute, X_test, model_daal, params=params) - daal_pred_value = daal_pred.prediction - -# Metrics for alternative_prediction -for i, func in enumerate(metric_func): - metrics[i][5] = func(y_test, convert_cb_predictions( - daal_pred_value, params.objective, metric_name[i], class_labels)) - -bench.print_output( - library='modelbuilders', - algorithm=f'catboost_{task}_and_modelbuilder', - stages=[ - 'training_preparation', - 'training', - 'prediction_preparation', - 'prediction', - 'transformation', - 'alternative_prediction'], - params=params, - functions=[ - 'cb.Pool.train', - 'cb.fit', - 'cb.Pool.test', - 'cb.predict', - 'daal4py.get_gbt_model_from_catboost', - 'daal4py.compute'], - times=[ - t_create_train, - fit_time, - t_create_test, - predict_time, - transform_time, - predict_time_daal], - metric_type=metric_name, - metrics=metrics, - data=[ - X_train, - X_train, - X_test, - X_test, - X_test, - X_test]) diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py deleted file mode 100644 index f263d419c..000000000 --- a/modelbuilders_bench/lgbm_mb.py +++ /dev/null @@ -1,158 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import os - -import bench -import daal4py -import lightgbm as lgbm -import numpy as np - -import modelbuilders_bench.mb_utils as utils - -parser = argparse.ArgumentParser( - description='lightgbm gbt + model transform + daal predict benchmark') - -parser.add_argument('--colsample-bytree', type=float, default=1, - help='Subsample ratio of columns ' - 'when constructing each tree') -parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, - help='Step size shrinkage used in update ' - 'to prevents overfitting') -parser.add_argument('--max-bin', type=int, default=256, - help='Maximum number of discrete bins to ' - 'bucket continuous features') -parser.add_argument('--max-delta-step', type=float, default=0, - help='Maximum delta step we allow each leaf output to be') -parser.add_argument('--max-depth', type=int, default=6, - help='Maximum depth of a tree') -parser.add_argument('--max-leaves', type=int, default=0, - help='Maximum number of nodes to be added') -parser.add_argument('--min-child-weight', type=float, default=1, - help='Minimum sum of instance weight needed in a child') -parser.add_argument('--min-split-gain', '--gamma', type=float, default=0, - help='Minimum loss reduction required to make' - ' partition on a leaf node') -parser.add_argument('--n-estimators', type=int, default=100, - help='Number of gradient boosted trees') -parser.add_argument('--objective', type=str, required=True, - choices=('regression', 'binary', 'multiclass'), - help='Control a balance of positive and negative weights') -parser.add_argument('--reg-alpha', type=float, default=0, - help='L1 regularization term on weights') -parser.add_argument('--reg-lambda', type=float, default=1, - help='L2 regularization term on weights') -parser.add_argument('--scale-pos-weight', type=float, default=1, - help='Controls a balance of positive and negative weights') -parser.add_argument('--subsample', type=float, default=1, - help='Subsample ratio of the training instances') - -params = bench.parse_args(parser) - -X_train, X_test, y_train, y_test = bench.load_data(params) - -lgbm_params = { - 'verbosity': -1, - 'learning_rate': params.learning_rate, - 'min_split_gain': params.min_split_gain, - 'max_depth': params.max_depth, - 'min_child_weight': params.min_child_weight, - 'max_delta_step': params.max_delta_step, - 'subsample': params.subsample, - 'colsample_bytree': params.colsample_bytree, - 'colsample_bynode': 1, - 'reg_lambda': params.reg_lambda, - 'reg_alpha': params.reg_alpha, - 'scale_pos_weight': params.scale_pos_weight, - 'max_leaves': params.max_leaves, - 'max_bin': params.max_bin, - 'objective': params.objective, - 'seed': params.seed -} - -if params.threads != -1: - lgbm_params.update({'nthread': params.threads}) - -if 'OMP_NUM_THREADS' in os.environ.keys(): - lgbm_params['nthread'] = int(os.environ['OMP_NUM_THREADS']) - -if params.objective.startswith('reg'): - task = 'regression' - metric_name, metric_func = 'rmse', bench.rmse_score -else: - task = 'classification' - metric_name, metric_func = 'accuracy[%]', utils.get_accuracy - if 'cudf' in str(type(y_train)): - params.n_classes = y_train[y_train.columns[0]].nunique() - else: - unique_y_train = np.unique(y_train) - params.n_classes = len(unique_y_train) - if max(unique_y_train) != len(unique_y_train) - 1: - params.n_classes = int(max(unique_y_train)) + 1 - - if params.n_classes > 2: - lgbm_params['num_class'] = params.n_classes - -t_creat_train, lgbm_train = bench.measure_function_time(lgbm.Dataset, X_train, - y_train, params=params, - free_raw_data=False) - -t_creat_test, lgbm_test = bench.measure_function_time(lgbm.Dataset, X_test, y_test, - params=params, reference=lgbm_train, - free_raw_data=False) - -t_train, model_lgbm = bench.measure_function_time(lgbm.train, lgbm_params, lgbm_train, - params=params, - num_boost_round=params.n_estimators, - valid_sets=lgbm_train, - verbose_eval=False) -train_metric = None -if not X_train.equals(X_test): - y_train_pred = model_lgbm.predict(X_train) - train_metric = metric_func(y_train, y_train_pred) - -t_lgbm_pred, y_test_pred = bench.measure_function_time(model_lgbm.predict, X_test, - params=params) -test_metric_lgbm = metric_func(y_test, y_test_pred) - -t_trans, model_daal = bench.measure_function_time( - daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params) - -if hasattr(params, 'n_classes'): - predict_algo = daal4py.gbt_classification_prediction( - nClasses=params.n_classes, resultsToEvaluate='computeClassLabels', fptype='float') - t_daal_pred, daal_pred = bench.measure_function_time( - predict_algo.compute, X_test, model_daal, params=params) - test_metric_daal = metric_func(y_test, daal_pred.prediction) -else: - predict_algo = daal4py.gbt_regression_prediction() - t_daal_pred, daal_pred = bench.measure_function_time( - predict_algo.compute, X_test, model_daal, params=params) - test_metric_daal = metric_func(y_test, daal_pred.prediction) - -utils.print_output( - library='modelbuilders', - algorithm=f'lightgbm_{task}_and_modelbuilder', - stages=['lgbm_train', 'lgbm_predict', 'daal4py_predict'], - params=params, - functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', - 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'], - times=[t_creat_train, t_train, t_creat_test, t_lgbm_pred, t_trans, t_daal_pred], - metric_type=metric_name, - metrics=[train_metric, test_metric_lgbm, test_metric_daal], - data=[X_train, X_test, X_test], -) diff --git a/modelbuilders_bench/mb_utils.py b/modelbuilders_bench/mb_utils.py deleted file mode 100644 index 0c8d29eb6..000000000 --- a/modelbuilders_bench/mb_utils.py +++ /dev/null @@ -1,71 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import json - -import numpy as np - - -def get_accuracy(true_labels, prediction): - errors = 0 - for i, true_label in enumerate(true_labels): - pred_label = 0 - if isinstance(prediction[i], (np.float32, np.float64)): - pred_label = prediction[i] > 0.5 - elif prediction[i].shape[0] == 1: - pred_label = prediction[i][0] - else: - pred_label = np.argmax(prediction[i]) - if true_label != pred_label: - errors += 1 - return 100 * (1 - errors / len(true_labels)) - - -def print_output(library, algorithm, stages, params, functions, - times, metric_type, metrics, data): - if params.output_format == 'json': - output = [] - output.append({ - 'library': library, - 'algorithm': algorithm, - 'input_data': { - 'data_format': params.data_format, - 'data_order': params.data_order, - 'data_type': str(params.dtype), - 'dataset_name': params.dataset_name, - 'rows': data[0].shape[0], - 'columns': data[0].shape[1] - } - }) - if hasattr(params, 'n_classes'): - output[-1]['input_data'].update({'classes': params.n_classes}) - for i, stage in enumerate(stages): - result = { - 'stage': stage, - } - if 'daal' in stage: - result.update({'conversion_to_daal4py': times[2 * i], - 'prediction_time': times[2 * i + 1]}) - elif 'train' in stage: - result.update({'matrix_creation_time': times[2 * i], - 'training_time': times[2 * i + 1]}) - else: - result.update({'matrix_creation_time': times[2 * i], - 'prediction_time': times[2 * i + 1]}) - if metrics[i] is not None: - result.update({f'{metric_type}': metrics[i]}) - output.append(result) - print(json.dumps(output, indent=4)) diff --git a/modelbuilders_bench/xgb_mb.py b/modelbuilders_bench/xgb_mb.py deleted file mode 100644 index 3c7caaec0..000000000 --- a/modelbuilders_bench/xgb_mb.py +++ /dev/null @@ -1,377 +0,0 @@ -# ============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import argparse - -import bench -import daal4py -import numpy as np -import xgboost as xgb - - -def convert_probs_to_classes(y_prob): - return np.array([np.argmax(y_prob[i]) for i in range(y_prob.shape[0])]) - - -def convert_xgb_predictions(y_pred, objective): - if objective == "multi:softprob": - y_pred = convert_probs_to_classes(y_pred) - elif objective == "binary:logistic": - y_pred = (y_pred >= 0.5).astype(np.int32) - return y_pred - - -def shap_accuracy(new, ref): - # broadcast all values into single column and calculate RMSE - return bench.rmse_score( - new.reshape( - -1, - ), - ref.reshape( - -1, - ), - ) - - -parser = argparse.ArgumentParser( - description="xgboost gbt + model transform + daal predict benchmark" -) - -parser.add_argument( - "--colsample-bytree", - type=float, - default=1, - help="Subsample ratio of columns " "when constructing each tree", -) -parser.add_argument( - "--count-dmatrix", - default=False, - action="store_true", - help="Count DMatrix creation in time measurements", -) -parser.add_argument( - "--enable-experimental-json-serialization", - default=True, - choices=("True", "False"), - help="Use JSON to store memory snapshots", -) -parser.add_argument( - "--grow-policy", - type=str, - default="depthwise", - help="Controls a way new nodes are added to the tree", -) -parser.add_argument( - "--inplace-predict", - default=False, - action="store_true", - help="Perform inplace_predict instead of default", -) -parser.add_argument( - "--learning-rate", - "--eta", - type=float, - default=0.3, - help="Step size shrinkage used in update " "to prevents overfitting", -) -parser.add_argument( - "--max-bin", - type=int, - default=256, - help="Maximum number of discrete bins to " "bucket continuous features", -) -parser.add_argument( - "--max-delta-step", - type=float, - default=0, - help="Maximum delta step we allow each leaf output to be", -) -parser.add_argument("--max-depth", type=int, default=6, help="Maximum depth of a tree") -parser.add_argument( - "--max-leaves", type=int, default=0, help="Maximum number of nodes to be added" -) -parser.add_argument( - "--min-child-weight", - type=float, - default=1, - help="Minimum sum of instance weight needed in a child", -) -parser.add_argument( - "--min-split-loss", - "--gamma", - type=float, - default=0, - help="Minimum loss reduction required to make" " partition on a leaf node", -) -parser.add_argument( - "--n-estimators", type=int, default=100, help="Number of gradient boosted trees" -) -parser.add_argument( - "--objective", - type=str, - required=True, - choices=("reg:squarederror", "binary:logistic", "multi:softmax", "multi:softprob"), - help="Control a balance of positive and negative weights", -) -parser.add_argument( - "--reg-alpha", type=float, default=0, help="L1 regularization term on weights" -) -parser.add_argument( - "--reg-lambda", type=float, default=1, help="L2 regularization term on weights" -) -parser.add_argument( - "--scale-pos-weight", - type=float, - default=1, - help="Controls a balance of positive and negative weights", -) -parser.add_argument( - "--single-precision-histogram", - default=False, - action="store_true", - help="Build histograms instead of double precision", -) -parser.add_argument( - "--subsample", - type=float, - default=1, - help="Subsample ratio of the training instances", -) -parser.add_argument( - "--tree-method", - type=str, - required=True, - help="The tree construction algorithm used in XGBoost", -) - -params = bench.parse_args(parser) - - -X_train, X_test, y_train, y_test = bench.load_data(params) - -xgb_params = { - "booster": "gbtree", - "verbosity": 0, - "learning_rate": params.learning_rate, - "min_split_loss": params.min_split_loss, - "max_depth": params.max_depth, - "min_child_weight": params.min_child_weight, - "max_delta_step": params.max_delta_step, - "subsample": params.subsample, - "sampling_method": "uniform", - "colsample_bytree": params.colsample_bytree, - "colsample_bylevel": 1, - "colsample_bynode": 1, - "reg_lambda": params.reg_lambda, - "reg_alpha": params.reg_alpha, - "tree_method": params.tree_method, - "scale_pos_weight": params.scale_pos_weight, - "grow_policy": params.grow_policy, - "max_leaves": params.max_leaves, - "max_bin": params.max_bin, - "objective": params.objective, - "seed": params.seed, - "single_precision_histogram": params.single_precision_histogram, - "enable_experimental_json_serialization": params.enable_experimental_json_serialization, -} - -xgb_params.update({"nthread": params.threads}) -daal4py.daalinit(params.threads) - -if params.objective.startswith("reg"): - task = "regression" - metric_name, metric_func = "rmse", bench.rmse_score -else: - task = "classification" - metric_name = "accuracy" - metric_func = bench.accuracy_score - if "cudf" in str(type(y_train)): - params.n_classes = y_train[y_train.columns[0]].nunique() - else: - params.n_classes = len(np.unique(y_train)) - - # Covtype has one class more than there is in train - if params.dataset_name == "covtype": - params.n_classes += 1 - - if params.n_classes > 2: - xgb_params["num_class"] = params.n_classes - -t_creat_train, dtrain = bench.measure_function_time( - xgb.DMatrix, X_train, params=params, label=y_train -) -t_creat_test, dtest = bench.measure_function_time( - xgb.DMatrix, X_test, params=params, label=y_test -) - -# SHAP interactions are very expensive - cap the number of rows -interaction_n_rows = max(2_000, 200_000 // (X_test.shape[0] * X_test.shape[1])) - -# not benchmarked, but required for SHAP interactions -dtest_interactions = xgb.DMatrix(X_test[:interaction_n_rows]) - - -def fit(dmatrix): - if dmatrix is None: - dmatrix = xgb.DMatrix(X_train, y_train) - return xgb.train(xgb_params, dmatrix, params.n_estimators) - - -if params.inplace_predict: - - def predict(*args): - return booster.inplace_predict( - np.ascontiguousarray(X_test.values, dtype=np.float32) - ) - -else: - - def predict(dmatrix, **kwargs): # type: ignore - if dmatrix is None: - dmatrix = xgb.DMatrix(X_test, y_test) - return booster.predict(dmatrix, **kwargs) - - -fit_time, booster = bench.measure_function_time( - fit, None if params.count_dmatrix else dtrain, params=params -) -train_metric = metric_func( - convert_xgb_predictions(booster.predict(dtrain), params.objective), y_train -) - -predict_time, y_pred = bench.measure_function_time( - predict, - None if params.inplace_predict or params.count_dmatrix else dtest, - params=params, -) -test_metric = metric_func(convert_xgb_predictions(y_pred, params.objective), y_test) - -shap_contrib_time, shap_contribs = bench.measure_function_time( - predict, dtest, pred_contribs=True, params=params -) - -shap_interaction_time, shap_interactions = bench.measure_function_time( - predict, dtest_interactions, pred_interactions=True, params=params -) - -transform_time, model_daal = bench.measure_function_time( - daal4py.mb.convert_model, booster, params=params -) - -predict_time_daal, daal_pred = bench.measure_function_time( - model_daal.predict, X_test, params=params -) -test_metric_daal = metric_func(y_test, daal_pred) - -if model_daal._is_regression: - shap_contrib_time_daal, daal_contribs = bench.measure_function_time( - model_daal.predict, X_test, pred_contribs=True, params=params - ) - - shap_interaction_time_daal, daal_interactions = bench.measure_function_time( - model_daal.predict, - X_test[:interaction_n_rows], - pred_interactions=True, - params=params, - ) - - contrib_accuracy = shap_accuracy(shap_contribs, daal_contribs) - - interaction_accuracy = shap_accuracy(shap_interactions, daal_interactions) - -else: - # classification currently does not support SHAP values - ( - shap_contrib_time_daal, - shap_interaction_time_daal, - contrib_accuracy, - interaction_accuracy, - ) = [0] * 4 - -bench.print_output( - library="modelbuilders", - algorithm=f"xgboost_{task}_and_modelbuilder", - alg_instance=booster, - alg_params={ - "max-depth": getattr(params, "max_depth", None), - "objective": getattr(params, "objective", None), - }, - stages=[ - "training_preparation", - "training", - "prediction_preparation", - "prediction", - "transformation", - "alternative_prediction", - "shap_contrib_prediction", - "alternative_shap_contrib_prediction", - "shap_interaction_prediction", - "alternative_shap_interaction_prediction", - ], - data=[X_train] * 2 + [X_test] * 2 + [X_train] + [X_test] * 5, - params=params, - functions=[ - "xgb.dmatrix.train", - "xgb.train", - "xgb.dmatrix.test", - "xgb.predict", - "daal4py.get_gbt_model_from_xgboost", - "daal4py.predict", - "xgb.predict(pred_contribs=True)", - "daal4py.predict(pred_contribs=True)", - "xgb.predict(pred_interactions=True)", - "daal4py.predict(pred_interactions=True)", - ], - times=[ - t_creat_train, - fit_time, - t_creat_test, - predict_time, - transform_time, - predict_time_daal, - shap_contrib_time, - shap_contrib_time_daal, - shap_interaction_time, - shap_interaction_time_daal, - ], - metric_type=[metric_name, "rmse"], - metrics=[ - [ - None, - train_metric, - None, - test_metric, - None, - test_metric_daal, - None, - None, - None, - None, - ], - [ - None, - None, - None, - None, - None, - None, - None, - contrib_accuracy, - None, - interaction_accuracy, - ], - ], -) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..68b3e5dc0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + + +[tool.black] +line-length = 90 +target-version = ['py39', 'py310', 'py311', 'py312'] +extend-ignore = 'E203' + +[tool.isort] +profile = "black" +line_length = 90 diff --git a/report_generator/README.md b/report_generator/README.md deleted file mode 100755 index a5871d512..000000000 --- a/report_generator/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# Report generator for scikit-learn_bench - -Report generator produces Excel table file from json benchmark log files. - -Run `python report_generator.py --result-files bench_log_1.json,bench_log_2.json [--report-file new_report.xlsx --generation-config default_report_gen_config.json]` to launch report generation. - -runner options: -* ``result-files`` : comma-separated benchmark json result file paths -* ``report-file`` : report file path -* ``generation-config`` : generation configuration file path - -config parameters: -* ``header``: The column names in the table header. These parameters are also used to compare reports. If a name is compound, use the ``:`` symbol to separate its parts. -* ``comparison_method``: The formula for the comparison of two results. The options are: ``1 operation 2`` or ``2 operation 1``, where ``1`` is the first result and ``2`` is the second result. The default is ``2 / 1``, which returns the ratio of the second result to the first one. -* ``aggregation_metrics``: The metrics applied to the columns with the comparisons of two reports. You can use multiple metrics. For each of these metrics, a separate sheet with a summary is compiled. The metrics should be Excel functions. For example: ``"geomean", "average"``. diff --git a/report_generator/default_report_gen_config.json b/report_generator/default_report_gen_config.json deleted file mode 100755 index 08de6dcf4..000000000 --- a/report_generator/default_report_gen_config.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "header": [ - "algorithm", - "stage", - "device", - "input_data:data_order", - "input_data:data_type", - "input_data:dataset_name", - "input_data:rows", - "input_data:columns", - "input_data:classes", - "input_data:n_clusters", - "algorithm_parameters:max_features", - "algorithm_parameters:n_estimators", - "algorithm_parameters:max_depth", - "algorithm_parameters:method", - "algorithm_parameters:n_neighbors", - "algorithm_parameters:n-neighbors", - "algorithm_parameters:max_iter" - ], - "comparison_method": { - "default": "2 / 1" - }, - "aggregation_metrics": [ - "geomean" - ] -} \ No newline at end of file diff --git a/report_generator/model_builder_report_gen_config.json b/report_generator/model_builder_report_gen_config.json deleted file mode 100755 index ef2768ffc..000000000 --- a/report_generator/model_builder_report_gen_config.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "header": [ - "algorithm", - "stage", - "device", - "input_data:data_order", - "input_data:data_type", - "input_data:dataset_name", - "input_data:rows", - "input_data:columns", - "input_data:classes", - "input_data:n_clusters", - "algorithm_parameters:max-depth", - "algorithm_parameters:objective" - ], - "comparison_method": { - "default": "2 / 1" - }, - "aggregation_metrics": [ - "geomean" - ] -} diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py deleted file mode 100755 index bad2f3c76..000000000 --- a/report_generator/report_generator.py +++ /dev/null @@ -1,516 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import datetime -import hashlib -import json -from typing import Any, List, Dict -from openpyxl.formatting.rule import ColorScaleRule -from openpyxl.styles import Font -from openpyxl.utils import get_column_letter - -import openpyxl - - -def get_property(entry: Dict[str, Any], prop: str): - keys = prop.split(':') - value = entry - for key in keys: - if key not in value: - return None - value = value[key] - if (not value): - return "null" - return value - - -def xy_to_excel_cell(x: int, y: int) -> str: - return '{}{}'.format(get_column_letter(x + 1), y + 1) - - -def get_excel_cell(work_sheet, x: int, y: int): - return work_sheet[xy_to_excel_cell(x, y)] - - -def write_cell( - work_sheet, - x: int, - y: int, - value: str, - *, - bold=False, - number_format='General', -) -> None: - work_sheet[xy_to_excel_cell(x, y)] = value - work_sheet[xy_to_excel_cell(x, y)].number_format = number_format - if bold: - work_sheet[xy_to_excel_cell(x, y)].font = Font(bold=True) - - -def is_equal_dict(a: Dict[str, Any], b: Dict[str, Any], props: List[str]) -> bool: - for prop in props: - if get_property(a, prop) != get_property(b, prop): - return False - return True - - -def get_metrics(report: Dict[str, Any]) -> List[str]: - metrics = list() - was = False - for i in report: - if i == "time[s]": - was = True - continue - if was: - metrics.append(i) - return metrics - - -def make_unique(a: List[Any]) -> List[Any]: - result = list() - d = dict() - for i in a: - if i in d: - continue - d[i] = 1 - result.append(i) - return result - - -def get_range( - start_x: int, - finish_x: int, - start_y: int, - finish_y: int, -) -> str: - return xy_to_excel_cell(start_x, start_y) + ':' + \ - xy_to_excel_cell(finish_x, finish_y) - - -def can_convert_to_float(string: str) -> bool: - try: - float(string) - except ValueError: - return False - return True - - -def write_aggregation_metric( - ws, - write_x: int, - write_y: int, - metric_range: str, - metric_name: str, -) -> None: - metric_string = '=' + metric_name + '(' + metric_range + ')' - write_cell( - ws, - write_x, - write_y, - metric_string, - number_format='0.00', - ) - - -def write_header_of_sheet( - work_sheet, - algorithm: str, - header_columns: List[str], - y_offset: int, - metrics: List[str], - agg_offset: int, - agg_metrics: List[str], - json_results: List[Dict[str, Any]], - left_offset: int, -) -> None: - # write header - for ind, val in enumerate(header_columns): - write_cell(work_sheet, ind, y_offset, val.split(':')[-1], bold=True) - # write aggregation metrics - if len(json_results) >= 2: - for ind, val in enumerate(agg_metrics): - write_cell( - work_sheet, - left_offset + len(json_results) - 1, - agg_offset + ind, - val, - bold=True, - ) - # write names of metrics and jsons - metric_offset = 0 - json_results_len = len(json_results) - for metric in metrics: - write_cell( - work_sheet, - left_offset + metric_offset, - y_offset - 1, - metric, - bold=True, - ) - for json_res in json_results: - write_cell( - work_sheet, - left_offset + metric_offset, - y_offset, - json_res["file_name"], - bold=True, - ) - metric_offset += 1 - for i in range(json_results_len): - for j in range(i + 1, json_results_len): - write_cell( - work_sheet, - left_offset + metric_offset, - y_offset, - json_results[i]['file_name'] + ' vs ' + json_results[j]['file_name'], - bold=True, - ) - metric_offset += 1 - - -def get_color_rule(metric: str) -> Any: - red = 'F85D5E' - yellow = 'FAF52E' - green = '58C144' - if metric in ['geomean', 'time[s]']: - return ColorScaleRule( - start_type='num', start_value=0.5, start_color=red, - mid_type='num', mid_value=1, mid_color=yellow, - end_type='num', end_value=5, end_color=green) - if metric == 'average': - return ColorScaleRule( - start_type='num', start_value=-3, start_color=red, - mid_type='num', mid_value=0, mid_color=yellow, - end_type='num', end_value=3, end_color=green) - return ColorScaleRule( - start_type='percentile', start_value=10, start_color=red, - mid_type='percentile', mid_value=50, mid_color=yellow, - end_type='percentile', end_value=90, end_color=green) - - -def get_comparison_method(config: Dict[str, str], metric: str) -> str: - return config[metric] if metric in config else config['default'] - - -def get_ratio_string(a: str, b: str, comparison_method: str, num_digits=3) -> str: - splitted_comparison_method = comparison_method.split(' ') - if splitted_comparison_method[0] == "2": - a, b = b, a - return '=ROUND(' + a + splitted_comparison_method[1] + b + f',{num_digits})' - - -def get_header_parameters( - json_results: List[Dict[str, Any]], - full_header_parameters: List[str], - algorithm: str, -) -> List[str]: - for json_res in json_results: - for report in json_res['results']: - if report['algorithm'] != algorithm: - continue - result = list() - for param in full_header_parameters: - if get_property(report, param) is not None: - result.append(param) - return result - raise ValueError(f'There is no {algorithm} in input json(s)') - - -parser = argparse.ArgumentParser() -parser.add_argument('--result-files', type=str, required=True, - help='Benchmark result file names separated by commas') -parser.add_argument('--report-file', type=str, - default=f'report_{str(datetime.date.today())}.xlsx') -parser.add_argument('--generation-config', type=str, - default='default_report_gen_config.json') -args = parser.parse_args() - -# Read input json(s) -json_results: List[Dict[str, Any]] = list() -for file_name in args.result_files.split(','): - with open(file_name, 'r') as file: - res = json.load(file) - res['file_name'] = file_name - json_results.append(res) - -# Read config -with open(args.generation_config, 'r') as file: - gen_config = json.load(file) - -# compute hash for software and hardware configurations -HASH_LIMIT = 8 -for i, json_res in enumerate(json_results): - for ware in ['software', 'hardware']: - h = hashlib.sha256() - h.update(bytes(str(json_res[ware]), encoding='utf-8')) - json_res[f'{ware}_hash'] = h.hexdigest()[:HASH_LIMIT] - -# getting metrics for each algorithm -available_algos_and_metrics: Dict[str, List[str]] = dict() -for json_res in json_results: - for report in json_res['results']: - metrics: List[str] = get_metrics(report) - if report['algorithm'] in available_algos_and_metrics: - available_algos_and_metrics[report['algorithm']] += metrics - else: - available_algos_and_metrics[report['algorithm']] = metrics - -for ind, val in enumerate(available_algos_and_metrics): - available_algos_and_metrics[val] = ['time[s]'] + make_unique(available_algos_and_metrics[val]) - - -HEAD_OFFSET = 4 -JSON_RESULTS_LEN = len(json_results) - -stages: List[str] = [ - 'training_preparation', - 'training', - 'computation', - 'prediction_preparation', - 'prediction', - 'alternative_prediction', - 'transformation', - 'search', - 'predict_proba', -] - -summary: Dict[str, Dict[str, Dict[str, Dict[str, str]]]] = dict() -wb = openpyxl.Workbook() - -for algo in available_algos_and_metrics: - # algo[:31] because excel warning about length of sheet name no more than 31 symbols - ws = wb.create_sheet(title=f'{algo[:31]}') - header_params = get_header_parameters(json_results, gen_config['header'], algo) - LEFT_OFFSET = len(header_params) - # writing table header - for offset, val in enumerate(['file_name', 'software_hash', 'hardware_hash']): - write_cell(ws, 0, offset, val) - for i, json_res in enumerate(json_results): - write_cell(ws, i + 1, offset, json_res[val]) - - y_offset = 0 - for stage_key in stages: - # list of already used results - used = [ - [False for j in range(len(json_results[i]['results']))] - for i in range(len(json_results)) - ] - begin_y_offset = y_offset - for json_res_ind, json_res in enumerate(json_results): - for report_ind, report in enumerate(json_res['results']): - if report['stage'] != stage_key or \ - report['algorithm'] != algo or \ - used[json_res_ind][report_ind] is True: - continue - # write parameters - for offset, config in enumerate(header_params): - write_cell(ws, offset, HEAD_OFFSET + 1 + y_offset, get_property(report, config)) - # write all metrics in report - metric_offset = 0 - for metric in available_algos_and_metrics[algo]: - write_cell( - ws, - LEFT_OFFSET + metric_offset + json_res_ind, HEAD_OFFSET + 1 + y_offset, - get_property(report, metric), - number_format='0.00', - ) - metric_offset += JSON_RESULTS_LEN * (JSON_RESULTS_LEN + 1) // 2 - used[json_res_ind][report_ind] = True - - # try to find in other configs report with same parameters - for json_res_comp_ind, json_res_comp in enumerate(json_results[json_res_ind + 1:]): - original_index = json_res_ind + 1 + json_res_comp_ind - for report_comp_ind, report_comp in enumerate(json_res_comp['results']): - if report_comp['stage'] != stage_key or \ - report_comp['algorithm'] != algo or \ - used[original_index][report_comp_ind] is True or \ - not is_equal_dict(report, report_comp, header_params): - continue - metric_offset = 0 - for metric in available_algos_and_metrics[algo]: - write_cell( - ws, - LEFT_OFFSET + original_index + metric_offset, - HEAD_OFFSET + y_offset + 1, - get_property(report_comp, metric), - number_format='0.00', - ) - metric_offset += JSON_RESULTS_LEN * (JSON_RESULTS_LEN + 1) // 2 - used[original_index][report_comp_ind] = True - y_offset += 1 - - if y_offset == begin_y_offset: - # nothing was written, so do not have to write header & do comparison - continue - write_header_of_sheet( - ws, - algo, - header_params, - HEAD_OFFSET + begin_y_offset, - available_algos_and_metrics[algo], - HEAD_OFFSET + y_offset + 1, - gen_config['aggregation_metrics'], - json_results, - LEFT_OFFSET, - ) - # write aggregation metric & save info for summary - metric_offset = JSON_RESULTS_LEN - for metric in available_algos_and_metrics[algo]: - comparison_offset = 0 - for i in range(JSON_RESULTS_LEN): - for j in range(i + 1, JSON_RESULTS_LEN): - # comprasion - for y in range(HEAD_OFFSET + begin_y_offset + 1, HEAD_OFFSET + y_offset + 1): - first_offset = LEFT_OFFSET + i + metric_offset - JSON_RESULTS_LEN - second_offset = LEFT_OFFSET + j + metric_offset - JSON_RESULTS_LEN - first_cell = get_excel_cell(ws, first_offset, y) - second_cell = get_excel_cell(ws, second_offset, y) - - if first_cell.value is None or\ - second_cell.value is None or \ - not can_convert_to_float(str(first_cell.value)) or \ - not can_convert_to_float(str(second_cell.value)): - continue - write_cell( - ws, - LEFT_OFFSET + metric_offset + comparison_offset, - y, - get_ratio_string( - xy_to_excel_cell(first_offset, y), - xy_to_excel_cell(second_offset, y), - get_comparison_method(gen_config['comparison_method'], metric), - ), - number_format='0.000', - ) - # fill comparison range by color rule - ws.conditional_formatting.add( - get_range( - LEFT_OFFSET + metric_offset + comparison_offset, - LEFT_OFFSET + metric_offset + comparison_offset, - HEAD_OFFSET + 1 + begin_y_offset, - HEAD_OFFSET + y_offset, - ), - get_color_rule(metric), - ) - # write aggregation metric - for agg_offset, agg_metric in enumerate(gen_config['aggregation_metrics']): - write_aggregation_metric( - ws, - LEFT_OFFSET + metric_offset + comparison_offset, - HEAD_OFFSET + 1 + y_offset + agg_offset, - get_range( - LEFT_OFFSET + metric_offset + comparison_offset, - LEFT_OFFSET + metric_offset + comparison_offset, - HEAD_OFFSET + 1 + begin_y_offset, - HEAD_OFFSET + y_offset, - ), - agg_metric, - ) - - column_name = \ - json_results[i]['file_name'] + \ - ' vs ' + \ - json_results[j]['file_name'] + \ - ' (' + stage_key + ')' - - cell_name_to_summary = \ - '=' + algo[:31] + '!' + \ - xy_to_excel_cell(LEFT_OFFSET + metric_offset + comparison_offset, - HEAD_OFFSET + 1 + y_offset + agg_offset) - if agg_metric not in summary: - summary[agg_metric] = dict() - if column_name not in summary[agg_metric]: - summary[agg_metric][column_name] = dict() - if algo not in summary[agg_metric][column_name]: - summary[agg_metric][column_name][algo] = dict() - summary[agg_metric][column_name][algo].update( - {f'{metric}': cell_name_to_summary}) - comparison_offset += 1 - metric_offset += JSON_RESULTS_LEN * (JSON_RESULTS_LEN + 1) // 2 - # for comfortable view - y_offset += len(gen_config['aggregation_metrics']) + 3 - -# write summary for each aggregation metric -for agg_metric in gen_config['aggregation_metrics']: - if JSON_RESULTS_LEN == 1: - continue - y_offset = 0 - # write summary - ws = wb.create_sheet('Summary' + f' ({agg_metric})', 0) - for name_ind, name in enumerate(summary[agg_metric]): - # write table name - write_cell(ws, 0, y_offset, name, bold=True) - # getting unique list of metrics on current comparison - metrics_in_current_summary = list() - for algo in summary[agg_metric][name]: - for metric in summary[agg_metric][name][algo]: - metrics_in_current_summary.append(metric) - metrics_in_current_summary = make_unique(metrics_in_current_summary) - - # fill table - for metric_ind, metric in enumerate(metrics_in_current_summary): - # write metric name - write_cell(ws, metric_ind + 1, y_offset + 1, metric) - for algo_ind, algo in enumerate(summary[agg_metric][name]): - if metric not in summary[agg_metric][name][algo]: - continue - # write algorithm name - write_cell( - ws, - 0, - y_offset + algo_ind + 2, - algo - ) - # write geomean - write_cell( - ws, - metric_ind + 1, - y_offset + algo_ind + 2, - summary[agg_metric][name][algo][metric], - number_format='0.00', - ) - - # color some range by color rule - ws.conditional_formatting.add( - get_range( - 1, - len(metrics_in_current_summary), - y_offset + 2, - y_offset + len(summary[agg_metric][name]) + 1, - ), - get_color_rule(agg_metric), - ) - y_offset += len(summary[agg_metric][name]) + 3 - -# write hardware & software configs -for i, json_res in enumerate(json_results): - ws = wb.create_sheet(title=f"SW config n{i}_{json_res['software_hash']}") - ws[xy_to_excel_cell(0, 0)] = \ - f"Software configuration {i} (hash: {json_res['software_hash']})" - sw_conf = json.dumps(json_res['software'], indent=4).split('\n') - for j, val in enumerate(sw_conf): - ws[xy_to_excel_cell(0, 1 + j)] = val - - ws = wb.create_sheet(title=f"HW config n{i}_{json_res['hardware_hash']}") - ws[xy_to_excel_cell(0, 0)] = \ - f"Hardware configuration {i} (hash: {json_res['hardware_hash']})" - hw_conf = json.dumps(json_res['hardware'], indent=4).split('\n') - for j, val in enumerate(hw_conf): - ws[xy_to_excel_cell(0, 1 + j)] = val - -wb.remove(wb['Sheet']) -wb.save(args.report_file) diff --git a/report_generator/sklearn_metrics_report_gen_config.json b/report_generator/sklearn_metrics_report_gen_config.json deleted file mode 100644 index 5adf457f7..000000000 --- a/report_generator/sklearn_metrics_report_gen_config.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "header": [ - "stage", - "device", - "input_data:data_order", - "input_data:data_type", - "input_data:dataset_name", - "input_data:rows", - "input_data:columns", - "input_data:classes", - "input_data:n_clusters", - "n_clusters", - "algorithm_parameters:algorithm", - "algorithm_parameters:tol", - "algorithm_parameters:max_iter", - "algorithm_parameters:init", - "algorithm_parameters:n_init", - "algorithm_parameters:alpha", - "algorithm_parameters:l1_ratio", - "algorithm_parameters:solver", - "algorithm_parameters:C", - "algorithm_parameters:cache_size", - "algorithm_parameters:kernel", - "algorithm_parameters:nu", - "algorithm_parameters:eps", - "algorithm_parameters:n_neighbors", - "algorithm_parameters:metric", - "algorithm_parameters:n_estimators" - ], - "comparison_method": { - "default": "2 / 1" - }, - "aggregation_metrics": [ - "geomean" - ] -} diff --git a/report_generator/sklearn_performance_report_gen_config.json b/report_generator/sklearn_performance_report_gen_config.json deleted file mode 100644 index 23c30c243..000000000 --- a/report_generator/sklearn_performance_report_gen_config.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "header": [ - "algorithm", - "stage", - "device", - "input_data:data_order", - "input_data:data_type", - "input_data:dataset_name", - "input_data:rows", - "input_data:columns", - "input_data:classes", - "algorithm_parameters:tol", - "algorithm_parameters:max_iter", - "algorithm_parameters:solver", - "algorithm_parameters:C", - "algorithm_parameters:kernel", - "algorithm_parameters:nu", - "algorithm_parameters:eps", - "algorithm_parameters:n_neighbors", - "algorithm_parameters:n_estimators", - "algorithm_parameters:n_clusters", - "algorithm_parameters:min_samples", - "algorithm_parameters:fit_intercept", - "algorithm_parameters:max_depth", - "algorithm_parameters:max_features" - ], - "comparison_method": { - "default": "2 / 1" - }, - "aggregation_metrics": [ - "geomean" - ] -} diff --git a/requirements-common.txt b/requirements-common.txt deleted file mode 100644 index fd8816bd0..000000000 --- a/requirements-common.txt +++ /dev/null @@ -1,5 +0,0 @@ -scikit-learn==1.2.0 -pandas==1.5.2 -openpyxl -tqdm -requests diff --git a/runner.py b/runner.py deleted file mode 100755 index 6de494785..000000000 --- a/runner.py +++ /dev/null @@ -1,329 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import json -import logging -import os -import socket -import sys -from typing import Any, Dict, List, Union - -import utils -from pathlib import Path - - -def get_configs(path: Path) -> List[str]: - result = list() - for dir_or_file in os.listdir(path): - new_path = Path(path, dir_or_file) - if dir_or_file.endswith('.json'): - result.append(str(new_path)) - elif os.path.isdir(new_path): - result += get_configs(new_path) - return result - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--configs', metavar='ConfigPath', type=str, - default='configs/config_example.json', - help='The path to a configuration file or ' - 'a directory that contains configuration files') - parser.add_argument('--device', '--devices', default='host cpu gpu none', type=str, nargs='+', - choices=('host', 'cpu', 'gpu', 'none'), - help='Availible execution context devices. ' - 'This parameter only marks devices as available, ' - 'make sure to add the device to the config file ' - 'to run it on a specific device') - parser.add_argument('--dummy-run', default=False, action='store_true', - help='Run configuration parser and datasets generation ' - 'without benchmarks running') - parser.add_argument('--dtype', '--dtypes', type=str, default="float32 float64", nargs='+', - choices=("float32", "float64"), - help='Available floating point data types' - 'This parameter only marks dtype as available, ' - 'make sure to add the dtype parameter to the config file ') - parser.add_argument('--workload-size', type=str, default="small medium large", nargs='+', - choices=("small", "medium", "large"), - help='Available workload sizes,' - 'make sure to add the workload-size parameter to the config file ' - 'unmarked workloads will be launched anyway') - parser.add_argument('--no-intel-optimized', default=False, action='store_true', - help='Use Scikit-learn without Intel optimizations') - parser.add_argument('--output-file', default='results.json', - type=argparse.FileType('w'), - help='Output file of benchmarks to use with their runner') - parser.add_argument('--verbose', default='INFO', type=str, - choices=("ERROR", "WARNING", "INFO", "DEBUG"), - help='Print additional information during benchmarks running') - parser.add_argument('--report', nargs='?', default=None, metavar='ConfigPath', type=str, - const='report_generator/default_report_gen_config.json', - help='Create an Excel report based on benchmarks results. ' - 'If the parameter is not set, the reporter will not be launched. ' - 'If the parameter is set and the config is not specified, ' - 'the default config will be used. ' - 'Need "openpyxl" library') - args = parser.parse_args() - - logging.basicConfig( - stream=sys.stdout, format='%(levelname)s: %(message)s', level=args.verbose) - hostname = socket.gethostname() - - env = os.environ.copy() - if 'DATASETSROOT' in env: - datasets_root = env['DATASETSROOT'] - logging.info(f'Datasets folder at {datasets_root}') - elif 'DAAL_DATASETS' in env: - datasets_root = env['DAAL_DATASETS'] - logging.info(f'Datasets folder at {datasets_root}') - else: - datasets_root = '' - logging.info('Datasets folder is not set, using local folder') - - json_result: Dict[str, Union[Dict[str, Any], List[Any]]] = { - 'hardware': utils.get_hw_parameters(), - 'software': utils.get_sw_parameters(), - 'results': [] - } - is_successful = True - # getting jsons from folders - paths_to_configs: List[str] = list() - for config_name in args.configs.split(','): - if os.path.isdir(config_name): - config_name = get_configs(Path(config_name)) - else: - config_name = [config_name] - paths_to_configs += config_name - args.configs = ','.join(paths_to_configs) - - for config_name in args.configs.split(','): - logging.info(f'Config: {config_name}') - with open(config_name, 'r') as config_file: - config = json.load(config_file) - - # get parameters that are common for all cases - common_params = config['common'] - for params_set in config['cases']: - params = common_params.copy() - params.update(params_set.copy()) - - if 'workload-size' in params: - if params['workload-size'] not in args.workload_size: - continue - del params['workload-size'] - - device = [] - if 'device' not in params: - if 'sklearn' in params['lib']: - logging.info('The device parameter value is not defined in config, ' - 'none is used') - device = ['none'] - elif not isinstance(params['device'], list): - device = [params['device']] - else: - device = params['device'] - params["device"] = [dv for dv in device if dv in args.device] - - dtype = [] - if 'dtype' not in params: - dtype = ['float64'] - elif not isinstance(params['dtype'], list): - dtype = [params['dtype']] - else: - dtype = params['dtype'] - params['dtype'] = [dt for dt in dtype if dt in args.dtype] - - algorithm = params['algorithm'] - libs = params['lib'] - if not isinstance(libs, list): - libs = [libs] - del params['dataset'], params['algorithm'], params['lib'] - cases = utils.generate_cases(params) - logging.info(f'{algorithm} algorithm: {len(libs) * len(cases)} case(s),' - f' {len(params_set["dataset"])} dataset(s)\n') - - if (len(libs) * len(cases) == 0): - continue - - for dataset in params_set['dataset']: - if dataset['source'] in ['csv', 'npy']: - dataset_name = dataset['name'] if 'name' in dataset else 'unknown' - if 'training' not in dataset or 'x' not in dataset['training']: - logging.warning( - f'Dataset {dataset_name} could not be loaded. \n' - 'Training data for algorithm is not specified' - ) - continue - - files = {} - - files['file-X-train'] = dataset['training']["x"] - if 'y' in dataset['training']: - files['file-y-train'] = dataset['training']["y"] - if 'testing' in dataset: - files['file-X-test'] = dataset["testing"]["x"] - if 'y' in dataset['testing']: - files['file-y-test'] = dataset["testing"]["y"] - - dataset_path = utils.find_the_dataset(dataset_name, datasets_root, - files.values()) - if dataset_path is None: - logging.warning( - f'Dataset {dataset_name} could not be loaded. \n' - 'Check the correct name or expand the download in ' - 'the folder dataset.' - ) - continue - elif not dataset_path and datasets_root: - logging.info( - f'{dataset_name} is taken from local folder' - ) - - paths = '' - for data_path, data_file in files.items(): - paths += f'--{data_path} {os.path.join(dataset_path, data_file)} ' - - elif dataset['source'] == 'synthetic': - class GenerationArgs: - classes: int - clusters: int - features: int - filex: str - filextest: str - filey: str - fileytest: str - samples: int - seed: int - test_samples: int - type: str - gen_args = GenerationArgs() - - if 'seed' in params_set: - gen_args.seed = params_set['seed'] - else: - gen_args.seed = 777 - - # default values - gen_args.clusters = 10 - gen_args.type = dataset['type'] - gen_args.samples = dataset['training']['n_samples'] - gen_args.features = dataset['n_features'] - if 'n_classes' in dataset: - gen_args.classes = dataset['n_classes'] - cls_num_for_file = f'-{dataset["n_classes"]}' - elif 'n_clusters' in dataset: - gen_args.clusters = dataset['n_clusters'] - cls_num_for_file = f'-{dataset["n_clusters"]}' - else: - cls_num_for_file = '' - - file_prefix = f'data/synthetic-{gen_args.type}{cls_num_for_file}-' - file_postfix = f'-{gen_args.samples}x{gen_args.features}.npy' - - files = {} - gen_args.filex = f'{file_prefix}X-train{file_postfix}' - files['file-X-train'] = gen_args.filex - if gen_args.type not in ['blobs']: - gen_args.filey = f'{file_prefix}y-train{file_postfix}' - files['file-y-train'] = gen_args.filey - - if 'testing' in dataset: - gen_args.test_samples = dataset['testing']['n_samples'] - gen_args.filextest = f'{file_prefix}X-test{file_postfix}' - files['file-X-test'] = gen_args.filextest - if gen_args.type not in ['blobs']: - gen_args.fileytest = f'{file_prefix}y-test{file_postfix}' - files['file-y-test'] = gen_args.fileytest - else: - gen_args.test_samples = 0 - gen_args.filextest = gen_args.filex - files['file-X-test'] = gen_args.filextest - if gen_args.type not in ['blobs']: - gen_args.fileytest = gen_args.filey - files['file-y-test'] = gen_args.filey - - dataset_name = f'synthetic_{gen_args.type}' - - dataset_path = utils.find_or_gen_dataset(gen_args, - datasets_root, files.values()) - if dataset_path is None: - logging.warning( - f'Dataset {dataset_name} could not be generated. \n' - ) - continue - - paths = '' - for data_path, data_file in files.items(): - paths += f'--{data_path} {os.path.join(dataset_path, data_file)} ' - else: - logging.warning('Unknown dataset source. Only synthetics datasets ' - 'and csv/npy files are supported now') - - no_intel_optimize = \ - '--no-intel-optimized ' if args.no_intel_optimized else '' - for lib in libs: - for i, case in enumerate(cases): - command = f'python {lib}_bench/{algorithm}.py ' \ - + no_intel_optimize \ - + f'--arch {hostname} {case} {paths} ' \ - + f'--dataset-name {dataset_name}' - command = ' '.join(command.split()) - logging.info(command) - if not args.dummy_run: - case = f'{lib},{algorithm} ' + case - stdout, stderr = utils.read_output_from_command( - command, env=os.environ.copy()) - stdout, extra_stdout = utils.filter_stdout(stdout) - stderr = utils.filter_stderr(stderr) - - print(stdout, end='\n') - - if extra_stdout != '': - stderr += f'CASE {case} EXTRA OUTPUT:\n' \ - + f'{extra_stdout}\n' - try: - if isinstance(json_result['results'], list): - json_result['results'].extend( - json.loads(stdout)) - except json.JSONDecodeError as decoding_exception: - stderr += f'CASE {case} JSON DECODING ERROR:\n' \ - + f'{decoding_exception}\n{stdout}\n' - - if stderr != '': - if 'daal4py' not in stderr: - is_successful = False - logging.warning( - 'Error in benchmark: \n' + stderr) - - json.dump(json_result, args.output_file, indent=4) - name_result_file = args.output_file.name - args.output_file.close() - - if args.report: - command = 'python report_generator/report_generator.py ' \ - + f'--result-files {name_result_file} ' \ - + f'--report-file {name_result_file}.xlsx ' \ - + '--generation-config ' + args.report - logging.info(command) - stdout, stderr = utils.read_output_from_command(command) - if stderr != '': - logging.warning('Error in report generator: \n' + stderr) - is_successful = False - - if not is_successful: - logging.warning('benchmark running had runtime errors') - sys.exit(1) diff --git a/cuml_bench/__init__.py b/sklbench/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from cuml_bench/__init__.py rename to sklbench/__init__.py diff --git a/sklbench/__main__.py b/sklbench/__main__.py new file mode 100644 index 000000000..0dce22b42 --- /dev/null +++ b/sklbench/__main__.py @@ -0,0 +1,33 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import sys + +from sklbench.runner import get_parser_description, get_runner_parser, run_benchmarks + + +def main(): + parser = get_runner_parser() + args = parser.parse_args() + if args.describe_parser: + print(get_parser_description(parser)) + return 0 + else: + return run_benchmarks(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/sklbench/benchmarks/README.md b/sklbench/benchmarks/README.md new file mode 100644 index 000000000..7dcb13aa2 --- /dev/null +++ b/sklbench/benchmarks/README.md @@ -0,0 +1,41 @@ +# Benchmarks + +```mermaid +flowchart LR + A["Benchmarking case parameters\n[JSON-formatted string]"] --> C[Individual benchmark] + B["Benchmarking case filters\n[JSON-formatted string]"] --> C + C --> D["Raw results with parameters and metrics\n[JSON-formatted string]"] + + classDef inputOutputStyle fill:#44b,color:white,stroke-width:2px,stroke:white; + classDef benchStyle font-size:x-large + class A inputOutputStyle + class B inputOutputStyle + class D inputOutputStyle + class C benchStyle +``` + +## `Scikit-learn Estimator` + +Benchmark workflow: + - Load estimator from the specified library by recursive module search + - Load data with a common loader function + - Assign special values that require estimator/data to be loaded + - Get sklearn/sklearnex context, estimator parameters, running parameters + - Measure required estimator methods + - Combine metrics and parameters into the output + +See [benchmark-specific config parameters](../../configs/README.md#benchmark-specific-parameters). + +## `Function` + +Benchmark workflow: + - Load function from the specified library by recursive module search + - Load data with a common loader function + - Construct data arguments in specified order + - Assign special values that require estimator/data to be loaded + - Measure function performance metrics + +See [benchmark-specific config parameters](../../configs/README.md#benchmark-specific-parameters). + +--- +[Documentation tree](../../README.md#-documentation) diff --git a/daal4py_bench/__init__.py b/sklbench/benchmarks/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from daal4py_bench/__init__.py rename to sklbench/benchmarks/__init__.py diff --git a/sklbench/benchmarks/common.py b/sklbench/benchmarks/common.py new file mode 100644 index 000000000..7f81386ee --- /dev/null +++ b/sklbench/benchmarks/common.py @@ -0,0 +1,95 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import argparse +import json +from typing import Dict + +from ..utils.bench_case import get_bench_case_value, get_data_name +from ..utils.custom_types import BenchCase +from ..utils.logger import logger + + +def enrich_result(result: Dict, bench_case: BenchCase) -> Dict: + """Common function for all benchmarks to update + the result with additional information""" + result.update( + { + "dataset": get_data_name(bench_case, shortened=True), + "library": get_bench_case_value(bench_case, "algorithm:library").replace( + "sklbench.emulators.", "" + ), + "device": get_bench_case_value(bench_case, "algorithm:device"), + } + ) + enable_modelbuilders = get_bench_case_value( + bench_case, "algorithm:enable_modelbuilders", False + ) + if enable_modelbuilders and result["library"] in ["xgboost", "lightgbm", "catboost"]: + # NOTE: while modelbuilders are stored in `daal4py.mb` namespace + # their results are saved as `sklearnex` for better report readability + logger.debug( + "Modelbuilders are enabled, changing library " + f"`{result['library']}` to `sklearnex` in benchmark output." + ) + result["library"] = "sklearnex" + taskset = get_bench_case_value(bench_case, "bench:taskset", None) + if taskset is not None: + result.update({"taskset": taskset}) + distributor = get_bench_case_value(bench_case, "bench:distributor") + if distributor is not None: + result.update({"distributor": distributor}) + mpi_params = get_bench_case_value(bench_case, "bench:mpi_params", dict()) + for mpi_key, mpi_value in mpi_params.items(): + result[f"mpi_{mpi_key}"] = mpi_value + return result + + +def check_to_print_result(bench_case: BenchCase) -> bool: + """Check if the benchmark should print the result""" + distribution = get_bench_case_value(bench_case, "bench:distributor") + if distribution == "mpi": + from mpi4py import MPI + + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + if rank != 0: + return False + return True + + +def main_template(main_method): + parser = argparse.ArgumentParser() + parser.add_argument("--bench-case", required=True, type=str) + parser.add_argument("--filters", required=True, type=str) + parser.add_argument( + "--log-level", + default="WARNING", + type=str, + choices=("ERROR", "WARNING", "INFO", "DEBUG"), + help="Logging level for benchmark", + ) + args = parser.parse_args() + + logger.setLevel(args.log_level) + + bench_case = json.loads(args.bench_case) + filters = json.loads(args.filters)["filters"] + + results = main_method(bench_case, filters) + + if check_to_print_result(bench_case): + print(json.dumps(results, indent=4)) diff --git a/sklbench/benchmarks/custom_function.py b/sklbench/benchmarks/custom_function.py new file mode 100644 index 000000000..25abb900e --- /dev/null +++ b/sklbench/benchmarks/custom_function.py @@ -0,0 +1,114 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from typing import Dict, List, Tuple + +from ..datasets import load_data +from ..datasets.transformer import split_and_transform_data +from ..utils.bench_case import get_bench_case_value +from ..utils.common import get_module_members +from ..utils.config import bench_case_filter +from ..utils.custom_types import BenchCase +from ..utils.logger import logger +from ..utils.measurement import measure_case +from ..utils.special_params import assign_case_special_values_on_run +from .common import enrich_result, main_template + + +def get_function_instance(library_name: str, function_name: str): + _, functions_map = get_module_members(library_name.split(".")) + if function_name not in functions_map: + raise ValueError( + f"Unable to find {function_name} function in {library_name} module." + ) + if len(functions_map[function_name]) != 1: + logger.debug( + f'List of estimator with name "{function_name}": ' + f"{functions_map[function_name]}" + ) + logger.warning( + f"Found {len(functions_map[function_name])} classes for " + f'"{function_name}" estimator name. ' + f"Using first {functions_map[function_name][0]}." + ) + return functions_map[function_name][0] + + +def get_function_args(bench_case: BenchCase, x_train, y_train, x_test, y_test) -> Tuple: + args_map = { + "x_train": x_train, + "y_train": y_train, + "x_test": x_test, + "y_test": y_test, + } + # order format: "arg1|arg2|...|argN" + args_order = get_bench_case_value( + bench_case, "algorithm:args_order", "x_train|y_train" + ) + args = (args_map[arg] for arg in args_order.split("|")) + return args + + +def measure_function_instance(bench_case, function_instance, args: Tuple, kwargs: Dict): + metrics = dict() + metrics["time[ms]"], metrics["time std[ms]"], _ = measure_case( + bench_case, function_instance, *args, **kwargs + ) + return metrics + + +def main(bench_case: BenchCase, filters: List[BenchCase]): + library_name = get_bench_case_value(bench_case, "algorithm:library") + function_name = get_bench_case_value(bench_case, "algorithm:function") + + function_instance = get_function_instance(library_name, function_name) + + # load and transform data + data, data_description = load_data(bench_case) + (x_train, x_test, y_train, y_test), data_description = split_and_transform_data( + bench_case, data, data_description + ) + + # assign special values + assign_case_special_values_on_run( + bench_case, (x_train, y_train, x_test, y_test), data_description + ) + + function_args = get_function_args(bench_case, x_train, y_train, x_test, y_test) + + if not bench_case_filter(bench_case, filters): + logger.warning("Benchmarking case was filtered.") + return list() + + metrics = measure_function_instance( + bench_case, + function_instance, + function_args, + get_bench_case_value(bench_case, "algorithm:kwargs", dict()), + ) + result = { + "task": "utility", + "function": function_name, + } + result = enrich_result(result, bench_case) + # TODO: replace `x_train` data_desc with more informative values + result.update(data_description["x_train"]) + result.update(metrics) + return [result] + + +if __name__ == "__main__": + main_template(main) diff --git a/sklbench/benchmarks/estimator_task_map.json b/sklbench/benchmarks/estimator_task_map.json new file mode 100644 index 000000000..09eb59b55 --- /dev/null +++ b/sklbench/benchmarks/estimator_task_map.json @@ -0,0 +1,20 @@ +{ + "classification": [ + "Classifier", + "LogisticRegression", + "SVC" + ], + "regression": [ + "Regressor", + "LinearRegression", + "Ridge", + "Lasso", + "ElasticNet", + "SVR" + ], + "clustering": ["DBSCAN", "KMeans"], + "decomposition": ["PCA"], + "manifold": ["TSNE"], + "search": ["NearestNeighbors"], + "utility": ["BasicStatistics", "Covariance"] +} diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py new file mode 100644 index 000000000..f9c0a75ed --- /dev/null +++ b/sklbench/benchmarks/sklearn_estimator.py @@ -0,0 +1,561 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import inspect +import io +import json +import logging +import os +from importlib.metadata import PackageNotFoundError, version +from typing import Dict, List, Union + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator +from sklearn.metrics import ( + accuracy_score, + balanced_accuracy_score, + completeness_score, + davies_bouldin_score, + homogeneity_score, + log_loss, + mean_squared_error, + r2_score, + roc_auc_score, +) + +from ..datasets import load_data +from ..datasets.transformer import split_and_transform_data +from ..utils.bench_case import get_bench_case_value +from ..utils.common import convert_to_numpy, custom_format, get_module_members +from ..utils.config import bench_case_filter +from ..utils.custom_types import BenchCase, Numeric, NumpyNumeric +from ..utils.logger import logger +from ..utils.measurement import measure_case +from ..utils.special_params import assign_case_special_values_on_run +from .common import enrich_result, main_template + + +def get_estimator(library_name: str, estimator_name: str): + classes_map, _ = get_module_members(library_name.split(".")) + if estimator_name not in classes_map: + raise ValueError( + f"Unable to find {estimator_name} estimator in {library_name} module." + ) + if len(classes_map[estimator_name]) != 1: + logger.debug( + f'List of estimator with name "{estimator_name}": ' + f"{classes_map[estimator_name]}" + ) + logger.warning( + f"Found {len(classes_map[estimator_name])} classes for " + f'"{estimator_name}" estimator name. ' + f"Using first {classes_map[estimator_name][0]}." + ) + estimator = classes_map[estimator_name][0] + if not issubclass(estimator, BaseEstimator): + logger.info(f"{estimator} estimator is not derived from sklearn's BaseEstimator") + return estimator + + +def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]: + # default estimator methods + estimator_methods = { + "training": ["fit"], + "inference": ["predict", "predict_proba", "transform"], + } + for stage in estimator_methods.keys(): + methods = get_bench_case_value( + bench_case, f"algorithm:estimator_methods:{stage}", None + ) + if methods is not None: + estimator_methods[stage] = methods.split("|") + return estimator_methods + + +def estimator_to_task(estimator_name: str) -> str: + """Maps estimator name to machine learning task based on listed estimator postfixes""" + with open( + os.path.join( + os.path.abspath(os.path.dirname(__file__)), "estimator_task_map.json" + ) + ) as map_file: + estimator_to_task_map = json.load(map_file) + for task, postfixes_list in estimator_to_task_map.items(): + if any(map(lambda postfix: estimator_name.endswith(postfix), postfixes_list)): + return task + return "unknown" + + +def get_number_of_classes(estimator_instance, y): + classes = getattr(estimator_instance, "classes_", None) + class_weight = getattr(estimator_instance, "_class_weight", None) + if classes is not None and hasattr(classes, "__len__"): + return len(classes) + elif class_weight is not None and hasattr(class_weight, "__len__"): + return len(class_weight) + else: + return len(np.unique(y)) + + +def get_subset_metrics_of_estimator( + task, stage, estimator_instance, data +) -> Dict[str, float]: + # brute kNN with transfer between training and inference stages + # is required for recall metric calculation of search task + global _brute_knn + + metrics = dict() + # Note: use `x` and `y` when calling estimator methods, + # and `x_compat` and `y_compat` for compatibility with sklearn metrics + x, y = data + x_compat, y_compat = list(map(lambda i: convert_to_numpy(i), data)) + if stage == "training": + if hasattr(estimator_instance, "n_iter_"): + iterations = estimator_instance.n_iter_ + if isinstance(iterations, Union[Numeric, NumpyNumeric].__args__): + metrics.update({"iterations": int(iterations)}) + elif ( + hasattr(iterations, "__len__") + and len(iterations) == 1 + and isinstance(iterations[0], Union[Numeric, NumpyNumeric].__args__) + ): + metrics.update({"iterations": int(iterations[0])}) + if task == "classification": + y_pred = convert_to_numpy(estimator_instance.predict(x)) + metrics.update( + { + "accuracy": float(accuracy_score(y_compat, y_pred)), + "balanced accuracy": float(balanced_accuracy_score(y_compat, y_pred)), + } + ) + if hasattr(estimator_instance, "predict_proba") and not ( + hasattr(estimator_instance, "probability") + and getattr(estimator_instance, "probability") == False + ): + y_pred_proba = convert_to_numpy(estimator_instance.predict_proba(x)) + metrics.update( + { + "ROC AUC": float( + roc_auc_score( + y_compat, + ( + y_pred_proba + if y_pred_proba.shape[1] > 2 + else y_pred_proba[:, 1] + ), + multi_class="ovr", + ) + ), + "logloss": float(log_loss(y_compat, y_pred_proba)), + } + ) + elif task == "regression": + y_pred = convert_to_numpy(estimator_instance.predict(x)) + metrics.update( + { + "RMSE": float(mean_squared_error(y_compat, y_pred) ** 0.5), + "R2": float(r2_score(y_compat, y_pred)), + } + ) + elif task == "decomposition": + if "PCA" in str(estimator_instance): + if hasattr(estimator_instance, "score"): + metrics.update( + {"average log-likelihood": float(estimator_instance.score(x))} + ) + if stage == "training" and hasattr( + estimator_instance, "explained_variance_ratio_" + ): + metrics.update( + { + "1st component variance ratio": float( + estimator_instance.explained_variance_ratio_[0] + ) + } + ) + elif task == "clustering": + if hasattr(estimator_instance, "inertia_"): + # compute inertia manually using distances to cluster centers + # provided by KMeans.transform + metrics.update( + { + "inertia": float( + np.power( + convert_to_numpy(estimator_instance.transform(x)).min(axis=1), + 2, + ).sum() + ) + } + ) + if hasattr(estimator_instance, "predict"): + y_pred = convert_to_numpy(estimator_instance.predict(x)) + metrics.update( + { + "Davies-Bouldin score": float(davies_bouldin_score(x_compat, y_pred)), + "homogeneity": float(homogeneity_score(y_compat, y_pred)), + "completeness": float(completeness_score(y_compat, y_pred)), + } + ) + if "DBSCAN" in str(estimator_instance) and stage == "training": + labels = convert_to_numpy(estimator_instance.labels_) + clusters = len(np.unique(labels[labels != -1])) + metrics.update({"clusters": clusters}) + if clusters > 1: + metrics.update( + { + "Davies-Bouldin score": float( + davies_bouldin_score(x_compat, labels) + ) + } + ) + if len(np.unique(y)) < 128: + metrics.update( + { + "homogeneity": ( + float(homogeneity_score(y_compat, labels)) + if clusters > 1 + else 0 + ), + "completeness": ( + float(completeness_score(y_compat, labels)) + if clusters > 1 + else 0 + ), + } + ) + elif task == "manifold": + if hasattr(estimator_instance, "kl_divergence_") and stage == "training": + metrics.update( + {"Kullback-Leibler divergence": float(estimator_instance.kl_divergence_)} + ) + elif task == "search": + if stage == "training": + from sklearn.neighbors import NearestNeighbors + + _brute_knn = NearestNeighbors(algorithm="brute").fit(x_compat) + else: + recall_degree = 10 + ground_truth_neighbors = _brute_knn.kneighbors( + x_compat, recall_degree, return_distance=False + ) + predicted_neighbors = convert_to_numpy( + estimator_instance.kneighbors(x, recall_degree, return_distance=False) + ) + n_relevant = 0 + for i in range(ground_truth_neighbors.shape[0]): + n_relevant += len( + np.intersect1d(ground_truth_neighbors[i], predicted_neighbors[i]) + ) + recall = ( + n_relevant + / ground_truth_neighbors.shape[0] + / ground_truth_neighbors.shape[1] + ) + metrics.update({f"recall@{recall_degree}": recall}) + if ( + hasattr(estimator_instance, "support_vectors_") + and estimator_instance.support_vectors_ is not None + ): + metrics.update({"support vectors": len(estimator_instance.support_vectors_)}) + return metrics + + +def get_context(bench_case: BenchCase): + sklearn_context, sklearnex_context = [ + get_bench_case_value(bench_case, f"algorithm:{library}_context", None) + for library in ["sklearn", "sklearnex"] + ] + if sklearnex_context is not None: + from sklearnex import config_context + + if sklearn_context is not None: + logger.info( + f"Updating sklearnex context {sklearnex_context} " + f"with sklearn context {sklearn_context}" + ) + sklearnex_context.update(sklearn_context) + return config_context, sklearnex_context + elif sklearn_context is not None: + from sklearn import config_context + + return config_context, sklearn_context + else: + from contextlib import nullcontext + + return nullcontext, dict() + + +def sklearnex_logger_is_available() -> bool: + try: + sklex_version = tuple(map(int, version("scikit-learn-intelex").split("."))) + # scikit-learn-intelex packages is still signed with build date + return sklex_version > (20230510, 0) + except PackageNotFoundError: + return False + + +def get_sklearnex_logging_stream() -> io.StringIO: + sklex_logger = logging.getLogger("sklearnex") + sklex_logger.setLevel(logging.INFO) + for handler in sklex_logger.handlers.copy(): + sklex_logger.removeHandler(handler) + stream = io.StringIO() + channel = logging.StreamHandler(stream) + formatter = logging.Formatter("%(levelname)s:%(name)s: %(message)s") + channel.setFormatter(formatter) + sklex_logger.addHandler(channel) + return stream + + +def verify_patching(stream: io.StringIO, function_name) -> bool: + acceleration_lines = 0 + fallback_lines = 0 + logs = stream.getvalue().split("\n")[:-1] + for line in logs: + if function_name in line: + if "running accelerated version on" in line: + acceleration_lines += 1 + if "fallback to original Scikit-learn" in line: + fallback_lines += 1 + return acceleration_lines > 0 and fallback_lines == 0 + + +def create_online_function(method_instance, data_args, batch_size): + n_batches = data_args[0].shape[0] // batch_size + + if "y" in list(inspect.signature(method_instance).parameters): + + def ndarray_function(x, y): + for i in range(n_batches): + method_instance( + x[i * batch_size : (i + 1) * batch_size], + y[i * batch_size : (i + 1) * batch_size], + ) + + def dataframe_function(x, y): + for i in range(n_batches): + method_instance( + x.iloc[i * batch_size : (i + 1) * batch_size], + y.iloc[i * batch_size : (i + 1) * batch_size], + ) + + else: + + def ndarray_function(x): + for i in range(n_batches): + method_instance(x[i * batch_size : (i + 1) * batch_size]) + + def dataframe_function(x): + for i in range(n_batches): + method_instance(x.iloc[i * batch_size : (i + 1) * batch_size]) + + if "ndarray" in str(type(data_args[0])): + return ndarray_function + elif "DataFrame" in str(type(data_args[0])): + return dataframe_function + else: + return f"Unknown {type(data_args[0])} input type for online execution mode" + + +def measure_sklearn_estimator( + bench_case, + task, + estimator_class, + estimator_methods, + estimator_params, + x_train, + x_test, + y_train, + y_test, +): + enable_modelbuilders = get_bench_case_value( + bench_case, "algorithm:enable_modelbuilders", False + ) + ensure_sklearnex_patching = get_bench_case_value( + bench_case, "bench:ensure_sklearnex_patching", True + ) + ensure_sklearnex_patching = ( + ensure_sklearnex_patching + and sklearnex_logger_is_available() + and ( + estimator_class.__module__.startswith("daal4py") + or estimator_class.__module__.startswith("sklearnex") + ) + ) + sklearnex_logging_stream = get_sklearnex_logging_stream() + + metrics = dict() + estimator_instance = estimator_class(**estimator_params) + for stage in estimator_methods.keys(): + for method in estimator_methods[stage]: + if hasattr(estimator_instance, method): + method_instance = getattr(estimator_instance, method) + if "y" in list(inspect.signature(method_instance).parameters): + if stage == "training": + data_args = (x_train, y_train) + else: + data_args = (x_test, y_test) + else: + if stage == "training": + data_args = (x_train,) + else: + data_args = (x_test,) + batch_size = get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ) + if batch_size is not None: + method_instance = create_online_function( + method_instance, data_args, batch_size + ) + # daal4py model builders enabling branch + if enable_modelbuilders and stage == "inference": + import daal4py + + daal_model = daal4py.mb.convert_model( + estimator_instance.get_booster() + ) + method_instance = getattr(daal_model, method) + + metrics[method] = dict() + ( + metrics[method]["time[ms]"], + metrics[method]["time std[ms]"], + _, + ) = measure_case(bench_case, method_instance, *data_args) + if batch_size is not None: + metrics[method]["throughput[samples/ms]"] = ( + (data_args[0].shape[0] // batch_size) * batch_size + ) / metrics[method]["time[ms]"] + if ensure_sklearnex_patching: + full_method_name = f"{estimator_class.__name__}.{method}" + sklearnex_logging_stream.seek(0) + method_is_patched = verify_patching( + sklearnex_logging_stream, full_method_name + ) + if not method_is_patched: + logger.warning( + f"{full_method_name} was not patched by sklearnex." + ) + + quality_metrics = { + "training": get_subset_metrics_of_estimator( + task, "training", estimator_instance, (x_train, y_train) + ), + "inference": get_subset_metrics_of_estimator( + task, "inference", estimator_instance, (x_test, y_test) + ), + } + for method in metrics.keys(): + for stage in estimator_methods.keys(): + if method in estimator_methods[stage]: + metrics[method].update(quality_metrics[stage]) + + return metrics, estimator_instance + + +def main(bench_case: BenchCase, filters: List[BenchCase]): + # get estimator class and ML task + library_name = get_bench_case_value(bench_case, "algorithm:library") + estimator_name = get_bench_case_value(bench_case, "algorithm:estimator") + + estimator_class = get_estimator(library_name, estimator_name) + task = estimator_to_task(estimator_name) + + # load and transform data + data, data_description = load_data(bench_case) + (x_train, x_test, y_train, y_test), data_description = split_and_transform_data( + bench_case, data, data_description + ) + + # assign special values + assign_case_special_values_on_run( + bench_case, (x_train, y_train, x_test, y_test), data_description + ) + + # get estimator parameters + estimator_params = get_bench_case_value( + bench_case, "algorithm:estimator_params", dict() + ) + + # get estimator methods for measurement + estimator_methods = get_estimator_methods(bench_case) + + # benchmark case filtering + if not bench_case_filter(bench_case, filters): + logger.warning("Benchmarking case was filtered.") + return list() + + # run estimator methods + context_class, context_params = get_context(bench_case) + with context_class(**context_params): + metrics, estimator_instance = measure_sklearn_estimator( + bench_case, + task, + estimator_class, + estimator_methods, + estimator_params, + x_train, + x_test, + y_train, + y_test, + ) + + result_template = { + "task": task, + "estimator": estimator_name, + } + result_template = enrich_result(result_template, bench_case) + if "assume_finite" in context_params: + result_template["assume_finite"] = context_params["assume_finite"] + if hasattr(estimator_instance, "get_params"): + estimator_params = estimator_instance.get_params() + # note: "handle" is not JSON-serializable + if "handle" in estimator_params: + del estimator_params["handle"] + logger.debug(f"Estimator parameters:\n{custom_format(estimator_params)}") + result_template.update(estimator_params) + + data_descs = { + "training": data_description["x_train"], + "inference": data_description["x_test"], + } + for stage in estimator_methods.keys(): + data_descs[stage].update( + { + "batch_size": get_bench_case_value( + bench_case, f"algorithm:batch_size:{stage}" + ) + } + ) + if "n_classes" in data_description: + data_descs[stage].update({"n_classes": data_description["n_classes"]}) + + results = list() + for method in metrics.keys(): + result = result_template.copy() + for stage in estimator_methods.keys(): + if method in estimator_methods[stage]: + result.update({"stage": stage, "method": method}) + result.update(data_descs[stage]) + result.update(metrics[method]) + results.append(result) + + return results + + +if __name__ == "__main__": + main_template(main) diff --git a/sklbench/datasets/README.md b/sklbench/datasets/README.md new file mode 100644 index 000000000..7f7cf9c28 --- /dev/null +++ b/sklbench/datasets/README.md @@ -0,0 +1,36 @@ +# Data Handling in Benchmarks + +Data handling steps: +1. Load data: + - If not cached: download/generate dataset and put it in raw and/or usual cache + - If cached: load from cached files +2. Split data into subsets if requested +3. Convert to requested form (data type, format, order, etc.) + +There are two levels of caching with corresponding directories: `raw cache` for files downloaded from external sources, and just `cache` for files applicable for fast-loading in benchmarks. + +Each dataset has few associated files in usual `cache`: data component files (`x`, `y`, `weights`, etc.) and JSON file with dataset properties (number of classes, clusters, default split arguments). +For example: +``` +data_cache/ +... +β”œβ”€β”€ mnist.json +β”œβ”€β”€ mnist_x.parq +β”œβ”€β”€ mnist_y.npz +... +``` + +Cached file formats: +| Format | File extension | Associated Python types | +| --- | --- | --- | +| [Parquet](https://parquet.apache.org) | `.parq` | pandas.DataFrame | +| Numpy uncompressed binary dense data | `.npz` | numpy.ndarray, pandas.Series | +| Numpy uncompressed binary CSR data | `.csr.npz` | scipy.sparse.csr_matrix | + +Existing data sources: + - Synthetic data from sklearn + - OpenML datasets + - Custom loaders for named datasets + +--- +[Documentation tree](../../README.md#-documentation) diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py new file mode 100644 index 000000000..be20420ec --- /dev/null +++ b/sklbench/datasets/__init__.py @@ -0,0 +1,77 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import os +from typing import Dict, Tuple + +from ..utils.bench_case import get_bench_case_value, get_data_name +from ..utils.common import custom_format +from ..utils.custom_types import BenchCase +from .loaders import ( + dataset_loading_functions, + load_openml_data, + load_sklearn_synthetic_data, +) + + +def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]: + # get data name and cache dirs + data_name = get_data_name(bench_case, shortened=False) + data_cache = get_bench_case_value(bench_case, "data:cache_directory", "data_cache") + raw_data_cache = get_bench_case_value( + bench_case, "data:raw_cache_directory", os.path.join(data_cache, "raw") + ) + common_kwargs = { + "data_name": data_name, + "data_cache": data_cache, + "raw_data_cache": raw_data_cache, + } + preproc_kwargs = get_bench_case_value(bench_case, "data:preprocessing_kwargs", dict()) + # make cache directories + os.makedirs(data_cache, exist_ok=True) + os.makedirs(raw_data_cache, exist_ok=True) + # load by dataset name + dataset = get_bench_case_value(bench_case, "data:dataset") + if dataset is not None: + dataset_params = get_bench_case_value(bench_case, "data:dataset_kwargs", dict()) + return dataset_loading_functions[dataset]( + **common_kwargs, preproc_kwargs=preproc_kwargs, dataset_params=dataset_params + ) + # load by source + source = get_bench_case_value(bench_case, "data:source") + if source is not None: + # sklearn.datasets functions + if source.startswith("make_"): + generation_kwargs = get_bench_case_value( + bench_case, "data:generation_kwargs", dict() + ) + return load_sklearn_synthetic_data( + function_name=source, + input_kwargs=generation_kwargs, + preproc_kwargs=preproc_kwargs, + **common_kwargs, + ) + # openml dataset + elif source == "fetch_openml": + openml_id = get_bench_case_value(bench_case, "data:id") + return load_openml_data( + openml_id=openml_id, preproc_kwargs=preproc_kwargs, **common_kwargs + ) + + raise ValueError( + "Unable to get data from bench_case:\n" + f'{custom_format(get_bench_case_value(bench_case, "data"))}' + ) diff --git a/sklbench/datasets/common.py b/sklbench/datasets/common.py new file mode 100644 index 000000000..e7ed01602 --- /dev/null +++ b/sklbench/datasets/common.py @@ -0,0 +1,239 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import json +import os +import re +from typing import Dict, List, Union + +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder + +from ..utils.custom_types import Array +from ..utils.logger import logger + +# NB: non-registered data components and extensions will not be found by loader +KNOWN_DATA_COMPONENTS = ["x", "y"] +KNOWN_DATA_EXTENSIONS = ["parq", "npz", "csr.npz"] + + +def get_expr_by_prefix(prefix: str) -> str: + def get_or_expr_from_list(a: List[str]) -> str: + # transforms list to OR expression: "['x', 'y']" -> "x|y" + return str(a)[1:-1].replace("'", "").replace(", ", "|") + + data_comp_expr = get_or_expr_from_list(KNOWN_DATA_COMPONENTS) + data_ext_expr = get_or_expr_from_list(KNOWN_DATA_EXTENSIONS) + + return f"{prefix}_({data_comp_expr}).({data_ext_expr})" + + +def get_filenames_by_prefix(directory: str, prefix: str) -> List[str]: + assert os.path.isdir(directory) + prefix_expr = get_expr_by_prefix(prefix) + return list( + filter(lambda x: re.search(prefix_expr, x) is not None, os.listdir(directory)) + ) + + +def load_data_file(filepath, extension): + if extension == "parq": + data = pd.read_parquet(filepath) + elif extension.endswith("npz"): + npz_content = np.load(filepath) + if extension == "npz": + data = npz_content["arr_0"] + elif extension == "csr.npz": + data = csr_matrix( + tuple(npz_content[attr] for attr in ["data", "indices", "indptr"]) + ) + else: + raise ValueError(f'Unknown npz subextension "{extension}"') + npz_content.close() + else: + raise ValueError(f'Unknown extension "{extension}"') + return data + + +def load_data_from_cache(data_cache: str, data_name: str) -> Dict: + # data filename format: + # {data_name}_{data_component}.{file_ext} + data_filenames = get_filenames_by_prefix(data_cache, data_name) + data = dict() + for data_filename in data_filenames: + if data_filename.endswith(".json"): + continue + postfix = data_filename.replace(data_name, "")[1:] + component, file_ext = postfix.split(".", 1) + data[component] = load_data_file( + os.path.join(data_cache, data_filename), file_ext + ) + return data + + +def save_data_to_cache(data: Dict, data_cache: str, data_name: str): + for component_name, data_compoment in data.items(): + component_filepath = os.path.join(data_cache, f"{data_name}_{component_name}") + # convert 2d numpy array to pandas DataFrame for better caching + if isinstance(data_compoment, np.ndarray) and data_compoment.ndim == 2: + data_compoment = pd.DataFrame(data_compoment) + # branching by data type for saving to cache + if isinstance(data_compoment, pd.DataFrame): + component_filepath += ".parq" + data_compoment.columns = [ + column if isinstance(column, str) else str(column) + for column in list(data_compoment.columns) + ] + data_compoment.to_parquet( + component_filepath, engine="fastparquet", compression="snappy" + ) + elif isinstance(data_compoment, csr_matrix): + component_filepath += ".csr.npz" + np.savez( + component_filepath, + **{ + attr: getattr(data_compoment, attr) + for attr in ["data", "indices", "indptr"] + }, + ) + elif isinstance(data_compoment, pd.Series): + component_filepath += ".npz" + np.savez(component_filepath, data_compoment.to_numpy()) + elif isinstance(data_compoment, np.ndarray): + component_filepath += ".npz" + np.savez(component_filepath, data_compoment) + + +def load_data_description(data_cache: str, data_name: str) -> Dict: + with open(os.path.join(data_cache, f"{data_name}.json"), "r") as desc_file: + data_desc = json.load(desc_file) + return data_desc + + +def save_data_description(data_desc: Dict, data_cache: str, data_name: str): + with open(os.path.join(data_cache, f"{data_name}.json"), "w") as desc_file: + json.dump(data_desc, desc_file) + + +def cache(function): + def cache_wrapper(**kwargs): + data_name = kwargs["data_name"] + data_cache = kwargs["data_cache"] + if len(get_filenames_by_prefix(data_cache, data_name)) > 0: + logger.info(f'Loading "{data_name}" dataset from cache files') + data = load_data_from_cache(data_cache, data_name) + data_desc = load_data_description(data_cache, data_name) + else: + logger.info(f'Loading "{data_name}" dataset from scratch') + data, data_desc = function(**kwargs) + save_data_to_cache(data, data_cache, data_name) + save_data_description(data_desc, data_cache, data_name) + return data, data_desc + + return cache_wrapper + + +def preprocess_data( + data_dict: List[Dict[str, Array]], + subsample: Union[float, int, None] = None, + **kwargs, +) -> List[Dict[str, Array]]: + """Preprocessing function applied for all data arguments.""" + if subsample is not None: + for data_name, data in data_dict.items(): + data_dict[data_name] = train_test_split( + data, train_size=subsample, random_state=42, shuffle=True + )[0] + return data_dict + + +def preprocess_x( + x: Array, + replace_nan="auto", + category_encoding="ordinal", + normalize=False, + force_for_sparse=True, + **kwargs, +) -> Array: + """Preprocessing function applied only for `x` data argument.""" + return_type = type(x) + if force_for_sparse and isinstance(x, csr_matrix): + x = x.toarray() + if isinstance(x, np.ndarray): + x = pd.DataFrame(x) + if not isinstance(x, pd.DataFrame): + logger.warning( + "Preprocessing is supported only for pandas DataFrames " + f"and numpy ndarray. Got {type(x)} instead." + ) + return x + # NaN values replacement + if x.isna().any().any(): + nan_columns = x.columns[x.isna().any(axis=0)] + nan_df = x[nan_columns] + if replace_nan == "auto": + replace_nan = "median" + logger.debug(f'Changing "replace_nan" from "auto" to "{replace_nan}".') + if replace_nan == "median": + nan_df = nan_df.fillna(nan_df.median()) + elif replace_nan == "mean": + nan_df = nan_df.fillna(nan_df.mean()) + elif replace_nan == "ignore": + pass + else: + logger.warning(f'Unknown "{replace_nan}" replace nan type.') + x[nan_columns] = nan_df + # Categorical features transformation + categ_columns = x.columns[(x.dtypes == "category") + (x.dtypes == "object")] + if len(categ_columns) > 0: + if category_encoding == "onehot": + prev_n_columns = x.shape[1] + x = pd.get_dummies(x, columns=list(categ_columns)) + logger.debug( + f"OneHotEncoder extended {prev_n_columns} columns to {x.shape[1]}." + ) + elif category_encoding == "ordinal": + encoder = OrdinalEncoder() + encoder.set_output(transform="pandas") + ordinal_df = encoder.fit_transform(x[categ_columns]) + x = x.drop(columns=categ_columns).join(ordinal_df) + elif category_encoding == "drop": + x = x.drop(columns=categ_columns) + elif category_encoding == "ignore": + pass + else: + logger.warning(f'Unknown "{category_encoding}" category encoding type.') + # Mean-Standard normalization + if normalize: + x = (x - x.mean()) / x.std() + if return_type == np.ndarray: + return x.values + else: + return x + + +def preprocess(function): + def preprocess_wrapper(**kwargs): + preproc_kwargs = kwargs.pop("preproc_kwargs", dict()) + data, data_desc = function(**kwargs) + data = preprocess_data(data, **preproc_kwargs) + data["x"] = preprocess_x(data["x"], **preproc_kwargs) + return data, data_desc + + return preprocess_wrapper diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py new file mode 100644 index 000000000..fc1fa5e63 --- /dev/null +++ b/sklbench/datasets/downloaders.py @@ -0,0 +1,121 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import os +from typing import Callable, List, Union + +import numpy as np +import pandas as pd +import requests +from scipy.sparse import csr_matrix +from sklearn.datasets import fetch_openml + +try: + import kaggle + + kaggle_is_imported = True +except (ImportError, OSError, ValueError): + kaggle_is_imported = False + + +def retrieve(url: str, filename: str) -> None: + if os.path.isfile(filename): + return + elif url.startswith("http"): + response = requests.get(url, stream=True) + if response.status_code != 200: + raise AssertionError( + f"Failed to download from {url}.\n" + f"Response returned status code {response.status_code}" + ) + total_size = int(response.headers.get("content-length", 0)) + block_size = 8192 + n = 0 + with open(filename, "wb+") as datafile: + for data in response.iter_content(block_size): + n += len(data) / 1024 + datafile.write(data) + if total_size != 0 and n != total_size / 1024: + raise AssertionError("Some content was present but not downloaded/written") + + +def fetch_and_correct_openml( + data_id: int, raw_data_cache_dir: str, as_frame: str = "auto" +): + x, y = fetch_openml( + data_id=data_id, return_X_y=True, as_frame=as_frame, data_home=raw_data_cache_dir + ) + if ( + isinstance(x, csr_matrix) + or isinstance(x, pd.DataFrame) + or isinstance(x, np.ndarray) + ): + pass + else: + raise ValueError(f'Unknown "{type(x)}" x type was returned from fetch_openml') + if isinstance(y, pd.Series): + # label transforms to cat.codes if it is passed as categorical series + if isinstance(y.dtype, pd.CategoricalDtype): + y = y.cat.codes + y = y.values + elif isinstance(y, np.ndarray): + pass + else: + raise ValueError(f'Unknown "{type(y)}" y type was returned from fetch_openml') + return x, y + + +def load_openml( + data_id: int, + raw_data_cache_dir: str, + transform_x_y_func: Union[Callable, None] = None, + as_frame: str = "auto", +): + x, y = fetch_and_correct_openml(data_id, raw_data_cache_dir, as_frame) + if transform_x_y_func is not None: + x, y = transform_x_y_func(x, y) + return x, y + + +def download_and_read_csv(url: str, raw_data_cache_dir: str, **reading_kwargs): + local_path = os.path.join(raw_data_cache_dir, os.path.basename(url)) + retrieve(url, local_path) + data = pd.read_csv(local_path, **reading_kwargs) + return data + + +def download_kaggle_files( + kaggle_type: str, kaggle_name: str, filenames: List[str], raw_data_cache_dir: str +): + if not kaggle_is_imported: + raise ValueError("Kaggle API is not available.") + api = kaggle.KaggleApi() + api.authenticate() + + if kaggle_type == "competition": + download_method = api.competition_download_file + elif kaggle_type == "dataset": + download_method = api.dataset_download_file + else: + raise ValueError( + f"Unknown {kaggle_type} type for " '"download_kaggle_files" function.' + ) + + output_file_paths = {} + for filename in filenames: + download_method(kaggle_name, filename, raw_data_cache_dir) + output_file_paths[filename] = os.path.join(raw_data_cache_dir, filename) + return output_file_paths diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py new file mode 100644 index 000000000..0cc915f05 --- /dev/null +++ b/sklbench/datasets/loaders.py @@ -0,0 +1,854 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== +import os +from typing import Dict, Tuple + +import numpy as np +import pandas as pd +from scipy import sparse +from sklearn.datasets import ( + fetch_california_housing, + fetch_covtype, + load_digits, + load_svmlight_file, + make_blobs, + make_classification, + make_regression, +) + +from .common import cache, preprocess +from .downloaders import ( + download_and_read_csv, + download_kaggle_files, + load_openml, + retrieve, +) + + +@preprocess +@cache +def load_openml_data( + openml_id: int, data_name: str, data_cache: str, raw_data_cache: str +) -> Tuple[Dict, Dict]: + x, y = load_openml(openml_id, raw_data_cache) + data_desc = dict() + unique_labels = dict(pd.value_counts(y)) + if len(unique_labels) < 32 and all(map(lambda x: x > 4, unique_labels.values())): + data_desc["n_classes"] = len(unique_labels) + return {"x": x, "y": y}, data_desc + + +@preprocess +@cache +def load_sklearn_synthetic_data( + function_name: str, + input_kwargs: Dict, + data_name: str, + data_cache: str, + raw_data_cache: str, +) -> Tuple[Dict, Dict]: + functions_map = { + "make_classification": make_classification, + "make_regression": make_regression, + "make_blobs": make_blobs, + } + generation_kwargs = {"random_state": 42} + generation_kwargs.update(input_kwargs) + + if function_name not in functions_map: + raise ValueError( + f"Unknown {function_name} function " "for synthetic data generation" + ) + x, y = functions_map[function_name](**generation_kwargs) + data_desc = dict() + if function_name == "make_classification": + data_desc["n_classes"] = generation_kwargs["n_classes"] + data_desc["n_clusters_per_class"] = generation_kwargs.get( + "n_clusters_per_class", 2 + ) + if function_name == "make_blobs": + data_desc["n_clusters"] = generation_kwargs["centers"] + return {"x": x, "y": y}, data_desc + + +""" +Classification datasets +""" + + +@cache +def load_airline_depdelay( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Airline dataset + http://kt.ijs.si/elena_ikonomovska/data.html + + Classification task. n_classes = 2. + """ + url = "http://kt.ijs.si/elena_ikonomovska/datasets/airline/airline_14col.data.bz2" + + ordered_columns = [ + "Year", + "Month", + "DayofMonth", + "DayofWeek", + "CRSDepTime", + "CRSArrTime", + "UniqueCarrier", + "FlightNum", + "ActualElapsedTime", + "Origin", + "Dest", + "Distance", + "Diverted", + "ArrDelay", + ] + categorical_int_columns = ["Year", "Month", "DayofMonth", "DayofWeek"] + continuous_int_columns = [ + "CRSDepTime", + "CRSArrTime", + "FlightNum", + "ActualElapsedTime", + "Distance", + "Diverted", + "ArrDelay", + ] + column_dtypes = { + col: np.int16 for col in categorical_int_columns + continuous_int_columns + } + + df = download_and_read_csv( + url, raw_data_cache, names=ordered_columns, dtype=column_dtypes + ) + + for col in df.select_dtypes(["object"]).columns: + df[col] = df[col].astype("category") + + task = dataset_params.get("task", "classification") + if task == "classification": + df["ArrDelay"] = (df["ArrDelay"] > 0).astype(int) + elif task == "regression": + pass + else: + raise ValueError(f'Unknown "{task}" task type for airline dataset.') + + y = df["ArrDelay"].to_numpy(dtype=np.float32) + x = df.drop(columns=["ArrDelay"]) + + data_description = { + "n_classes": 2, + "default_split": {"test_size": 0.2, "random_state": 42}, + } + return {"x": x, "y": y}, data_description + + +@cache +def load_bosch( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + data_filename = "train_numeric.csv.zip" + + data_path = download_kaggle_files( + "competition", + "bosch-production-line-performance", + [data_filename], + raw_data_cache, + )[data_filename] + + data = pd.read_csv(data_path, index_col=0, compression="zip", dtype=np.float32) + y = data.iloc[:, -1].to_numpy(dtype=np.float32) + x = data.drop(labels=[data.columns[-1]], axis=1) + + data_desc = {"default_split": {"test_size": 0.2, "random_state": 77}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_hepmass( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + HEPMASS dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/HEPMASS. + + Classification task. n_classes = 2. + """ + url_train = ( + "https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_train.csv.gz" + ) + url_test = ( + "https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_test.csv.gz" + ) + + dtype = np.float32 + train_data = download_and_read_csv( + url_train, raw_data_cache, delimiter=",", compression="gzip", dtype=dtype + ) + test_data = download_and_read_csv( + url_test, raw_data_cache, delimiter=",", compression="gzip", dtype=dtype + ) + + data = pd.concat([train_data, test_data]) + label = data.columns[0] + y = data[label] + x = data.drop(columns=[label]) + + data_desc = { + "n_classes": 2, + "default_split": { + "train_size": train_data.shape[0], + "test_size": test_data.shape[0], + "shuffle": False, + }, + } + return {"x": x, "y": y}, data_desc + + +def load_higgs_susy_subsample( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + if data_name == "susy": + """ + SUSY dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/SUSY + + Classification task. n_classes = 2. + """ + url = ( + "https://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz" + ) + train_size, test_size = 4500000, 500000 + elif data_name == "higgs": + """ + Higgs dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/HIGGS + + Classification task. n_classes = 2. + """ + url = ( + "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" + ) + train_size, test_size = 10000000, 1000000 + else: + raise ValueError( + f"Unknown dataset name {data_name} " + 'for "load_higgs_susy_subsample" function' + ) + + data = download_and_read_csv( + url, raw_data_cache, delimiter=",", header=None, compression="gzip" + ) + assert data.shape[0] == train_size + test_size, "Wrong number of samples was loaded" + x, y = data[data.columns[1:]], data[data.columns[0]] + + data_desc = { + "n_classes": 2, + "default_split": { + "train_size": train_size, + "test_size": test_size, + "shuffle": False, + }, + } + return {"x": x, "y": y}, data_desc + + +@cache +def load_higgs(**kwargs) -> Tuple[Dict, Dict]: + return load_higgs_susy_subsample(**kwargs) + + +@cache +def load_susy(**kwargs) -> Tuple[Dict, Dict]: + return load_higgs_susy_subsample(**kwargs) + + +@cache +def load_letters( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Letter Recognition dataset from UCI machine learning repository + http://archive.ics.uci.edu/ml/datasets/Letter+Recognition + + Classification task. n_classes = 26. + """ + url = ( + "http://archive.ics.uci.edu/ml/machine-learning-databases/" + "letter-recognition/letter-recognition.data" + ) + data = download_and_read_csv(url, raw_data_cache, header=None, dtype=None) + x, y = data.iloc[:, 1:], data.iloc[:, 0].astype("category").cat.codes.values + + data_desc = {"n_classes": 26, "default_split": {"test_size": 0.2, "random_state": 0}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_sklearn_digits( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + x, y = load_digits(return_X_y=True) + data_desc = { + "n_classes": 10, + "default_split": {"train_size": 0.2, "random_state": 42}, + } + return {"x": x, "y": y}, data_desc + + +@cache +def load_covtype( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Cover type dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/covertype + + y contains 7 unique class labels from 1 to 7 inclusive. + Classification task. n_classes = 7. + """ + x, y = fetch_covtype(return_X_y=True, data_home=raw_data_cache) + y = y.astype(int) - 1 + binary = dataset_params.get("binary", False) + if binary: + y = (y > 2).astype(int) + + data_desc = { + "n_classes": 2 if binary else 7, + "default_split": {"test_size": 0.2, "random_state": 77}, + } + return {"x": x, "y": y}, data_desc + + +@cache +def load_epsilon( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + """ + url_train = ( + "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary" + "/epsilon_normalized.bz2" + ) + url_test = ( + "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary" + "/epsilon_normalized.t.bz2" + ) + local_url_train = os.path.join(raw_data_cache, os.path.basename(url_train)) + local_url_test = os.path.join(raw_data_cache, os.path.basename(url_test)) + + retrieve(url_train, local_url_train) + retrieve(url_test, local_url_test) + + x_train, y_train = load_svmlight_file(local_url_train, dtype=np.float32) + x_test, y_test = load_svmlight_file(local_url_test, dtype=np.float32) + + x = sparse.vstack([x_train, x_test]) + y = np.hstack([y_train, y_test]) + y[y <= 0] = 0 + + data_desc = { + "n_classes": 2, + "default_split": { + "train_size": y_train.shape[0], + "test_size": y_test.shape[0], + "shuffle": False, + }, + } + return {"x": x, "y": y}, data_desc + + +@cache +def load_gisette( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + GISETTE is a handwritten digit recognition problem. + The problem is to separate the highly confusable digits '4' and '9'. + This dataset is one of five datasets of the NIPS 2003 feature selection challenge. + + Classification task. n_classes = 2. + """ + + def convert_x(x, n_samples, n_features): + x_out = x.iloc[:n_samples].values + x_out = pd.DataFrame( + np.array( + [ + np.fromstring(elem[0], dtype=int, count=n_features, sep=" ") + for elem in x_out + ] + ) + ) + return x_out.values + + def convert_y(y, n_samples): + y_out = y.iloc[:n_samples].values.astype(int) + y_out = pd.DataFrame((y_out > 0).astype(int)) + return y_out.values.reshape(-1) + + url_prefix = "http://archive.ics.uci.edu/ml/machine-learning-databases" + data_urls = { + "x_train": f"{url_prefix}/gisette/GISETTE/gisette_train.data", + "x_test": f"{url_prefix}/gisette/GISETTE/gisette_valid.data", + "y_train": f"{url_prefix}/gisette/GISETTE/gisette_train.labels", + "y_test": f"{url_prefix}/gisette/gisette_valid.labels", + } + data = {} + for subset_name, subset_url in data_urls.items(): + data[subset_name] = download_and_read_csv(subset_url, raw_data_cache, header=None) + + n_columns, train_size, test_size = 5000, 6000, 1000 + + x_train = convert_x(data["x_train"], train_size, n_columns) + x_test = convert_x(data["x_test"], test_size, n_columns) + y_train = convert_y(data["y_train"], train_size) + y_test = convert_y(data["y_test"], test_size) + + x = np.vstack([x_train, x_test]) + y = np.hstack([y_train, y_test]) + + data_desc = { + "n_classes": 2, + "default_split": { + "train_size": y_train.shape[0], + "test_size": y_test.shape[0], + "shuffle": False, + }, + } + return {"x": x, "y": y}, data_desc + + +@cache +def load_a9a( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + def transform_x_y(x, y): + y[y == -1] = 0 + return x, y + + x, y = load_openml(1430, raw_data_cache, transform_x_y) + data_desc = {"n_classes": 2, "default_split": {"test_size": 0.2, "random_state": 11}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_codrnanorm( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + def transform_x_y(x, y): + x = pd.DataFrame(x.todense()) + y = y.astype("int") + y[y == -1] = 0 + return x, y + + x, y = load_openml(1241, raw_data_cache, transform_x_y_func=transform_x_y) + data_desc = {"n_classes": 2, "default_split": {"test_size": 0.2, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_creditcard( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + x, y = load_openml(1597, raw_data_cache) + data_desc = {"n_classes": 2, "default_split": {"test_size": 0.1, "random_state": 777}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_fraud( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + x, y = load_openml(42175, raw_data_cache) + data_desc = {"n_classes": 2, "default_split": {"test_size": 0.2, "random_state": 77}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_ijcnn( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Author: Danil Prokhorov. + libSVM,AAD group + Cite: Danil Prokhorov. IJCNN 2001 neural network competition. + Slide presentation in IJCNN'01, + Ford Research Laboratory, 2001. http://www.geocities.com/ijcnn/nnc_ijcnn01.pdf. + + Classification task. n_classes = 2. + """ + + def transform_x_y(x, y): + y[y == -1] = 0 + return x, y + + x, y = load_openml(1575, raw_data_cache, transform_x_y) + data_desc = {"n_classes": 2, "default_split": {"test_size": 0.2, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_klaverjas( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Abstract: + Klaverjas is an example of the Jack-Nine card games, + which are characterized as trick-taking games where the the Jack + and nine of the trump suit are the highest-ranking trumps, and + the tens and aces of other suits are the most valuable cards + of these suits. It is played by four players in two teams. + + Task Information: + Classification task. n_classes = 2. + """ + x, y = load_openml(41228, raw_data_cache) + data_desc = {"n_classes": 2, "default_split": {"train_size": 0.2, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_skin_segmentation( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Abstract: + The Skin Segmentation dataset is constructed over B, G, R color space. + Skin and Nonskin dataset is generated using skin textures from + face images of diversity of age, gender, and race people. + Author: Rajen Bhatt, Abhinav Dhall, rajen.bhatt '@' gmail.com, IIT Delhi. + + Classification task. n_classes = 2. + """ + + def transform_x_y(x, y): + y = y.astype(int) + y[y == 2] = 0 + return x, y + + x, y = load_openml(1502, raw_data_cache, transform_x_y) + data_desc = {"n_classes": 2, "default_split": {"test_size": 0.2, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_cifar( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Source: + University of Toronto + Collected by Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton + https://www.cs.toronto.edu/~kriz/cifar.html + + Classification task. n_classes = 10. + """ + x, y = load_openml(40927, raw_data_cache) + binary = dataset_params.get("binary", False) + if binary: + y = (y > 0).astype(int) + data_desc = { + "n_classes": 2 if binary else 10, + "default_split": {"test_size": 1 / 6, "random_state": 42}, + } + return {"x": x, "y": y}, data_desc + + +@cache +def load_connect( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Source: + UC Irvine Machine Learning Repository + http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.htm + + Classification task. n_classes = 3. + """ + x, y = load_openml(1591, raw_data_cache) + y = (y + 1).astype("int") + data_desc = {"n_classes": 3, "default_split": {"test_size": 0.1, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_covertype( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Abstract: This is the original version of the famous + covertype dataset in ARFF format. + Author: Jock A. Blackard, Dr. Denis J. Dean, Dr. Charles W. Anderson + Source: [original](https://archive.ics.uci.edu/ml/datasets/covertype) + + Classification task. n_classes = 7. + """ + x, y = load_openml(1596, raw_data_cache) + data_desc = {"n_classes": 7, "default_split": {"test_size": 0.4, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +def load_mnist_template( + openml_id: int, + raw_data_cache: str, +) -> Tuple[Dict, Dict]: + def transform_x_y(x, y): + return x.astype("uint8"), y.astype("uint8") + + x, y = load_openml(openml_id, raw_data_cache, transform_x_y) + data_desc = {"n_classes": 10, "default_split": {"test_size": 10000, "shuffle": False}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_mnist( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Abstract: + The MNIST database of handwritten digits with 784 features. + It can be split in a training set of the first 60,000 examples, + and a test set of 10,000 examples + Source: + Yann LeCun, Corinna Cortes, Christopher J.C. Burges + http://yann.lecun.com/exdb/mnist/ + + Classification task. n_classes = 10. + """ + return load_mnist_template(554, raw_data_cache) + + +@cache +def load_fashion_mnist( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + return load_mnist_template(40996, raw_data_cache) + + +@cache +def load_svhn( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + return load_mnist_template(41081, raw_data_cache) + + +@cache +def load_sensit( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + Abstract: Vehicle classification in distributed sensor networks. + Author: M. Duarte, Y. H. Hu + Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets) + + Classification task. n_classes = 3. + """ + x, y = load_openml(1593, raw_data_cache) + data_desc = {"n_classes": 3, "default_split": {"test_size": 0.2, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +""" +Regression datasets +""" + + +@cache +def load_abalone( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + https://archive.ics.uci.edu/ml/machine-learning-databases/abalone + + """ + url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data" + data = download_and_read_csv(url, raw_data_cache, header=None) + data[0] = data[0].astype("category").cat.codes + x, y = data.iloc[:, :-1], data.iloc[:, -1].values + + data_desc = {"default_split": {"test_size": 0.2, "random_state": 0}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_california_housing( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + x, y = fetch_california_housing( + return_X_y=True, as_frame=False, data_home=raw_data_cache + ) + data_desc = {"default_split": {"test_size": 0.1, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_fried( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + x, y = load_openml(564, raw_data_cache) + data_desc = {"default_split": {"test_size": 0.2, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_medical_charges_nominal( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + x, y = load_openml(42559, raw_data_cache) + + data_desc = {"default_split": {"test_size": 0.2, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_twodplanes( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + x, y = load_openml(1197, raw_data_cache) + data_desc = {"default_split": {"test_size": 0.4, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_year_prediction_msd( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + url = ( + "https://archive.ics.uci.edu/ml/machine-learning-databases/00203/" + "YearPredictionMSD.txt.zip" + ) + data = download_and_read_csv(url, raw_data_cache, header=None) + x, y = data.iloc[:, 1:], data.iloc[:, 0] + data_desc = {"default_split": {"test_size": 0.1, "shuffle": False}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_yolanda( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + x, y = load_openml(42705, raw_data_cache) + data_desc = {"default_split": {"test_size": 0.2, "random_state": 42}} + return {"x": x, "y": y}, data_desc + + +@cache +def load_road_network( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00246/3D_spatial_network.txt" + n_samples, dtype = 20000, np.float32 + data = download_and_read_csv(url, raw_data_cache, dtype=dtype) + x, y = data.values[:, 1:], data.values[:, 0] + data_desc = { + "default_split": { + "train_size": n_samples, + "test_size": n_samples, + "shuffle": False, + } + } + return {"x": x, "y": y}, data_desc + + +""" +Index/neighbors search datasets +""" + + +def load_ann_dataset_template(url, raw_data_cache): + import h5py + + local_path = os.path.join(raw_data_cache, os.path.basename(url)) + retrieve(url, local_path) + with h5py.File(local_path, "r") as f: + x_train = np.asarray(f["train"]) + x_test = np.asarray(f["test"]) + x = np.concatenate([x_train, x_test], axis=0) + data_desc = { + "default_split": { + "train_size": x_train.shape[0], + "test_size": x_test.shape[0], + } + } + del x_train, x_test + # TODO: remove placeholding zeroed y + y = np.zeros((x.shape[0],)) + return {"x": x, "y": y}, data_desc + + +@cache +def load_sift( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + url = "http://ann-benchmarks.com/sift-128-euclidean.hdf5" + return load_ann_dataset_template(url, raw_data_cache) + + +@cache +def load_gist( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + url = "http://ann-benchmarks.com/gist-960-euclidean.hdf5" + return load_ann_dataset_template(url, raw_data_cache) + + +dataset_loading_functions = { + # classification + "airline_depdelay": load_airline_depdelay, + "a9a": load_a9a, + "bosch": load_bosch, + "codrnanorm": load_codrnanorm, + "covtype": load_covtype, + "creditcard": load_creditcard, + "digits": load_sklearn_digits, + "epsilon": load_epsilon, + "fraud": load_fraud, + "gisette": load_gisette, + "hepmass": load_hepmass, + "higgs": load_higgs, + "susy": load_susy, + "ijcnn": load_ijcnn, + "klaverjas": load_klaverjas, + "cifar": load_cifar, + "connect": load_connect, + "covertype": load_covertype, + "skin_segmentation": load_skin_segmentation, + "mnist": load_mnist, + "fashion_mnist": load_fashion_mnist, + "svhn": load_svhn, + "sensit": load_sensit, + "letters": load_letters, + # regression + "abalone": load_abalone, + "california_housing": load_california_housing, + "fried": load_fried, + "medical_charges_nominal": load_medical_charges_nominal, + "twodplanes": load_twodplanes, + "year_prediction_msd": load_year_prediction_msd, + "yolanda": load_yolanda, + "road_network": load_road_network, + # index search + "sift": load_sift, + "gist": load_gist, +} +dataset_loading_functions = { + key: preprocess(value) for key, value in dataset_loading_functions.items() +} diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py new file mode 100644 index 000000000..d2e63e9e0 --- /dev/null +++ b/sklbench/datasets/transformer.py @@ -0,0 +1,189 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import os + +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix +from sklearn.model_selection import train_test_split + +from ..utils.bench_case import get_bench_case_value +from ..utils.logger import logger + + +def convert_data(data, dformat: str, order: str, dtype: str, device: str = None): + if isinstance(data, csr_matrix) and dformat != "csr_matrix": + data = data.toarray() + if dtype == "preserve": + dtype = None + if order == "F": + data = np.asfortranarray(data, dtype=dtype) + elif order == "C": + data = np.ascontiguousarray(data, dtype=dtype) + else: + raise ValueError(f"Unknown data order {order}") + if dformat == "numpy": + return data + elif dformat == "pandas": + if data.ndim == 1: + return pd.Series(data) + return pd.DataFrame(data) + elif dformat == "dpnp": + import dpnp + + return dpnp.array(data, dtype=dtype, order=order, device=device) + elif dformat == "dpctl": + import dpctl.tensor + + return dpctl.tensor.asarray(data, dtype=dtype, order=order, device=device) + elif dformat.startswith("modin"): + if dformat.endswith("ray"): + os.environ["MODIN_ENGINE"] = "ray" + elif dformat.endswith("dask"): + os.environ["MODIN_ENGINE"] = "dask" + elif dformat.endswith("unidist"): + os.environ["MODIN_ENGINE"] = "unidist" + os.environ["UNIDIST_BACKEND"] = "mpi" + else: + logger.info( + "Modin engine is unknown or not specified. Default engine will be used." + ) + + import modin.pandas as modin_pd + + if data.ndim == 1: + return modin_pd.Series(data) + return modin_pd.DataFrame(data) + elif dformat == "cudf": + import cudf + + if data.ndim == 1: + return cudf.Series(data) + if order == "C": + logger.warning("cudf.DataFrame is not compatible with C data order") + return cudf.DataFrame(data) + elif dformat == "cupy": + import cupy + + return cupy.array(data) + else: + raise ValueError(f"Unknown data format {dformat}") + + +def train_test_split_wrapper(*args, **kwargs): + if "ignore" in kwargs: + result = [] + for arg in args: + result += [arg, arg] + return result + else: + return train_test_split(*args, **kwargs) + + +def split_and_transform_data(bench_case, data, data_description): + if "default_split" in data_description: + split_kwargs = data_description["default_split"].copy() + else: + split_kwargs = {"random_state": 42} + split_kwargs.update(get_bench_case_value(bench_case, "data:split_kwargs", dict())) + x = data["x"] + if "y" in data: + y = data["y"] + x_train, x_test, y_train, y_test = train_test_split_wrapper(x, y, **split_kwargs) + else: + x_train, x_test = train_test_split_wrapper(x, **split_kwargs) + y_train, y_test = None, None + + distributed_split = get_bench_case_value(bench_case, "data:distributed_split", None) + if distributed_split == "rank_based": + from mpi4py import MPI + + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + + n_train = len(x_train) + n_test = len(x_test) + + train_start = rank * n_train // size + train_end = (1 + rank) * n_train // size + test_start = rank * n_test // size + test_end = (1 + rank) * n_test // size + + if "y" in data: + x_train, y_train = ( + x_train[train_start:train_end], + y_train[train_start:train_end], + ) + x_test, y_test = x_test[test_start:test_end], y_test[test_start:test_end] + else: + x_train = x_train[train_start:train_end] + x_test = x_test[test_start:test_end] + + device = get_bench_case_value(bench_case, "algorithm:device", None) + common_data_format = get_bench_case_value(bench_case, "data:format", "pandas") + common_data_order = get_bench_case_value(bench_case, "data:order", "F") + common_data_dtype = get_bench_case_value(bench_case, "data:dtype", "float64") + + data_dict = { + "x_train": x_train, + "x_test": x_test, + "y_train": y_train, + "y_test": y_test, + } + + if "n_classes" in data_description: + required_label_dtype = "int" + else: + required_label_dtype = None + + for subset_name, subset_content in data_dict.items(): + if subset_content is None: + continue + is_label = subset_name.startswith("y") + + data_format = get_bench_case_value( + bench_case, f"data:{subset_name}:format", common_data_format + ) + data_order = get_bench_case_value( + bench_case, f"data:{subset_name}:order", common_data_order + ) + data_dtype = get_bench_case_value( + bench_case, f"data:{subset_name}:dtype", common_data_dtype + ) + + if is_label and required_label_dtype is not None: + data_dtype = required_label_dtype + + converted_data = convert_data( + subset_content, data_format, data_order, data_dtype, device + ) + data_dict[subset_name] = converted_data + if not is_label: + data_description[subset_name] = { + "format": data_format, + "order": data_order, + "dtype": data_dtype, + "samples": converted_data.shape[0], + } + if len(converted_data.shape) == 2 and converted_data.shape[1] > 1: + data_description[subset_name]["features"] = converted_data.shape[1] + + return ( + (data_dict[name] for name in ["x_train", "x_test", "y_train", "y_test"]), + data_description, + ) diff --git a/sklbench/emulators/README.md b/sklbench/emulators/README.md new file mode 100644 index 000000000..faa11d79c --- /dev/null +++ b/sklbench/emulators/README.md @@ -0,0 +1,19 @@ +# Emulators + +This part of **scikit-learn_bench** contains emulators - sklearn-like estimators wrapping other non-compliant frameworks' APIs. + +Emulators are specified in configs using full module path and emulator name, for example: +```json +{ "library": "sklbench.emulators.svs", "estimator": "NearestNeighbors" } +``` + +## Emulators list + +| Library | Emulator name | Supported methods | Wrapped entity | +| --- | --- | --- | --- | +| Faiss | NearestNeighbors | `fit`, `kneighbors` | `FlatL2`, `IVFFlat` and `IVFPQ` index search. Supports both `cpu` and `gpu` devices. | +| RAFT | NearestNeighbors | `fit`, `kneighbors` | `FlatL2`, `IVFFlat`, `IVFPQ` and `CAGRA` index search. | +| SVS | NearestNeighbors | `fit`, `kneighbors` | `Vamana` index search. | + +--- +[Documentation tree](../../README.md#-documentation) diff --git a/sklbench/emulators/__init__.py b/sklbench/emulators/__init__.py new file mode 100644 index 000000000..2499f7728 --- /dev/null +++ b/sklbench/emulators/__init__.py @@ -0,0 +1,17 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +__all__ = ["common", "faiss", "raft", "svs"] diff --git a/sklbench/emulators/common/__init__.py b/sklbench/emulators/common/__init__.py new file mode 100644 index 000000000..2a763f461 --- /dev/null +++ b/sklbench/emulators/common/__init__.py @@ -0,0 +1,19 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from .neighbors import NearestNeighborsBase + +__all__ = ["NearestNeighborsBase"] diff --git a/sklbench/emulators/common/neighbors.py b/sklbench/emulators/common/neighbors.py new file mode 100644 index 000000000..6c4540182 --- /dev/null +++ b/sklbench/emulators/common/neighbors.py @@ -0,0 +1,62 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + + +from warnings import warn + +import numpy as np + + +class NearestNeighborsBase: + def get_params(self): + result = { + "n_neighbors": self.n_neighbors, + "algorithm": self.algorithm, + "metric": self.metric, + "metric_params": None, + "p": 2 if "euclidean" in self.metric else None, + } + optional_keys = [ + "n_lists", + "n_probes", + "m_subvectors", + "n_bits", + "intermediate_graph_degree", + "graph_degree", + ] + for optional_key in optional_keys: + if hasattr(self, optional_key): + result[optional_key] = getattr(self, optional_key) + return result + + def get_m_subvectors(self, percentile, d): + """Method to get `m_subvectors` closest to specific percentile and + compatible with RAFT and FAISS""" + raft_comp = np.arange(1, d // 16) * 16 + faiss_comp = np.array([1, 2, 3, 4, 8, 12, 16, 20, 24, 28, 32, 40, 48]) + faiss_comp = faiss_comp[d % faiss_comp == 0] + intersection = np.intersect1d(raft_comp, faiss_comp) + if len(intersection) == 0: + m_subvectors = 16 + warn( + f"Unable to calculate compatible m_subvectors from {d} features. " + "Defaulting to 16 subvectors." + ) + else: + m_subvectors = int( + intersection[np.argmin(np.abs(intersection - d * percentile))] + ) + return m_subvectors diff --git a/sklbench/emulators/faiss/__init__.py b/sklbench/emulators/faiss/__init__.py new file mode 100644 index 000000000..fd76252a2 --- /dev/null +++ b/sklbench/emulators/faiss/__init__.py @@ -0,0 +1,19 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from .neighbors import NearestNeighbors + +__all__ = ["NearestNeighbors"] diff --git a/sklbench/emulators/faiss/neighbors.py b/sklbench/emulators/faiss/neighbors.py new file mode 100644 index 000000000..882b4761e --- /dev/null +++ b/sklbench/emulators/faiss/neighbors.py @@ -0,0 +1,85 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + + +import faiss + +from ..common import NearestNeighborsBase + + +class NearestNeighbors(NearestNeighborsBase): + """ + Minimal class emulating `sklearn.neighbors.NearestNeighbors` estimator + """ + + def __init__( + self, + n_neighbors=5, + algorithm="brute", + metric="euclidean", + n_lists=1024, + n_probes=64, + m_subvectors=16, + n_bits=8, + device="cpu", + ): + self.n_neighbors = n_neighbors + self.algorithm = algorithm + self.metric = metric + self.n_lists = n_lists + self.n_probes = n_probes + self.m_subvectors = m_subvectors + self.n_bits = n_bits + self.device = device + if self.device == "gpu": + self._gpu_resources = faiss.StandardGpuResources() + + def fit(self, X, y=None): + d = X.shape[1] + if isinstance(self.m_subvectors, float): + self.m_subvectors = self.get_m_subvectors(self.m_subvectors, d) + self._base_index = faiss.IndexFlatL2(d) + if self.algorithm == "brute": + self._index = self._base_index + elif self.algorithm == "ivf_flat": + self._index = faiss.IndexIVFFlat( + self._base_index, d, self.n_lists, faiss.METRIC_L2 + ) + elif self.algorithm == "ivf_pq": + self._index = faiss.IndexIVFPQ( + self._base_index, + d, + self.n_lists, + self.m_subvectors, + self.n_bits, + faiss.METRIC_L2, + ) + else: + raise ValueError(f"Unknown algorithm {self.algorithm}") + if self.device == "gpu": + self._index = faiss.index_cpu_to_gpu(self._gpu_resources, 0, self._index) + self._index.nprobe = self.n_probes + self._index.train(X) + self._index.add(X) + return self + + def kneighbors(self, X, n_neighbors=None, return_distance=True): + k = self.n_neighbors if n_neighbors is None else n_neighbors + distances, indices = self._index.search(X, k) + if return_distance: + return distances, indices + else: + return indices diff --git a/sklbench/emulators/raft/__init__.py b/sklbench/emulators/raft/__init__.py new file mode 100644 index 000000000..fd76252a2 --- /dev/null +++ b/sklbench/emulators/raft/__init__.py @@ -0,0 +1,19 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from .neighbors import NearestNeighbors + +__all__ = ["NearestNeighbors"] diff --git a/sklbench/emulators/raft/neighbors.py b/sklbench/emulators/raft/neighbors.py new file mode 100644 index 000000000..5fa39fb58 --- /dev/null +++ b/sklbench/emulators/raft/neighbors.py @@ -0,0 +1,123 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import cupy as cp +from pylibraft.common import DeviceResources +from pylibraft.neighbors import brute_force, cagra, ivf_flat, ivf_pq + +from ..common import NearestNeighborsBase + + +class NearestNeighbors(NearestNeighborsBase): + """ + Minimal class emulating `sklearn.neighbors.NearestNeighbors` estimator + """ + + def __init__( + self, + n_neighbors=5, + algorithm="brute", + metric="euclidean", + n_lists=1024, + n_probes=64, + m_subvectors=16, + n_bits=8, + intermediate_graph_degree=128, + graph_degree=64, + ): + self.n_neighbors = n_neighbors + self.algorithm = algorithm + self.metric = metric + self.n_lists = n_lists + self.n_probes = n_probes + self.m_subvectors = m_subvectors + self.n_bits = n_bits + self.intermediate_graph_degree = intermediate_graph_degree + self.graph_degree = graph_degree + self._handle = DeviceResources() + + def fit(self, X, y=None): + d = X.shape[1] + if isinstance(self.m_subvectors, float): + self.m_subvectors = self.get_m_subvectors(self.m_subvectors, d) + if self.algorithm == "brute": + self._X_fit = X + elif self.algorithm == "ivf_flat": + index_params = ivf_flat.IndexParams(n_lists=self.n_lists, metric=self.metric) + self._index = ivf_flat.build(index_params, X, handle=self._handle) + elif self.algorithm == "ivf_pq": + index_params = ivf_pq.IndexParams( + n_lists=self.n_lists, + metric=self.metric, + pq_dim=self.m_subvectors, + pq_bits=self.n_bits, + ) + self._index = ivf_pq.build(index_params, X, handle=self._handle) + elif self.algorithm == "cagra": + index_params = cagra.IndexParams( + metric="sqeuclidean", + intermediate_graph_degree=self.intermediate_graph_degree, + graph_degree=self.graph_degree, + ) + self._index = cagra.build(index_params, X, handle=self._handle) + else: + raise ValueError(f"Unknown algorithm {self.algorithm}") + self._handle.sync() + return self + + def kneighbors(self, X, n_neighbors=None, return_distance=True): + k = self.n_neighbors if n_neighbors is None else n_neighbors + if self.algorithm == "brute": + distances, indices = brute_force.knn( + self._X_fit, X, k, metric=self.metric, handle=self._handle + ) + elif self.algorithm == "ivf_flat": + distances, indices = ivf_flat.search( + ivf_flat.SearchParams(n_probes=self.n_probes), + self._index, + X, + k + 1, + handle=self._handle, + ) + elif self.algorithm == "ivf_pq": + distances, indices = ivf_pq.search( + ivf_pq.SearchParams(n_probes=self.n_probes), + self._index, + X, + k, + handle=self._handle, + ) + elif self.algorithm == "cagra": + distances, indices = cagra.search( + cagra.SearchParams(itopk_size=int(2 * k)), + self._index, + X, + k, + handle=self._handle, + ) + else: + raise ValueError(f"Unknown algorithm {self.algorithm}") + self._handle.sync() + if not isinstance(distances, cp.ndarray): + distances = cp.asarray(distances) + if not isinstance(indices, cp.ndarray): + indices = cp.asarray(indices) + if self.algorithm == "ivf_flat": + distances, indices = distances[:, :-1], indices[:, :-1] + if return_distance: + return distances, indices + else: + return indices diff --git a/sklbench/emulators/svs/__init__.py b/sklbench/emulators/svs/__init__.py new file mode 100644 index 000000000..fd76252a2 --- /dev/null +++ b/sklbench/emulators/svs/__init__.py @@ -0,0 +1,19 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from .neighbors import NearestNeighbors + +__all__ = ["NearestNeighbors"] diff --git a/sklbench/emulators/svs/neighbors.py b/sklbench/emulators/svs/neighbors.py new file mode 100644 index 000000000..958438ead --- /dev/null +++ b/sklbench/emulators/svs/neighbors.py @@ -0,0 +1,64 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import pysvs +from psutil import cpu_count + +from ..common.neighbors import NearestNeighborsBase + + +class NearestNeighbors(NearestNeighborsBase): + """ + Minimal class emulating `sklearn.neighbors.NearestNeighbors` estimator + """ + + def __init__( + self, + n_neighbors=5, + algorithm="vamana", + metric="euclidean", + graph_max_degree=64, + window_size=128, + n_jobs=cpu_count(logical=False), + ): + self.n_neighbors = n_neighbors + self.algorithm = algorithm + self.metric = metric + self.graph_max_degree = graph_max_degree + self.window_size = window_size + self.n_jobs = n_jobs + + def fit(self, X, y=None): + build_params = pysvs.VamanaBuildParameters( + graph_max_degree=self.graph_max_degree, + window_size=self.window_size, + num_threads=self.n_jobs, + ) + self._index = pysvs.Vamana.build( + build_params, + X, + pysvs.DistanceType.L2, + num_threads=self.n_jobs, + ) + return self + + def kneighbors(self, X, n_neighbors=None, return_distance=True): + k = self.n_neighbors if n_neighbors is None else n_neighbors + indices, distances = self._index.search(X, k) + if return_distance: + return distances, indices + else: + return indices diff --git a/sklbench/report/README.md b/sklbench/report/README.md new file mode 100644 index 000000000..a3b5584c5 --- /dev/null +++ b/sklbench/report/README.md @@ -0,0 +1,29 @@ +# Report Generator + +**Scikit-learn_bench** report generator creates a high-level report with aggregated stats from provided benchmark results. + +Generator will eventually support different types of reports, but there is only one supported type currently: + + - `Separate tables`: writes aggregated metrics on summary page and detailed result on separate pages + +Raw results are converted into a pandas dataframe and the final report is made by processing this dataframe into separate ones written into Excel tables. + +## Arguments + + +| Name | Type | Default value | Choices | Description | +|:-----------------------------------------------|:-------|:----------------------------------------------------|:-----------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------| +| `--report-log-level` | str | WARNING | ('ERROR', 'WARNING', 'INFO', 'DEBUG') | Logging level for report generator. | +| `--result-files` | str | ['result.json'] | | Result file path[s] from scikit-learn_bench runs for report generation. | +| `--report-file` | str | report.xlsx | | Report file path. | +| `--report-type` | str | separate-tables | ('separate-tables',) | Report type ("separate-tables" is the only supported now). | +| `--compatibility-mode` | | False | | [EXPERIMENTAL] Compatibility mode drops and modifies results to make them comparable (for example, sklearn and cuML parameters). | +| `--drop-columns`
`--drop-cols` | str | [] | | Columns to drop from report. | +| `--diff-columns`
`--diff-cols` | str | ['environment_name', 'library', 'format', 'device'] | | Columns to show difference between. | +| `--split-columns` | str | ['estimator', 'method', 'function'] | | Splitting columns for subreports/sheets. | +| `--diffs-selection` | str | upper_triangle | ['upper_triangle', 'lower_triangle', 'matrix'] | Selects which part of one-vs-one difference to show (all matrix or one of triangles). | +| `--perf-color-scale` | float | [0.8, 1.0, 10.0] | | Color scale for performance metric improvement in report. | +| `--quality-color-scale` | float | [0.99, 0.995, 1.01] | | Color scale for quality metric improvement in report. | + +--- +[Documentation tree](../../README.md#-documentation) diff --git a/sklbench/report/__init__.py b/sklbench/report/__init__.py new file mode 100644 index 000000000..88fb31cde --- /dev/null +++ b/sklbench/report/__init__.py @@ -0,0 +1,25 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from .arguments import add_report_generator_arguments, get_report_parser +from .implementation import generate_report, get_result_tables_as_df + +__all__ = [ + "add_report_generator_arguments", + "get_result_tables_as_df", + "get_report_parser", + "generate_report", +] diff --git a/sklbench/report/__main__.py b/sklbench/report/__main__.py new file mode 100644 index 000000000..a76a70184 --- /dev/null +++ b/sklbench/report/__main__.py @@ -0,0 +1,29 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import sys + +from . import generate_report, get_report_parser + + +def main(): + parser = get_report_parser() + args = parser.parse_args() + return generate_report(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/sklbench/report/arguments.py b/sklbench/report/arguments.py new file mode 100644 index 000000000..166661f16 --- /dev/null +++ b/sklbench/report/arguments.py @@ -0,0 +1,114 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import argparse + +from .implementation import DIFFBY_COLUMNS + + +def add_report_generator_arguments( + parser: argparse.ArgumentParser, +) -> argparse.ArgumentParser: + parser.add_argument( + "--report-log-level", + default="WARNING", + type=str, + choices=("ERROR", "WARNING", "INFO", "DEBUG"), + help="Logging level for report generator.", + ) + parser.add_argument( + "--result-files", + type=str, + nargs="+", + default=list(), + help="Result file path[s] from scikit-learn_bench runs for report generation.", + ) + parser.add_argument( + "--report-file", type=str, default="report.xlsx", help="Report file path." + ) + parser.add_argument( + "--report-type", + type=str, + default="separate-tables", + choices=("separate-tables",), + help='Report type ("separate-tables" is the only supported now).', + ) + parser.add_argument( + "--compatibility-mode", + default=False, + action="store_true", + help="[EXPERIMENTAL] Compatibility mode drops and modifies results " + "to make them comparable (for example, sklearn and cuML parameters).", + ) + # 'separate-table' report type arguments + parser.add_argument( + "--drop-columns", + "--drop-cols", + type=str, + nargs="+", + default=list(), + help="Columns to drop from report.", + ) + parser.add_argument( + "--diff-columns", + "--diff-cols", + type=str, + nargs="+", + default=DIFFBY_COLUMNS, + help="Columns to show difference between.", + ) + parser.add_argument( + "--split-columns", + type=str, + nargs="+", + default=["estimator", "method", "function"], + help="Splitting columns for subreports/sheets.", + ) + parser.add_argument( + "--diffs-selection", + type=str, + choices=["upper_triangle", "lower_triangle", "matrix"], + default="upper_triangle", + help="Selects which part of one-vs-one difference to show " + "(all matrix or one of triangles).", + ) + # color scale settings + parser.add_argument( + "--perf-color-scale", + type=float, + nargs="+", + default=[0.8, 1.0, 10.0], + help="Color scale for performance metric improvement in report.", + ) + parser.add_argument( + "--quality-color-scale", + type=float, + nargs="+", + default=[0.99, 0.995, 1.01], + help="Color scale for quality metric improvement in report.", + ) + return parser + + +def get_report_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="python -m sklbench.report", + description=""" + Scikit-learn_bench report generator + """, + ) + add_report_generator_arguments(parser) + return parser diff --git a/sklbench/report/compatibility.py b/sklbench/report/compatibility.py new file mode 100644 index 000000000..d297b52c4 --- /dev/null +++ b/sklbench/report/compatibility.py @@ -0,0 +1,199 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import math + +import numpy as np +import pandas as pd + +from ..utils.logger import logger + + +def transform_results_to_compatible(results: pd.DataFrame): + # sklearn and sklearnex compatibility + if (results["library"] == "sklearnex").any(): + # delete extra columns related to sklearnex only + results.drop( + inplace=True, + errors="ignore", + columns=[ + "max_bins", + "min_bin_size", + ], + ) + # cuML compatibility + if ( + (results["library"] == "cuml") + | (results["library"] == "raft") + | (results["library"] == "faiss") + ).any(): + logger.info( + "Found cuML, RAFT or FAISS entries in provided results. They will be " + "filtered and transformed to make all entries compatible " + "assuming config entries are aligned between cuML and other frameworks." + ) + # delete extra columns related to cuML only or sklearn only + results.drop( + inplace=True, + errors="ignore", + columns=[ + # sklearn common + "n_jobs", + # cuML common + "output_type", + # cuML OR sklearn (dependent on algorithm) + "random_state", + "verbose", + "normalize", + "copy_x", + "copy_X", + "warm_start", + # sklearn DBSCAN + "leaf_size", + # cuML DBSCAN + "max_mbytes_per_batch", + "calc_core_sample_indices", + # cuML KMeans + "oversampling_factor", + "max_samples_per_batch", + # sklearn kNN + "leaf_size", + "radius", + # sklearn LinearRegression + "positive", + "precompute", + # sklearn LogisticRegression + "dual", + "intercept_scaling", + "multi_class", + # cuML LogisticRegression + "linesearch_max_iter", + # sklearn PCA + "n_oversamples", + "power_iteration_normalizer", + # cuML TSNE + "late_exaggeration", + "learning_rate_method", + "perplexity_max_iter", + "exaggeration_iter", + "pre_momentum", + "post_momentum", + "square_distances", + # sklearn SVM + "break_ties", + "shrinking", + # cuML SVM + "nochange_steps", + # sklearn[ex] Ensemble + "ccp_alpha", + "max_bins", + "min_bin_size", + "min_weight_fraction_leaf", + "oob_score", + # cuml Ensemble + "n_bins", + "accuracy_metric", + "max_batch_size", + "n_streams", + # NearestNeighbors emulators + "n_lists", + "n_probes", + "m_subvectors", + "n_bits", + "intermediate_graph_degree", + "graph_degree", + ], + ) + # DBSCAN parameters renaming + cuml_dbscan_index = (results["estimator"] == "DBSCAN") & ( + results["library"] == "cuml" + ) + if cuml_dbscan_index.any(): + results.loc[cuml_dbscan_index, "algorithm"] = "brute" + # KMeans parameters renaming + cuml_kmeans_index = (results["estimator"] == "KMeans") & ( + results["library"] == "cuml" + ) + if cuml_kmeans_index.any(): + results.loc[cuml_kmeans_index, "algorithm"] = "lloyd" + results.loc[ + cuml_kmeans_index & (results["init"] == "scalable-k-means++"), "init" + ] = "k-means++" + # Linear models parameters renaming + linear_index = ( + (results["estimator"] == "LinearRegression") + | (results["estimator"] == "Ridge") + | (results["estimator"] == "Lasso") + | (results["estimator"] == "ElasticNet") + ) & ( + (results["library"] == "cuml") + | (results["library"] == "sklearn") + | (results["library"] == "sklearnex") + ) + if linear_index.any(): + results.loc[linear_index, "algorithm"] = np.nan + results.loc[linear_index, "solver"] = np.nan + + sklearn_ridge_index = (results["estimator"] == "Ridge") & ( + (results["library"] == "sklearn") | (results["library"] == "sklearnex") + ) + if sklearn_ridge_index.any(): + results.loc[sklearn_ridge_index, "tol"] = np.nan + + cuml_logreg_index = (results["estimator"] == "LogisticRegression") & ( + results["library"] == "cuml" + ) + if cuml_logreg_index.any(): + lbfgs_solver_index = ( + cuml_logreg_index + & (results["solver"] == "qn") + & ((results["penalty"] == "none") | (results["penalty"] == "l2")) + ) + if lbfgs_solver_index.any(): + results.loc[lbfgs_solver_index, "solver"] = "lbfgs" + # TSNE parameters renaming + cuml_tsne_index = (results["estimator"] == "TSNE") & ( + results["library"] == "cuml" + ) + if cuml_tsne_index.any(): + results.loc[cuml_tsne_index, "n_neighbors"] = np.nan + # SVC parameters renaming + cuml_svc_index = (results["estimator"] == "SVC") & (results["library"] == "cuml") + if cuml_svc_index.any(): + results.loc[cuml_svc_index, "decision_function_shape"] = results.loc[ + cuml_svc_index, "multiclass_strategy" + ] + results.loc[cuml_svc_index, "multiclass_strategy"] = np.nan + # Ensemble parameters renaming + cuml_rf_index = ( + (results["estimator"] == "RandomForestClassifier") + | (results["estimator"] == "RandomForestRegressor") + ) & (results["library"] == "cuml") + if cuml_rf_index.any(): + gini_index = cuml_rf_index & (results["split_criterion"] == 0) + if gini_index.any(): + results.loc[gini_index, "criterion"] = "gini" + results.loc[gini_index, "split_criterion"] = np.nan + mse_index = cuml_rf_index & (results["split_criterion"] == 2) + if mse_index.any(): + results.loc[mse_index, "criterion"] = "squared_error" + results.loc[mse_index, "split_criterion"] = np.nan + inf_leaves_index = cuml_rf_index & (results["max_leaves"] == -1) + if inf_leaves_index.any(): + results.loc[inf_leaves_index, "max_leaf_nodes"] = None + results.loc[inf_leaves_index, "max_leaves"] = np.nan + + return results diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py new file mode 100644 index 000000000..b577ab551 --- /dev/null +++ b/sklbench/report/implementation.py @@ -0,0 +1,364 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import argparse +import json +from typing import Dict, List + +import openpyxl as xl +import pandas as pd +from openpyxl.formatting.rule import ColorScaleRule +from openpyxl.utils import get_column_letter +from openpyxl.utils.dataframe import dataframe_to_rows +from scipy.stats import gmean + +from ..utils.common import custom_format, flatten_dict, flatten_list +from ..utils.logger import logger +from .compatibility import transform_results_to_compatible + +METRICS = { + "lower is better": [ + "time[ms]", + "iterations", + # classification + "logloss", + # regression + "RMSE", + # clustering + "inertia", + "Davies-Bouldin score", + # manifold + # - TSNE + "Kullback-Leibler divergence", + ], + "higher is better": [ + "throughput[samples/ms]", + # classification + "accuracy", + "balanced accuracy", + "ROC AUC", + # regression + "R2", + # clustering + "homogeneity", + "completeness", + # search + "recall@10", + ], + "indifferent": [ + # SVM + "support vectors", + # PCA + "average log-likelihood", + "1st component variance ratio", + # DBSCAN + # NB: 'n_clusters' is parameter of KMeans while + # 'clusters' is number of computer clusters by DBSCAN + "clusters", + ], + "incomparable": ["time std[ms]"], +} +METRIC_NAMES = flatten_list([list(METRICS[key]) for key in METRICS]) +PERF_METRICS = ["time[ms]", "throughput[samples/ms]"] + +COLUMNS_ORDER = [ + # algorithm + "stage", + "task", + "library", + "estimator", + "method", + "function", + "online_inference_mode", + "device", + "environment_name", + # data + "dataset", + "samples", + "features", + "format", + "dtype", + "order", + "n_classes", + "n_clusters", + "batch_size", +] + +DIFFBY_COLUMNS = ["environment_name", "library", "format", "device"] + + +def geomean_wrapper(a): + return gmean(a, nan_policy="omit") + + +def reorder_columns(input_columns: List, columns_order: List = COLUMNS_ORDER) -> List: + output_columns = list() + # 1st step: select existing columns from known ordered columns + for ordered_column in columns_order: + if ordered_column in input_columns: + output_columns.append(ordered_column) + input_columns.remove(ordered_column) + # 2nd step: add left input columns + output_columns += input_columns + return output_columns + + +def filter_nan_columns(input_df: pd.DataFrame): + output_df = input_df.copy() + non_nan_columns = output_df.columns[output_df.isna().mean(axis=0) < 1] + output_df = output_df[non_nan_columns] + return output_df + + +def split_df_by_columns( + input_df: pd.DataFrame, columns: List, remove_column: bool = True +) -> Dict[str, pd.DataFrame]: + split_columns = list(set(columns) & set(input_df.columns)) + split_columns = reorder_columns(split_columns, columns) + value_counts = input_df.value_counts(split_columns, dropna=False, sort=False) + output_dfs = {} + for unique_values in value_counts.index: + index_mask = [ + input_df[column] == unique_value + for column, unique_value in zip(value_counts.index.names, unique_values) + if not pd.isna(unique_value) + ] + index_mask = pd.DataFrame(index_mask).all(axis=0) + subset_name = str(unique_values)[1:-1] + subset_name = subset_name.replace(", ", "|").replace(",", "").replace("'", "") + subset_name = subset_name.replace("nan|", "").replace("|nan", "") + output_dfs[subset_name] = filter_nan_columns(input_df.loc[index_mask]) + if remove_column: + output_dfs[subset_name] = output_dfs[subset_name].drop( + columns=set(split_columns) & set(output_dfs[subset_name].columns) + ) + output_dfs[subset_name] = output_dfs[subset_name][ + reorder_columns(list(output_dfs[subset_name].columns)) + ] + return output_dfs + + +def compare_df(input_df, diff_columns, diffs_selection, compared_columns=METRIC_NAMES): + def select_comparison(i, j, diffs_selection): + if diffs_selection == "upper_triangle": + return j > i + elif diffs_selection == "lower_triangle": + return i > j + return i != j + + index_columns = list( + (set(input_df.columns) - set(diff_columns)) - set(compared_columns) + ) + df = input_df.set_index(index_columns) + unique_indices = df.index.unique() + splitted_dfs = split_df_by_columns(input_df, diff_columns) + splitted_dfs = {key: df.set_index(index_columns) for key, df in splitted_dfs.items()} + + # drop results with duplicated indices (keep first entry only) + for key, splitted_df in splitted_dfs.items(): + splitted_dfs[key] = splitted_df[~splitted_df.index.duplicated(keep="first")] + + df = pd.DataFrame(index=unique_indices) + # original values + for key, splitted_df in splitted_dfs.items(): + if len(set(splitted_df.columns) - set(compared_columns)) > 0: + raise ValueError + for column in splitted_df.columns: + df[f"{key}\n{column}"] = splitted_df[column] + # compared values + for i, (key_ith, df_ith) in enumerate(splitted_dfs.items()): + for j, (key_jth, df_jth) in enumerate(splitted_dfs.items()): + if select_comparison(i, j, diffs_selection): + comparison_name = f"{key_jth} vs {key_ith}" + for column in df_ith.columns: + if column in METRICS["higher is better"]: + df[f"{comparison_name}\n{column} relative improvement"] = ( + df_jth[column] / df_ith[column] + ) + elif column in METRICS["lower is better"]: + df[f"{comparison_name}\n{column} relative improvement"] = ( + df_ith[column] / df_jth[column] + ) + elif column in METRICS["indifferent"]: + df[f"{comparison_name}\n{column} is equal"] = ( + df_ith[column] == df_jth[column] + ) + df = df.reset_index() + # move to multi-index + df = df[reorder_columns(list(df.columns))] + df.columns = [ + column if "\n" in column else f"parameter\n{column}" for column in df.columns + ] + df.columns = pd.MultiIndex.from_tuples( + [tuple(column.split("\n")) for column in df.columns] + ) + return df + + +def write_df_to_sheet(df, sheet, index=True, header=True): + for row in dataframe_to_rows(df, index=index, header=header): + if any(map(lambda x: x is not None, row)): + sheet.append(row) + + +def merge_result_files(filenames): + results = dict() + for result_name in filenames: + with open(result_name, "r") as fp: + result = json.load(fp) + for key, value in result.items(): + if key in results: + if isinstance(value, list): + results[key] += value + elif isinstance(value, dict): + results[key].update(value) + else: + results[key] = value + return results + + +def get_result_tables_as_df( + results, + diffby_columns=DIFFBY_COLUMNS, + splitby_columns=["estimator", "method", "function"], + compatibility_mode=False, +): + bench_cases = pd.DataFrame( + [flatten_dict(bench_case) for bench_case in results["bench_cases"]] + ) + + if compatibility_mode: + bench_cases = transform_results_to_compatible(bench_cases) + + for column in diffby_columns.copy(): + if bench_cases[column].nunique() == 1: + bench_cases.drop(columns=[column], inplace=True) + diffby_columns.remove(column) + + return split_df_by_columns(bench_cases, splitby_columns) + + +def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame: + metric_columns = list() + for column in list(df.columns): + for metric_name in METRIC_NAMES: + # only relative improvements are included in summary currently + if len(column) > 1 and column[1] == f"{metric_name} relative improvement": + metric_columns.append(column) + summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T + summary.index = pd.Index([df_name]) + return summary + + +def get_color_rule(scale): + red, yellow, green = "F85D5E", "FAF52E", "58C144" + start_value, mid_value, end_value = scale + return ColorScaleRule( + start_type="num", + start_value=start_value, + start_color=red, + mid_type="num", + mid_value=mid_value, + mid_color=yellow, + end_type="num", + end_value=end_value, + end_color=green, + ) + + +def apply_rules_for_sheet(sheet, perf_color_scale, quality_color_scale): + for column in sheet.iter_cols(): + column_idx = get_column_letter(column[0].column) + is_rel_impr = any( + [ + isinstance(cell.value, str) and "relative improvement" in cell.value + for cell in column + ] + ) + is_time = any( + [ + isinstance(cell.value, str) + and (any(map(lambda x: x in cell.value, PERF_METRICS))) + for cell in column + ] + ) + if is_rel_impr: + cell_range = f"${column_idx}1:${column_idx}{len(column)}" + sheet.conditional_formatting.add( + cell_range, + get_color_rule(perf_color_scale if is_time else quality_color_scale), + ) + + +def write_environment_info(results, workbook): + env_infos = results["environment"] + for env_name, env_info in env_infos.items(): + for info_type, info_subclass in env_info.items(): + new_ws = workbook.create_sheet(title=f"{info_type}|{env_name}"[:31]) + for sub_key, sub_info in info_subclass.items(): + if isinstance(sub_info, dict): + if all( + map( + lambda x: not (isinstance(x, list) or isinstance(x, dict)), + sub_info.values(), + ) + ): + info_df = pd.Series(sub_info).to_frame() + else: + info_df = pd.DataFrame(sub_info).T + elif isinstance(sub_info, list): + info_df = pd.DataFrame(sub_info) + else: + continue + write_df_to_sheet(info_df, new_ws) + new_ws.append([None]) + + +def generate_report(args: argparse.Namespace): + logger.setLevel(args.report_log_level) + results = merge_result_files(args.result_files) + + diffby, splitby = args.diff_columns, args.split_columns + dfs = get_result_tables_as_df(results, diffby, splitby, args.compatibility_mode) + + wb = xl.Workbook() + summary_dfs = list() + for df_name, df in dfs.items(): + drop_columns = list(set(df.columns) & set(args.drop_columns)) + df = df.drop(columns=drop_columns) + + ws = wb.create_sheet(title=df_name[:30]) + if len(diffby) > 0: + current_df = compare_df(df, diffby, args.diffs_selection) + else: + current_df = df + write_df_to_sheet(current_df, ws, index=False) + apply_rules_for_sheet(ws, args.perf_color_scale, args.quality_color_scale) + summary_dfs.append(get_summary_from_df(current_df, df_name)) + # write summary to corresponding sheet + summary_df = pd.concat(summary_dfs, axis=0, join="outer") + summary_df = summary_df[summary_df.columns.sortlevel(level=0, ascending=False)[0]] + logger.info(f"{custom_format('Report summary', bcolor='HEADER')}\n{summary_df}") + if summary_df.size > 0: + summary_ws = wb.create_sheet("Summary") + write_df_to_sheet(summary_df, summary_ws) + apply_rules_for_sheet(summary_ws, args.perf_color_scale, args.quality_color_scale) + # write environment info + write_environment_info(results, wb) + # remove default sheet + wb.remove(wb["Sheet"]) + wb.save(args.report_file) + return 0 diff --git a/sklbench/runner/README.md b/sklbench/runner/README.md new file mode 100644 index 000000000..14ead8d3c --- /dev/null +++ b/sklbench/runner/README.md @@ -0,0 +1,58 @@ +# Benchmarks Runner + +**Scikit-learn_bench** runner orchestrates running of the individual benchmarks based on provided config files, parameters, filters, and other arguments. + +Runner consumes the following types of arguments: + - Settings defining benchmarking cases (config location\[s\], global parameters, and filters) + - Verbosity levels for different scikit-learn_bench stages (runner, benchmarks, report generator) + - Settings for aggregated benchmarks output + - Scikit-learn_bench workflow parameters + +And follows the next steps: + +1. Generate benchmarking cases +2. Filter them if possible to compare parameters and filters (early filtering) +3. Prefetch datasets in parallel if explicitly requested with a special argument +4. Sequentially call individual benchmarks as subprocesses +5. Combine raw results and output them as a JSON file +6. Call report generator in-place if requested + +See [benchmarking config specification](../../docs/README.md) for explanation of config files formatting. + +```mermaid +flowchart LR + A["Configs reading"] --> B + B["Filtering of bench. cases"] --> P + P["Datasets prefetching\n[optional]"] --> C + B --> C + C["Benchmarks calling"] --> D + D["Raw results output"] --> E + E["Report generation\n[optional]"] + + classDef optional stroke-dasharray: 8 8 + class P optional + class E optional +``` + +## Arguments + + +| Name | Type | Default value | Choices | Description | +|:-----------------------------------------------|:-------|:----------------------------------------------------|:-----------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------| +| `--runner-log-level` | str | WARNING | ('ERROR', 'WARNING', 'INFO', 'DEBUG') | Logging level for benchmarks runner. | +| `--bench-log-level` | str | WARNING | ('ERROR', 'WARNING', 'INFO', 'DEBUG') | Logging level for each running benchmark. | +| `--log-level`
`-l` | str | | ('ERROR', 'WARNING', 'INFO', 'DEBUG') | Global logging level for benchmarks: overwrites runner, benchmarks and report logging levels. | +| `--config`
`--configs`
`-c` | str | | | Paths to a configuration files or/and directories that contain configuration files. | +| `--parameters`
`--params`
`-p` | str | | | Globally defines or overwrites config parameters. For example: `-p data:dtype=float32 data:order=F`. | +| `--parameter-filters`
`--filters`
`-f` | str | | | Filters benchmarking cases by parameter values. For example: `-f data:dtype=float32 data:order=F`. | +| `--result-file`
`-r` | str | result.json | | File path to store scikit-learn_bench's runned cases results. | +| `--environment-name`
`--env-name`
`-e` | str | | | Environment name to use instead of it's configuration hash. | +| `--prefetch-datasets` | | False | | Load all requested datasets in parallel before running benchmarks. | +| `--exit-on-error` | | False | | Interrupt runner and exit if last benchmark failed with error. | +| `--describe-parser` | | False | | Print parser description in Markdown table format and exit. | +| `--report` | | False | | Enables generation of report. | + +Also, benchmarks runner includes and takes into the account [arguments of report generator](../report/README.md#arguments) if `--report` flag is passed. + +--- +[Documentation tree](../../README.md#-documentation) diff --git a/sklbench/runner/__init__.py b/sklbench/runner/__init__.py new file mode 100644 index 000000000..ff76d5383 --- /dev/null +++ b/sklbench/runner/__init__.py @@ -0,0 +1,20 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from .arguments import get_parser_description, get_runner_parser +from .implementation import run_benchmarks + +__all__ = ["get_runner_parser", "get_parser_description", "run_benchmarks"] diff --git a/sklbench/runner/arguments.py b/sklbench/runner/arguments.py new file mode 100644 index 000000000..1ba47daaa --- /dev/null +++ b/sklbench/runner/arguments.py @@ -0,0 +1,166 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import argparse +from typing import Dict, List + +import pandas as pd + +from ..report import add_report_generator_arguments + + +def get_parser_description(parser: argparse.ArgumentParser) -> pd.DataFrame: + """Convert parser description to Markdown-style table.""" + + def get_argument_actions(parser: argparse.ArgumentParser) -> List: + arg_actions = [] + + for action in parser._actions: + if isinstance(action, argparse._ArgumentGroup): + for subaction in action._group_actions: + arg_actions.append(subaction) + else: + arg_actions.append(action) + return arg_actions + + def parse_action(action: argparse.Action) -> Dict: + return { + "Name": "
".join(map(lambda x: f"`{x}`", action.option_strings)), + "Type": action.type.__name__ if action.type is not None else None, + "Default value": ( + action.default if action.default is not argparse.SUPPRESS else None + ), + "Choices": action.choices, + "Description": action.help, + } + + return pd.DataFrame(map(parse_action, get_argument_actions(parser))).to_markdown( + index=False + ) + + +def add_runner_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + # verbosity levels + parser.add_argument( + "--runner-log-level", + default="WARNING", + type=str, + choices=("ERROR", "WARNING", "INFO", "DEBUG"), + help="Logging level for benchmarks runner.", + ) + parser.add_argument( + "--bench-log-level", + default="WARNING", + type=str, + choices=("ERROR", "WARNING", "INFO", "DEBUG"), + help="Logging level for each running benchmark.", + ) + parser.add_argument( + "--log-level", + "-l", + default=None, + type=str, + choices=("ERROR", "WARNING", "INFO", "DEBUG"), + help="Global logging level for benchmarks: " + "overwrites runner, benchmarks and report logging levels.", + ) + # benchmarking cases finding, overwriting and filtering + parser.add_argument( + "--config", + "--configs", + "-c", + type=str, + nargs="+", + default=None, + help="Paths to a configuration files or/and " + "directories that contain configuration files.", + ) + parser.add_argument( + "--parameters", + "--params", + "-p", + default="", + type=str, + nargs="+", + help="Globally defines or overwrites config parameters. " + "For example: `-p data:dtype=float32 data:order=F`.", + ) + parser.add_argument( + "--parameter-filters", + "--filters", + "-f", + default="", + type=str, + nargs="+", + help="Filters benchmarking cases by parameter values. " + "For example: `-f data:dtype=float32 data:order=F`.", + ) + + parser.add_argument( + "--result-file", + "-r", + type=str, + default="result.json", + help="File path to store scikit-learn_bench's runned cases results.", + ) + parser.add_argument( + "--environment-name", + "--env-name", + "-e", + type=str, + default=None, + help="Environment name to use instead of it's configuration hash.", + ) + parser.add_argument( + "--prefetch-datasets", + default=False, + action="store_true", + help="Load all requested datasets in parallel before running benchmarks.", + ) + # workflow control + parser.add_argument( + "--exit-on-error", + default=False, + action="store_true", + help="Interrupt runner and exit if last benchmark failed with error.", + ) + # option to get parser description in Markdown table format for READMEs + parser.add_argument( + "--describe-parser", + default=False, + action="store_true", + help="Print parser description in Markdown table format and exit.", + ) + # report generator arguments for optional usage + parser.add_argument( + "--report", + default=False, + action="store_true", + help="Enables generation of report.", + ) + add_report_generator_arguments(parser) + return parser + + +def get_runner_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="python -m sklbench", + description=""" + Scikit-learn_bench runner + """, + ) + add_runner_arguments(parser) + return parser diff --git a/sklbench/runner/commands_helper.py b/sklbench/runner/commands_helper.py new file mode 100644 index 000000000..b66da0111 --- /dev/null +++ b/sklbench/runner/commands_helper.py @@ -0,0 +1,128 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import json +import os +import sys +from time import time +from typing import Dict, List, Tuple + +from ..utils.bench_case import get_bench_case_name, get_bench_case_value +from ..utils.common import custom_format, hash_from_json_repr, read_output_from_command +from ..utils.custom_types import BenchCase +from ..utils.logger import logger + + +def generate_benchmark_command( + bench_case: BenchCase, filters: List[BenchCase], log_level: str +) -> str: + # generate parameter and filter arguments for benchmark cli wrapper + bench_case_str = json.dumps(bench_case).replace(" ", "") + filters_str = json.dumps({"filters": filters}).replace(" ", "") + # get command prefix if set + command_prefix = "" + # 1. taskset (cpu affinity) command prefix + taskset = get_bench_case_value(bench_case, "bench:taskset") + if taskset is not None: + command_prefix = f"taskset -c {taskset} {command_prefix}" + # 2. distributed workflow (MPI, etc.) command prefix + distribution = get_bench_case_value(bench_case, "bench:distributor") + if distribution == "mpi": + mpi_params = get_bench_case_value(bench_case, "bench:mpi_params", dict()) + mpi_prefix = "mpirun" + for mpi_param_name, mpi_param_value in mpi_params.items(): + mpi_prefix += f" -{mpi_param_name} {mpi_param_value}" + command_prefix = f"{mpi_prefix} {command_prefix}" + # 3. Intel(R) VTune* profiling command prefix + vtune_profiling = get_bench_case_value(bench_case, "bench:vtune_profiling") + if vtune_profiling is not None: + if sys.platform == "linux": + vtune_result_dir = get_bench_case_value( + bench_case, "bench:vtune_results_directory", "vtune_results" + ) + os.makedirs(vtune_result_dir, exist_ok=True) + vtune_result_path = os.path.join( + vtune_result_dir, + "_".join( + [ + get_bench_case_name(bench_case, shortened=True, separator="_"), + hash_from_json_repr(bench_case), + # TODO: replace unix time in ms with datetime + str(int(time() * 1000)), + ] + ), + ) + command_prefix = ( + f"vtune -collect {vtune_profiling} -r {vtune_result_path} " + f"-start-paused -q -no-summary {command_prefix}" + ) + # vtune CLI requires modification of quotes bench args: `"` -> `\"` + bench_case_str = bench_case_str.replace('"', '\\"') + filters_str = filters_str.replace('"', '\\"') + else: + logger.warning( + "Intel(R) VTune(TM) profiling in scikit-learn_bench " + "is supported only on Linux." + ) + # benchmark selection + if get_bench_case_value(bench_case, "algorithm:estimator") is not None: + benchmark_name = "sklearn_estimator" + elif get_bench_case_value(bench_case, "algorithm:function") is not None: + benchmark_name = "custom_function" + else: + raise ValueError("Unknown benchmark type") + return ( + f"{command_prefix}python " + f"-m sklbench.benchmarks.{benchmark_name} " + f"--bench-case {bench_case_str} " + f"--filters {filters_str} " + f"--log-level {log_level}" + ) + + +def run_benchmark_from_case( + bench_case: BenchCase, filters: List[BenchCase], log_level: str +) -> Tuple[int, List[Dict]]: + command = generate_benchmark_command(bench_case, filters, log_level) + logger.debug(f"Benchmark wrapper call command:\n{command}") + return_code, stdout, stderr = read_output_from_command(command) + + # filter stdout warnings + prefixes_to_skip = ["[W]", "[I]"] + stdout = "\n".join( + [ + line + for line in stdout.split("\n") + if not any(map(lambda x: line.startswith(x), prefixes_to_skip)) + ] + ) + + if stdout != "": + logger.debug(f'{custom_format("Benchmark stdout:", bcolor="OKBLUE")}\n{stdout}') + if return_code == 0: + if stderr != "": + logger.warning(f"Benchmark stderr:\n{stderr}") + try: + result = json.loads(stdout) + except json.JSONDecodeError: + logger.warning("Unable to read benchmark output in json format.") + return_code = -1 + result = list() + else: + logger.warning(f"Benchmark returned non-zero code={return_code}.") + logger.warning(f"Benchmark stderr:\n{stderr}") + result = list() + return return_code, result diff --git a/sklbench/runner/implementation.py b/sklbench/runner/implementation.py new file mode 100644 index 000000000..2375e4b75 --- /dev/null +++ b/sklbench/runner/implementation.py @@ -0,0 +1,133 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + + +import argparse +import json +from multiprocessing import Pool +from typing import Dict, List, Tuple, Union + +from psutil import cpu_count +from tqdm import tqdm + +from ..datasets import load_data +from ..report import generate_report, get_result_tables_as_df +from ..utils.bench_case import get_bench_case_name, get_data_name +from ..utils.common import custom_format, hash_from_json_repr +from ..utils.config import early_filtering, generate_bench_cases, generate_bench_filters +from ..utils.custom_types import BenchCase +from ..utils.env import get_environment_info +from ..utils.logger import logger +from .commands_helper import run_benchmark_from_case + + +def call_benchmarks( + bench_cases: List[BenchCase], + filters: List[BenchCase], + log_level: str = "WARNING", + environment_name: Union[str, None] = None, + early_exit: bool = False, +) -> Tuple[int, Dict[str, Union[Dict, List]]]: + """Iterates over benchmarking cases with progress bar and combines their results""" + env_info = get_environment_info() + if environment_name is None: + environment_name = hash_from_json_repr(env_info) + results = list() + return_code = 0 + bench_cases_with_pbar = tqdm(bench_cases) + for bench_case in bench_cases_with_pbar: + bench_cases_with_pbar.set_description( + custom_format( + get_bench_case_name(bench_case, shortened=True), bcolor="HEADER" + ) + ) + try: + bench_return_code, bench_entries = run_benchmark_from_case( + bench_case, filters, log_level + ) + if bench_return_code != 0: + return_code = bench_return_code + if early_exit: + break + for entry in bench_entries: + entry["environment_name"] = environment_name + results.append(entry) + except KeyboardInterrupt: + return_code = -1 + break + full_result = { + "bench_cases": results, + "environment": {environment_name: env_info}, + } + return return_code, full_result + + +def run_benchmarks(args: argparse.Namespace) -> int: + # overwrite all logging levels if requested + if args.log_level is not None: + for log_type in ["runner", "bench", "report"]: + setattr(args, f"{log_type}_log_level", args.log_level) + # set logging level + logger.setLevel(args.runner_log_level) + + # find and parse configs + bench_cases = generate_bench_cases(args) + + # get parameter filters + param_filters = generate_bench_filters(args.parameter_filters) + + # perform early filtering based on 'data' parameters and + # some of 'algorithm' parameters assuming they were already assigned + bench_cases = early_filtering(bench_cases, param_filters) + + # prefetch datasets + if args.prefetch_datasets: + # trick: get unique dataset names only to avoid loading of same dataset + # by different cases/processes + dataset_cases = {get_data_name(case): case for case in bench_cases} + logger.debug(f"Unique dataset names to load:\n{list(dataset_cases.keys())}") + n_proc = min([16, cpu_count(), len(dataset_cases)]) + logger.info(f"Prefetching datasets with {n_proc} processes") + with Pool(n_proc) as pool: + pool.map(load_data, dataset_cases.values()) + + # run bench_cases + return_code, result = call_benchmarks( + bench_cases, + param_filters, + args.bench_log_level, + args.environment_name, + args.exit_on_error, + ) + + # output as pandas dataframe + if len(result["bench_cases"]) != 0: + for key, df in get_result_tables_as_df(result).items(): + logger.info(f'{custom_format(key, bcolor="HEADER")}\n{df}') + + # output raw result + logger.debug(custom_format(result)) + + with open(args.result_file, "w") as fp: + json.dump(result, fp, indent=4) + + # generate report + if args.report: + if args.result_file not in args.result_files: + args.result_files += [args.result_file] + generate_report(args) + + return return_code diff --git a/datasets/__init__.py b/sklbench/utils/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from datasets/__init__.py rename to sklbench/utils/__init__.py diff --git a/sklbench/utils/bench_case.py b/sklbench/utils/bench_case.py new file mode 100644 index 000000000..b63f36bb4 --- /dev/null +++ b/sklbench/utils/bench_case.py @@ -0,0 +1,136 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from copy import deepcopy +from typing import Any, List, Union + +from .custom_types import BenchCase, JsonTypesUnion + + +def set_bench_case_value( + bench_case: BenchCase, param_name: Union[List[str], str], value: JsonTypesUnion +): + if isinstance(param_name, str): + keys_chain = param_name.split(":") + else: + keys_chain = param_name + # deep dive into bench case + local_value = bench_case + for prev_key in keys_chain[:-1]: + if prev_key not in local_value: + local_value[prev_key] = dict() + local_value = local_value[prev_key] + local_value[keys_chain[-1]] = value # type: ignore + + +def get_bench_case_value( + bench_case: BenchCase, + param_name: Union[List[str], str], + default_value: JsonTypesUnion = None, +) -> Any: + if isinstance(param_name, str): + keys_chain = param_name.split(":") + else: + keys_chain = param_name + # deep dive into bench case + local_value = bench_case + for prev_key in keys_chain: + if prev_key not in local_value: + return default_value + local_value = local_value[prev_key] + return deepcopy(local_value) + + +def get_bench_case_values( + bench_case: BenchCase, + param_names: Union[List[List[str]], List[str]], + default_value: JsonTypesUnion = None, +) -> List[Any]: + return list( + get_bench_case_value(bench_case, param_name, default_value) + for param_name in param_names + ) + + +def get_first_of_bench_case_values( + bench_case: BenchCase, param_names: Union[List[List[str]], List[str]] +) -> JsonTypesUnion: + values = get_bench_case_values(bench_case, param_names, None) + values = list(filter(lambda x: x is not None, values)) + if len(values) == 0: + raise ValueError(f"Unable to find any of values: {param_names}.") + else: + return values[0] + + +def apply_func_to_bench_case_values( + bench_case: BenchCase, func, copy: bool = False +) -> BenchCase: + if copy: + result = deepcopy(bench_case) + else: + result = bench_case + for key, value in result.items(): + if isinstance(value, dict): + apply_func_to_bench_case_values(value, func) + else: + result[key] = func(value) + return result + + +def get_data_name(bench_case: BenchCase, shortened: bool = False) -> str: + # check if unique dataset name is specified directly + dataset = get_bench_case_value(bench_case, "data:dataset") + if dataset is not None: + return dataset + # check source of data + source = get_bench_case_value(bench_case, "data:source") + # generate kwargs postfixes for data filename + postfixes = dict() + for kwargs_type in ["generation", "dataset"]: + postfix = "" + for key, value in get_bench_case_value( + bench_case, f"data:{kwargs_type}_kwargs", dict() + ).items(): + postfix += f"_{key}_{value}" + postfixes[kwargs_type] = postfix + # fetch_openml + if source == "fetch_openml": + openml_id = get_bench_case_value(bench_case, "data:id") + return f"openml_{openml_id}" + # make_* + if source in ["make_classification", "make_regression", "make_blobs"]: + name = source + if shortened: + return name.replace("classification", "clsf").replace("regression", "regr") + else: + return f'{name}{postfixes["generation"]}{postfixes["dataset"]}' + raise ValueError("Unable to get data name") + + +def get_bench_case_name( + bench_case: BenchCase, shortened: bool = False, separator: str = " " +) -> str: + library_name = get_bench_case_value(bench_case, "algorithm:library") + alg_name = get_first_of_bench_case_values( + bench_case, ["algorithm:estimator", "algorithm:function"] + ) + data_name = get_data_name(bench_case, shortened) + name_args = [library_name, alg_name, data_name] + device = get_bench_case_value(bench_case, "algorithm:device", None) + if device is not None: + name_args.append(device) + return separator.join(name_args) diff --git a/sklbench/utils/common.py b/sklbench/utils/common.py new file mode 100755 index 000000000..064864286 --- /dev/null +++ b/sklbench/utils/common.py @@ -0,0 +1,219 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import hashlib +import importlib +import inspect +import json +import re +import subprocess as sp +from pprint import pformat +from shutil import get_terminal_size +from typing import Any, Dict, List, Tuple, Union + +import numpy as np + +from .custom_types import JsonTypesUnion, ModuleContentMap, Numeric + +# ANSI escape codes for in-terminal formatting +BCOLORS = { + "FAIL": "\033[91m", + "OKGREEN": "\033[92m", + "WARNING": "\033[93m", + "OKBLUE": "\033[94m", + "HEADER": "\033[95m", + "OKCYAN": "\033[96m", + "ENDC": "\033[0m", + "BOLD": "\033[1m", + "UNDERLINE": "\033[4m", +} + + +def custom_format( + input_obj: Any, + bcolor: Union[str, None] = None, + prettify: bool = True, + width: int = get_terminal_size().columns, + indent: int = 4, +) -> str: + """Pretty format with terminal highlighting""" + output = input_obj.copy() if hasattr(input_obj, "copy") else input_obj + if prettify: + output = pformat(input_obj, width=width, indent=indent) + if bcolor is not None: + output = BCOLORS[bcolor] + str(input_obj) + BCOLORS["ENDC"] + return output + + +def read_output_from_command(command: str) -> Tuple[int, str, str]: + """Executes command and returns code, stdout and stderr""" + res = sp.run( + command.split(" "), + stdout=sp.PIPE, + stderr=sp.PIPE, + encoding="utf-8", + ) + return res.returncode, res.stdout[:-1], res.stderr[:-1] + + +def hash_from_json_repr(x: JsonTypesUnion, hash_limit: int = 5) -> str: + h = hashlib.sha256() + h.update(bytes(json.dumps(x), encoding="utf-8")) + return h.hexdigest()[:hash_limit] + + +def ensure_list_types_homogeneity(input_list: List): + list_types = set([type(el) for el in input_list]) + if len(list_types) != 1: + raise ValueError("List is not type homogeneous. " f"Existing types: {list_types}") + + +def flatten_dict( + input_dict: Dict[str, JsonTypesUnion], + key_separator: str = " ", + keys_to_remove: List = ["metrics"], +) -> Dict: + output_dict = dict() + # iteration with inner recursion + for key, value in input_dict.items(): + if isinstance(value, dict): + flat_inner_dict = flatten_dict(value) + for inner_key, inner_value in flat_inner_dict.items(): + new_key = ( + key + key_separator + inner_key + if key not in keys_to_remove + else inner_key + ) + output_dict[new_key] = inner_value + else: + # keys to remove are not applied for lowest level keys + output_dict[key] = value + return output_dict + + +def flatten_list(input_list: List, ensure_type_homogeneity: bool = False) -> List: + output_list = list() + # iteration with inner recursion + for value in input_list: + if isinstance(value, list): + inner_flat_list = flatten_list(value) + for inner_value in inner_flat_list: + output_list.append(inner_value) + else: + output_list.append(value) + if ensure_type_homogeneity: + ensure_list_types_homogeneity(output_list) + return output_list + + +def get_module_members( + module_names_chain: Union[List, str] +) -> Tuple[ModuleContentMap, ModuleContentMap]: + def get_module_name(module_names_chain: List[str]) -> str: + name = module_names_chain[0] + for subname in module_names_chain[1:]: + name += "." + subname + return name + + def merge_maps( + first_map: ModuleContentMap, second_map: ModuleContentMap + ) -> ModuleContentMap: + output = dict() + all_keys = set(first_map.keys()) | set(second_map.keys()) + for key in all_keys: + if key in first_map and key in second_map: + output[key] = first_map[key] + second_map[key] + elif key in first_map: + output[key] = first_map[key] + elif key in second_map: + output[key] = second_map[key] + return output + + if isinstance(module_names_chain, str): + module_names_chain = [module_names_chain] + module_name = get_module_name(module_names_chain) + classes_map: ModuleContentMap = dict() + functions_map: ModuleContentMap = dict() + + try: + module = importlib.__import__(module_name, globals(), locals(), [], 0) + for subname in module_names_chain[1:]: + module = getattr(module, subname) + except ModuleNotFoundError: + return dict(), dict() + + for name, obj in inspect.getmembers(module): + if inspect.isclass(obj): + if name in classes_map and obj not in classes_map[name]: + classes_map[name].append(obj) + else: + classes_map[name] = [obj] + elif inspect.isfunction(obj): + if name in functions_map and obj not in functions_map[name]: + functions_map[name].append(obj) + else: + functions_map[name] = [obj] + + if hasattr(module, "__all__"): + for name in module.__all__: + sub_classes_map, sub_functions_map = get_module_members( + module_names_chain + [name] + ) + classes_map = merge_maps(classes_map, sub_classes_map) + functions_map = merge_maps(functions_map, sub_functions_map) + + return classes_map, functions_map + + +def is_float(value: str) -> bool: + return ( + re.match( + r"^[-+]?(?:\b[0-9]+(?:\.[0-9]*)?|\.[0-9]+\b)(?:[eE][-+]?[0-9]+\b)?$", value + ) + is not None + ) + + +def convert_to_numeric_if_possible(value: str) -> Union[Numeric, str]: + if value.isdigit(): + return int(value) + elif is_float(value): + return float(value) + else: + return value + + +def convert_to_numpy(a, dp_compat=False) -> np.ndarray: + if dp_compat and ("dpctl" in str(type(a)) or "dpnp" in str(type(a))): + return a + if isinstance(a, np.ndarray): + return a + elif hasattr(a, "to_numpy"): + return a.to_numpy() + elif hasattr(a, "asnumpy"): + return a.asnumpy() + elif "dpnp" in str(type(a)): + import dpnp + + return dpnp.asnumpy(a) + elif "dpctl" in str(type(a)): + import dpctl.tensor + + return dpctl.tensor.to_numpy(a) + elif "cupy.ndarray" in str(type(a)): + return a.get() + else: + raise ValueError("Unable to convert data to numpy.ndarray") diff --git a/sklbench/utils/config.py b/sklbench/utils/config.py new file mode 100644 index 000000000..11de647d5 --- /dev/null +++ b/sklbench/utils/config.py @@ -0,0 +1,325 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import argparse +import json +import os +from copy import deepcopy +from typing import Dict, List, Union + +from .bench_case import get_bench_case_value, set_bench_case_value +from .common import ( + convert_to_numeric_if_possible, + custom_format, + flatten_list, + hash_from_json_repr, +) +from .custom_types import BenchCase, BenchTemplate +from .logger import logger +from .special_params import ( + assign_case_special_values_on_generation, + assign_template_special_values, + explain_range, +) + + +def find_configs(paths: Union[List[str], str, None]) -> List[str]: + result = list() + if paths is None: + return result + # iterate over list of paths + elif isinstance(paths, list): + for path in paths: + result += find_configs(path) + # check if path is *.json file + elif os.path.isfile(paths) and paths.endswith(".json"): + result.append(paths) + # iterate over directory content with recursion + elif os.path.isdir(paths): + for path in os.listdir(paths): + result += find_configs(os.path.join(paths, path)) + else: + logger.debug(f'Config path "{paths}" wasn\'t added') + return result + + +def merge_dicts(first: Dict, second: Dict) -> Dict: + # Function overwrites deep copy of first with second + # `deepcopy` is used to avoid accidental changes + # through reference to list or dict + result = deepcopy(first) + # iteration over items of second dict with inner recursion + for key, value in second.items(): + if key not in result: + result[key] = deepcopy(value) + else: + # `dict | dict` case - simple merge + if isinstance(result[key], dict) and isinstance(value, dict): + result[key] = merge_dicts(result[key], value) + elif isinstance(result[key], list) and isinstance(value, dict): + result[key] = [merge_dicts(el, deepcopy(value)) for el in result[key]] + elif isinstance(result[key], dict) and isinstance(value, list): + result[key] = [merge_dicts(result[key], deepcopy(el)) for el in value] + elif isinstance(result[key], list) and isinstance(value, list): + local_result = [] + for element_in_first in result[key]: + for element_in_second in value: + local_result.append( + merge_dicts(element_in_first, element_in_second) + ) + result[key] = local_result + else: + result[key] = deepcopy(value) + return result + + +def parse_config_file(config_path: str) -> List[Dict]: + with open(config_path, "r") as config_file: + config_content = json.load(config_file) + templates = list() + if "TEMPLATES" not in config_content: + raise ValueError(f"{config_path} doesn't contain templates") + if "INCLUDE" in config_content: + config_dir = os.path.dirname(config_path) + include_content = dict() + for include_config in config_content["INCLUDE"]: + include_path = os.path.join(config_dir, include_config) + if os.path.isfile(include_path): + with open(include_path, "r") as include_file: + include_content.update(json.load(include_file)["PARAMETERS_SETS"]) + else: + logger.warning(f"Include file '{include_path}' not found.") + include_content.update(config_content["PARAMETERS_SETS"]) + config_content["PARAMETERS_SETS"] = include_content + for template_name, template_content in config_content["TEMPLATES"].items(): + new_templates = [{}] + # 1st step: pop list of included param sets and add them to template + if "SETS" in template_content: + for param_set_name in template_content.pop("SETS"): + param_set = config_content["PARAMETERS_SETS"][param_set_name] + if isinstance(param_set, dict): + new_templates = [ + merge_dicts(tmpl, param_set) for tmpl in new_templates + ] + elif isinstance(param_set, list): + new_templates = flatten_list( + [ + [merge_dicts(tmpl, set_element) for set_element in param_set] + for tmpl in new_templates + ] + ) + # 2nd step: add other params for specific template + new_templates = [merge_dicts(tmpl, template_content) for tmpl in new_templates] + templates += new_templates + return templates + + +def parse_cli_parameters(params: List) -> BenchTemplate: + result = dict() + for param in params: + # parameter format: "key1:key2:key3=value1[,value2]" + param = param.split("=") + param_path, param_values = param[0].split(":"), param[1] + param_values = param_values.split(",") + # int/float/bool/None values are initially read as str + for i, value in enumerate(param_values): + if param_values[i] == "null": + param_values[i] = None + elif param_values[i] == "true": + param_values[i] = True + elif param_values[i] == "false": + param_values[i] = False + else: + param_values[i] = convert_to_numeric_if_possible(value) + if len(param_values) == 1: + param_values = param_values[0] + # deduce chain of param keys + param_dict = dict() + local_dict = param_dict + for key in param_path[:-1]: + local_dict[key] = dict() + local_dict = local_dict[key] + local_dict[param_path[-1]] = param_values + result = merge_dicts(result, param_dict) + + return result + + +def expand_ranges_in_template(template: BenchTemplate): + for key, value in template.items(): + # recursion for inner dict + if isinstance(value, dict): + expand_ranges_in_template(value) + # iteration over list values + elif isinstance(value, list): + for i, el in enumerate(value): + # list of dicts + if isinstance(el, dict): + expand_ranges_in_template(el) + # list of strs + elif isinstance(el, str) and el.startswith("[RANGE]"): + value[i] = explain_range(el) + # avoidance of nested lists + # (in bench_case where ranges and strs are mixed) + template[key] = flatten_list(value) + elif isinstance(value, str) and value.startswith("[RANGE]"): + template[key] = explain_range(value) + if len(template[key]) == 0: + raise ValueError("Range specification resulted in zero-length list") + + +def expand_template( + template: BenchTemplate, bench_cases: List[Dict], keys_chain: List[str] +) -> List[Dict]: + # deep copy to prevent modifying by reference + bench_cases = deepcopy(bench_cases) + # iterate over dict + if isinstance(template, dict): + for key, value in template.items(): + bench_cases = expand_template(value, bench_cases, keys_chain + [key]) + # iterate over list + elif isinstance(template, list): + new_bench_cases = list() + for i, value in enumerate(template): + new_bench_cases += expand_template(value, bench_cases, keys_chain) + bench_cases = new_bench_cases + # assign scalar value + else: + for bench_case in bench_cases: + set_bench_case_value(bench_case, keys_chain, template) + return bench_cases + + +def remove_duplicated_bench_cases(bench_cases: List[BenchCase]) -> List[BenchCase]: + hash_map = dict() + for bench_case in bench_cases: + hash_map[hash_from_json_repr(bench_case)] = bench_case + return list(hash_map.values()) + + +def bench_case_filter(bench_case: BenchCase, filters: List[BenchCase]): + # filtering is implemented by comparison of + # benchmark case and case merged with filters: + # filtering is passed if one of merged cases has same hash as original + original_hash = hash_from_json_repr(bench_case) + filtered_hashes = [ + hash_from_json_repr(merge_dicts(bench_case, bench_filter)) + for bench_filter in filters + ] + return original_hash in filtered_hashes or len(filtered_hashes) == 0 + + +def early_filtering( + bench_cases: List[BenchCase], filters: List[BenchCase] +) -> List[BenchCase]: + def get_early_filter(original_filter): + static_params = [ + "data", + "algorithm:library", + "algorithm:estimator", + "algorithm:function", + "algorithm:device", + ] + early_filter = dict() + for static_param in static_params: + early_value = get_bench_case_value(original_filter, static_param) + if early_value is not None: + set_bench_case_value(early_filter, static_param, early_value) + return early_filter + + static_param_filters = list(map(get_early_filter, filters)) + filtered_bench_cases = list( + filter(lambda x: bench_case_filter(x, static_param_filters), bench_cases) + ) + if len(bench_cases) != len(filtered_bench_cases): + logger.info( + "Early filtering reduced number of cases from " + f"{len(bench_cases)} to {len(filtered_bench_cases)}." + ) + return filtered_bench_cases + + +def generate_bench_filters(raw_filters: List) -> List[BenchCase]: + # filters are implemented as benchmark cases + # containing only filter values + filters_template = parse_cli_parameters(raw_filters) + filters_template = assign_template_special_values(filters_template) + expand_ranges_in_template(filters_template) + filters = expand_template(filters_template, [{}], []) + filters = remove_duplicated_bench_cases(filters) + filters = list(map(assign_case_special_values_on_generation, filters)) + logger.debug(f"Loaded filters:\n{custom_format(filters)}") + return filters + + +def generate_bench_cases(args: argparse.Namespace) -> List[BenchCase]: + # find config files from paths specified in args + config_files = find_configs(args.config) + + # config files or global parameters should be defined for run + if len(config_files) == 0: + if args.parameters == "": + raise ValueError("Unable to find any configs") + else: + logger.info("Using CLI parameters as template") + else: + logger.info(f"Number of found config files: {len(config_files)}") + logger.debug(f"Found config files:\n{custom_format(config_files)}") + + # parse config files to get bench_case templates + # (without expanded paramaters from lists and ranges) + bench_case_templates = list() + for config_file in config_files: + bench_case_templates += parse_config_file(config_file) + + # overwrite templates by globally defined parameters or use them as template + global_parameters = parse_cli_parameters(args.parameters) + if len(global_parameters) > 0: + logger.info(f"Global parameters:\n{custom_format(global_parameters)}") + else: + logger.debug("Global parameters are empty") + if len(bench_case_templates) == 0: + bench_case_templates = [global_parameters] + else: + bench_case_templates = [ + merge_dicts(tmpl, global_parameters) for tmpl in bench_case_templates + ] + logger.info(f"Number of loaded templates: {len(bench_case_templates)}") + logger.debug(f"Loaded templates:\n{custom_format(bench_case_templates)}") + + # assign special values in templates + bench_case_templates = list(map(assign_template_special_values, bench_case_templates)) + + # extract values from lists and ranges defined in templates + for tmpl in bench_case_templates: + expand_ranges_in_template(tmpl) + + all_bench_cases = list() + # find non-duplicated bench_cases from templates + for tmpl in bench_case_templates: + all_bench_cases += expand_template(tmpl, [{}], []) + logger.debug( + f"Number of loaded cases before removal of duplicates: {len(all_bench_cases)}" + ) + all_bench_cases = remove_duplicated_bench_cases(all_bench_cases) + + # assign special values in bench_cases + all_bench_cases = list(map(assign_case_special_values_on_generation, all_bench_cases)) + + logger.info(f"Number of loaded cases: {len(all_bench_cases)}") + logger.debug(f"Loaded cases:\n{custom_format(all_bench_cases)}") + + return all_bench_cases diff --git a/sklbench/utils/custom_types.py b/sklbench/utils/custom_types.py new file mode 100644 index 000000000..e30e7de73 --- /dev/null +++ b/sklbench/utils/custom_types.py @@ -0,0 +1,34 @@ +# =============================================================================== +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from typing import Any, Dict, List, Union + +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix + +NumpyNumeric = Union[np.unsignedinteger, np.integer, np.floating] +Numeric = Union[int, float] +Scalar = Union[Numeric, bool, str, None] +JsonTypesUnion = Union[Scalar, List, Dict] +# TODO: replace Any with Union[Callable, ...] +ModuleContentMap = Dict[str, List[Any]] +# template may contain lists on first level +BenchTemplate = Dict[str, Any] +# case is expected to be nested dict +BenchCase = Dict[str, Dict[str, Any]] + +Array = Union[pd.DataFrame, np.ndarray, csr_matrix] diff --git a/sklbench/utils/env.py b/sklbench/utils/env.py new file mode 100644 index 000000000..73b6d45e5 --- /dev/null +++ b/sklbench/utils/env.py @@ -0,0 +1,148 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import json +from typing import Dict + +import pandas as pd + +from .common import read_output_from_command +from .logger import logger + + +def get_numa_cpus_conf() -> Dict[int, str]: + try: + _, lscpu_text, _ = read_output_from_command("lscpu") + return { + i: numa_cpus + for i, numa_cpus in enumerate( + map( + lambda x: x.split(" ")[-1], + filter( + lambda line: "NUMA" in line and "CPU(s)" in line, + lscpu_text.split("\n"), + ), + ) + ) + } + except FileNotFoundError: + logger.warning("Unable to get numa cpus configuration via lscpu") + return dict() + + +def get_software_info() -> Dict: + result = dict() + # conda list + try: + _, conda_list, _ = read_output_from_command("conda list --json") + conda_packages = json.loads(conda_list) + result["conda_packages"] = {pkg.pop("name"): pkg for pkg in conda_packages} + # pip list + except (FileNotFoundError, PermissionError, AttributeError): + logger.warning("Unable to get python packages list via conda") + try: + _, pip_list, _ = read_output_from_command("pip list --format json") + pip_packages = json.loads(pip_list) + result["pip_packages"] = {pkg.pop("name"): pkg for pkg in pip_packages} + except (FileNotFoundError, PermissionError, AttributeError): + logger.warning("Unable to get python packages list via pip") + return result + + +def get_oneapi_devices() -> pd.DataFrame: + try: + import dpctl + + devices = dpctl.get_devices() + devices = { + device.filter_string: { + "name": device.name, + "vendor": device.vendor, + "type": str(device.device_type).split(".")[1], + "driver version": device.driver_version, + "memory size[GB]": device.global_mem_size / 2**30, + } + for device in devices + } + if len(devices) > 0: + return pd.DataFrame(devices).T + else: + logger.warning("dpctl device table is empty") + except (ImportError, ModuleNotFoundError): + logger.warning("dpctl can not be imported") + # 'type' is left for device type selection only + return pd.DataFrame({"type": list()}) + + +def get_higher_isa(cpu_flags: str) -> str: + # TODO: add non-x86 sets + ordered_sets = ["avx512", "avx2", "avx", "sse4_2", "ssse3", "sse2"] + for isa in ordered_sets: + if isa in cpu_flags: + return isa + return "unknown" + + +def get_hardware_info() -> Dict: + result = dict() + oneapi_devices = get_oneapi_devices() + if len(oneapi_devices) > 0: + logger.info(f"DPCTL listed devices:\n{oneapi_devices}\n") + # CPU + try: + from cpuinfo import get_cpu_info + + cpu_info = get_cpu_info() + # remap cpu info values to better understandable names + fields_map = { + "arch": "architecture", + "brand_raw": "name", + "flags": "flags", + "count": "logical_cpus", + } + for key in list(cpu_info.keys()): + value = cpu_info.pop(key) + if key in fields_map.keys(): + cpu_info[fields_map[key]] = value + # squash CPU flags + cpu_info["flags"] = " ".join(cpu_info["flags"]) + result["CPU"] = cpu_info + logger.info(f'CPU name: {cpu_info["name"]}') + logger.info( + "Highest supported ISA: " f'{get_higher_isa(cpu_info["flags"]).upper()}' + ) + except (ImportError, ModuleNotFoundError): + logger.warning('Unable to parse CPU info with "cpuinfo" module') + # GPUs + result["GPU(s)"] = dict() + try: + oneapi_gpus = oneapi_devices[oneapi_devices["type"] == "gpu"] + result["GPU(s)"].update(oneapi_gpus.T.to_dict()) + except (ImportError, ModuleNotFoundError): + logger.warning('Unable to get devices with "dpctl" module') + # RAM size + try: + import psutil + + result["RAM size[GB]"] = psutil.virtual_memory().total / 2**30 + logger.info(f'RAM size[GB]: {round(result["RAM size[GB]"], 3)}') + except (ImportError, ModuleNotFoundError): + logger.warning('Unable to parse memory info with "psutil" module') + return result + + +def get_environment_info() -> Dict: + return {"hardware": get_hardware_info(), "software": get_software_info()} diff --git a/sklbench/utils/logger.py b/sklbench/utils/logger.py new file mode 100644 index 000000000..909406304 --- /dev/null +++ b/sklbench/utils/logger.py @@ -0,0 +1,25 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import logging + +logger = logging.Logger("sklbench") + +logging_channel = logging.StreamHandler() +logging_formatter = logging.Formatter("%(levelname)s:%(name)s: %(message)s") +logging_channel.setFormatter(logging_formatter) + +logger.addHandler(logging_channel) diff --git a/sklbench/utils/measurement.py b/sklbench/utils/measurement.py new file mode 100644 index 000000000..989daefd8 --- /dev/null +++ b/sklbench/utils/measurement.py @@ -0,0 +1,100 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import timeit + +import numpy as np + +from .bench_case import get_bench_case_value +from .custom_types import BenchCase +from .logger import logger + +try: + import itt + + itt_is_available = True +except (ImportError, ModuleNotFoundError): + itt_is_available = False + + +def box_filter(timing, left=0.2, right=0.8): + timing.sort() + size = len(timing) + if size == 1: + return timing[0] * 1000, 0 + lower, upper = timing[int(size * left)], timing[int(size * right)] + result = np.array([item for item in timing if lower < item < upper]) + return np.mean(result) * 1000, np.std(result) * 1000 + + +def measure_time( + func, + *args, + n_runs=20, + time_limit=60 * 60, + std_mean_ratio=0.2, + enable_itt=False, + **kwargs, +): + if enable_itt and not itt_is_available: + logger.warning( + "Intel(R) VTune(TM) profiling was requested " + 'but "itt" python module is not available.' + ) + times = [] + func_return_value = None + while len(times) < n_runs: + if enable_itt and itt_is_available: + itt.resume() + t0 = timeit.default_timer() + func_return_value = func(*args, **kwargs) + t1 = timeit.default_timer() + if enable_itt and itt_is_available: + itt.pause() + times.append(t1 - t0) + if sum(times) > time_limit: + logger.warning( + f"'{func}' function measurement time " + f"({sum(times)} seconds from {len(times)} runs) " + f"exceeded time limit ({time_limit} seconds)" + ) + break + mean, std = box_filter(times) + if std / mean > std_mean_ratio: + logger.warning( + f'Measured "std / mean" time ratio of "{str(func)}" function is higher ' + f"than threshold ({round(std / mean, 3)} vs. {std_mean_ratio})" + ) + return mean, std, func_return_value + + +# wrapper to get measurement params from benchmarking case +def measure_case(case: BenchCase, func, *args, **kwargs): + distirbutor = get_bench_case_value(case, "bench:distributor") + if distirbutor == "mpi": + # sync all MPI processes + from mpi4py import MPI + + comm = MPI.COMM_WORLD + comm.Barrier() + return measure_time( + func, + *args, + **kwargs, + n_runs=get_bench_case_value(case, "bench:n_runs", 10), + time_limit=get_bench_case_value(case, "bench:time_limit", 3600), + enable_itt=get_bench_case_value(case, "bench:vtune_profiling") is not None, + ) diff --git a/sklbench/utils/special_params.py b/sklbench/utils/special_params.py new file mode 100644 index 000000000..491910234 --- /dev/null +++ b/sklbench/utils/special_params.py @@ -0,0 +1,284 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from copy import deepcopy +from math import ceil +from typing import Dict, List + +import numpy as np +import pandas as pd +from psutil import cpu_count +from sklearn.metrics import euclidean_distances + +from ..datasets import dataset_loading_functions +from .bench_case import get_bench_case_value, set_bench_case_value +from .common import convert_to_numpy, flatten_list +from .custom_types import BenchCase, BenchTemplate +from .env import get_numa_cpus_conf +from .logger import logger + +SP_VALUE_STR = "[SPECIAL_VALUE]" + + +def is_special_value(value) -> bool: + return isinstance(value, str) and value.startswith(SP_VALUE_STR) + + +def explain_range(range_str: str) -> List: + def check_range_values_size(range_values: List[int], size: int): + if len(range_values) != size: + raise ValueError( + f"Range contains {len(range_values)} " f"numeric values instead of {size}" + ) + + range_values = range_str.replace("[RANGE]", "").split(":") + # TODO: add float values + range_type = range_values[0] + range_values = list(map(int, range_values[1:])) + # - add:start{int}:end{int}:step{int} - Arithmetic progression + # Sequence: start + step * i <= end + if range_type == "add": + check_range_values_size(range_values, 3) + start, end, step = range_values + return list(range(start, end + step, step)) + # - mul:current{int}:end{int}:step{int} - Geometric progression + # Sequence: current * step <= end + elif range_type == "mul": + check_range_values_size(range_values, 3) + current, end, step = range_values + result = list() + while current <= end: + result.append(current) + current *= step + return result + # - pow:base{int}:start{int}:end{int}[:step{int}] - Powers of base number + elif range_type == "pow": + # add default step = 1 if not defined + if len(range_values) < 4: + range_values.append(1) + check_range_values_size(range_values, 4) + base, start, end, step = range_values + return [base**i for i in range(start, end + step, step)] + else: + raise ValueError(f'Unknown "{range_type}" range type') + + +def assign_template_special_values(template: BenchTemplate) -> BenchTemplate: + # data:dataset special values + datasets = deepcopy(get_bench_case_value(template, "data:dataset")) + if datasets is not None: + if not isinstance(datasets, list): + datasets = [datasets] + # `all_named` is equal to all datasets known by data loaders + all_named_datasets = list(dataset_loading_functions.keys()) + for i, dataset in enumerate(datasets): + if is_special_value(dataset): + dataset = dataset.replace(SP_VALUE_STR, "") + if dataset == "all_named": + datasets[i] = all_named_datasets + datasets = flatten_list(datasets, ensure_type_homogeneity=True) + set_bench_case_value(template, "data:dataset", datasets) + + return template + + +def assign_case_special_values_on_generation(bench_case: BenchCase) -> BenchCase: + # sklearn.datasets.make_classification: n_informative as ratio of n_features + n_informative = get_bench_case_value( + bench_case, "data:generation_kwargs:n_informative" + ) + if is_special_value(n_informative): + n_informative = float(n_informative.replace(SP_VALUE_STR, "")) + if n_informative <= 0.0 or n_informative > 1.0: + raise ValueError(f'Wrong special value "{n_informative}" for n_informative') + n_features = get_bench_case_value(bench_case, "data:generation_kwargs:n_features") + if n_features is None: + raise ValueError( + '"n_features" is not specified for special value of "n_informative"' + ) + set_bench_case_value( + bench_case, + "data:generation_kwargs:n_informative", + ceil(n_informative * n_features), + ) + # taskset + taskset = get_bench_case_value(bench_case, "bench:taskset") + if is_special_value(taskset): + taskset = taskset.replace(SP_VALUE_STR, "") + # special value format for numa nodes: "numa:{numa_node_0}[|{numa_node_1}...]" + if taskset.startswith("numa"): + numa_nodes = list(map(int, taskset.split(":")[1].split("|"))) + numa_cpus_conf = get_numa_cpus_conf() + taskset = ",".join([numa_cpus_conf[numa_node] for numa_node in numa_nodes]) + set_bench_case_value(bench_case, "bench:taskset", taskset) + + # remove requested parameters from the case + def traverse_with_removal(case: BenchCase): + for key, value in list(case.items()): + if isinstance(value, dict): + traverse_with_removal(value) + elif isinstance(value, str) and value == "[REMOVE]": + del case[key] + + traverse_with_removal(bench_case) + + return bench_case + + +def get_ratio_from_n_jobs(n_jobs: str) -> float: + args = n_jobs.split(":") + if len(args) == 1: + return 1.0 + elif len(args) == 2: + return float(args[1]) + else: + raise ValueError(f'Wrong arguments {args} in "n_jobs" special value') + + +def assign_case_special_values_on_run( + bench_case: BenchCase, data, data_description: Dict +): + # Note: data = (x_train, y_train, x_test, y_train) + library = get_bench_case_value(bench_case, "algorithm:library", None) + estimator = get_bench_case_value(bench_case, "algorithm:estimator", None) + # device-related parameters assignment + device = get_bench_case_value(bench_case, "algorithm:device", "default") + if device != "default": + # xgboost tree method assignment branch + if library == "xgboost" and estimator in ["XGBRegressor", "XGBClassifier"]: + if device == "cpu" or any(map(device.startswith, ["gpu", "cuda"])): + logger.debug( + f"Forwaring device '{device}' to XGBoost estimator parameters" + ) + set_bench_case_value( + bench_case, "algorithm:estimator_params:device", device + ) + else: + raise ValueError(f"Unknown device '{device}' for xgboost {estimator}") + # set target offload for execution context + elif library.startswith("sklearnex") or library.startswith("daal4py"): + if device == "cpu": + logger.debug( + "Skipping setting of 'target_offload' for CPU device " + "to avoid extra overheads" + ) + else: + set_bench_case_value( + bench_case, "algorithm:sklearnex_context:target_offload", device + ) + # faiss GPU algorithm selection + elif library == "sklbench.emulators.faiss" and estimator == "NearestNeighbors": + set_bench_case_value(bench_case, "algorithm:estimator_params:device", device) + else: + logger.warning(f'Device specification "{device}" is not used for this case') + # assign "default" or changed device for output + tree_method = get_bench_case_value( + bench_case, "algorithm:estimator_params:tree_method", None + ) + if tree_method == "gpu_hist": + device = "gpu" + set_bench_case_value(bench_case, "algorithm:device", device) + # n_jobs + n_jobs = get_bench_case_value(bench_case, "algorithm:estimator_params:n_jobs", None) + if is_special_value(n_jobs): + n_jobs = n_jobs.replace(SP_VALUE_STR, "") + if n_jobs.startswith("physical_cpus"): + n_cpus = cpu_count(logical=False) + elif n_jobs.startswith("logical_cpus"): + n_cpus = cpu_count(logical=True) + else: + raise ValueError(f'Unknown special value {n_jobs} for "n_jobs"') + n_jobs = int(n_cpus * get_ratio_from_n_jobs(n_jobs)) + set_bench_case_value(bench_case, "algorithm:estimator_params:n_jobs", n_jobs) + # classes balance for XGBoost + scale_pos_weight = get_bench_case_value( + bench_case, "algorithm:estimator_params:scale_pos_weight", None + ) + if ( + is_special_value(scale_pos_weight) + and scale_pos_weight.replace(SP_VALUE_STR, "") == "auto" + and library == "xgboost" + and estimator == "XGBClassifier" + ): + y_train = convert_to_numpy(data[1]) + value_counts = pd.value_counts(y_train).sort_index() + if len(value_counts) != 2: + logger.info( + f"Number of classes ({len(value_counts)}) != 2 " + 'while "scale_pos_weight" is set to "auto". ' + "This parameter is removed from estimator parameters." + ) + set_bench_case_value( + bench_case, "algorithm:estimator_params:scale_pos_weight", None + ) + else: + scale_pos_weight = value_counts.iloc[0] / value_counts.iloc[1] + set_bench_case_value( + bench_case, + "algorithm:estimator_params:scale_pos_weight", + scale_pos_weight, + ) + # "n_clusters" auto assignment from data description + n_clusters = get_bench_case_value( + bench_case, "algorithm:estimator_params:n_clusters", None + ) + if is_special_value(n_clusters) and n_clusters.replace(SP_VALUE_STR, "") == "auto": + n_clusters = data_description.get("n_clusters", None) + n_classes = data_description.get("n_classes", None) + n_clusters_per_class = data_description.get("n_clusters_per_class", 1) + if n_clusters is not None: + if isinstance(n_clusters, int): + set_bench_case_value( + bench_case, "algorithm:estimator_params:n_clusters", n_clusters + ) + else: + raise ValueError( + f"n_clusters={n_clusters} of type {type(n_clusters)} " + "from data description is not integer." + ) + elif n_classes is not None: + set_bench_case_value( + bench_case, + "algorithm:estimator_params:n_clusters", + n_classes * n_clusters_per_class, + ) + else: + raise ValueError( + "Unable to auto-assign n_clusters: " + "data description doesn't have n_clusters or n_classes" + ) + # "eps" auto assignment for DBSCAN + eps = get_bench_case_value(bench_case, "algorithm:estimator_params:eps", None) + if is_special_value(eps) and eps.replace(SP_VALUE_STR, "").startswith( + "distances_quantile" + ): + x_train = convert_to_numpy(data[0]) + quantile = float(eps.replace(SP_VALUE_STR, "").split(":")[1]) + # subsample of x_train is used to avoid reaching of memory limit for large matrices + subsample = list(getattr(x_train, "index", np.arange(x_train.shape[0]))) + np.random.seed(42) + np.random.shuffle(subsample) + subsample = subsample[: min(x_train.shape[0], 1000)] + x_sample = ( + x_train.loc[subsample] if hasattr(x_train, "loc") else x_train[subsample] + ) + # conversion to lower precision is required + # to produce same distances quantile for different dtypes of x + x_sample = x_sample.astype("float32") + dist = np.tril(euclidean_distances(x_sample, x_sample)).reshape(-1) + dist = dist[dist != 0] + quantile = float(np.quantile(dist, quantile)) + set_bench_case_value(bench_case, "algorithm:estimator_params:eps", quantile) diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md deleted file mode 100644 index bde10c3f9..000000000 --- a/sklearn_bench/README.md +++ /dev/null @@ -1,175 +0,0 @@ -# How to create conda environment for benchmarking - -If you want to test scikit-learn, then use - -```bash -pip install -r sklearn_bench/requirements.txt -# or -conda install -c intel scikit-learn scikit-learn-intelex pandas tqdm -``` - -## Algorithms parameters - -You can launch benchmarks for each algorithm separately. The tables below list all supported parameters for each algorithm: - -- [General](#general) -- [DBSCAN](#dbscan) -- [RandomForestClassifier](#randomforestclassifier) -- [RandomForestRegressor](#randomforestregressor) -- [pairwise_distances](#pairwise_distances) -- [KMeans](#kmeans) -- [KNeighborsClassifier](#kneighborsclassifier) -- [LinearRegression](#linearregression) -- [LogisticRegression](#logisticregression) -- [PCA](#pca) -- [Ridge Regression](#ridge) -- [SVC](#svc) -- [TSNE](#tsne) -- [train_test_split](#train_test_split) - -### General - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -|num-threads|int|-1| The number of threads to use| -|arch|str|?|Achine architecture, for bookkeeping| -|batch|str|?|Batch ID, for bookkeeping| -|prefix|str|sklearn|Prefix string, for bookkeeping| -|header|action|False|Output CSV header| -|verbose|action|False|Output extra debug messages| -|data-format|str|numpy|Data formats: *numpy*, *pandas* or *cudf*| -|data-order|str|C|Data order: C (row-major, default) or F (column-major)| -|dtype|np.dtype|np.float64|Data type: *float64* (default) or *float32*| -|check-finiteness|action|False|Check finiteness in sklearn input check(disabled by default)| -|output-format|str|csv|Output format: *csv* (default) or *json*'| -|time-method|str|mean_min|*box_filter* or *mean_min*. Method used for time mesurements| -|box-filter-measurements|int|100|Maximum number of measurements in box filter| -|inner-loops|int|100|Maximum inner loop iterations. (we take the mean over inner iterations)| -|outer-loops|int|100|Maximum outer loop iterations. (we take the min over outer iterations)| -|time-limit|float|10|Target time to spend to benchmark| -|goal-outer-loops|int|10|The number of outer loops to aim while automatically picking number of inner loops. If zero, do not automatically decide number of inner loops| -|seed|int|12345|Seed to pass as random_state| -|dataset-name|str|None|Dataset name| - -### DBSCAN - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| epsilon | float | 10 | Radius of neighborhood of a point| -| min_samples | int | 5 | The minimum number of samples required in a 'neighborhood to consider a point a core point | - -### RandomForestClassifier - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split | -| num-trees | int | 100 | The number of trees in the forest | -| max-features | float_or_int | None | Upper bound on features used at each split | -| max-depth | int | None | Upper bound on depth of constructed trees | -| min-samples-split | float_or_int | 2 | Minimum samples number for node splitting | -| max-leaf-nodes | int | None | Maximum leaf nodes per tree | -| min-impurity-decrease | float | 0 | Needed impurity decrease for node splitting | -| no-bootstrap | store_false | True | Don't control bootstraping | - -### RandomForestRegressor - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split | -| num-trees | int | 100 | The number of trees in the forest | -| max-features | float_or_int | None | Upper bound on features used at each split | -| max-depth | int | None | Upper bound on depth of constructed trees | -| min-samples-split | float_or_int | 2 | Minimum samples number for node splitting | -| max-leaf-nodes | int | None | Maximum leaf nodes per tree | -| min-impurity-decrease | float | 0 | Needed impurity decrease for node splitting | -| no-bootstrap | action | True | Don't control bootstraping | -| use-sklearn-class | action | | Force use of sklearn.ensemble.RandomForestClassifier | - -### pairwise_distances - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| metric | str | cosine | *cosine* or *correlation* Metric to test for pairwise distances | - -### KMeans - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| init | str | | Initial clusters | -| tol | float | 0 | Absolute threshold | -| maxiter | inte | 100 | Maximum number of iterations | -| n-clusters | int | | The number of clusters | - -### KNeighborsClassifier - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| n-neighbors | int | 5 | The number of neighbors to use | -| weights | str | uniform | Weight function used in prediction | -| method | str | brute | Algorithm used to compute the nearest neighbors | -| metric | str | euclidean | Distance metric to use | - -### LinearRegression - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| no-fit-intercept | action | True | Don't fit intercept (assume data already centered) | - -### LogisticRegression - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| no-fit-intercept | action | True | Don't fit intercept| -| multiclass | str | auto | *auto*, *ovr* or *multinomial*. How to treat multi class data| -| solver | str | lbfgs | *lbfgs*, *newton-cg* or *saga*. Solver to use| -| maxiter | int | 100 | Maximum iterations for the iterative solver | -| C | float | 1.0 | Regularization parameter | -| tol | float | None | Tolerance for solver | - -### PCA - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| svd-solver | str | daal | *daal*, *full*. SVD solver to use | -| n-components | int | None | The number of components to find | -| whiten | action | False | Perform whitening | - -### Ridge - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| no-fit-intercept | action | True | Don't fit intercept (assume data already centered) | -| solver | str | auto | Solver used for training | -| alpha | float | 1.0 | Regularization strength | - -### SVC - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| C | float | 0.01 | SVM slack parameter | -| kernel | str | linear | *linear*, *rbf*, or *poly*. SVM kernel function | -| gamma | float | None | Parameter for kernel="rbf" | -| max-cache-size | int | 64 | Maximum cache size for SVM. | -| tol | float | 1e-16 | Tolerance passed to sklearn.svm.SVC | -| probability | action | True | Use probability for SVC | - -### TSNE - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| n-components | int | 2 | Dimension of the embedded space | -| early-exaggeration | float | 12.0 | This factor increases the attractive forces between points
and allows points to move around more freely finding their nearest neighbors more easily | -| learning-rate | float | 200.0 | The learning rate for t-SNE is usually in the range [10.0, 1000.0] | -| angle | float | 0.5 | Angular size. This is the trade-off between speed and accuracy | -| min-grad-norm | float | 1e-7 | If the gradient norm is below this threshold, the optimization is stopped | -| random-state | int | 1234 | Determines the random number generator | - -### train_test_split - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| train-size | float | 0.75 | Size of training subset | -| test-size | float | 0.25 | Size of testing subset | -| do-not-shuffle | action | False | Do not perform data shuffle before splitting | -| include-y | action | False | Include label (Y) in splitting | -| rng | str | None | *MT19937*, *SFMT19937*, *MT2203*, *R250*, *WH*, *MCG31*, *MCG59*, *MRG32K3A*, *PHILOX4X32X10*, *NONDETERM* or None. Random numbers generator for shuffling.(only for IDP scikit-learn)| diff --git a/sklearn_bench/__init__.py b/sklearn_bench/__init__.py deleted file mode 100755 index e69de29bb..000000000 diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py deleted file mode 100644 index 94a55bafa..000000000 --- a/sklearn_bench/dbscan.py +++ /dev/null @@ -1,58 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench - - -def main(): - from sklearn.cluster import DBSCAN - - # Load generated data - X, _, _, _ = bench.load_data(params, add_dtype=True) - - # Create our clustering object - dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, - min_samples=params.min_samples, metric='euclidean', - algorithm='auto') - - # N.B. algorithm='auto' will select oneAPI Data Analytics Library (oneDAL) - # brute force method when running daal4py-patched scikit-learn, and probably - # 'kdtree' when running unpatched scikit-learn. - - # Time fit - time, _ = bench.measure_function_time(dbscan.fit, X, params=params) - labels = dbscan.labels_ - - params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) - acc = bench.davies_bouldin_score(X, labels) - - bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], - params=params, functions=['DBSCAN'], times=[time], - metrics=[acc], metric_type='davies_bouldin_score', - data=[X], alg_instance=dbscan) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') - parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., - help='Radius of neighborhood of a point') - parser.add_argument('-m', '--min-samples', default=5, type=int, - help='The minimum number of samples required in a ' - 'neighborhood to consider a point a core point') - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py deleted file mode 100644 index 5d35ef02d..000000000 --- a/sklearn_bench/df_clsf.py +++ /dev/null @@ -1,98 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np - - -def main(): - from sklearn.ensemble import RandomForestClassifier - - # Load and convert data - X_train, X_test, y_train, y_test = bench.load_data(params) - - # Create our random forest classifier - clf = RandomForestClassifier(criterion=params.criterion, - n_estimators=params.num_trees, - max_depth=params.max_depth, - max_features=params.max_features, - min_samples_split=params.min_samples_split, - max_leaf_nodes=params.max_leaf_nodes, - min_impurity_decrease=params.min_impurity_decrease, - bootstrap=params.bootstrap, - random_state=params.seed, - n_jobs=params.n_jobs) - - params.n_classes = len(np.unique(y_train)) - - fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) - y_pred = clf.predict(X_train) - y_proba = clf.predict_proba(X_train) - train_acc = bench.accuracy_score(y_train, y_pred) - train_log_loss = bench.log_loss(y_train, y_proba) - train_roc_auc = bench.roc_auc_score(y_train, y_proba) - - predict_time, y_pred = bench.measure_function_time( - clf.predict, X_test, params=params) - y_proba = clf.predict_proba(X_test) - test_acc = bench.accuracy_score(y_test, y_pred) - test_log_loss = bench.log_loss(y_test, y_proba) - test_roc_auc = bench.roc_auc_score(y_test, y_proba) - - bench.print_output( - library='sklearn', - algorithm='df_clsf', - stages=['training', 'prediction'], - params=params, - functions=['df_clsf.fit', 'df_clsf.predict'], - times=[fit_time, predict_time], - metric_type=['accuracy', 'log_loss', 'roc_auc'], - metrics=[ - [train_acc, test_acc], - [train_log_loss, test_log_loss], - [train_roc_auc, test_roc_auc], - ], - data=[X_train, X_test], - alg_instance=clf, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn random forest ' - 'classification benchmark') - - parser.add_argument('--criterion', type=str, default='gini', - choices=('gini', 'entropy'), - help='The function to measure the quality of a split') - parser.add_argument('--num-trees', type=int, default=100, - help='Number of trees in the forest') - parser.add_argument('--max-features', type=bench.float_or_int_or_str, default=None, - help='Upper bound on features used at each split') - parser.add_argument('--max-depth', type=int, default=None, - help='Upper bound on depth of constructed trees') - parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, - help='Minimum samples number for node splitting') - parser.add_argument('--max-leaf-nodes', type=int, default=None, - help='Maximum leaf nodes per tree') - parser.add_argument('--min-impurity-decrease', type=float, default=0., - help='Needed impurity decrease for node splitting') - parser.add_argument('--no-bootstrap', dest='bootstrap', default=True, - action='store_false', help="Don't control bootstraping") - - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/df_regr.py b/sklearn_bench/df_regr.py deleted file mode 100644 index baa5bb475..000000000 --- a/sklearn_bench/df_regr.py +++ /dev/null @@ -1,90 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import bench - - -def main(): - from sklearn.ensemble import RandomForestRegressor - - # Load and convert data - X_train, X_test, y_train, y_test = bench.load_data(params) - y_train = y_train.values.ravel() - y_test = y_test.values.ravel() - - # Create our random forest regressor - regr = RandomForestRegressor(criterion=params.criterion, - n_estimators=params.num_trees, - max_depth=params.max_depth, - max_features=params.max_features, - min_samples_split=params.min_samples_split, - max_leaf_nodes=params.max_leaf_nodes, - min_impurity_decrease=params.min_impurity_decrease, - bootstrap=params.bootstrap, - random_state=params.seed, - n_jobs=params.n_jobs) - - fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - - y_pred = regr.predict(X_train) - train_rmse = bench.rmse_score(y_train, y_pred) - train_r2 = bench.r2_score(y_train, y_pred) - - predict_time, y_pred = bench.measure_function_time( - regr.predict, X_test, params=params) - test_rmse = bench.rmse_score(y_test, y_pred) - test_r2 = bench.r2_score(y_test, y_pred) - - bench.print_output( - library='sklearn', - algorithm='df_regr', - stages=['training', 'prediction'], - params=params, - functions=['df_regr.fit', 'df_regr.predict'], - times=[fit_time, predict_time], - metric_type=['rmse', 'r2_score'], - metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], - data=[X_train, X_test], - alg_instance=regr, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn random forest ' - 'regression benchmark') - - parser.add_argument('--criterion', type=str, default='squared_error', - choices=('squared_error', 'absolute_error'), - help='The function to measure the quality of a split') - parser.add_argument('--num-trees', type=int, default=100, - help='Number of trees in the forest') - parser.add_argument('--max-features', type=bench.float_or_int_or_str, default=None, - help='Upper bound on features used at each split') - parser.add_argument('--max-depth', type=int, default=None, - help='Upper bound on depth of constructed trees') - parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, - help='Minimum samples number for node splitting') - parser.add_argument('--max-leaf-nodes', type=int, default=None, - help='Grow trees with max_leaf_nodes in best-first fashion' - 'if it is not None') - parser.add_argument('--min-impurity-decrease', type=float, default=0., - help='Needed impurity decrease for node splitting') - parser.add_argument('--no-bootstrap', dest='bootstrap', default=True, - action='store_false', help="Don't control bootstraping") - - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/distances.py b/sklearn_bench/distances.py deleted file mode 100644 index c708513d1..000000000 --- a/sklearn_bench/distances.py +++ /dev/null @@ -1,44 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench - - -def main(): - from sklearn.metrics.pairwise import pairwise_distances - - # Load data - X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True) - - time, _ = bench.measure_function_time(pairwise_distances, X, metric=params.metric, - n_jobs=params.n_jobs, params=params) - - bench.print_output(library='sklearn', algorithm='distances', stages=['computation'], - params=params, functions=[params.metric.capitalize()], - times=[time], metric_type=None, metrics=[None], data=[X], - alg_params={'metric': params.metric}) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn pairwise distances ' - 'benchmark') - parser.add_argument('--metric', default='cosine', - choices=['cosine', 'correlation'], - help='Metric to test for pairwise distances') - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py deleted file mode 100755 index 3467e0dda..000000000 --- a/sklearn_bench/elasticnet.py +++ /dev/null @@ -1,78 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench - - -def main(): - from sklearn.linear_model import ElasticNet - - # Load data - X_train, X_test, y_train, y_test = bench.load_data(params) - - # Create our regression object - regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio, - alpha=params.alpha, tol=params.tol, - max_iter=params.maxiter) - # Time fit - fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - - # Time predict - predict_time, y_pred = bench.measure_function_time(regr.predict, - X_train, params=params) - - train_rmse = bench.rmse_score(y_train, y_pred) - train_r2 = bench.r2_score(y_train, y_pred) - y_pred = regr.predict(X_test) - test_rmse = bench.rmse_score(y_test, y_pred) - test_r2 = bench.r2_score(y_test, y_pred) - - bench.print_output( - library='sklearn', - algorithm='elasticnet', - stages=['training', 'prediction'], - params=params, - functions=['ElasticNet.fit', 'ElasticNet.predict'], - times=[fit_time, predict_time], - metric_type=['rmse', 'r2_score', 'iter'], - metrics=[ - [train_rmse, test_rmse], - [train_r2, test_r2], - [int(regr.n_iter_), int(regr.n_iter_)], - ], - data=[X_train, X_train], - alg_instance=regr, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression ' - 'benchmark') - parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") - parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, - help='Regularization parameter') - parser.add_argument('--maxiter', type=int, default=1000, - help='Maximum iterations for the iterative solver') - parser.add_argument('--l1_ratio', dest='l1_ratio', type=float, default=0.5, - help='Regularization parameter') - parser.add_argument('--tol', type=float, default=0.0, - help='Tolerance for solver.') - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py deleted file mode 100644 index b522a0e92..000000000 --- a/sklearn_bench/kmeans.py +++ /dev/null @@ -1,104 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -from typing import Any - -import bench -import numpy as np - - -def main(): - from sklearn.cluster import KMeans - from sklearn.metrics.cluster import davies_bouldin_score - - # Load and convert generated data - X_train, X_test, _, _ = bench.load_data(params) - - X_init: Any - if params.filei == 'k-means++': - X_init = 'k-means++' - # Load initial centroids from specified path - elif params.filei is not None: - X_init = {k: v.astype(params.dtype) for k, v in np.load(params.filei).items()} - if isinstance(X_init, np.ndarray): - params.n_clusters = X_init.shape[0] - # or choose random centroids from training data - else: - np.random.seed(params.seed) - centroids_idx = np.random.randint(low=0, high=X_train.shape[0], - size=params.n_clusters) - if hasattr(X_train, "iloc"): - X_init = X_train.iloc[centroids_idx].values - else: - X_init = X_train[centroids_idx] - - def fit_kmeans(X, X_init): - alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, - max_iter=params.maxiter, init=X_init, n_init=params.n_init, - algorithm=params.algorithm, random_state=params.random_state) - alg.fit(X) - return alg - - # Time fit - fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, - X_init, params=params) - - train_predict = kmeans.predict(X_train) - acc_train = davies_bouldin_score(X_train, train_predict) - - # Time predict - predict_time, test_predict = bench.measure_function_time( - kmeans.predict, X_test, params=params) - - acc_test = davies_bouldin_score(X_test, test_predict) - - bench.print_output( - library='sklearn', - algorithm='kmeans', - stages=['training', 'prediction'], - params=params, - functions=['KMeans.fit', 'KMeans.predict'], - times=[fit_time, predict_time], - metric_type=['davies_bouldin_score', 'inertia', 'iter'], - metrics=[ - [acc_train, acc_test], - [kmeans.inertia_, kmeans.inertia_], - [kmeans.n_iter_, kmeans.n_iter_] - ], - data=[X_train, X_test], - alg_instance=kmeans, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') - parser.add_argument('-i', '--filei', '--fileI', '--init', - type=str, help='Initial clusters') - parser.add_argument('-t', '--tol', type=float, default=0., - help='Absolute threshold') - parser.add_argument('--maxiter', type=int, default=100, - help='Maximum number of iterations') - parser.add_argument('--n-clusters', type=int, help='Number of clusters') - parser.add_argument('--algorithm', type=str, default='full', - help='K-means algorithm to use') - parser.add_argument('--n_init', type=int, default=1, - help='Number of time the k-means algorithm ' - 'will be run with different centroid seeds') - parser.add_argument('--random_state', type=int, default=777, - help='Random state') - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py deleted file mode 100755 index f58be1650..000000000 --- a/sklearn_bench/knn_clsf.py +++ /dev/null @@ -1,108 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np - - -def main(): - from sklearn.neighbors import KNeighborsClassifier - - # Load generated data - X_train, X_test, y_train, y_test = bench.load_data(params) - params.n_classes = len(np.unique(y_train)) - - # Create classification object - knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors, - weights=params.weights, - algorithm=params.method, - metric=params.metric, - n_jobs=params.n_jobs) - - # Measure time and accuracy on fitting - train_time, _ = bench.measure_function_time( - knn_clsf.fit, X_train, y_train, params=params) - if params.task == 'classification': - y_pred = knn_clsf.predict(X_train) - y_proba = knn_clsf.predict_proba(X_train) - train_acc = bench.accuracy_score(y_train, y_pred) - train_log_loss = bench.log_loss(y_train, y_proba) - train_roc_auc = bench.roc_auc_score(y_train, y_proba) - - # Measure time and accuracy on prediction - if params.task == 'classification': - predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test, - params=params) - y_proba = knn_clsf.predict_proba(X_test) - test_acc = bench.accuracy_score(y_test, yp) - test_log_loss = bench.log_loss(y_test, y_proba) - test_roc_auc = bench.roc_auc_score(y_test, y_proba) - else: - predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test, - params=params) - - if params.task == 'classification': - bench.print_output( - library='sklearn', - algorithm=knn_clsf._fit_method + '_knn_clsf', - stages=['training', 'prediction'], - params=params, - functions=['knn_clsf.fit', 'knn_clsf.predict'], - times=[train_time, predict_time], - metric_type=['accuracy', 'log_loss', 'roc_auc'], - metrics=[ - [train_acc, test_acc], - [train_log_loss, test_log_loss], - [train_roc_auc, test_roc_auc], - ], - data=[X_train, X_test], - alg_instance=knn_clsf, - ) - else: - bench.print_output( - library='sklearn', - algorithm=knn_clsf._fit_method + '_knn_search', - stages=['training', 'search'], - params=params, - functions=['knn_clsf.fit', 'knn_clsf.kneighbors'], - times=[train_time, predict_time], - metric_type=None, - metrics=[], - data=[X_train, X_test], - alg_instance=knn_clsf, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='scikit-learn kNN classifier benchmark') - - parser.add_argument('--task', default='classification', type=str, - choices=('search', 'classification'), - help='kNN task: search or classification') - parser.add_argument('--n-neighbors', default=5, type=int, - help='Number of neighbors to use') - parser.add_argument('--weights', type=str, default='uniform', - help='Weight function used in prediction') - parser.add_argument('--method', type=str, default='brute', - choices=('brute', 'kd_tree', 'ball_tree', 'auto'), - help='Algorithm used to compute the nearest neighbors') - parser.add_argument('--metric', type=str, default='euclidean', - help='Distance metric to use') - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/knn_regr.py b/sklearn_bench/knn_regr.py deleted file mode 100644 index c2048e3f4..000000000 --- a/sklearn_bench/knn_regr.py +++ /dev/null @@ -1,100 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np - - -def main(): - from sklearn.neighbors import KNeighborsRegressor - - # Load generated data - X_train, X_test, y_train, y_test = bench.load_data(params) - params.n_classes = len(np.unique(y_train)) - - # Create a regression object - knn_regr = KNeighborsRegressor(n_neighbors=params.n_neighbors, - weights=params.weights, - algorithm=params.method, - metric=params.metric, - n_jobs=params.n_jobs) - - # Measure time and accuracy on fitting - train_time, _ = bench.measure_function_time( - knn_regr.fit, X_train, y_train, params=params) - if params.task == 'regression': - y_pred = knn_regr.predict(X_train) - train_rmse = bench.rmse_score(y_train, y_pred) - train_r2 = bench.r2_score(y_train, y_pred) - - # Measure time and accuracy on prediction - if params.task == 'regression': - predict_time, yp = bench.measure_function_time(knn_regr.predict, X_test, - params=params) - test_rmse = bench.rmse_score(y_test, yp) - test_r2 = bench.r2_score(y_test, yp) - else: - predict_time, _ = bench.measure_function_time(knn_regr.kneighbors, X_test, - params=params) - - if params.task == 'regression': - bench.print_output( - library='sklearn', - algorithm=knn_regr._fit_method + '_knn_regr', - stages=['training', 'prediction'], - params=params, - functions=['knn_regr.fit', 'knn_regr.predict'], - times=[train_time, predict_time], - metric_type=['rmse', 'r2_score'], - metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], - data=[X_train, X_test], - alg_instance=knn_regr, - ) - else: - bench.print_output( - library='sklearn', - algorithm=knn_regr._fit_method + '_knn_search', - stages=['training', 'search'], - params=params, - functions=['knn_regr.fit', 'knn_regr.kneighbors'], - times=[train_time, predict_time], - metric_type=None, - metrics=[], - data=[X_train, X_test], - alg_instance=knn_regr, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='scikit-learn kNN classifier benchmark') - - parser.add_argument('--task', default='regression', type=str, - choices=('search', 'regression'), - help='The type of kNN task: search or regression') - parser.add_argument('--n-neighbors', default=5, type=int, - help='The number of neighbors to use') - parser.add_argument('--weights', type=str, default='uniform', - help='The weight function to be used in prediction') - parser.add_argument('--method', type=str, default='brute', - choices=('brute', 'kd_tree', 'ball_tree', 'auto'), - help='The method to find the nearest neighbors') - parser.add_argument('--metric', type=str, default='euclidean', - help='The metric to calculate distances') - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/lasso.py b/sklearn_bench/lasso.py deleted file mode 100755 index c167bc359..000000000 --- a/sklearn_bench/lasso.py +++ /dev/null @@ -1,76 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench - - -def main(): - from sklearn.linear_model import Lasso - - # Load data - X_train, X_test, y_train, y_test = bench.load_data(params) - - # Create our regression object - regr = Lasso(fit_intercept=params.fit_intercept, alpha=params.alpha, - tol=params.tol, max_iter=params.maxiter) - - # Time fit - fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - - # Time predict - predict_time, yp = bench.measure_function_time( - regr.predict, X_train, params=params) - - train_rmse = bench.rmse_score(y_train, yp) - train_r2 = bench.r2_score(y_train, yp) - yp = regr.predict(X_test) - test_rmse = bench.rmse_score(y_test, yp) - test_r2 = bench.r2_score(y_test, yp) - - bench.print_output( - library='sklearn', - algorithm='lasso', - stages=['training', 'prediction'], - params=params, - functions=['Lasso.fit', 'Lasso.predict'], - times=[fit_time, predict_time], - metric_type=['rmse', 'r2_score', 'iter'], - metrics=[ - [train_rmse, test_rmse], - [train_r2, test_r2], - [int(regr.n_iter_), int(regr.n_iter_)], - ], - data=[X_train, X_test], - alg_instance=regr, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn lasso regression ' - 'benchmark') - parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") - parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, - help='Regularization parameter') - parser.add_argument('--maxiter', type=int, default=1000, - help='Maximum iterations for the iterative solver') - parser.add_argument('--tol', type=float, default=0.0, - help='Tolerance for solver.') - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py deleted file mode 100644 index 7da0dba45..000000000 --- a/sklearn_bench/linear.py +++ /dev/null @@ -1,63 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import bench - - -def main(): - from sklearn.linear_model import LinearRegression - - # Load data - X_train, X_test, y_train, y_test = bench.load_data( - params, generated_data=['X_train', 'y_train']) - - # Create our regression object - regr = LinearRegression(fit_intercept=params.fit_intercept, - n_jobs=params.n_jobs) - - # Time fit - fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - - # Time predict - predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) - - test_rmse = bench.rmse_score(y_test, yp) - test_r2 = bench.r2_score(y_test, yp) - yp = regr.predict(X_train) - train_rmse = bench.rmse_score(y_train, yp) - train_r2 = bench.r2_score(y_train, yp) - - bench.print_output( - library='sklearn', algorithm='lin_reg', - stages=['training', 'prediction'], - params=params, functions=['Linear.fit', 'Linear.predict'], - times=[fit_time, predict_time], - metric_type=['rmse', 'r2_score'], - metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], - data=[X_train, X_test], - alg_instance=regr, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn linear regression ' - 'benchmark') - parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py deleted file mode 100644 index 733ee5765..000000000 --- a/sklearn_bench/log_reg.py +++ /dev/null @@ -1,100 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np - - -def main(): - from sklearn.linear_model import LogisticRegression - - # Load generated data - X_train, X_test, y_train, y_test = bench.load_data(params) - - params.n_classes = len(np.unique(y_train)) - - if params.multiclass == 'auto': - params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial' - - if not params.tol: - params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10 - - # Create our classifier object - clf = LogisticRegression(penalty='l2', C=params.C, n_jobs=params.n_jobs, - fit_intercept=params.fit_intercept, - verbose=params.verbose, - tol=params.tol, max_iter=params.maxiter, - solver=params.solver, multi_class=params.multiclass) - # Time fit and predict - fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) - - y_pred = clf.predict(X_train) - y_proba = clf.predict_proba(X_train) - train_acc = bench.accuracy_score(y_train, y_pred) - train_log_loss = bench.log_loss(y_train, y_proba) - train_roc_auc = bench.roc_auc_score(y_train, y_proba) - - predict_time, y_pred = bench.measure_function_time( - clf.predict, X_test, params=params) - y_proba = clf.predict_proba(X_test) - test_acc = bench.accuracy_score(y_test, y_pred) - test_log_loss = bench.log_loss(y_test, y_proba) - test_roc_auc = bench.roc_auc_score(y_test, y_proba) - - bench.print_output( - library='sklearn', - algorithm='log_reg', - stages=['training', 'prediction'], - params=params, - functions=['LogReg.fit', 'LogReg.predict'], - times=[fit_time, predict_time], - metric_type=['accuracy', 'log_loss', 'roc_auc'], - metrics=[ - [train_acc, test_acc], - [train_log_loss, test_log_loss], - [train_roc_auc, test_roc_auc], - ], - data=[X_train, X_test], - alg_instance=clf, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn logistic ' - 'regression benchmark') - parser.add_argument('--no-fit-intercept', dest='fit_intercept', - action='store_false', default=True, - help="Don't fit intercept") - parser.add_argument('--multiclass', default='auto', - choices=('auto', 'ovr', 'multinomial'), - help='How to treat multi class data. ' - '"auto" picks "ovr" for binary classification, and ' - '"multinomial" otherwise.') - parser.add_argument('--solver', default='lbfgs', - choices=('lbfgs', 'newton-cg', 'saga'), - help='Solver to use.') - parser.add_argument('--maxiter', type=int, default=100, - help='Maximum iterations for the iterative solver') - parser.add_argument('-C', dest='C', type=float, default=1.0, - help='Regularization parameter') - parser.add_argument('--tol', type=float, default=None, - help='Tolerance for solver. If solver == "newton-cg", ' - 'then the default is 1e-3. Otherwise, the default ' - 'is 1e-10.') - params = bench.parse_args(parser, loop_types=('fit', 'predict')) - bench.run_with_context(params, main) diff --git a/sklearn_bench/nusvc.py b/sklearn_bench/nusvc.py deleted file mode 100644 index d3e6eeece..000000000 --- a/sklearn_bench/nusvc.py +++ /dev/null @@ -1,106 +0,0 @@ -# =============================================================================== -# Copyright 2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np - - -def main(): - from sklearn.svm import NuSVC - - X_train, X_test, y_train, y_test = bench.load_data(params) - y_train = np.asfortranarray(y_train).ravel() - - if params.gamma is None: - params.gamma = 1.0 / X_train.shape[1] - - cache_size_bytes = bench.get_optimal_cache_size(X_train.shape[0], - max_cache=params.max_cache_size) - params.cache_size_mb = cache_size_bytes / 1024**2 - params.n_classes = len(np.unique(y_train)) - - clf = NuSVC(nu=params.nu, kernel=params.kernel, cache_size=params.cache_size_mb, - tol=params.tol, gamma=params.gamma, probability=params.probability, - random_state=43, degree=params.degree) - - fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) - params.sv_len = clf.support_.shape[0] - - if params.probability: - state_predict = 'predict_proba' - clf_predict = clf.predict_proba - y_proba_train = clf_predict(X_train) - y_proba_test = clf_predict(X_test) - train_log_loss = bench.log_loss(y_train, y_proba_train) - test_log_loss = bench.log_loss(y_test, y_proba_test) - train_roc_auc = bench.roc_auc_score(y_train, y_proba_train) - test_roc_auc = bench.roc_auc_score(y_test, y_proba_test) - else: - state_predict = 'prediction' - clf_predict = clf.predict - train_log_loss = None - test_log_loss = None - train_roc_auc = None - test_roc_auc = None - - predict_train_time, y_pred = bench.measure_function_time( - clf_predict, X_train, params=params) - train_acc = bench.accuracy_score(y_train, y_pred) - - _, y_pred = bench.measure_function_time( - clf_predict, X_test, params=params) - test_acc = bench.accuracy_score(y_test, y_pred) - - bench.print_output( - library='sklearn', - algorithm='nuSVC', - stages=['training', state_predict], - params=params, functions=['NuSVC.fit', f'NuSVC.{state_predict}'], - times=[fit_time, predict_train_time], - metric_type=['accuracy', 'log_loss', 'roc_auc', 'n_sv'], - metrics=[ - [train_acc, test_acc], - [train_log_loss, test_log_loss], - [train_roc_auc, test_roc_auc], - [int(clf.n_support_.sum()), int(clf.n_support_.sum())], - ], - data=[X_train, X_train], - alg_instance=clf, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn NuSVC benchmark') - - parser.add_argument('--nu', dest='nu', type=float, default=.5, - help='Nu in the nu-SVC model (0 < nu <= 1)') - parser.add_argument('--kernel', choices=('linear', 'rbf', 'poly', 'sigmoid'), - default='linear', help='NuSVC kernel function') - parser.add_argument('--degree', type=int, default=3, - help='Degree of the polynomial kernel function') - parser.add_argument('--gamma', type=float, default=None, - help='Parameter for kernel="rbf"') - parser.add_argument('--max-cache-size', type=int, default=8, - help='Maximum cache size, in gigabytes, for NuSVC.') - parser.add_argument('--tol', type=float, default=1e-3, - help='Tolerance passed to sklearn.svm.NuSVC') - parser.add_argument('--probability', action='store_true', default=False, - dest='probability', help="Use probability for NuSVC") - - params = bench.parse_args(parser, loop_types=('fit', 'predict')) - bench.run_with_context(params, main) diff --git a/sklearn_bench/nusvr.py b/sklearn_bench/nusvr.py deleted file mode 100644 index ccfe519ba..000000000 --- a/sklearn_bench/nusvr.py +++ /dev/null @@ -1,91 +0,0 @@ -# =============================================================================== -# Copyright 2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np - - -def main(): - from sklearn.svm import NuSVR - - X_train, X_test, y_train, y_test = bench.load_data(params) - y_train = np.asfortranarray(y_train).ravel() - - if params.gamma is None: - params.gamma = 1.0 / X_train.shape[1] - - cache_size_bytes = bench.get_optimal_cache_size(X_train.shape[0], - max_cache=params.max_cache_size) - params.cache_size_mb = cache_size_bytes / 1024**2 - params.n_classes = len(np.unique(y_train)) - - regr = NuSVR(C=params.C, nu=params.nu, kernel=params.kernel, - cache_size=params.cache_size_mb, tol=params.tol, gamma=params.gamma, - degree=params.degree) - - fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - params.sv_len = regr.support_.shape[0] - - predict_train_time, y_pred = bench.measure_function_time( - regr.predict, X_train, params=params) - train_rmse = bench.rmse_score(y_train, y_pred) - train_r2 = bench.r2_score(y_train, y_pred) - - _, y_pred = bench.measure_function_time( - regr.predict, X_test, params=params) - test_rmse = bench.rmse_score(y_test, y_pred) - test_r2 = bench.r2_score(y_test, y_pred) - - bench.print_output( - library='sklearn', - algorithm='nuSVR', - stages=['training', 'prediction'], - params=params, - functions=['NuSVR.fit', 'NuSVR.predict'], - times=[fit_time, predict_train_time], - metric_type=['rmse', 'r2_score', 'n_sv'], - metrics=[ - [train_rmse, test_rmse], - [train_r2, test_r2], - [int(regr.n_support_.sum()), int(regr.n_support_.sum())], - ], - data=[X_train, X_train], - alg_instance=regr, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn NuSVR benchmark') - - parser.add_argument('-C', dest='C', type=float, default=1., - help='NuSVR regularization parameter') - parser.add_argument('--nu', dest='nu', type=float, default=.5, - help='Nu in the nu-SVC model (0 < nu <= 1)') - parser.add_argument('--kernel', choices=('linear', 'rbf', 'poly', 'sigmoid'), - default='linear', help='NuSVR kernel function') - parser.add_argument('--degree', type=int, default=3, - help='Degree of the polynomial kernel function') - parser.add_argument('--gamma', type=float, default=None, - help='Parameter for kernel="rbf"') - parser.add_argument('--max-cache-size', type=int, default=8, - help='Maximum cache size, in gigabytes, for NuSVR.') - parser.add_argument('--tol', type=float, default=1e-3, - help='Tolerance passed to sklearn.svm.NuSVR') - - params = bench.parse_args(parser, loop_types=('fit', 'predict')) - bench.run_with_context(params, main) diff --git a/sklearn_bench/pca.py b/sklearn_bench/pca.py deleted file mode 100644 index 31d7bffc2..000000000 --- a/sklearn_bench/pca.py +++ /dev/null @@ -1,66 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench - - -def main(): - from sklearn.decomposition import PCA - - # Load random data - X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train']) - - if params.n_components is None: - p, n = X_train.shape - params.n_components = min((n, (2 + min((n, p))) // 3)) - - # Create our PCA object - pca = PCA(svd_solver=params.svd_solver, whiten=params.whiten, - n_components=params.n_components) - - # Time fit - fit_time, _ = bench.measure_function_time(pca.fit, X_train, params=params) - - # Time transform - transform_time, _ = bench.measure_function_time( - pca.transform, X_train, params=params) - - bench.print_output( - library='sklearn', - algorithm='PCA', - stages=['training', 'transformation'], - params=params, - functions=['PCA.fit', 'PCA.transform'], - times=[fit_time, transform_time], - metric_type='noise_variance', - metrics=[pca.noise_variance_, pca.noise_variance_], - data=[X_train, X_test], - alg_instance=pca, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn PCA benchmark') - parser.add_argument('--svd-solver', type=str, choices=['full'], - default='full', help='SVD solver to use') - parser.add_argument('--n-components', type=int, default=None, - help='The number of components to find') - parser.add_argument('--whiten', action='store_true', default=False, - help='Perform whitening') - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/requirements.txt b/sklearn_bench/requirements.txt deleted file mode 100755 index 6bf9a3f99..000000000 --- a/sklearn_bench/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -scikit-learn-intelex==2023.0.0 -dpcpp-cpp-rt==2023.0.0 diff --git a/sklearn_bench/ridge.py b/sklearn_bench/ridge.py deleted file mode 100644 index 19718a4e7..000000000 --- a/sklearn_bench/ridge.py +++ /dev/null @@ -1,70 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench - - -def main(): - from sklearn.linear_model import Ridge - - # Load data - X_train, X_test, y_train, y_test = bench.load_data( - params, generated_data=['X_train', 'y_train']) - - # Create our regression object - regr = Ridge(fit_intercept=params.fit_intercept, alpha=params.alpha, - solver=params.solver) - - # Time fit - fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - - # Time predict - predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) - - test_rmse = bench.rmse_score(y_test, yp) - test_r2 = bench.r2_score(y_test, yp) - yp = regr.predict(X_train) - train_rmse = bench.rmse_score(y_train, yp) - train_r2 = bench.r2_score(y_train, yp) - - bench.print_output( - library='sklearn', - algorithm='ridge_regr', - stages=['training', 'prediction'], - params=params, - functions=['Ridge.fit', 'Ridge.predict'], - times=[fit_time, predict_time], - metric_type=['rmse', 'r2_score'], - metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], - data=[X_train, X_test], - alg_instance=regr, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn ridge regression ' - 'benchmark') - parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") - parser.add_argument('--solver', default='auto', - help='Solver used for training') - parser.add_argument('--alpha', type=float, default=1.0, - help='Regularization strength') - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py deleted file mode 100644 index 5ac4c939c..000000000 --- a/sklearn_bench/svm.py +++ /dev/null @@ -1,113 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np - - -def main(): - from sklearn.svm import SVC - - X_train, X_test, y_train, y_test = bench.load_data(params) - y_train = np.asfortranarray(y_train).ravel() - - if params.gamma is None: - params.gamma = 1.0 / X_train.shape[1] - - cache_size_bytes = bench.get_optimal_cache_size(X_train.shape[0], - max_cache=params.max_cache_size) - params.cache_size_mb = cache_size_bytes / 1024**2 - params.n_classes = len(np.unique(y_train)) - - clf = SVC(C=params.C, kernel=params.kernel, cache_size=params.cache_size_mb, - tol=params.tol, gamma=params.gamma, probability=params.probability, - random_state=43, degree=params.degree) - - fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) - params.sv_len = clf.support_.shape[0] - - if params.probability: - state_predict = 'predict_proba' - clf_predict = clf.predict_proba - train_acc = None - test_acc = None - - predict_train_time, y_pred = bench.measure_function_time( - clf_predict, X_train, params=params) - train_log_loss = bench.log_loss(y_train, y_pred) - train_roc_auc = bench.roc_auc_score(y_train, y_pred) - - _, y_pred = bench.measure_function_time( - clf_predict, X_test, params=params) - test_log_loss = bench.log_loss(y_test, y_pred) - test_roc_auc = bench.roc_auc_score(y_test, y_pred) - else: - state_predict = 'prediction' - clf_predict = clf.predict - train_log_loss = None - test_log_loss = None - train_roc_auc = None - test_roc_auc = None - - predict_train_time, y_pred = bench.measure_function_time( - clf_predict, X_train, params=params) - train_acc = bench.accuracy_score(y_train, y_pred) - - _, y_pred = bench.measure_function_time( - clf_predict, X_test, params=params) - test_acc = bench.accuracy_score(y_test, y_pred) - - bench.print_output( - library='sklearn', - algorithm='SVC', - stages=['training', state_predict], - params=params, - functions=['SVM.fit', f'SVM.{state_predict}'], - times=[fit_time, predict_train_time], - metric_type=['accuracy', 'log_loss', 'roc_auc', 'n_sv'], - metrics=[ - [train_acc, test_acc], - [train_log_loss, test_log_loss], - [train_roc_auc, test_roc_auc], - [int(clf.n_support_.sum()), int(clf.n_support_.sum())], - ], - data=[X_train, X_train], - alg_instance=clf, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn SVM benchmark') - - parser.add_argument('-C', dest='C', type=float, default=1.0, - help='SVM regularization parameter') - parser.add_argument('--kernel', choices=('linear', 'rbf', 'poly', 'sigmoid'), - default='linear', help='SVM kernel function') - parser.add_argument('--degree', type=int, default=3, - help='Degree of the polynomial kernel function') - parser.add_argument('--gamma', type=float, default=None, - help='Parameter for kernel="rbf"') - parser.add_argument('--max-cache-size', type=int, default=8, - help='Maximum cache size, in gigabytes, for SVM.') - parser.add_argument('--tol', type=float, default=1e-3, - help='Tolerance passed to sklearn.svm.SVC') - parser.add_argument('--probability', action='store_true', default=False, - dest='probability', help="Use probability for SVC") - - params = bench.parse_args(parser, loop_types=('fit', 'predict')) - bench.run_with_context(params, main) diff --git a/sklearn_bench/svr.py b/sklearn_bench/svr.py deleted file mode 100644 index 7e9dc2c8d..000000000 --- a/sklearn_bench/svr.py +++ /dev/null @@ -1,91 +0,0 @@ -# =============================================================================== -# Copyright 2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np - - -def main(): - from sklearn.svm import SVR - - X_train, X_test, y_train, y_test = bench.load_data(params) - y_train = np.asfortranarray(y_train).ravel() - - if params.gamma is None: - params.gamma = 1.0 / X_train.shape[1] - - cache_size_bytes = bench.get_optimal_cache_size(X_train.shape[0], - max_cache=params.max_cache_size) - params.cache_size_mb = cache_size_bytes / 1024**2 - params.n_classes = len(np.unique(y_train)) - - regr = SVR(C=params.C, epsilon=params.epsilon, kernel=params.kernel, - cache_size=params.cache_size_mb, tol=params.tol, gamma=params.gamma, - degree=params.degree) - - fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - params.sv_len = regr.support_.shape[0] - - predict_train_time, y_pred = bench.measure_function_time( - regr.predict, X_train, params=params) - train_rmse = bench.rmse_score(y_train, y_pred) - train_r2 = bench.r2_score(y_train, y_pred) - - _, y_pred = bench.measure_function_time( - regr.predict, X_test, params=params) - test_rmse = bench.rmse_score(y_test, y_pred) - test_r2 = bench.r2_score(y_test, y_pred) - - bench.print_output( - library='sklearn', - algorithm='SVR', - stages=['training', 'prediction'], - params=params, - functions=['SVR.fit', 'SVR.predict'], - times=[fit_time, predict_train_time], - metric_type=['rmse', 'r2_score', 'n_sv'], - metrics=[ - [train_rmse, test_rmse], - [train_r2, test_r2], - [int(regr.n_support_.sum()), int(regr.n_support_.sum())], - ], - data=[X_train, X_train], - alg_instance=regr, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn SVR benchmark') - - parser.add_argument('-C', dest='C', type=float, default=1., - help='SVR regularization parameter') - parser.add_argument('--epsilon', dest='epsilon', type=float, default=.1, - help='Epsilon in the epsilon-SVR model') - parser.add_argument('--kernel', choices=('linear', 'rbf', 'poly', 'sigmoid'), - default='linear', help='SVR kernel function') - parser.add_argument('--degree', type=int, default=3, - help='Degree of the polynomial kernel function') - parser.add_argument('--gamma', type=float, default=None, - help='Parameter for kernel="rbf"') - parser.add_argument('--max-cache-size', type=int, default=8, - help='Maximum cache size, in gigabytes, for SVR.') - parser.add_argument('--tol', type=float, default=1e-3, - help='Tolerance passed to sklearn.svm.SVR') - - params = bench.parse_args(parser, loop_types=('fit', 'predict')) - bench.run_with_context(params, main) diff --git a/sklearn_bench/train_test_split.py b/sklearn_bench/train_test_split.py deleted file mode 100644 index 046719b48..000000000 --- a/sklearn_bench/train_test_split.py +++ /dev/null @@ -1,72 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -from typing import Iterable - -import bench - - -def main(): - from sklearn.model_selection import train_test_split - - # Load generated data - X, y, _, _ = bench.load_data(params) - - data_args: Iterable - if params.include_y: - data_args = (X, y) - else: - data_args = (X, ) - - tts_params = { - 'train_size': params.train_size, - 'test_size': params.test_size, - 'shuffle': not params.do_not_shuffle, - 'random_state': params.seed - } - - if params.rng is not None: - tts_params['rng'] = params.rng - - time, _ = bench.measure_function_time( - train_test_split, *data_args, params=params, **tts_params) - - bench.print_output(library='sklearn', algorithm='train_test_split', - stages=['training'], params=params, - functions=['train_test_split'], times=[time], metrics=[None], - metric_type=None, data=[X], alg_params=tts_params) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='scikit-learn train_test_split benchmark') - parser.add_argument('--train-size', type=float, default=0.75, - help='Size of training subset') - parser.add_argument('--test-size', type=float, default=0.25, - help='Size of testing subset') - parser.add_argument('--do-not-shuffle', default=False, action='store_true', - help='Do not perform data shuffle before splitting') - parser.add_argument('--include-y', default=False, action='store_true', - help='Include label (Y) in splitting') - parser.add_argument('--rng', default=None, - choices=('MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', - 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10', - 'NONDETERM', None), - help='Random numbers generator for shuffling ' - '(only for IDP scikit-learn)') - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/sklearn_bench/tsne.py b/sklearn_bench/tsne.py deleted file mode 100644 index 2d9f2d0aa..000000000 --- a/sklearn_bench/tsne.py +++ /dev/null @@ -1,71 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import bench -import argparse -import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - - -def main(): - from sklearn.manifold import TSNE - - # Load and convert data - X, _, _, _ = bench.load_data(params) - - # Create our TSNE model - tsne = TSNE(n_components=params.n_components, early_exaggeration=params.early_exaggeration, - learning_rate=params.learning_rate, angle=params.angle, - min_grad_norm=params.min_grad_norm, random_state=params.random_state) - - fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params) - divergence = tsne.kl_divergence_ - - bench.print_output( - library='sklearn', - algorithm='TSNE', - stages=['training'], - params=params, - functions=['TSNE.fit'], - times=[fit_time], - metric_type='divergence', - metrics=[divergence], - data=[X], - alg_instance=tsne, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='scikit-learn tsne ' - 'regression benchmark') - - parser.add_argument('--n-components', type=int, default=2, - help='The dimension of the embedded space.') - parser.add_argument('--early-exaggeration', type=float, default=12.0, - help='This factor increases the attractive forces between points ' - 'and allows points to move around more freely, ' - 'finding their nearest neighbors more easily.') - parser.add_argument('--learning-rate', type=float, default=200.0, - help='The learning rate for t-SNE is usually in the range [10.0, 1000.0].') - parser.add_argument('--angle', type=float, default=0.5, - help='Angular size. This is the trade-off between speed and accuracy.') - parser.add_argument('--min-grad-norm', type=float, default=1e-7, - help='If the gradient norm is below this threshold,' - 'the optimization is stopped.') - parser.add_argument('--random-state', type=int, default=1234) - - params = bench.parse_args(parser) - bench.run_with_context(params, main) diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml new file mode 100644 index 000000000..a37769ce9 --- /dev/null +++ b/test-configuration-linux.yml @@ -0,0 +1,57 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== +steps: + - task: UsePythonVersion@0 + displayName: "Use Python $(PYTHON_VERSION)" + inputs: + versionSpec: "$(PYTHON_VERSION)" + - script: | + conda create -y -n bench-env -c conda-forge -c nodefaults python=$(PYTHON_VERSION) + displayName: Environment initialization + - script: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench-env + pip install -r envs/requirements-sklearn.txt + pip list + displayName: Install requirements via pip + condition: eq(variables['PKG_MANAGER'], 'pip') + - script: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench-env + conda env update -f envs/conda-env-sklearn.yml -n bench-env + conda list + displayName: Install requirements via conda + condition: eq(variables['PKG_MANAGER'], 'conda') + - script: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench-env + python -m sklbench --report -l DEBUG --report -p algorithm:library=sklearn,sklearnex algorithm:estimator=PCA,KMeans,ElasticNet,KNeighborsClassifier data:dataset=skin_segmentation data:split_kwargs:train_size=5000 data:split_kwargs:test_size=5000 + displayName: CLI arguments example run + - script: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json + displayName: Sklearn example run + - script: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json + displayName: XGBoost example run + - script: | + source /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench-env + python -m sklbench -l INFO -c configs/testing/azure-pipelines-ci.json --prefetch-datasets --report --diff-cols library --compatibility-mode + displayName: CI config run diff --git a/test-configuration-win.yml b/test-configuration-win.yml new file mode 100644 index 000000000..a1eddaebd --- /dev/null +++ b/test-configuration-win.yml @@ -0,0 +1,53 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== +steps: + - task: UsePythonVersion@0 + displayName: "Use Python $(PYTHON_VERSION)" + inputs: + versionSpec: "$(PYTHON_VERSION)" + - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" + displayName: Add conda to Windows PATH + - script: | + conda create -y -n bench-env -c conda-forge -c nodefaults python=$(PYTHON_VERSION) + displayName: Environment initialization + - script: | + call activate bench-env + pip install -r envs/requirements-sklearn.txt + pip list + displayName: Install requirements via pip + condition: eq(variables['PKG_MANAGER'], 'pip') + - script: | + call activate bench-env + conda env update -f envs/conda-env-sklearn.yml -n bench-env + conda list + displayName: Install requirements via conda + condition: eq(variables['PKG_MANAGER'], 'conda') + - script: | + call activate bench-env + python -m sklbench --report -l DEBUG --report -p algorithm:library=sklearn,sklearnex algorithm:estimator=PCA,KMeans,ElasticNet,KNeighborsClassifier data:dataset=skin_segmentation data:split_kwargs:train_size=5000 data:split_kwargs:test_size=5000 + displayName: CLI arguments example run + - script: | + call activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json + displayName: Sklearn example run + - script: | + call activate bench-env + python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json + displayName: XGBoost example run + - script: | + call activate bench-env + python -m sklbench -l INFO -c configs/testing/azure-pipelines-ci.json --prefetch-datasets --report --diff-cols library --compatibility-mode + displayName: CI config run diff --git a/utils.py b/utils.py deleted file mode 100755 index a91f69292..000000000 --- a/utils.py +++ /dev/null @@ -1,214 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import json -import os -import platform -import subprocess -import sys -from pathlib import Path -from typing import Any, Dict, Iterable, List, Tuple, Union, cast - -from datasets.make_datasets import try_gen_dataset -from datasets.load_datasets import try_load_dataset - - -def filter_stderr(text: str) -> str: - # delete 'Intel(R) Extension for Scikit-learn usage in sklearn' messages - fake_error_message = ('Intel(R) Extension for Scikit-learn* enabled ' + - '(https://github.com/intel/scikit-learn-intelex)') - - return ''.join(text.split(fake_error_message)) - - -def filter_stdout(text: str) -> Tuple[str, str]: - verbosity_letters = 'EWIDT' - filtered, extra = '', '' - for line in text.split('\n'): - if line == '': - continue - to_remove = False - for letter in verbosity_letters: - if line.startswith(f'[{letter}]'): - to_remove = True - break - if to_remove: - extra += line + '\n' - else: - filtered += line + '\n' - return filtered, extra - - -def files_in_folder(folder: str, files: Iterable[str]) -> bool: - for file in files: - if not os.path.isfile(os.path.join(folder, file)): - return False - return True - - -def find_or_gen_dataset(args: Any, folder: str, files: Iterable[str]): - if files_in_folder("", files): - return "" - if folder: - if files_in_folder(folder, files) or \ - try_gen_dataset(args, folder): - return folder - if try_gen_dataset(args, ""): - return "" - return None - - -def find_the_dataset(name: str, folder: str, files: Iterable[str]): - if files_in_folder("", files): - return "" - if folder: - if files_in_folder(folder, files) or \ - try_load_dataset(dataset_name=name, - output_directory=Path(os.path.join(folder, "data"))): - return folder - if try_load_dataset(dataset_name=name, output_directory=Path("data")): - return "" - return None - - -def read_output_from_command(command: str, - env: Dict[str, str] = os.environ.copy()) -> Tuple[str, str]: - if "PYTHONPATH" in env: - env["PYTHONPATH"] += ":" + os.path.dirname(os.path.abspath(__file__)) - else: - env["PYTHONPATH"] = os.path.dirname(os.path.abspath(__file__)) - res = subprocess.run(command.split(' '), stdout=subprocess.PIPE, - stderr=subprocess.PIPE, encoding='utf-8', env=env) - return res.stdout[:-1], res.stderr[:-1] - - -def parse_lscpu_lscl_info(command_output: str) -> Dict[str, str]: - res: Dict[str, str] = {} - for elem in command_output.strip().split('\n'): - splt = elem.split(':') - res[splt[0]] = splt[1] - return res - - -def get_hw_parameters() -> Dict[str, Union[Dict[str, Any], float]]: - if 'Linux' not in platform.platform(): - return {} - - hw_params: Dict[str, Union[Dict[str, str], float]] = {'CPU': {}} - # get CPU information - lscpu_info, _ = read_output_from_command('lscpu') - lscpu_info = ' '.join(lscpu_info.split()) - for line in lscpu_info.split('\n'): - k, v = line.split(": ")[:2] - if k == 'CPU MHz': - continue - cast(Dict[str, str], hw_params['CPU'])[k] = v - - # get RAM size - mem_info, _ = read_output_from_command('free -b') - mem_info = mem_info.split('\n')[1] - mem_info = ' '.join(mem_info.split()) - hw_params['RAM size[GB]'] = int(mem_info.split(' ')[1]) / 2 ** 30 - - # get Intel GPU information - try: - lsgpu_info, _ = read_output_from_command( - 'lscl --device-type=gpu --platform-vendor=Intel') - device_num = 0 - start_idx = lsgpu_info.find('Device ') - while start_idx >= 0: - start_idx = lsgpu_info.find(':', start_idx) + 1 - end_idx = lsgpu_info.find('Device ', start_idx) - hw_params[f'GPU Intel #{device_num + 1}'] = parse_lscpu_lscl_info( - lsgpu_info[start_idx: end_idx]) - device_num += 1 - start_idx = end_idx - except (FileNotFoundError, json.JSONDecodeError): - pass - - # get Nvidia GPU information - try: - gpu_info, _ = read_output_from_command( - 'nvidia-smi --query-gpu=name,memory.total,driver_version,pstate ' - '--format=csv,noheader') - gpu_info_arr = gpu_info.split(', ') - if len(gpu_info_arr) == 0: - return hw_params - hw_params['GPU Nvidia'] = { - 'Name': gpu_info_arr[0], - 'Memory size': gpu_info_arr[1], - 'Performance mode': gpu_info_arr[3] - } - except (FileNotFoundError, json.JSONDecodeError, IndexError): - pass - return hw_params - - -def get_sw_parameters() -> Dict[str, Dict[str, Any]]: - sw_params = {} - try: - gpu_info, _ = read_output_from_command( - 'nvidia-smi --query-gpu=name,memory.total,driver_version,pstate ' - '--format=csv,noheader') - info_arr = gpu_info.split(', ') - sw_params['GPU_driver'] = {'version': info_arr[2]} - # alert if GPU is already running any processes - gpu_processes, _ = read_output_from_command( - 'nvidia-smi --query-compute-apps=name,pid,used_memory ' - '--format=csv,noheader') - if gpu_processes != '': - print(f'There are running processes on GPU:\n{gpu_processes}', - file=sys.stderr) - except (FileNotFoundError, json.JSONDecodeError, TypeError): - pass - - # get python packages info from conda - try: - conda_list, _ = read_output_from_command('conda list --json') - needed_columns = ['version', 'build_string', 'channel'] - conda_list_json: List[Dict[str, str]] = json.loads(conda_list) - for pkg in conda_list_json: - pkg_info = {} - for col in needed_columns: - if col in pkg: - pkg_info[col] = pkg[col] - sw_params[pkg['name']] = pkg_info - except (FileNotFoundError, json.JSONDecodeError, TypeError): - pass - - return sw_params - - -def generate_cases(params: Dict[str, Union[List[Any], Any]]) -> List[str]: - ''' - Generate cases for benchmarking by iterating the parameter values - ''' - commands = [''] - for param, values in params.items(): - if isinstance(values, list): - prev_len = len(commands) - commands *= len(values) - dashes = '-' if len(param) == 1 else '--' - for command_num in range(prev_len): - for idx, val in enumerate(values): - commands[prev_len * idx + command_num] += ' ' + \ - dashes + param + ' ' + str(val) - else: - dashes = '-' if len(param) == 1 else '--' - for command_num, _ in enumerate(commands): - commands[command_num] += ' ' + \ - dashes + param + ' ' + str(values) - return commands diff --git a/xgboost_bench/README.md b/xgboost_bench/README.md deleted file mode 100644 index 45f27be87..000000000 --- a/xgboost_bench/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# How to create conda environment for benchmarking - -```bash -pip install -r xgboost_bench/requirements.txt -# or -conda install -c intel scikit-learn scikit-learn-intelex pandas tqdm -``` - -## Algorithms parameters - -You can launch benchmarks for each algorithm separately. The table below lists all supported parameters for each algorithm. - -### General - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -|num-threads|int|-1| The number of threads to use| -|arch|str|?|Achine architecture, for bookkeeping| -|batch|str|?|Batch ID, for bookkeeping| -|prefix|str|sklearn|Prefix string, for bookkeeping| -|header|action|False|Output CSV header| -|verbose|action|False|Output extra debug messages| -|data-format|str|numpy|Data formats: *numpy*, *pandas* or *cudf*| -|data-order|str|C|Data order: C (row-major, default) or F (column-major)| -|dtype|np.dtype|np.float64|Data type: *float64* (default) or *float32*| -|check-finiteness|action|False|Check finiteness in sklearn input check(disabled by default)| -|output-format|str|csv|Output format: *csv* (default) or *json*'| -|time-method|str|mean_min|Method used for time mesurements| -|box-filter-measurements|int|100|Maximum number of measurements in box filter| -|inner-loops|int|100|Maximum inner loop iterations. (we take the mean over inner iterations)| -|outer-loops|int|100|Maximum outer loop iterations. (we take the min over outer iterations)| -|time-limit|float|10|Target time to spend to benchmark| -|goal-outer-loops|int|10|The number of outer loops to aim while automatically picking number of inner loops. If zero, do not automatically decide number of inner loops.| -|seed|int|12345|Seed to pass as random_state| -|dataset-name|str|None|Dataset name| - -### GradientBoostingTrees - -| parameter Name | Type | default value | description | -| ----- | ---- |---- |---- | -| n-estimators | int | 100 | The number of gradient boosted trees | -| learning-rate | float | 0.3 | Step size shrinkage used in update to prevents overfitting| -| min-split-loss | float | 0 | Minimum loss reduction required to make partition on a leaf node | -| max-depth | int | 6 | Maximum depth of a tree | -| min-child-weight | float | 1 | Minimum sum of instance weight needed in a child | -| max-delta-step | float | 0 | Maximum delta step we allow each leaf output to be | -| subsample | float | 1 | Subsample ratio of the training instances | -| colsample-bytree | float | 1 | Subsample ratio of columns when constructing each tree | -| reg-lambda | float | 1 | L2 regularization term on weights | -| reg-alpha | float | 0 | L1 regularization term on weights | -| tree-method | str | | The tree construction algorithm used in XGBoost | -| scale-pos-weight | float | 1 | Controls a balance of positive and negative weights | -| grow-policy | str | depthwise | Controls a way new nodes are added to the tree | -| max-leaves | int | 0 | Maximum number of nodes to be added | -| max-bin | int | 256 | Maximum number of discrete bins to bucket continuous features | -| objective | str | True | *reg:squarederror*, *binary:logistic*, *multi:softmax* or *multi:softprob*. Control a balance of positive and negative weights | diff --git a/xgboost_bench/__init__.py b/xgboost_bench/__init__.py deleted file mode 100755 index e69de29bb..000000000 diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py deleted file mode 100644 index eeb321b66..000000000 --- a/xgboost_bench/gbt.py +++ /dev/null @@ -1,181 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np -import xgboost as xgb - - -def convert_probs_to_classes(y_prob): - return np.array([np.argmax(y_prob[i]) for i in range(y_prob.shape[0])]) - - -def convert_xgb_predictions(y_pred, objective): - if objective == 'multi:softprob': - y_pred = convert_probs_to_classes(y_pred) - elif objective == 'binary:logistic': - y_pred = (y_pred >= 0.5).astype(np.int32) - return y_pred - - -parser = argparse.ArgumentParser(description='xgboost gradient boosted trees benchmark') - - -parser.add_argument('--colsample-bytree', type=float, default=1, - help='Subsample ratio of columns ' - 'when constructing each tree') -parser.add_argument('--count-dmatrix', default=False, action='store_true', - help='Count DMatrix creation in time measurements') -parser.add_argument('--enable-experimental-json-serialization', default=True, - choices=('True', 'False'), help='Use JSON to store memory snapshots') -parser.add_argument('--grow-policy', type=str, default='depthwise', - help='Controls a way new nodes are added to the tree') -parser.add_argument('--inplace-predict', default=False, action='store_true', - help='Perform inplace_predict instead of default') -parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, - help='Step size shrinkage used in update ' - 'to prevents overfitting') -parser.add_argument('--max-bin', type=int, default=256, - help='Maximum number of discrete bins to ' - 'bucket continuous features') -parser.add_argument('--max-delta-step', type=float, default=0, - help='Maximum delta step we allow each leaf output to be') -parser.add_argument('--max-depth', type=int, default=6, - help='Maximum depth of a tree') -parser.add_argument('--max-leaves', type=int, default=0, - help='Maximum number of nodes to be added') -parser.add_argument('--min-child-weight', type=float, default=1, - help='Minimum sum of instance weight needed in a child') -parser.add_argument('--min-split-loss', '--gamma', type=float, default=0, - help='Minimum loss reduction required to make' - ' partition on a leaf node') -parser.add_argument('--n-estimators', type=int, default=100, - help='The number of gradient boosted trees') -parser.add_argument('--objective', type=str, required=True, - choices=('reg:squarederror', 'binary:logistic', - 'multi:softmax', 'multi:softprob'), - help='Specifies the learning task') -parser.add_argument('--reg-alpha', type=float, default=0, - help='L1 regularization term on weights') -parser.add_argument('--reg-lambda', type=float, default=1, - help='L2 regularization term on weights') -parser.add_argument('--scale-pos-weight', type=float, default=1, - help='Controls a balance of positive and negative weights') -parser.add_argument('--single-precision-histogram', default=False, action='store_true', - help='Build histograms instead of double precision') -parser.add_argument('--subsample', type=float, default=1, - help='Subsample ratio of the training instances') -parser.add_argument('--tree-method', type=str, required=True, - help='The tree construction algorithm used in XGBoost') - -params = bench.parse_args(parser) -# Default seed -if params.seed == 12345: - params.seed = 0 - -# Load and convert data -X_train, X_test, y_train, y_test = bench.load_data(params) - -xgb_params = { - 'booster': 'gbtree', - 'verbosity': 0, - 'learning_rate': params.learning_rate, - 'min_split_loss': params.min_split_loss, - 'max_depth': params.max_depth, - 'min_child_weight': params.min_child_weight, - 'max_delta_step': params.max_delta_step, - 'subsample': params.subsample, - 'sampling_method': 'uniform', - 'colsample_bytree': params.colsample_bytree, - 'colsample_bylevel': 1, - 'colsample_bynode': 1, - 'reg_lambda': params.reg_lambda, - 'reg_alpha': params.reg_alpha, - 'tree_method': params.tree_method, - 'scale_pos_weight': params.scale_pos_weight, - 'grow_policy': params.grow_policy, - 'max_leaves': params.max_leaves, - 'max_bin': params.max_bin, - 'objective': params.objective, - 'seed': params.seed, - 'single_precision_histogram': params.single_precision_histogram, - 'enable_experimental_json_serialization': - params.enable_experimental_json_serialization -} - -if params.threads != -1: - xgb_params.update({'nthread': params.threads}) - -if params.objective.startswith('reg'): - task = 'regression' - metric_name, metric_func = 'rmse', bench.rmse_score -else: - task = 'classification' - metric_name = 'accuracy' - metric_func = bench.accuracy_score - if 'cudf' in str(type(y_train)): - params.n_classes = y_train[y_train.columns[0]].nunique() - else: - params.n_classes = len(np.unique(y_train)) - - # Covtype has one class more than there is in train - if params.dataset_name == 'covtype': - params.n_classes += 1 - - if params.n_classes > 2: - xgb_params['num_class'] = params.n_classes - -dtrain = xgb.DMatrix(X_train, y_train) -dtest = xgb.DMatrix(X_test, y_test) - - -def fit(dmatrix): - if dmatrix is None: - dmatrix = xgb.DMatrix(X_train, y_train) - return xgb.train(xgb_params, dmatrix, params.n_estimators) - - -if params.inplace_predict: - def predict(*args): - return booster.inplace_predict(np.ascontiguousarray(X_test.values, - dtype=np.float32)) -else: - def predict(dmatrix): # type: ignore - if dmatrix is None: - dmatrix = xgb.DMatrix(X_test, y_test) - return booster.predict(dmatrix) - - -fit_time, booster = bench.measure_function_time( - fit, None if params.count_dmatrix else dtrain, params=params) -train_metric = metric_func( - convert_xgb_predictions( - booster.predict(dtrain), - params.objective), - y_train) - -predict_time, y_pred = bench.measure_function_time( - predict, None if params.inplace_predict or params.count_dmatrix else dtest, params=params) -test_metric = metric_func(convert_xgb_predictions(y_pred, params.objective), y_test) - -bench.print_output(library='xgboost', algorithm=f'gradient_boosted_trees_{task}', - stages=['training', 'prediction'], - params=params, functions=['gbt.fit', 'gbt.predict'], - times=[fit_time, predict_time], metric_type=metric_name, - metrics=[train_metric, test_metric], data=[X_train, X_test], - alg_instance=booster, alg_params=xgb_params) diff --git a/xgboost_bench/requirements.txt b/xgboost_bench/requirements.txt deleted file mode 100755 index 7074b035b..000000000 --- a/xgboost_bench/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -xgboost==1.7.2