diff --git a/.gitignore b/.gitignore index feb538de6b..093cbba81f 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,6 @@ dpctl/tensor/_usmarray.h # moved cmake scripts dpctl/resources/cmake + +# asv artifacts +*.asv* diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000000..513ad48ec9 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,22 @@ +# dpctl benchmarks + +Benchmarking dpctl using Airspeed Velocity +Read more about ASV [here](https://asv.readthedocs.io/en/stable/index.html) + +## Usage +The benchmarks were made with using an existing environment in-mind before execution. You will see the `asv.conf.json` is minimal without any environmental information supplied. +The expectation is for users to execute `asv run` with an existing environment. + +As such, you should have conda or mamba installed, and create an environment [following these instructions](https://intelpython.github.io/dpctl/latest/beginners_guides/installation.html#dpctl-installation) +Additionally, install `asv` and `libmambapy` to the environment. + +Then, you may activate the environment and instruct `asv run` to use this existing environment for the benchmarks by pointing it to the environment's python binary, like so: +``` +conda activate dpctl_env +asv run --environment existing:/full/mamba/path/envs/dpctl_env/bin/python +``` + +For `level_zero` devices, you might see `USM Allocation` errors unless you use the `asv run` command with `--launch-method spawn` + +## Writing new benchmarks +Read ASV's guidelines for writing benchmarks [here](https://asv.readthedocs.io/en/stable/writing_benchmarks.html) diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 0000000000..478dc518f9 --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,53 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "dpctl", + + // The project's homepage + "project_url": "https://github.com/IntelPython/dpctl", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // Customizable commands for building the project. + // See asv.conf.json documentation. + "build_command": [], + + // List of branches to benchmark. If not provided, defaults to "main" + // (for git) or "default" (for mercurial). + "branches": ["HEAD"], + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv", "mamba" (above 3.8) + // or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": ".asv/html" +} diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmarks/benchmarks/benchmark_utils.py b/benchmarks/benchmarks/benchmark_utils.py new file mode 100644 index 0000000000..7d493d1cd6 --- /dev/null +++ b/benchmarks/benchmarks/benchmark_utils.py @@ -0,0 +1,17 @@ +from asv_runner.benchmarks.mark import SkipNotImplemented + +import dpctl.tensor as dpt + + +def skip_unsupported_dtype(q, dtype): + """ + Skip the benchmark if the device does not support the given data type. + """ + if ( + (dtype == dpt.float64 or dtype.name == dpt.complex128) + and not q.sycl_device.has_aspect_fp64 + ) or (dtype == dpt.float16 and not q.sycl_device.has_aspect_fp16): + raise SkipNotImplemented( + f"Skipping benchmark for {dtype.name} on this device" + + " as it is not supported." + ) diff --git a/benchmarks/benchmarks/binary.py b/benchmarks/benchmarks/binary.py new file mode 100644 index 0000000000..49d08d09d6 --- /dev/null +++ b/benchmarks/benchmarks/binary.py @@ -0,0 +1,133 @@ +import dpctl +import dpctl.tensor as dpt + +from . import benchmark_utils as bench_utils + +SHARED_QUEUE = dpctl.SyclQueue(property="enable_profiling") + + +class Binary: + """Benchmark class for binary operations on SYCL devices.""" + + timeout = 300.0 + + def setup(self): + """Setup the benchmark environment.""" + self.q = SHARED_QUEUE + self.iterations = 1 + self.n_values = 10**8 + + def run_bench(self, q, reps, n_max, dtype1, dtype2, op): + """Run the benchmark for a specific function and dtype combination.""" + + def get_sizes(n): + s = [] + m = 8192 + while m < n: + s.append(m) + m *= 2 + s.append(n) + return s + + x1 = dpt.ones(n_max, dtype=dtype1, sycl_queue=q) + x2 = dpt.ones(n_max, dtype=dtype2, sycl_queue=q) + r = op(x1, x2) + + max_bytes = x1.nbytes + x2.nbytes + r.nbytes + times_res = [] + + for n in get_sizes(n_max): + x1_n = x1[:n] + x2_n = x2[:n] + r_n = r[:n] + n_bytes = x1_n.nbytes + x2_n.nbytes + r_n.nbytes + + n_iters = int((max_bytes / n_bytes) * reps) + + while True: + timer = dpctl.SyclTimer( + device_timer="order_manager", time_scale=1e9 + ) + with timer(q): + for _ in range(n_iters): + op(x1_n, x2_n, out=r_n) + + dev_dt = timer.dt.device_dt + if dev_dt > 0: + times_res.append((n, dev_dt / n_iters)) + break + + return times_res + + +binary_instance = Binary() +binary_instance.q = SHARED_QUEUE +binary_instance.iterations = 1 +binary_instance.n_values = 10**8 + +function_list = [ + dpt.add, + dpt.multiply, + dpt.divide, + dpt.subtract, + dpt.floor_divide, + dpt.remainder, + dpt.hypot, + dpt.logaddexp, + dpt.pow, + dpt.atan2, + dpt.nextafter, + dpt.copysign, + dpt.less, + dpt.less_equal, + dpt.greater, + dpt.greater_equal, + dpt.equal, + dpt.not_equal, + dpt.minimum, + dpt.maximum, + dpt.bitwise_and, + dpt.bitwise_or, + dpt.bitwise_xor, + dpt.bitwise_left_shift, + dpt.bitwise_right_shift, + dpt.logical_and, + dpt.logical_or, + dpt.logical_xor, +] + +# Generate dtype combinations for each function +dtypes = {} +for fn in function_list: + dtypes[fn] = [list(map(dpt.dtype, sig.split("->")[0])) for sig in fn.types] + + +# Dynamically create benchmark methods at the module level +def generate_benchmark_functions(): + """Dynamically create benchmark functions for each + function and dtype combination. + """ + for fn in function_list: + fn_name = fn.name_ + for dtype1, dtype2 in dtypes[fn]: + # Create unique function names + method_name = f"time_{fn_name}_{dtype1.name}_{dtype2.name}" + + def benchmark_method(self, fn=fn, dtype1=dtype1, dtype2=dtype2): + bench_utils.skip_unsupported_dtype(self.q, dtype1) + return self.run_bench( + self.q, + self.iterations, + self.n_values, + dtype1, + dtype2, + fn, + ) + + benchmark_method.__name__ = method_name + # Attach the new method to the Binary class + setattr(Binary, method_name, benchmark_method) + + +# Generate the benchmark functions +generate_benchmark_functions() diff --git a/benchmarks/benchmarks/ef_bench_add.py b/benchmarks/benchmarks/ef_bench_add.py new file mode 100644 index 0000000000..f17f8613d2 --- /dev/null +++ b/benchmarks/benchmarks/ef_bench_add.py @@ -0,0 +1,31 @@ +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._tensor_elementwise_impl as tei +import dpctl.utils as dpu + + +class EfBenchAdd: + + def time_ef_bench_add(self): + q = dpctl.SyclQueue(property="enable_profiling") + n = 2**26 + reps = 50 + + dt = dpt.int8 + x1 = dpt.ones(n, dtype=dt, sycl_queue=q) + x2 = dpt.ones(n, dtype=dt, sycl_queue=q) + + op1, op2 = dpt.add, tei._add + + r = op1(x1, x2) + + timer = dpctl.SyclTimer(device_timer="order_manager", time_scale=1e9) + + m = dpu.SequentialOrderManager[q] + with timer(q): + for _ in range(reps): + deps = m.submitted_events + ht_e, c_e = op2( + src1=x1, src2=x2, dst=r, sycl_queue=q, depends=deps + ) + m.add_event_pair(ht_e, c_e)