Skip to content

task: benchmarking with asv #2054

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,6 @@ dpctl/tensor/_usmarray.h

# moved cmake scripts
dpctl/resources/cmake

# asv artifacts
*.asv*
22 changes: 22 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# dpctl benchmarks

Benchmarking dpctl using Airspeed Velocity
Read more about ASV [here](https://asv.readthedocs.io/en/stable/index.html)

## Usage
The benchmarks were made with using an existing environment in-mind before execution. You will see the `asv.conf.json` is minimal without any environmental information supplied.
The expectation is for users to execute `asv run` with an existing environment.

As such, you should have conda or mamba installed, and create an environment [following these instructions](https://intelpython.github.io/dpctl/latest/beginners_guides/installation.html#dpctl-installation)
Additionally, install `asv` and `libmambapy` to the environment.

Then, you may activate the environment and instruct `asv run` to use this existing environment for the benchmarks by pointing it to the environment's python binary, like so:
```
conda activate dpctl_env
asv run --environment existing:/full/mamba/path/envs/dpctl_env/bin/python
```

For `level_zero` devices, you might see `USM Allocation` errors unless you use the `asv run` command with `--launch-method spawn`

## Writing new benchmarks
Read ASV's guidelines for writing benchmarks [here](https://asv.readthedocs.io/en/stable/writing_benchmarks.html)
53 changes: 53 additions & 0 deletions benchmarks/asv.conf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
// The version of the config file format. Do not change, unless
// you know what you are doing.
"version": 1,

// The name of the project being benchmarked
"project": "dpctl",

// The project's homepage
"project_url": "https://github.com/IntelPython/dpctl",

// The URL or local path of the source code repository for the
// project being benchmarked
"repo": "..",

// Customizable commands for building the project.
// See asv.conf.json documentation.
"build_command": [],

// List of branches to benchmark. If not provided, defaults to "main"
// (for git) or "default" (for mercurial).
"branches": ["HEAD"],

// The DVCS being used. If not set, it will be automatically
// determined from "repo" by looking at the protocol in the URL
// (if remote), or by looking for special directories, such as
// ".git" (if local).
"dvcs": "git",

// The tool to use to create environments. May be "conda",
// "virtualenv", "mamba" (above 3.8)
// or other value depending on the plugins in use.
// If missing or the empty string, the tool will be automatically
// determined by looking for tools on the PATH environment
// variable.
"environment_type": "conda",

// The directory (relative to the current directory) that benchmarks are
// stored in. If not provided, defaults to "benchmarks"
"benchmark_dir": "benchmarks",

// The directory (relative to the current directory) to cache the Python
// environments in. If not provided, defaults to "env"
"env_dir": ".asv/env",

// The directory (relative to the current directory) that raw benchmark
// results are stored in. If not provided, defaults to "results".
"results_dir": ".asv/results",

// The directory (relative to the current directory) that the html tree
// should be written to. If not provided, defaults to "html".
"html_dir": ".asv/html"
}
Empty file.
17 changes: 17 additions & 0 deletions benchmarks/benchmarks/benchmark_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from asv_runner.benchmarks.mark import SkipNotImplemented

import dpctl.tensor as dpt


def skip_unsupported_dtype(q, dtype):
"""
Skip the benchmark if the device does not support the given data type.
"""
if (
(dtype == dpt.float64 or dtype.name == dpt.complex128)
and not q.sycl_device.has_aspect_fp64
) or (dtype == dpt.float16 and not q.sycl_device.has_aspect_fp16):
raise SkipNotImplemented(
f"Skipping benchmark for {dtype.name} on this device"
+ " as it is not supported."
)
133 changes: 133 additions & 0 deletions benchmarks/benchmarks/binary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import dpctl
import dpctl.tensor as dpt

from . import benchmark_utils as bench_utils

SHARED_QUEUE = dpctl.SyclQueue(property="enable_profiling")


class Binary:
"""Benchmark class for binary operations on SYCL devices."""

timeout = 300.0

def setup(self):
"""Setup the benchmark environment."""
self.q = SHARED_QUEUE
self.iterations = 1
self.n_values = 10**8

def run_bench(self, q, reps, n_max, dtype1, dtype2, op):
"""Run the benchmark for a specific function and dtype combination."""

def get_sizes(n):
s = []
m = 8192
while m < n:
s.append(m)
m *= 2
s.append(n)
return s

x1 = dpt.ones(n_max, dtype=dtype1, sycl_queue=q)
x2 = dpt.ones(n_max, dtype=dtype2, sycl_queue=q)
r = op(x1, x2)

max_bytes = x1.nbytes + x2.nbytes + r.nbytes
times_res = []

for n in get_sizes(n_max):
x1_n = x1[:n]
x2_n = x2[:n]
r_n = r[:n]
n_bytes = x1_n.nbytes + x2_n.nbytes + r_n.nbytes

n_iters = int((max_bytes / n_bytes) * reps)

while True:
timer = dpctl.SyclTimer(
device_timer="order_manager", time_scale=1e9
)
with timer(q):
for _ in range(n_iters):
op(x1_n, x2_n, out=r_n)

dev_dt = timer.dt.device_dt
if dev_dt > 0:
times_res.append((n, dev_dt / n_iters))
break

return times_res


binary_instance = Binary()
binary_instance.q = SHARED_QUEUE
binary_instance.iterations = 1
binary_instance.n_values = 10**8

function_list = [
dpt.add,
dpt.multiply,
dpt.divide,
dpt.subtract,
dpt.floor_divide,
dpt.remainder,
dpt.hypot,
dpt.logaddexp,
dpt.pow,
dpt.atan2,
dpt.nextafter,
dpt.copysign,
dpt.less,
dpt.less_equal,
dpt.greater,
dpt.greater_equal,
dpt.equal,
dpt.not_equal,
dpt.minimum,
dpt.maximum,
dpt.bitwise_and,
dpt.bitwise_or,
dpt.bitwise_xor,
dpt.bitwise_left_shift,
dpt.bitwise_right_shift,
dpt.logical_and,
dpt.logical_or,
dpt.logical_xor,
]

# Generate dtype combinations for each function
dtypes = {}
for fn in function_list:
dtypes[fn] = [list(map(dpt.dtype, sig.split("->")[0])) for sig in fn.types]


# Dynamically create benchmark methods at the module level
def generate_benchmark_functions():
"""Dynamically create benchmark functions for each
function and dtype combination.
"""
for fn in function_list:
fn_name = fn.name_
for dtype1, dtype2 in dtypes[fn]:
# Create unique function names
method_name = f"time_{fn_name}_{dtype1.name}_{dtype2.name}"

def benchmark_method(self, fn=fn, dtype1=dtype1, dtype2=dtype2):
bench_utils.skip_unsupported_dtype(self.q, dtype1)
return self.run_bench(
self.q,
self.iterations,
self.n_values,
dtype1,
dtype2,
fn,
)

benchmark_method.__name__ = method_name
# Attach the new method to the Binary class
setattr(Binary, method_name, benchmark_method)


# Generate the benchmark functions
generate_benchmark_functions()
31 changes: 31 additions & 0 deletions benchmarks/benchmarks/ef_bench_add.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import dpctl
import dpctl.tensor as dpt
import dpctl.tensor._tensor_elementwise_impl as tei
import dpctl.utils as dpu


class EfBenchAdd:

def time_ef_bench_add(self):
q = dpctl.SyclQueue(property="enable_profiling")
n = 2**26
reps = 50

dt = dpt.int8
x1 = dpt.ones(n, dtype=dt, sycl_queue=q)
x2 = dpt.ones(n, dtype=dt, sycl_queue=q)

op1, op2 = dpt.add, tei._add

r = op1(x1, x2)

timer = dpctl.SyclTimer(device_timer="order_manager", time_scale=1e9)

m = dpu.SequentialOrderManager[q]
with timer(q):
for _ in range(reps):
deps = m.submitted_events
ht_e, c_e = op2(
src1=x1, src2=x2, dst=r, sycl_queue=q, depends=deps
)
m.add_event_pair(ht_e, c_e)
Loading