Skip to content

Commit f57963e

Browse files
task: benchmarking with asv (#2054)
2 parents 041e0d1 + 7b6db2e commit f57963e

File tree

7 files changed

+259
-0
lines changed

7 files changed

+259
-0
lines changed

Diff for: .gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,6 @@ dpctl/tensor/_usmarray.h
102102

103103
# moved cmake scripts
104104
dpctl/resources/cmake
105+
106+
# asv artifacts
107+
*.asv*

Diff for: benchmarks/README.md

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# dpctl benchmarks
2+
3+
Benchmarking dpctl using Airspeed Velocity
4+
Read more about ASV [here](https://asv.readthedocs.io/en/stable/index.html)
5+
6+
## Usage
7+
The benchmarks were made with using an existing environment in-mind before execution. You will see the `asv.conf.json` is minimal without any environmental information supplied.
8+
The expectation is for users to execute `asv run` with an existing environment.
9+
10+
As such, you should have conda or mamba installed, and create an environment [following these instructions](https://intelpython.github.io/dpctl/latest/beginners_guides/installation.html#dpctl-installation)
11+
Additionally, install `asv` and `libmambapy` to the environment.
12+
13+
Then, you may activate the environment and instruct `asv run` to use this existing environment for the benchmarks by pointing it to the environment's python binary, like so:
14+
```
15+
conda activate dpctl_env
16+
asv run --environment existing:/full/mamba/path/envs/dpctl_env/bin/python
17+
```
18+
19+
For `level_zero` devices, you might see `USM Allocation` errors unless you use the `asv run` command with `--launch-method spawn`
20+
21+
## Writing new benchmarks
22+
Read ASV's guidelines for writing benchmarks [here](https://asv.readthedocs.io/en/stable/writing_benchmarks.html)

Diff for: benchmarks/asv.conf.json

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
{
2+
// The version of the config file format. Do not change, unless
3+
// you know what you are doing.
4+
"version": 1,
5+
6+
// The name of the project being benchmarked
7+
"project": "dpctl",
8+
9+
// The project's homepage
10+
"project_url": "https://github.com/IntelPython/dpctl",
11+
12+
// The URL or local path of the source code repository for the
13+
// project being benchmarked
14+
"repo": "..",
15+
16+
// Customizable commands for building the project.
17+
// See asv.conf.json documentation.
18+
"build_command": [],
19+
20+
// List of branches to benchmark. If not provided, defaults to "main"
21+
// (for git) or "default" (for mercurial).
22+
"branches": ["HEAD"],
23+
24+
// The DVCS being used. If not set, it will be automatically
25+
// determined from "repo" by looking at the protocol in the URL
26+
// (if remote), or by looking for special directories, such as
27+
// ".git" (if local).
28+
"dvcs": "git",
29+
30+
// The tool to use to create environments. May be "conda",
31+
// "virtualenv", "mamba" (above 3.8)
32+
// or other value depending on the plugins in use.
33+
// If missing or the empty string, the tool will be automatically
34+
// determined by looking for tools on the PATH environment
35+
// variable.
36+
"environment_type": "conda",
37+
38+
// The directory (relative to the current directory) that benchmarks are
39+
// stored in. If not provided, defaults to "benchmarks"
40+
"benchmark_dir": "benchmarks",
41+
42+
// The directory (relative to the current directory) to cache the Python
43+
// environments in. If not provided, defaults to "env"
44+
"env_dir": ".asv/env",
45+
46+
// The directory (relative to the current directory) that raw benchmark
47+
// results are stored in. If not provided, defaults to "results".
48+
"results_dir": ".asv/results",
49+
50+
// The directory (relative to the current directory) that the html tree
51+
// should be written to. If not provided, defaults to "html".
52+
"html_dir": ".asv/html"
53+
}

Diff for: benchmarks/benchmarks/__init__.py

Whitespace-only changes.

Diff for: benchmarks/benchmarks/benchmark_utils.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from asv_runner.benchmarks.mark import SkipNotImplemented
2+
3+
import dpctl.tensor as dpt
4+
5+
6+
def skip_unsupported_dtype(q, dtype):
7+
"""
8+
Skip the benchmark if the device does not support the given data type.
9+
"""
10+
if (
11+
(dtype == dpt.float64 or dtype.name == dpt.complex128)
12+
and not q.sycl_device.has_aspect_fp64
13+
) or (dtype == dpt.float16 and not q.sycl_device.has_aspect_fp16):
14+
raise SkipNotImplemented(
15+
f"Skipping benchmark for {dtype.name} on this device"
16+
+ " as it is not supported."
17+
)

Diff for: benchmarks/benchmarks/binary.py

+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import dpctl
2+
import dpctl.tensor as dpt
3+
4+
from . import benchmark_utils as bench_utils
5+
6+
SHARED_QUEUE = dpctl.SyclQueue(property="enable_profiling")
7+
8+
9+
class Binary:
10+
"""Benchmark class for binary operations on SYCL devices."""
11+
12+
timeout = 300.0
13+
14+
def setup(self):
15+
"""Setup the benchmark environment."""
16+
self.q = SHARED_QUEUE
17+
self.iterations = 1
18+
self.n_values = 10**8
19+
20+
def run_bench(self, q, reps, n_max, dtype1, dtype2, op):
21+
"""Run the benchmark for a specific function and dtype combination."""
22+
23+
def get_sizes(n):
24+
s = []
25+
m = 8192
26+
while m < n:
27+
s.append(m)
28+
m *= 2
29+
s.append(n)
30+
return s
31+
32+
x1 = dpt.ones(n_max, dtype=dtype1, sycl_queue=q)
33+
x2 = dpt.ones(n_max, dtype=dtype2, sycl_queue=q)
34+
r = op(x1, x2)
35+
36+
max_bytes = x1.nbytes + x2.nbytes + r.nbytes
37+
times_res = []
38+
39+
for n in get_sizes(n_max):
40+
x1_n = x1[:n]
41+
x2_n = x2[:n]
42+
r_n = r[:n]
43+
n_bytes = x1_n.nbytes + x2_n.nbytes + r_n.nbytes
44+
45+
n_iters = int((max_bytes / n_bytes) * reps)
46+
47+
while True:
48+
timer = dpctl.SyclTimer(
49+
device_timer="order_manager", time_scale=1e9
50+
)
51+
with timer(q):
52+
for _ in range(n_iters):
53+
op(x1_n, x2_n, out=r_n)
54+
55+
dev_dt = timer.dt.device_dt
56+
if dev_dt > 0:
57+
times_res.append((n, dev_dt / n_iters))
58+
break
59+
60+
return times_res
61+
62+
63+
binary_instance = Binary()
64+
binary_instance.q = SHARED_QUEUE
65+
binary_instance.iterations = 1
66+
binary_instance.n_values = 10**8
67+
68+
function_list = [
69+
dpt.add,
70+
dpt.multiply,
71+
dpt.divide,
72+
dpt.subtract,
73+
dpt.floor_divide,
74+
dpt.remainder,
75+
dpt.hypot,
76+
dpt.logaddexp,
77+
dpt.pow,
78+
dpt.atan2,
79+
dpt.nextafter,
80+
dpt.copysign,
81+
dpt.less,
82+
dpt.less_equal,
83+
dpt.greater,
84+
dpt.greater_equal,
85+
dpt.equal,
86+
dpt.not_equal,
87+
dpt.minimum,
88+
dpt.maximum,
89+
dpt.bitwise_and,
90+
dpt.bitwise_or,
91+
dpt.bitwise_xor,
92+
dpt.bitwise_left_shift,
93+
dpt.bitwise_right_shift,
94+
dpt.logical_and,
95+
dpt.logical_or,
96+
dpt.logical_xor,
97+
]
98+
99+
# Generate dtype combinations for each function
100+
dtypes = {}
101+
for fn in function_list:
102+
dtypes[fn] = [list(map(dpt.dtype, sig.split("->")[0])) for sig in fn.types]
103+
104+
105+
# Dynamically create benchmark methods at the module level
106+
def generate_benchmark_functions():
107+
"""Dynamically create benchmark functions for each
108+
function and dtype combination.
109+
"""
110+
for fn in function_list:
111+
fn_name = fn.name_
112+
for dtype1, dtype2 in dtypes[fn]:
113+
# Create unique function names
114+
method_name = f"time_{fn_name}_{dtype1.name}_{dtype2.name}"
115+
116+
def benchmark_method(self, fn=fn, dtype1=dtype1, dtype2=dtype2):
117+
bench_utils.skip_unsupported_dtype(self.q, dtype1)
118+
return self.run_bench(
119+
self.q,
120+
self.iterations,
121+
self.n_values,
122+
dtype1,
123+
dtype2,
124+
fn,
125+
)
126+
127+
benchmark_method.__name__ = method_name
128+
# Attach the new method to the Binary class
129+
setattr(Binary, method_name, benchmark_method)
130+
131+
132+
# Generate the benchmark functions
133+
generate_benchmark_functions()

Diff for: benchmarks/benchmarks/ef_bench_add.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import dpctl
2+
import dpctl.tensor as dpt
3+
import dpctl.tensor._tensor_elementwise_impl as tei
4+
import dpctl.utils as dpu
5+
6+
7+
class EfBenchAdd:
8+
9+
def time_ef_bench_add(self):
10+
q = dpctl.SyclQueue(property="enable_profiling")
11+
n = 2**26
12+
reps = 50
13+
14+
dt = dpt.int8
15+
x1 = dpt.ones(n, dtype=dt, sycl_queue=q)
16+
x2 = dpt.ones(n, dtype=dt, sycl_queue=q)
17+
18+
op1, op2 = dpt.add, tei._add
19+
20+
r = op1(x1, x2)
21+
22+
timer = dpctl.SyclTimer(device_timer="order_manager", time_scale=1e9)
23+
24+
m = dpu.SequentialOrderManager[q]
25+
with timer(q):
26+
for _ in range(reps):
27+
deps = m.submitted_events
28+
ht_e, c_e = op2(
29+
src1=x1, src2=x2, dst=r, sycl_queue=q, depends=deps
30+
)
31+
m.add_event_pair(ht_e, c_e)

0 commit comments

Comments
 (0)