Skip to content

Commit 45960ef

Browse files
committed
ggml-hexagon: release v1.06 and ready for code review
1 parent e9d1860 commit 45960ef

File tree

5 files changed

+167
-127
lines changed

5 files changed

+167
-127
lines changed

ggml/src/ggml-hexagon/ggml-hexagon.cpp

+20-5
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,8 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
383383
#elif defined(_WIN32)
384384
.qnn_runtimelib_path = "C:\\",
385385
#endif
386-
.ggml_hexagon_version = {"1.05"},
387-
.ggml_dsp_version = {"0.62"},
386+
.ggml_hexagon_version = {"1.06"},
387+
.ggml_dsp_version = {"0.63"},
388388
};
389389

390390
//file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
@@ -1417,6 +1417,13 @@ class hexagon_appcfg {
14171417
section = cur_section;
14181418
trim(key);
14191419
trim(value);
1420+
1421+
//"1.00" -> 1.00
1422+
if (value.front() == '"' && value.back() == '"') {
1423+
value.erase(0, 1); // erase the first character "
1424+
value.erase(value.size() - 1); // erase the last character "
1425+
}
1426+
14201427
return true;
14211428
}
14221429

@@ -1829,8 +1836,10 @@ static void ggmlhexagon_load_cfg() {
18291836
GGMLHEXAGON_LOG_INFO("%s", tmposs.str().c_str());
18301837
});
18311838
std::string precision_mode;
1832-
std::string ggml_hexagon_version;
1833-
hexagoncfg_instance.get_stringvalue("general", "version", ggml_hexagon_version, "1.00");
1839+
std::string version; //version of ggml-hexagon.cpp
1840+
std::string ggmldsp_version; //version of ggml-dsp.c
1841+
hexagoncfg_instance.get_stringvalue("general", "version", version, "1.00");
1842+
hexagoncfg_instance.get_stringvalue("general", "ggmldsp_version", ggmldsp_version, "0.62");
18341843
hexagoncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1);
18351844
hexagoncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0);
18361845
hexagoncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0);
@@ -1854,7 +1863,9 @@ static void ggmlhexagon_load_cfg() {
18541863

18551864
GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
18561865
GGMLHEXAGON_LOG_INFO("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version);
1857-
GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", ggml_hexagon_version.c_str());
1866+
GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", version.c_str());
1867+
GGMLHEXAGON_LOG_INFO("external ggml_dsp_version=%s", ggmldsp_version.c_str());
1868+
memcpy(g_hexagon_appcfg.ggml_dsp_version, ggmldsp_version.c_str(), strlen(ggmldsp_version.c_str()));
18581869
GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
18591870
ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
18601871
GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
@@ -5445,6 +5456,7 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
54455456
// between ARM-AP and cDSP. the mechanism in qidl/FastRPC is exactly similar to mechanism in TEE.
54465457
// try to find a better/efficient approach to exchange necessary data between ARM-AP side and cDSP side.
54475458
// manually modifying the important data structure ggml_tensor in ggml.h is not make-sense and not acceptable.
5459+
std::chrono::high_resolution_clock::time_point start_time = std::chrono::high_resolution_clock::now();
54485460
dsptensor_0.data = src0->data;
54495461
dsptensor_0.data_len = ggml_nbytes(src0);
54505462
dsptensor_0.type = src0->type;
@@ -5491,6 +5503,9 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
54915503
dsptensor_2.nb[3] = dst->nb[3];
54925504

54935505
memcpy(dsptensor_2.op_params, dst->op_params, GGML_MAX_OP_PARAMS / sizeof(int32_t));
5506+
std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
5507+
std::chrono::duration<size_t, std::nano> duration = end_time - start_time;
5508+
GGMLHEXAGON_LOG_VERBOSE("pack duration %llu ns", duration.count());
54945509

54955510
hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
54965511
if (AEE_SUCCESS != hexagon_error) {

ggml/src/ggml-hexagon/kernels/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initi
2121
LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
2222

2323
#SRCS = $(wildcard *.c)
24-
SRCS = ggml-dsp.c skel.c add.c mulmat.c
24+
SRCS = ggml-dsp.c skel.c entry.c add.c mulmat.c
2525
OBJS = $(patsubst %.c, %.o, $(SRCS))
2626

2727
ALL:$(OBJS)

ggml/src/ggml-hexagon/kernels/entry.c

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#include "ggml-dsp.h"
2+
3+
static int32 g_thread_counts = 1;
4+
5+
int ggmlop_dsp_open(const char * uri, remote_handle64 * handle) {
6+
void * tptr = NULL;
7+
GGMLHEXAGON_LOG_DEBUG("uri %s", uri);
8+
tptr = (void *)malloc(1);
9+
GGML_ASSERT(NULL != tptr);
10+
*handle = (remote_handle64)tptr;
11+
12+
GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", qurt_api_version());
13+
GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units());
14+
qurt_arch_version_t vers;
15+
qurt_sysenv_get_arch_version(&vers);
16+
GGMLHEXAGON_LOG_DEBUG("arch_version=0x%x", vers.arch_version);
17+
18+
qurt_sysenv_app_heap_t aheap;
19+
qurt_sysenv_get_app_heap(&aheap);
20+
GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit);
21+
22+
qurt_sysenv_max_hthreads_t mhwt;
23+
qurt_sysenv_get_max_hw_threads(&mhwt);
24+
GGMLHEXAGON_LOG_DEBUG("max hardware threads counts=%d", mhwt.max_hthreads);
25+
g_thread_counts = mhwt.max_hthreads;
26+
27+
return 0;
28+
}
29+
30+
int ggmlop_dsp_close(remote_handle64 handle) {
31+
if (handle)
32+
free((void*)handle);
33+
34+
return 0;
35+
}
36+
37+
AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
38+
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
39+
HAP_power_request_t request;
40+
memset(&request, 0, sizeof(HAP_power_request_t));
41+
request.type = HAP_power_set_apptype;
42+
request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
43+
44+
GGMLHEXAGON_LOG_DEBUG("user specified thread_counts %d", thread_counts);
45+
if (thread_counts > 1)
46+
g_thread_counts = (thread_counts > g_thread_counts) ? g_thread_counts : thread_counts;
47+
else
48+
g_thread_counts = 1;
49+
GGMLHEXAGON_LOG_DEBUG("real thread_counts %d", g_thread_counts);
50+
51+
void * ggmop_ctx = (void*)(handle);
52+
int retval = HAP_power_set(ggmop_ctx, &request);
53+
if (retval) {
54+
GGMLHEXAGON_LOG_DEBUG("failed first power vote");
55+
return AEE_EFAILED;
56+
}
57+
58+
//configure clocks & DCVS mode
59+
memset(&request, 0, sizeof(HAP_power_request_t));
60+
request.type = HAP_power_set_DCVS_v2;
61+
request.dcvs_v2.dcvs_enable = TRUE;
62+
request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level;
63+
if (dcvs_enabled) {
64+
request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE;
65+
request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE;
66+
} else {
67+
request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner;
68+
request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner;
69+
}
70+
request.dcvs_v2.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
71+
request.dcvs_v2.set_dcvs_params = TRUE;
72+
request.dcvs_v2.set_latency = TRUE;
73+
request.dcvs_v2.latency = latency;
74+
retval = HAP_power_set(ggmop_ctx, &request);
75+
if (retval) {
76+
GGMLHEXAGON_LOG_DEBUG("failed to vote for performance mode");
77+
return AEE_EFAILED;
78+
}
79+
80+
memset(&request, 0, sizeof(HAP_power_request_t));
81+
request.type = HAP_power_set_HVX;
82+
request.hvx.power_up = TRUE;
83+
retval = HAP_power_set(ggmop_ctx, &request);
84+
if (retval) {
85+
GGMLHEXAGON_LOG_DEBUG("failed to vote for HVX power");
86+
return AEE_EFAILED;
87+
}
88+
GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
89+
return AEE_SUCCESS;
90+
}
91+
92+
// =================================================================================================
93+
// implementation of ggml-hexagon kernel, it's better to put every hexagon-kernel to a single file
94+
// =================================================================================================
95+
int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
96+
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
97+
GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
98+
return 0;
99+
}
100+
101+
int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
102+
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
103+
GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
104+
return 0;
105+
}
106+
107+
int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
108+
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
109+
GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
110+
return 0;
111+
}
112+
113+
int ggmlop_get_thread_counts(void) {
114+
return g_thread_counts;
115+
}

ggml/src/ggml-hexagon/kernels/ggml-dsp.c

+29-119
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,33 @@
1+
/*
2+
* Copyright (c) 2025 The ggml authors
3+
*
4+
* Qualcomm Hexagon SDK and reference tech guides could be found at:
5+
* https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
6+
*
7+
* this single-source-file or self-contained file is implementation of ggml-dsp:
8+
* - a customized tiny ggml running on Qualcomm Hexagon cDSP
9+
* - ported from original ggml
10+
*
11+
* Permission is hereby granted, free of charge, to any person obtaining a copy
12+
* of this software and associated documentation files (the "Software"), to
13+
* deal in the Software without restriction, including without limitation the
14+
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
15+
* sell copies of the Software, and to permit persons to whom the Software is
16+
* furnished to do so, subject to the following conditions:
17+
*
18+
* The above copyright notice and this permission notice shall be included in
19+
* all copies or substantial portions of the Software.
20+
*
21+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
27+
* IN THE SOFTWARE.
28+
*/
129
#include "ggml-dsp.h"
230

3-
// =================================================================================================
4-
// tiny ggml-dsp, ported from original ggml
5-
// =================================================================================================
6-
static int32 g_thread_counts = 1;
7-
831
void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
932
#if !GGMLHEXAGON_DEBUG
1033
return;
@@ -30,7 +53,7 @@ void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
3053
char tmpbuf[GGMLHEXAGON_LOGBUF_LEN];
3154
size_t buflen = 0;
3255
if (tensor->type == GGML_TYPE_F32) {
33-
memset(tmpbuf, 0, GGMLHEXAGON_LOG_LEVEL_DEBUG);
56+
memset(tmpbuf, 0, GGMLHEXAGON_LOGBUF_LEN);
3457
for (int h = 0; h < tensor->ne[3]; h++) {
3558
for (int i = 0; i < tensor->ne[2]; i++) {
3659
for (int j = 0; j < tensor->ne[1]; j++) {
@@ -173,116 +196,3 @@ int64_t ggml_time_ms(void) {
173196
int64_t ggml_time_us(void) {
174197
return hexagon_perf_get_time_us();
175198
}
176-
177-
int ggmlop_get_thread_counts(void) {
178-
return g_thread_counts;
179-
}
180-
181-
// =================================================================================================
182-
// implementation of ggml-hexagon kernel skel function
183-
// =================================================================================================
184-
int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
185-
void *tptr = NULL;
186-
GGMLHEXAGON_LOG_DEBUG("uri %s", uri);
187-
tptr = (void *)malloc(1);
188-
*handle = (remote_handle64)tptr;
189-
assert(*handle);
190-
191-
GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", qurt_api_version());
192-
GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units());
193-
qurt_arch_version_t vers;
194-
qurt_sysenv_get_arch_version(&vers);
195-
GGMLHEXAGON_LOG_DEBUG("arch_version=0x%x", vers.arch_version);
196-
qurt_sysenv_app_heap_t aheap;
197-
qurt_sysenv_get_app_heap(&aheap);
198-
GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit);
199-
qurt_sysenv_max_hthreads_t mhwt;
200-
qurt_sysenv_get_max_hw_threads(&mhwt);
201-
GGMLHEXAGON_LOG_DEBUG("max hardware threads counts=%d", mhwt.max_hthreads);
202-
g_thread_counts = mhwt.max_hthreads;
203-
204-
return 0;
205-
}
206-
207-
int ggmlop_dsp_close(remote_handle64 handle) {
208-
if (handle)
209-
free((void*)handle);
210-
211-
return 0;
212-
}
213-
214-
AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
215-
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
216-
HAP_power_request_t request;
217-
memset(&request, 0, sizeof(HAP_power_request_t));
218-
request.type = HAP_power_set_apptype;
219-
request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
220-
221-
GGMLHEXAGON_LOG_DEBUG("user specified thread_counts %d", thread_counts);
222-
if (thread_counts > 1)
223-
g_thread_counts = (thread_counts > g_thread_counts) ? g_thread_counts : thread_counts;
224-
else
225-
g_thread_counts = 1;
226-
GGMLHEXAGON_LOG_DEBUG("real thread_counts %d", g_thread_counts);
227-
228-
void * ggmop_ctx = (void*)(handle);
229-
int retval = HAP_power_set(ggmop_ctx, &request);
230-
if (retval) {
231-
GGMLHEXAGON_LOG_DEBUG("failed first power vote");
232-
return AEE_EFAILED;
233-
}
234-
235-
//configure clocks & DCVS mode
236-
memset(&request, 0, sizeof(HAP_power_request_t));
237-
request.type = HAP_power_set_DCVS_v2;
238-
request.dcvs_v2.dcvs_enable = TRUE;
239-
request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level;
240-
if (dcvs_enabled) {
241-
request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE;
242-
request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE;
243-
} else {
244-
request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner;
245-
request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner;
246-
}
247-
request.dcvs_v2.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
248-
request.dcvs_v2.set_dcvs_params = TRUE;
249-
request.dcvs_v2.set_latency = TRUE;
250-
request.dcvs_v2.latency = latency;
251-
retval = HAP_power_set(ggmop_ctx, &request);
252-
if (retval) {
253-
GGMLHEXAGON_LOG_DEBUG("failed to vote for performance mode");
254-
return AEE_EFAILED;
255-
}
256-
257-
memset(&request, 0, sizeof(HAP_power_request_t));
258-
request.type = HAP_power_set_HVX;
259-
request.hvx.power_up = TRUE;
260-
retval = HAP_power_set(ggmop_ctx, &request);
261-
if (retval) {
262-
GGMLHEXAGON_LOG_DEBUG("failed to vote for HVX power");
263-
return AEE_EFAILED;
264-
}
265-
GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
266-
return AEE_SUCCESS;
267-
}
268-
269-
// =================================================================================================
270-
// implementation of ggml-hexagon kernel, it's better to put every hexagon-kernel to a single file
271-
// =================================================================================================
272-
int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
273-
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
274-
GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
275-
return 0;
276-
}
277-
278-
int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
279-
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
280-
GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
281-
return 0;
282-
}
283-
284-
int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
285-
GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
286-
GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
287-
return 0;
288-
}

scripts/ggml-hexagon.cfg

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@
2323
#
2424
[general]
2525
#version of ggml-hexagon.cpp on ARM-AP side
26-
version = "1.05"
26+
version = "1.06"
2727
#version of ggml-dsp.c on cDSP side
28-
ggmldsp_version = "0.62"
28+
ggmldsp_version = "0.63"
2929

3030
#0: HEXAGON_BACKEND_QNNCPU
3131
#1: HEXAGON_BACKEND_QNNGPU

0 commit comments

Comments
 (0)