Skip to content

Commit 4b561bd

Browse files
committed
backup
1 parent 38f09be commit 4b561bd

11 files changed

+508
-861
lines changed

ggml-sycl.cpp

+292-434
Large diffs are not rendered by default.

ggml-sycl.h

-2
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,6 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
3636
// TODO: these are temporary
3737
// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
3838
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
39-
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
40-
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
4139

4240
// SYCL doesn't support registering host memory, keep here for reference
4341
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);

ggml-sycl/backend.hpp

-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,5 @@
1919
#include "dmmv.hpp"
2020
#include "mmq.hpp"
2121
#include "mmvq.hpp"
22-
#include "pool.hpp"
2322

2423
#endif // GGML_SYCL_BACKEND_HPP

ggml-sycl/common.cpp

-116
Original file line numberDiff line numberDiff line change
@@ -20,122 +20,6 @@ int get_current_device_id() {
2020
return dpct::dev_mgr::instance().current_device_id();
2121
}
2222

23-
void log_ggml_var_device(
24-
const char* name,
25-
float* src,
26-
size_t total_elements,
27-
bool src_on_device) {
28-
if (!g_ggml_sycl_debug)
29-
return;
30-
if (!src) {
31-
printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
32-
return;
33-
}
34-
char filename[1024];
35-
sprintf(filename, "%s.txt", name);
36-
printf("GGML Tensor:%s save to %s\n", name, filename);
37-
38-
size_t total_size = total_elements * sizeof(float);
39-
float* local_buf = NULL;
40-
if (src_on_device) {
41-
local_buf = (float*)ggml_sycl_host_malloc(total_size);
42-
ggml_sycl_set_device(g_main_device);
43-
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
44-
main_stream->memcpy(local_buf, src, total_size).wait();
45-
} else {
46-
local_buf = (float*)src;
47-
}
48-
49-
std::ofstream logfile;
50-
logfile.open(filename);
51-
for (size_t i = 0; i < total_elements; i++) {
52-
logfile << local_buf[i] << " ";
53-
if ((i + 1) % 20 == 0)
54-
logfile << std::endl;
55-
}
56-
logfile << std::endl;
57-
logfile.close();
58-
59-
if (src_on_device)
60-
ggml_sycl_host_free(local_buf);
61-
}
62-
63-
void log_ggml_var_device_fp16(
64-
const char* name,
65-
sycl::half* src,
66-
size_t total_elements,
67-
bool src_on_device) {
68-
if (!g_ggml_sycl_debug)
69-
return;
70-
if (!src) {
71-
printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
72-
return;
73-
}
74-
char filename[1024];
75-
sprintf(filename, "%s.txt", name);
76-
printf("GGML Tensor:%s save to %s\n", name, filename);
77-
78-
size_t total_size = total_elements * sizeof(sycl::half);
79-
sycl::half* local_buf = NULL;
80-
if (src_on_device) {
81-
local_buf = (sycl::half*)ggml_sycl_host_malloc(total_size);
82-
ggml_sycl_set_device(g_main_device);
83-
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
84-
main_stream->memcpy(local_buf, src, total_size).wait();
85-
} else {
86-
local_buf = (sycl::half*)src;
87-
}
88-
89-
std::ofstream logfile;
90-
logfile.open(filename);
91-
for (size_t i = 0; i < total_elements; i++) {
92-
logfile << local_buf[i] << " ";
93-
if ((i + 1) % 20 == 0)
94-
logfile << std::endl;
95-
}
96-
logfile << std::endl;
97-
logfile.close();
98-
99-
if (src_on_device)
100-
ggml_sycl_host_free(local_buf);
101-
}
102-
103-
void print_ggml_tensor(const char* name, struct ggml_tensor* src) {
104-
if (!g_ggml_sycl_debug)
105-
return;
106-
if (!src) {
107-
printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
108-
return;
109-
}
110-
111-
size_t total_elements = ggml_nelements(src);
112-
113-
const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU ||
114-
src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
115-
float* src_data = NULL;
116-
if (src_on_device) {
117-
ggml_tensor_extra_gpu* src_extra = (ggml_tensor_extra_gpu*)src->extra;
118-
src_data = (float*)src_extra->data_device[g_main_device];
119-
} else {
120-
src_data = (float*)src->data;
121-
}
122-
123-
log_ggml_var_device(name, src_data, total_elements, src_on_device);
124-
}
125-
126-
void log_tensor_with_cnt(
127-
const char* name,
128-
struct ggml_tensor* src,
129-
int stop_cnt) {
130-
stop_cnt = 4;
131-
if (log_file_name_idx >= stop_cnt)
132-
return;
133-
char filename[1280];
134-
sprintf(filename, "%s_%07d", name, log_file_name_idx);
135-
log_file_name_idx++;
136-
print_ggml_tensor(filename, src);
137-
}
138-
13923
void* ggml_sycl_host_malloc(size_t size) try {
14024
if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
14125
return nullptr;

ggml-sycl/common.hpp

+150-49
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ static int g_work_group_size = 0;
7878
#define GGML_SYCL_MMV_Y 1
7979
#endif
8080

81+
typedef sycl::queue *queue_ptr;
82+
8183
enum ggml_sycl_backend_gpu_mode {
8284
SYCL_UNSET_GPU_MODE = -1,
8385
SYCL_SINGLE_GPU_MODE = 0,
@@ -182,17 +184,6 @@ static_assert(
182184
#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE
183185

184186
#define MUL_MAT_SRC1_COL_STRIDE 128
185-
#define MAX_STREAMS 8
186-
#define SYCL_MAX_DEVICES 48
187-
188-
static dpct::queue_ptr g_syclStreams[SYCL_MAX_DEVICES][MAX_STREAMS] = {{0}};
189-
190-
struct ggml_tensor_extra_gpu {
191-
void* data_device[SYCL_MAX_DEVICES]; // 1 pointer for each device for split
192-
// tensors
193-
dpct::event_ptr events[SYCL_MAX_DEVICES]
194-
[MAX_STREAMS]; // events for synchronizing multiple GPUs
195-
};
196187

197188
class sycl_gpu_mgr {
198189
public:
@@ -320,7 +311,7 @@ class sycl_gpu_mgr {
320311
}
321312
};
322313

323-
static sycl_gpu_mgr* g_sycl_gpu_mgr = NULL;
314+
static sycl_gpu_mgr* g_sycl_gpu_mgr = new sycl_gpu_mgr(0);
324315
static int g_device_count = -1;
325316
static int g_all_sycl_device_count = -1;
326317
static int g_main_device = -1;
@@ -329,31 +320,15 @@ static bool g_ggml_backend_sycl_buffer_type_initialized = false;
329320

330321
static std::array<float, SYCL_MAX_DEVICES> g_default_tensor_split = {};
331322

332-
static float g_tensor_split[SYCL_MAX_DEVICES] = {0};
323+
static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
333324

334325
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
335326
SYCL_UNSET_GPU_MODE;
336327

337-
struct sycl_device_capabilities {
338-
int cc; // compute capability
339-
bool vmm; // virtual memory support
340-
size_t vmm_granularity; // granularity of virtual memory
341-
int device_id;
342-
};
343-
344-
static sycl_device_capabilities g_device_caps[SYCL_MAX_DEVICES] = {
345-
{0, false, 0, -1}};
346-
347-
struct sycl_device_id2index {
348-
int index;
349-
};
350-
351328
static void* g_scratch_buffer = nullptr;
352329
static size_t g_scratch_size = 0; // disabled by default
353330
static size_t g_scratch_offset = 0;
354331

355-
static dpct::queue_ptr g_sycl_handles[SYCL_MAX_DEVICES] = {nullptr};
356-
357332
int get_main_device();
358333

359334
[[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
@@ -427,25 +402,151 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
427402
std::exit(1);
428403
}
429404

430-
void log_ggml_var_device(
431-
const char* name,
432-
float* src,
433-
size_t total_elements,
434-
bool src_on_device);
435-
436-
void log_ggml_var_device_fp16(
437-
const char* name,
438-
sycl::half* src,
439-
size_t total_elements,
440-
bool src_on_device);
441-
442-
// todo: debug for crash in some case
443-
void print_ggml_tensor(const char* name, struct ggml_tensor* src);
444-
445-
static int log_file_name_idx = 0;
446-
void log_tensor_with_cnt(
447-
const char* name,
448-
struct ggml_tensor* src,
449-
int stop_cnt);
405+
//////////////////////
406+
407+
struct ggml_sycl_device_info {
408+
int device_count;
409+
410+
struct sycl_device_info {
411+
int cc; // compute capability
412+
// int nsm; // number of streaming multiprocessors
413+
// size_t smpb; // max. shared memory per block
414+
bool vmm; // virtual memory support
415+
size_t total_vram;
416+
};
417+
418+
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
419+
420+
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
421+
};
422+
423+
const ggml_sycl_device_info & ggml_sycl_info();
424+
425+
struct ggml_sycl_pool {
426+
virtual ~ggml_sycl_pool() = default;
427+
428+
virtual void * alloc(size_t size, size_t * actual_size) = 0;
429+
virtual void free(void * ptr, size_t size) = 0;
430+
};
431+
432+
template<typename T>
433+
struct ggml_sycl_pool_alloc {
434+
ggml_sycl_pool * pool = nullptr;
435+
T * ptr = nullptr;
436+
size_t actual_size = 0;
437+
438+
explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) {
439+
}
440+
441+
ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) {
442+
alloc(size);
443+
}
444+
445+
~ggml_sycl_pool_alloc() {
446+
if (ptr != nullptr) {
447+
pool->free(ptr, actual_size);
448+
}
449+
}
450+
451+
// size is in number of elements
452+
T * alloc(size_t size) {
453+
GGML_ASSERT(pool != nullptr);
454+
GGML_ASSERT(ptr == nullptr);
455+
ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
456+
return ptr;
457+
}
458+
459+
T * alloc(ggml_sycl_pool & pool, size_t size) {
460+
this->pool = &pool;
461+
return alloc(size);
462+
}
463+
464+
T * get() {
465+
return ptr;
466+
}
467+
468+
ggml_sycl_pool_alloc() = default;
469+
ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete;
470+
ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete;
471+
ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete;
472+
ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete;
473+
};
474+
475+
// backend interface
476+
477+
struct ggml_tensor_extra_gpu {
478+
void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
479+
// tensors
480+
dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
481+
[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
482+
};
483+
484+
struct ggml_backend_sycl_context {
485+
int device;
486+
std::string name;
487+
488+
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
489+
static sycl::handler * sycl_handles[GGML_SYCL_MAX_DEVICES] = {nullptr};
490+
491+
explicit ggml_backend_sycl_context(int device) :
492+
device(device),
493+
name(GGML_SYCL_NAME + std::to_string(device)) {
494+
}
495+
496+
~ggml_backend_sycl_context() {
497+
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; ++i) {
498+
for (int j = 0; j < GGML_SYCL_MAX_STREAMS; ++j) {
499+
if (streams[i][j] != nullptr) {
500+
SYCL_CHECK(free(streams[i][j]));
501+
}
502+
}
503+
if (cublas_handles[i] != nullptr) {
504+
SYCL_CHECK(free(sycl_handles[i]));
505+
}
506+
}
507+
}
508+
509+
queue_ptr stream(int device, int stream) {
510+
if (qptrs[device][stream] == nullptr) {
511+
SYCL_CHECK(dpct::get_current_device().create_queue(
512+
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
513+
}
514+
return qptrs[device][stream];
515+
}
516+
517+
cudaStream_t stream() {
518+
return stream(device, 0);
519+
}
520+
521+
cublasHandle_t sycl_handle(int device) {
522+
if (sycl_handles[device] == nullptr) {
523+
const dpct::queue_ptr stream = streams[device][0];
524+
// create sycl handle
525+
SYCL_CHECK(CHECK_TRY_ERROR(sycl_handles[device] = stream));
526+
}
527+
return sycl_handles[device];
528+
}
529+
530+
cublasHandle_t sycl_handle() {
531+
return sycl_handle(device);
532+
}
533+
534+
// pool
535+
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
536+
537+
static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
538+
539+
ggml_sycl_pool & pool(int device) {
540+
if (pools[device] == nullptr) {
541+
pools[device] = new_pool_for_device(qptrs[device][0], device);
542+
}
543+
return *pools[device];
544+
}
545+
546+
ggml_sycl_pool & pool() {
547+
return pool(device);
548+
}
549+
};
550+
450551

451552
#endif // GGML_SYCL_COMMON_HPP

ggml-sycl/convert.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include "convert.hpp"
22
#include "dequantize.hpp"
3-
3+
#include "presets.hpp"
44

55
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
66
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,

0 commit comments

Comments
 (0)