@@ -78,6 +78,8 @@ static int g_work_group_size = 0;
78
78
#define GGML_SYCL_MMV_Y 1
79
79
#endif
80
80
81
+ typedef sycl::queue *queue_ptr;
82
+
81
83
enum ggml_sycl_backend_gpu_mode {
82
84
SYCL_UNSET_GPU_MODE = -1 ,
83
85
SYCL_SINGLE_GPU_MODE = 0 ,
@@ -182,17 +184,6 @@ static_assert(
182
184
#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE
183
185
184
186
#define MUL_MAT_SRC1_COL_STRIDE 128
185
- #define MAX_STREAMS 8
186
- #define SYCL_MAX_DEVICES 48
187
-
188
- static dpct::queue_ptr g_syclStreams[SYCL_MAX_DEVICES][MAX_STREAMS] = {{0 }};
189
-
190
- struct ggml_tensor_extra_gpu {
191
- void * data_device[SYCL_MAX_DEVICES]; // 1 pointer for each device for split
192
- // tensors
193
- dpct::event_ptr events[SYCL_MAX_DEVICES]
194
- [MAX_STREAMS]; // events for synchronizing multiple GPUs
195
- };
196
187
197
188
class sycl_gpu_mgr {
198
189
public:
@@ -320,7 +311,7 @@ class sycl_gpu_mgr {
320
311
}
321
312
};
322
313
323
- static sycl_gpu_mgr* g_sycl_gpu_mgr = NULL ;
314
+ static sycl_gpu_mgr* g_sycl_gpu_mgr = new sycl_gpu_mgr( 0 ) ;
324
315
static int g_device_count = -1 ;
325
316
static int g_all_sycl_device_count = -1 ;
326
317
static int g_main_device = -1 ;
@@ -329,31 +320,15 @@ static bool g_ggml_backend_sycl_buffer_type_initialized = false;
329
320
330
321
static std::array<float , SYCL_MAX_DEVICES> g_default_tensor_split = {};
331
322
332
- static float g_tensor_split[SYCL_MAX_DEVICES ] = {0 };
323
+ static float g_tensor_split[GGML_SYCL_MAX_DEVICES ] = {0 };
333
324
334
325
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
335
326
SYCL_UNSET_GPU_MODE;
336
327
337
- struct sycl_device_capabilities {
338
- int cc; // compute capability
339
- bool vmm; // virtual memory support
340
- size_t vmm_granularity; // granularity of virtual memory
341
- int device_id;
342
- };
343
-
344
- static sycl_device_capabilities g_device_caps[SYCL_MAX_DEVICES] = {
345
- {0 , false , 0 , -1 }};
346
-
347
- struct sycl_device_id2index {
348
- int index;
349
- };
350
-
351
328
static void * g_scratch_buffer = nullptr ;
352
329
static size_t g_scratch_size = 0 ; // disabled by default
353
330
static size_t g_scratch_offset = 0 ;
354
331
355
- static dpct::queue_ptr g_sycl_handles[SYCL_MAX_DEVICES] = {nullptr };
356
-
357
332
int get_main_device ();
358
333
359
334
[[noreturn]] static inline void bad_arch (const sycl::stream& stream_ct1) {
@@ -427,25 +402,151 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
427
402
std::exit (1 );
428
403
}
429
404
430
- void log_ggml_var_device (
431
- const char * name,
432
- float * src,
433
- size_t total_elements,
434
- bool src_on_device);
435
-
436
- void log_ggml_var_device_fp16 (
437
- const char * name,
438
- sycl::half* src,
439
- size_t total_elements,
440
- bool src_on_device);
441
-
442
- // todo: debug for crash in some case
443
- void print_ggml_tensor (const char * name, struct ggml_tensor * src);
444
-
445
- static int log_file_name_idx = 0 ;
446
- void log_tensor_with_cnt (
447
- const char * name,
448
- struct ggml_tensor * src,
449
- int stop_cnt);
405
+ // ////////////////////
406
+
407
+ struct ggml_sycl_device_info {
408
+ int device_count;
409
+
410
+ struct sycl_device_info {
411
+ int cc; // compute capability
412
+ // int nsm; // number of streaming multiprocessors
413
+ // size_t smpb; // max. shared memory per block
414
+ bool vmm; // virtual memory support
415
+ size_t total_vram;
416
+ };
417
+
418
+ sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
419
+
420
+ std::array<float , GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
421
+ };
422
+
423
+ const ggml_sycl_device_info & ggml_sycl_info ();
424
+
425
+ struct ggml_sycl_pool {
426
+ virtual ~ggml_sycl_pool () = default ;
427
+
428
+ virtual void * alloc (size_t size, size_t * actual_size) = 0;
429
+ virtual void free (void * ptr, size_t size) = 0;
430
+ };
431
+
432
+ template <typename T>
433
+ struct ggml_sycl_pool_alloc {
434
+ ggml_sycl_pool * pool = nullptr ;
435
+ T * ptr = nullptr ;
436
+ size_t actual_size = 0 ;
437
+
438
+ explicit ggml_sycl_pool_alloc (ggml_sycl_pool & pool) : pool(&pool) {
439
+ }
440
+
441
+ ggml_sycl_pool_alloc (ggml_sycl_pool & pool, size_t size) : pool(&pool) {
442
+ alloc (size);
443
+ }
444
+
445
+ ~ggml_sycl_pool_alloc () {
446
+ if (ptr != nullptr ) {
447
+ pool->free (ptr, actual_size);
448
+ }
449
+ }
450
+
451
+ // size is in number of elements
452
+ T * alloc (size_t size) {
453
+ GGML_ASSERT (pool != nullptr );
454
+ GGML_ASSERT (ptr == nullptr );
455
+ ptr = (T *) pool->alloc (size * sizeof (T), &this ->actual_size );
456
+ return ptr;
457
+ }
458
+
459
+ T * alloc (ggml_sycl_pool & pool, size_t size) {
460
+ this ->pool = &pool;
461
+ return alloc (size);
462
+ }
463
+
464
+ T * get () {
465
+ return ptr;
466
+ }
467
+
468
+ ggml_sycl_pool_alloc () = default ;
469
+ ggml_sycl_pool_alloc (const ggml_sycl_pool_alloc &) = delete ;
470
+ ggml_sycl_pool_alloc (ggml_sycl_pool_alloc &&) = delete ;
471
+ ggml_sycl_pool_alloc& operator =(const ggml_sycl_pool_alloc &) = delete ;
472
+ ggml_sycl_pool_alloc& operator =(ggml_sycl_pool_alloc &&) = delete ;
473
+ };
474
+
475
+ // backend interface
476
+
477
+ struct ggml_tensor_extra_gpu {
478
+ void * data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
479
+ // tensors
480
+ dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
481
+ [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
482
+ };
483
+
484
+ struct ggml_backend_sycl_context {
485
+ int device;
486
+ std::string name;
487
+
488
+ queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
489
+ static sycl::handler * sycl_handles[GGML_SYCL_MAX_DEVICES] = {nullptr };
490
+
491
+ explicit ggml_backend_sycl_context (int device) :
492
+ device(device),
493
+ name(GGML_SYCL_NAME + std::to_string(device)) {
494
+ }
495
+
496
+ ~ggml_backend_sycl_context () {
497
+ for (int i = 0 ; i < GGML_SYCL_MAX_DEVICES; ++i) {
498
+ for (int j = 0 ; j < GGML_SYCL_MAX_STREAMS; ++j) {
499
+ if (streams[i][j] != nullptr ) {
500
+ SYCL_CHECK (free (streams[i][j]));
501
+ }
502
+ }
503
+ if (cublas_handles[i] != nullptr ) {
504
+ SYCL_CHECK (free (sycl_handles[i]));
505
+ }
506
+ }
507
+ }
508
+
509
+ queue_ptr stream (int device, int stream) {
510
+ if (qptrs[device][stream] == nullptr ) {
511
+ SYCL_CHECK (dpct::get_current_device ().create_queue (
512
+ g_sycl_gpu_mgr->get_co_ctx (), dpct::get_current_device ())));
513
+ }
514
+ return qptrs[device][stream];
515
+ }
516
+
517
+ cudaStream_t stream () {
518
+ return stream (device, 0 );
519
+ }
520
+
521
+ cublasHandle_t sycl_handle (int device) {
522
+ if (sycl_handles[device] == nullptr ) {
523
+ const dpct::queue_ptr stream = streams[device][0 ];
524
+ // create sycl handle
525
+ SYCL_CHECK (CHECK_TRY_ERROR (sycl_handles[device] = stream));
526
+ }
527
+ return sycl_handles[device];
528
+ }
529
+
530
+ cublasHandle_t sycl_handle () {
531
+ return sycl_handle (device);
532
+ }
533
+
534
+ // pool
535
+ std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
536
+
537
+ static std::unique_ptr<ggml_sycl_pool> new_pool_for_device (queue_ptr qptr, int device);
538
+
539
+ ggml_sycl_pool & pool (int device) {
540
+ if (pools[device] == nullptr ) {
541
+ pools[device] = new_pool_for_device (qptrs[device][0 ], device);
542
+ }
543
+ return *pools[device];
544
+ }
545
+
546
+ ggml_sycl_pool & pool () {
547
+ return pool (device);
548
+ }
549
+ };
550
+
450
551
451
552
#endif // GGML_SYCL_COMMON_HPP
0 commit comments