iThalay
diff --git a/‎CMakeLists.txt
+12-4 b/‎CMakeLists.txt
+12-4
diff --git a/‎Makefile
+28-8 b/‎Makefile
+28-8
diff --git a/‎README.md
+2-2 b/‎README.md
+2-2
diff --git a/‎examples/common-ggml.cpp
+4 b/‎examples/common-ggml.cpp
+4
diff --git a/‎examples/talk-llama/CMakeLists.txt
+1-1 b/‎examples/talk-llama/CMakeLists.txt
+1-1
@@ -74,7 +74,8 @@ else()
     option(WHISPER_BLAS                  "whisper: use BLAS libraries"                        OFF)
     option(WHISPER_BLAS_VENDOR           "whisper: BLAS library vendor"                       Generic)
     option(WHISPER_OPENBLAS              "whisper: prefer OpenBLAS"                           OFF)
-    option(WHISPER_CUBLAS                "whisper: support for cuBLAS"                        OFF)
+    option(WHISPER_CUDA                  "whisper: support for CUDA"                          OFF)
+    option(WHISPER_CUBLAS                "whisper: support for CUDA (deprecated)"             OFF)
     option(WHISPER_HIPBLAS               "whisper: support for hipBLAS"                       OFF)
     option(WHISPER_CLBLAST               "whisper: use CLBlast"                               OFF)
     option(WHISPER_SYCL                  "whisper: use SYCL"                                  OFF)
@@ -240,6 +241,11 @@ if (WHISPER_BLAS)
 endif ()
 
 if (WHISPER_CUBLAS)
+    message(WARNING "WHISPER_CUBLAS is deprecated and will be removed in the future.\nUse WHISPER_CUDA instead")
+    set(WHISPER_CUDA ON)
+endif()
+
+if (WHISPER_CUDA)
     cmake_minimum_required(VERSION 3.17)
 
     find_package(CUDAToolkit)
@@ -249,9 +255,11 @@ if (WHISPER_CUBLAS)
 
         enable_language(CUDA)
 
-        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
+        file(GLOB   GGML_SOURCES_CUDA "ggml-cuda/*.cu")
+        list(APPEND GGML_SOURCES_CUDA  ggml-cuda.h)
+        list(APPEND GGML_SOURCES_CUDA  ggml-cuda.cu)
 
-        add_compile_definitions(GGML_USE_CUBLAS)
+        add_compile_definitions(GGML_USE_CUDA)
 
         if (WHISPER_STATIC)
             if (WIN32)
@@ -286,7 +294,7 @@ if (WHISPER_HIPBLAS)
 
     if (${hipblas_FOUND} AND ${hip_FOUND})
         message(STATUS "HIP and hipBLAS found")
-        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
         add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
         set_property(TARGET ggml-rocm PROPERTY POSITION_INDEPENDENT_CODE ON)
         set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
 
@@ -216,35 +216,48 @@ ifdef WHISPER_OPENBLAS
 endif
 
 ifdef WHISPER_CUBLAS
+# WHISPER_CUBLAS is deprecated and will be removed in the future
+	WHISPER_CUDA := 1
+endif
+
+ifdef WHISPER_CUDA
 	ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
 		CUDA_ARCH_FLAG ?= native
 	else
 		CUDA_ARCH_FLAG ?= all
 	endif
 
-	CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
-	CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	CFLAGS      += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	CXXFLAGS    += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	LDFLAGS     += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
 	WHISPER_OBJ += ggml-cuda.o
+	WHISPER_OBJ += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
 	NVCC        = nvcc
 	NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
 
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
+
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif
 
 ifdef WHISPER_HIPBLAS
 	ROCM_PATH   ?= /opt/rocm
 	HIPCC       ?= $(ROCM_PATH)/bin/hipcc
 	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
-	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
-	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
+	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
 	LDFLAGS     += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	LDFLAGS     += -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
 	WHISPER_OBJ += ggml-cuda.o
+	WHISPER_OBJ += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
+
+ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif
 
@@ -309,6 +322,13 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
+ifdef WHISPER_CUBLAS
+$(info !!!!)
+$(info WHISPER_CUBLAS is deprecated and will be removed in the future. Use WHISPER_CUDA instead.)
+$(info !!!!)
+$(info )
+endif
+
 #
 # Build library
 #
@@ -410,8 +430,8 @@ lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
 talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
 
-talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
+talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
 
 #
 # Audio samples
 
@@ -414,11 +414,11 @@ For more information about the Core ML implementation please refer to PR [#1037]
 With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
 First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
 
-Now build `whisper.cpp` with cuBLAS support:
+Now build `whisper.cpp` with CUDA support:
 
 ```
 make clean
-WHISPER_CUBLAS=1 make -j
+WHISPER_CUDA=1 make -j
 ```
 
 ## OpenCL GPU support via CLBlast
 
@@ -70,6 +70,7 @@ bool ggml_common_quantize_0(
         case GGML_FTYPE_MOSTLY_IQ1_S:
         case GGML_FTYPE_MOSTLY_IQ4_NL:
         case GGML_FTYPE_MOSTLY_IQ4_XS:
+        case GGML_FTYPE_MOSTLY_IQ1_M:
                 {
                     fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                     return false;
@@ -193,6 +194,8 @@ bool ggml_common_quantize_0(
                 case GGML_TYPE_I8:
                 case GGML_TYPE_I16:
                 case GGML_TYPE_I32:
+                case GGML_TYPE_I64:
+                case GGML_TYPE_F64:
                 case GGML_TYPE_Q8_1:
                 case GGML_TYPE_Q8_K:
                 case GGML_TYPE_IQ2_XXS:
@@ -203,6 +206,7 @@ bool ggml_common_quantize_0(
                 case GGML_TYPE_IQ1_S:
                 case GGML_TYPE_IQ4_NL:
                 case GGML_TYPE_IQ4_XS:
+                case GGML_TYPE_IQ1_M:
                 case GGML_TYPE_COUNT:
                     {
                         fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
 
@@ -1,7 +1,7 @@
 if (WHISPER_SDL2)
     # talk-llama
     set(TARGET talk-llama)
-    add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp)
+    add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp unicode-data.cpp)
     target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
 
     if (WHISPER_CLBLAST)