YellowRoseCx
diff --git a/‎.gitignore
+2-1 b/‎.gitignore
+2-1
diff --git a/‎CMakeLists.txt
+1-1 b/‎CMakeLists.txt
+1-1
diff --git a/‎Makefile
+9-2 b/‎Makefile
+9-2
diff --git a/‎convert-lora-to-ggml.py
+5-1 b/‎convert-lora-to-ggml.py
+5-1
diff --git a/‎examples/CMakeLists.txt
+1 b/‎examples/CMakeLists.txt
+1
diff --git a/‎examples/common.cpp
-7 b/‎examples/common.cpp
-7
diff --git a/‎examples/embd-input/.gitignore
+4 b/‎examples/embd-input/.gitignore
+4
diff --git a/‎examples/embd-input/CMakeLists.txt
+15 b/‎examples/embd-input/CMakeLists.txt
+15
diff --git a/‎examples/embd-input/README.md
+63 b/‎examples/embd-input/README.md
+63
@@ -1,5 +1,6 @@
 *.o
 *.a
+*.so
 .DS_Store
 .build/
 .cache/
@@ -39,8 +40,8 @@ models/*
 /vdot
 /server
 /Pipfile
+/embd-input-test
 /libllama.so
-
 build-info.h
 arm_neon.h
 compile_commands.json
 
@@ -333,9 +333,9 @@ if (LLAMA_HIPBLAS)
         add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
         target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
+        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
         set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
         target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::hipblas)
-        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
 
         if (LLAMA_STATIC)
             message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
 
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple libembdinput.so embd-input-test
 
 ifdef LLAMA_BUILD_SERVER
 	BUILD_TARGETS += server
@@ -295,7 +295,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h
+	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch embd-input-test build-info.h
 
 #
 # Examples
@@ -328,6 +328,13 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
 
+libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+
+
+embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
+
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 
@@ -113,14 +113,18 @@ def write_tensor_header(
 
     write_file_header(fout, params)
     for k, v in model.items():
+        if k.endswith(".default.weight"):
+            k = k.replace(".default.weight", ".weight")
+        if k in ["llama_proj.weight", "llama_proj.bias"]:
+            continue
         if k.endswith("lora_A.weight"):
             if v.dtype != torch.float16 and v.dtype != torch.float32:
                 v = v.float()
             v = v.T
         else:
             v = v.float()
 
-        t = v.numpy()
+        t = v.detach().numpy()
         tname = translate_tensor_name(k)
         print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
         write_tensor_header(fout, tname, t.shape, t.dtype)
 
@@ -39,6 +39,7 @@ else()
     add_subdirectory(baby-llama)
     add_subdirectory(train-text-from-scratch)
     add_subdirectory(simple)
+    add_subdirectory(embd-input)
     if (LLAMA_METAL)
         add_subdirectory(metal)
     endif()
 
@@ -416,13 +416,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         exit(1);
     }
 
-#ifdef GGML_USE_CUBLAS
-    if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
-        fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
-        exit(1);
-    }
-#endif // GGML_USE_CUBLAS
-
     if (escape_prompt) {
         process_escapes(params.prompt);
     }
 
@@ -0,0 +1,4 @@
+PandaGPT
+MiniGPT-4
+*.pth
+
@@ -0,0 +1,15 @@
+set(TARGET embdinput)
+add_library(${TARGET} embd-input-lib.cpp embd-input.h)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
+
+set(TARGET embd-input-test)
+add_executable(${TARGET} embd-input-test.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
@@ -0,0 +1,63 @@
+### Examples for input embedding directly
+
+## Requirement
+build  `libembdinput.so`
+run the following comman in main dir (../../).
+```
+make
+```
+
+## [LLaVA](https://github.com/haotian-liu/LLaVA/) example  (llava.py)
+
+1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
+2. Convert it to ggml format.
+3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
+
+```
+import torch
+
+bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
+pth_path = "./examples/embd_input/llava_projection.pth"
+
+dic = torch.load(bin_path)
+used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
+torch.save({k: dic[k] for k in used_key}, pth_path)
+```
+4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
+
+
+## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
+
+1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
+The `adapter_config.json` is
+```
+{
+  "peft_type": "LORA",
+  "fan_in_fan_out": false,
+  "bias": null,
+  "modules_to_save": null,
+  "r": 32,
+  "lora_alpha": 32,
+  "lora_dropout": 0.1,
+  "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
+}
+```
+2. Papare the `vicuna` v0 model.
+3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
+4. Clone the PandaGPT source.
+```
+git clone https://github.com/yxuansu/PandaGPT
+```
+5. Install the requirement of PandaGPT.
+6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
+
+## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
+
+1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
+2. Clone the MiniGPT-4 source.
+```
+git clone https://github.com/Vision-CAIR/MiniGPT-4/
+```
+3. Install the requirement of PandaGPT.
+4. Papare the `vicuna` v0 model.
+5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
Original file line number	Diff line number	Diff line change
`@@ -416,13 +416,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {`
`416`	`416`	`exit(1);`
`417`	`417`	`}`
`418`	`418`
`419`		`-#ifdef GGML_USE_CUBLAS`
`420`		`- if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {`
`421`		`- fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);`
`422`		`- exit(1);`
`423`		`- }`
`424`		`-#endif // GGML_USE_CUBLAS`
`425`		`-`
`426`	`419`	`if (escape_prompt) {`
`427`	`420`	`process_escapes(params.prompt);`
`428`	`421`	`}`