Skip to content

Commit bacc202

Browse files
committed
Merge remote-tracking branch 'upstream/concedo'
1 parent b7cb4cf commit bacc202

File tree

6 files changed

+79
-52
lines changed

6 files changed

+79
-52
lines changed

CMakeLists.txt

+9-4
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
# IT WILL NOT BE UPDATED OR MAINTAINED !!!
44

55
message(STATUS "============== ============== ==============")
6-
message(STATUS "WARNING! Do NOT use this file. It is UNSUPPORTED for normal users. Use MAKE instead.")
7-
message(STATUS "It is ONLY for CUBLAS build testing on windows visual studio. IT WILL NOT BE UPDATED OR MAINTAINED !!!")
8-
message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building AN EXPERIMENAL WINDOWS CUBLAS BUILD! NOTHING ELSE WILL BE SUPPORTED !!!")
6+
message(STATUS "WARNING! Recommend NOT to use this file. It is UNSUPPORTED for normal users. Use MAKE instead.")
7+
message(STATUS "It is ONLY for CUBLAS builds on windows visual studio. IT WILL OVERWRITE YOUR EXISTING MAKEFILE !!!")
8+
message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building CUBLAS BUILDS! NOTHING ELSE WILL BE SUPPORTED !!!")
99
message(STATUS "============== ============== ==============")
1010

1111
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
@@ -110,7 +110,12 @@ if (LLAMA_CUBLAS)
110110
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
111111
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
112112
else()
113-
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
113+
message("CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
114+
if(CUDAToolkit_VERSION VERSION_GREATER 12)
115+
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
116+
else()
117+
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
118+
endif()
114119
endif()
115120
endif()
116121
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

Makefile

+5
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,11 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
174174
else
175175
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
176176
endif
177+
ifdef LLAMA_CUDA_MMQ_Y
178+
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
179+
else
180+
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
181+
endif # LLAMA_CUDA_MMQ_Y
177182
#ifdef LLAMA_CUDA_CUBLAS
178183
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
179184
#endif # LLAMA_CUDA_CUBLAS

klite.embd

+9-6
Large diffs are not rendered by default.

koboldcpp.py

+18-17
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ def utfprint(str):
304304
maxhordelen = 256
305305
modelbusy = threading.Lock()
306306
defaultport = 5001
307-
KcppVersion = "1.39.1"
307+
KcppVersion = "1.40.1"
308308
showdebug = True
309309
showsamplerwarning = True
310310
showmaxctxwarning = True
@@ -496,7 +496,7 @@ def do_GET(self):
496496
laste = handle.get_last_eval_time()
497497
lastc = handle.get_last_token_count()
498498
stopreason = handle.get_last_stop_reason()
499-
response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "stop_reason":stopreason}).encode())
499+
response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "stop_reason":stopreason, "idle":(0 if modelbusy.locked() else 1)}).encode())
500500

501501
if response_body is None:
502502
self.send_response(404)
@@ -674,7 +674,7 @@ def show_new_gui():
674674
root.destroy()
675675
if not args.model_param:
676676
print("\nNo ggml model file was selected. Exiting.")
677-
time.sleep(2)
677+
time.sleep(3)
678678
sys.exit(2)
679679
return
680680

@@ -1306,7 +1306,7 @@ def display_help():
13061306

13071307
if nextstate==0:
13081308
print("Exiting by user request.")
1309-
time.sleep(2)
1309+
time.sleep(3)
13101310
sys.exit()
13111311
elif nextstate==2:
13121312
time.sleep(0.1)
@@ -1317,7 +1317,7 @@ def display_help():
13171317

13181318
if not args.model_param:
13191319
print("\nNo ggml model file was selected. Exiting.")
1320-
time.sleep(2)
1320+
time.sleep(3)
13211321
sys.exit(2)
13221322

13231323
def show_gui_warning(issue=None):
@@ -1329,7 +1329,7 @@ def show_gui_warning(issue=None):
13291329
messagebox.showerror(title="No Backends Available!", message="KoboldCPP couldn't locate any backends to use.\n\nTo use the program, please run the 'make' command from the directory.")
13301330
root.destroy()
13311331
print("No Backend Available (i.e Default, OpenBLAS, CLBlast, CuBLAS). To use the program, please run the 'make' command from the directory.")
1332-
time.sleep(2)
1332+
time.sleep(3)
13331333
sys.exit(2)
13341334
else:
13351335
messagebox.showerror(title="New GUI failed, using Old GUI", message="The new GUI failed to load.\n\nTo use new GUI, please install the customtkinter python module.")
@@ -1423,7 +1423,7 @@ def onDropdownChange(event):
14231423

14241424
if launchclicked==False:
14251425
print("Exiting by user request.")
1426-
time.sleep(2)
1426+
time.sleep(3)
14271427
sys.exit()
14281428

14291429
#load all the vars
@@ -1479,7 +1479,7 @@ def onDropdownChange(event):
14791479
root.destroy()
14801480
if not args.model_param:
14811481
print("\nNo ggml model file was selected. Exiting.")
1482-
time.sleep(2)
1482+
time.sleep(3)
14831483
sys.exit(2)
14841484

14851485
else:
@@ -1489,7 +1489,7 @@ def onDropdownChange(event):
14891489
root.destroy()
14901490
if not args.model_param:
14911491
print("\nNo ggml model file was selected. Exiting.")
1492-
time.sleep(2)
1492+
time.sleep(3)
14931493
sys.exit(2)
14941494

14951495
#A very simple and stripped down embedded horde worker with no dependencies
@@ -1534,7 +1534,7 @@ def make_url_request(url, data, method='POST'):
15341534
BRIDGE_AGENT = f"KoboldCppEmbedWorker:1:https://github.com/LostRuins/koboldcpp"
15351535
cluster = "https://horde.koboldai.net"
15361536
while exitcounter < 10:
1537-
time.sleep(2)
1537+
time.sleep(3)
15381538
readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
15391539
if readygo:
15401540
print("Embedded Horde Worker is started.")
@@ -1610,10 +1610,10 @@ def make_url_request(url, data, method='POST'):
16101610
time.sleep(1)
16111611
if exitcounter<100:
16121612
print("Horde Worker Shutdown - Too many errors.")
1613-
time.sleep(2)
1613+
time.sleep(3)
16141614
else:
16151615
print("Horde Worker Shutdown - Server Closing.")
1616-
time.sleep(1)
1616+
time.sleep(2)
16171617
sys.exit(2)
16181618

16191619
def main(args):
@@ -1637,7 +1637,7 @@ def main(args):
16371637
except Exception as ex2:
16381638
print("File selection GUI unsupported. Please check command line: script.py --help")
16391639
print("Reason for no GUI: " + str(ex2))
1640-
time.sleep(2)
1640+
time.sleep(3)
16411641
sys.exit(2)
16421642

16431643
if args.hordeconfig and args.hordeconfig[0]!="":
@@ -1681,20 +1681,20 @@ def main(args):
16811681
time.sleep(1)
16821682
if not os.path.exists(args.model_param):
16831683
print(f"Cannot find model file: {args.model_param}")
1684-
time.sleep(2)
1684+
time.sleep(3)
16851685
sys.exit(2)
16861686

16871687
if args.lora and args.lora[0]!="":
16881688
if not os.path.exists(args.lora[0]):
16891689
print(f"Cannot find lora file: {args.lora[0]}")
1690-
time.sleep(2)
1690+
time.sleep(3)
16911691
sys.exit(2)
16921692
else:
16931693
args.lora[0] = os.path.abspath(args.lora[0])
16941694
if len(args.lora) > 1:
16951695
if not os.path.exists(args.lora[1]):
16961696
print(f"Cannot find lora base: {args.lora[1]}")
1697-
time.sleep(2)
1697+
time.sleep(3)
16981698
sys.exit(2)
16991699
else:
17001700
args.lora[1] = os.path.abspath(args.lora[1])
@@ -1715,7 +1715,7 @@ def main(args):
17151715

17161716
if not loadok:
17171717
print("Could not load model: " + modelname)
1718-
time.sleep(2)
1718+
time.sleep(3)
17191719
sys.exit(3)
17201720
try:
17211721
basepath = os.path.abspath(os.path.dirname(__file__))
@@ -1743,6 +1743,7 @@ def main(args):
17431743

17441744
if args.hordeconfig and len(args.hordeconfig)>4:
17451745
horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordeconfig[3],args.hordeconfig[4]))
1746+
horde_thread.daemon = True
17461747
horde_thread.start()
17471748

17481749
print(f"Please connect to custom endpoint at {epurl}")

llama.cpp

+12-7
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
158158
{ MODEL_7B, 512ull * kB },
159159
{ MODEL_13B, 640ull * kB },
160160
{ MODEL_30B, 768ull * kB },
161-
{ MODEL_65B, 1280ull * kB },
162-
{ MODEL_70B, 1280ull * kB },
161+
{ MODEL_65B, 1360ull * kB },
162+
{ MODEL_70B, 1360ull * kB },
163163
};
164164
return k_sizes;
165165
}
@@ -173,8 +173,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
173173
{ MODEL_7B, 128ull },
174174
{ MODEL_13B, 160ull },
175175
{ MODEL_30B, 208ull },
176-
{ MODEL_65B, 256ull },
177-
{ MODEL_70B, 256ull },
176+
{ MODEL_65B, 320ull },
177+
{ MODEL_70B, 320ull },
178178
};
179179
return k_sizes;
180180
}
@@ -937,6 +937,11 @@ bool llama_mlock_supported() {
937937
return llama_mlock::SUPPORTED;
938938
}
939939

940+
int get_blas_batch_mul(int batch)
941+
{
942+
return (batch>512?(batch>1024?4:2):1);
943+
}
944+
940945
void llama_backend_init(bool numa) {
941946
ggml_time_init();
942947

@@ -1042,7 +1047,7 @@ static void llama_model_load_internal(
10421047
void * progress_callback_user_data) {
10431048

10441049
model.t_start_us = ggml_time_us();
1045-
size_t blasbatchmul = (n_batch>512?(n_batch>1024?4:2):1);
1050+
size_t blasbatchmul = get_blas_batch_mul(n_batch);
10461051

10471052
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
10481053

@@ -1076,7 +1081,7 @@ static void llama_model_load_internal(
10761081
// LLaMAv2
10771082
// TODO: temporary until GGUF
10781083
//patch for llama2 gqa
1079-
if (model.type == e_model::MODEL_65B && hparams.n_mult == 4096) {
1084+
if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
10801085
fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
10811086
n_gqa = 8;
10821087
}
@@ -3248,7 +3253,7 @@ struct llama_context * llama_new_context_with_model(
32483253
params.seed = time(NULL);
32493254
}
32503255

3251-
size_t blasbatchmul = (params.n_batch>512?2:1);
3256+
size_t blasbatchmul = get_blas_batch_mul(params.n_batch);
32523257

32533258
unsigned cur_percentage = 0;
32543259
if (params.progress_callback == NULL) {

model_adapter.cpp

+26-18
Original file line numberDiff line numberDiff line change
@@ -133,28 +133,36 @@ void print_tok_vec(std::vector<float> &embd)
133133
else if(vocabsiz==50257 || (vocabsiz>=49152&&vocabsiz<=49157)) //49152-6 is starcoder
134134
{
135135
fileformat = FileFormat::GPT2_1;
136-
uint32_t temp;
137-
fin.read((char *)&temp, sizeof(temp)); //ctx
138-
fin.read((char *)&temp, sizeof(temp)); //n_embd
139-
fin.read((char *)&temp, sizeof(temp)); //n_head
136+
uint32_t temp, v1,v2,v3;
137+
fin.read((char *)&v1, sizeof(temp)); //ctx
138+
fin.read((char *)&v2, sizeof(temp)); //n_embd
139+
fin.read((char *)&v3, sizeof(temp)); //n_head
140140
fin.read((char *)&temp, sizeof(temp)); //n_layer
141-
fin.read((char *)&temp, sizeof(temp)); //f16
142-
const int32_t qntvr = temp / 1000;
143-
temp %= 1000;
144-
if (qntvr != 0)
141+
if(vocabsiz==49152 && v1==4096 && v2==2560 && v3==32 && temp==32)
145142
{
146-
if (qntvr == 1)
147-
{
148-
fileformat = FileFormat::GPT2_3;
149-
}
150-
else
151-
{
152-
fileformat = FileFormat::GPT2_4;
153-
}
143+
//special case, Stablecode Completion Alpha 3B
144+
fileformat = FileFormat::NEOX_6;
154145
}
155-
else if (temp != 0 && temp != 1)
146+
else
156147
{
157-
fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
148+
fin.read((char *)&temp, sizeof(temp)); //f16
149+
const int32_t qntvr = temp / 1000;
150+
temp %= 1000;
151+
if (qntvr != 0)
152+
{
153+
if (qntvr == 1)
154+
{
155+
fileformat = FileFormat::GPT2_3;
156+
}
157+
else
158+
{
159+
fileformat = FileFormat::GPT2_4;
160+
}
161+
}
162+
else if (temp != 0 && temp != 1)
163+
{
164+
fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
165+
}
158166
}
159167
}
160168
else if(vocabsiz < 31998 || vocabsiz > 33000)

0 commit comments

Comments
 (0)