Skip to content

Commit d571d16

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 608aa33 + dd0eabc commit d571d16

File tree

14 files changed

+1128
-880
lines changed

14 files changed

+1128
-880
lines changed

.github/workflows/build.yml

+5-5
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ env:
1919
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
2020

2121
jobs:
22-
ubuntu-latest-make:
23-
runs-on: ubuntu-latest
22+
ubuntu-focal-make:
23+
runs-on: ubuntu-20.04
2424

2525
steps:
2626
- name: Clone
@@ -31,12 +31,12 @@ jobs:
3131
id: depends
3232
run: |
3333
sudo apt-get update
34-
sudo apt-get install build-essential
34+
sudo apt-get install build-essential gcc-8
3535
3636
- name: Build
3737
id: make_build
3838
run: |
39-
make
39+
CC=gcc-8 make
4040
4141
ubuntu-latest-cmake:
4242
runs-on: ubuntu-latest
@@ -216,7 +216,7 @@ jobs:
216216
runs-on: ubuntu-latest
217217

218218
needs:
219-
- ubuntu-latest-make
219+
- ubuntu-focal-make
220220
- ubuntu-latest-cmake
221221
- macOS-latest-make
222222
- macOS-latest-cmake

Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,9 @@ ifdef LLAMA_CUBLAS
109109
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
110110
OBJS += ggml-cuda.o
111111
NVCC = nvcc
112-
NVCCFLAGS = --forward-unknown-to-host-linker -arch=native
112+
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
113113
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
114-
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
114+
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
115115
endif
116116
ifdef LLAMA_HIPBLAS
117117
ROCM_PATH ?= /opt/rocm

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ Here is an example of a few-shot interaction, invoked with the command
241241
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
242242
```
243243

244-
Note the use of `--color` to distinguish between user input and generated text.
244+
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
245245

246246
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
247247

examples/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,5 @@ else()
3434
add_subdirectory(quantize-stats)
3535
add_subdirectory(perplexity)
3636
add_subdirectory(embedding)
37+
add_subdirectory(save-load-state)
3738
endif()

examples/common.cpp

+1-3
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
156156
params.interactive = true;
157157
} else if (arg == "--embedding") {
158158
params.embedding = true;
159-
} else if (arg == "--interactive-start") {
160-
params.interactive = true;
161159
} else if (arg == "--interactive-first") {
162-
params.interactive_start = true;
160+
params.interactive_first = true;
163161
} else if (arg == "-ins" || arg == "--instruct") {
164162
params.instruct = true;
165163
} else if (arg == "--color") {

examples/common.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ struct gpt_params {
4343
bool interactive = false; // interactive mode
4444

4545
bool embedding = false; // get only sentence embedding
46-
bool interactive_start = false; // wait for user input immediately
46+
bool interactive_first = false; // wait for user input immediately
4747

4848
bool instruct = false; // instruction mode (used for Alpaca models)
4949
bool ignore_eos = false; // do not stop generating after eos

examples/main/README.md

+13-3
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,20 @@ To get started right away, run the following command, making sure to use the cor
2121
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
2222
```
2323

24+
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
25+
26+
```bash
27+
./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
28+
```
29+
2430
For an interactive experience, try this command:
2531

2632
```bash
2733
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
2834
```
2935

36+
Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
37+
3038
## Common Options
3139

3240
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
@@ -84,6 +92,8 @@ Instruction mode is particularly useful when working with Alpaca models, which a
8492

8593
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
8694

95+
Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
96+
8797
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
8898

8999
## Context Management
@@ -114,7 +124,7 @@ The following options are related to controlling the text generation process, in
114124

115125
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
116126

117-
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value.
127+
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
118128

119129
### RNG Seed
120130

@@ -126,7 +136,7 @@ The RNG seed is used to initialize the random number generator that influences t
126136

127137
- `--temp N`: Adjust the randomness of the generated text (default: 0.8).
128138

129-
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism.
139+
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
130140

131141
Example usage: `--temp 0.8`
132142

@@ -177,5 +187,5 @@ These options provide extra functionality and customization when running the LLa
177187
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
178188
- `--verbose-prompt`: Print the prompt before generating text.
179189
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
180-
- `--lora FNAME`: Apply a LoRA (Layer-wise Relevance Approximation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
190+
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
181191
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

examples/main/main.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,12 @@ int main(int argc, char ** argv) {
178178

179179
// in instruct mode, we inject a prefix and a suffix to each input by the user
180180
if (params.instruct) {
181-
params.interactive_start = true;
181+
params.interactive_first = true;
182182
params.antiprompt.push_back("### Instruction:\n\n");
183183
}
184184

185185
// enable interactive mode if reverse prompt or interactive start is specified
186-
if (params.antiprompt.size() != 0 || params.interactive_start) {
186+
if (params.antiprompt.size() != 0 || params.interactive_first) {
187187
params.interactive = true;
188188
}
189189

@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
246246
#endif
247247
" - Press Return to return control to LLaMa.\n"
248248
" - If you want to submit another line, end your input in '\\'.\n\n");
249-
is_interacting = params.interactive_start;
249+
is_interacting = params.interactive_first;
250250
}
251251

252252
bool is_antiprompt = false;
+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
set(TARGET save-load-state)
2+
add_executable(${TARGET} save-load-state.cpp)
3+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#include <vector>
2+
#include <cstdio>
3+
#include <chrono>
4+
5+
#include "common.h"
6+
#include "llama.h"
7+
#include "llama.cpp"
8+
9+
using namespace std;
10+
11+
int main(int argc, char ** argv) {
12+
gpt_params params;
13+
params.model = "models/llama-7B/ggml-model.bin";
14+
params.seed = 42;
15+
params.n_threads = 4;
16+
params.repeat_last_n = 64;
17+
params.prompt = "The quick brown fox";
18+
19+
if (gpt_params_parse(argc, argv, params) == false) {
20+
return 1;
21+
}
22+
23+
auto lparams = llama_context_default_params();
24+
25+
lparams.n_ctx = params.n_ctx;
26+
lparams.n_parts = params.n_parts;
27+
lparams.seed = params.seed;
28+
lparams.f16_kv = params.memory_f16;
29+
lparams.use_mmap = params.use_mmap;
30+
lparams.use_mlock = params.use_mlock;
31+
32+
auto n_past = 0;
33+
auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
34+
35+
// init
36+
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
37+
auto tokens = vector<llama_token>(params.n_ctx);
38+
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
39+
40+
if (n_prompt_tokens < 1) {
41+
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
42+
return 1;
43+
}
44+
45+
// evaluate prompt
46+
47+
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
48+
49+
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
50+
n_past += n_prompt_tokens;
51+
52+
// Save state (rng, logits, embedding and kv_cache) to file
53+
FILE *fp_write = fopen("dump_state.bin", "wb");
54+
auto state_size = llama_get_state_size(ctx);
55+
auto state_mem = new uint8_t[state_size];
56+
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
57+
fwrite(state_mem, 1, state_size, fp_write);
58+
fclose(fp_write);
59+
60+
// save state (last tokens)
61+
auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
62+
auto n_past_saved = n_past;
63+
64+
// first run
65+
printf("\n%s", params.prompt.c_str());
66+
for (auto i = 0; i < params.n_predict; i++) {
67+
auto next_token = llama_sample_top_p_top_k(
68+
ctx,
69+
&last_n_tokens_data.back() - params.repeat_last_n,
70+
params.repeat_last_n,
71+
40,
72+
1.0,
73+
1.0,
74+
1.1);
75+
auto next_token_str = llama_token_to_str(ctx, next_token);
76+
last_n_tokens_data.push_back(next_token);
77+
printf("%s", next_token_str);
78+
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
79+
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
80+
return 1;
81+
}
82+
n_past += 1;
83+
}
84+
printf("\n\n");
85+
86+
// free old model
87+
llama_free(ctx);
88+
89+
// load new model
90+
91+
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
92+
93+
// Load state (rng, logits, embedding and kv_cache) from file
94+
FILE *fp_read = fopen("dump_state.bin", "rb");
95+
auto state_size2 = llama_get_state_size(ctx2);
96+
if (state_size != state_size2) {
97+
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
98+
}
99+
fread(state_mem, 1, state_size, fp_read);
100+
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
101+
fclose(fp_read);
102+
103+
// restore state (last tokens)
104+
last_n_tokens_data = last_n_tokens_data_saved;
105+
n_past = n_past_saved;
106+
107+
// second run
108+
for (auto i = 0; i < params.n_predict; i++) {
109+
auto next_token = llama_sample_top_p_top_k(
110+
ctx2,
111+
&last_n_tokens_data.back() - params.repeat_last_n,
112+
params.repeat_last_n,
113+
40,
114+
1.0,
115+
1.0,
116+
1.1);
117+
auto next_token_str = llama_token_to_str(ctx2, next_token);
118+
last_n_tokens_data.push_back(next_token);
119+
printf("%s", next_token_str);
120+
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
121+
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
122+
return 1;
123+
}
124+
n_past += 1;
125+
}
126+
printf("\n\n");
127+
return 0;
128+
}

0 commit comments

Comments
 (0)