Skip to content

Commit a4648c1

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 4c8b3fb + 0ecb1bb commit a4648c1

File tree

6 files changed

+79
-19
lines changed

6 files changed

+79
-19
lines changed

.github/workflows/build.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ jobs:
165165
- build: 'clblast'
166166
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
167167
- build: 'openblas'
168-
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
168+
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
169169

170170
steps:
171171
- name: Clone
@@ -187,7 +187,7 @@ jobs:
187187
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
188188
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
189189
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
190-
rename-item $env:RUNNER_TEMP/clblast_release_dir clblast
190+
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
191191
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
192192
$txt = Get-Content -Path $f -Raw
193193
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
@@ -213,7 +213,6 @@ jobs:
213213
cd build
214214
cmake .. ${{ matrix.defines }}
215215
cmake --build . --config Release
216-
cp ../LICENSE ./bin/Release/llama.cpp.txt
217216
218217
- name: Add clblast.dll
219218
id: add_clblast_dll
@@ -258,6 +257,7 @@ jobs:
258257
id: pack_artifacts
259258
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
260259
run: |
260+
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
261261
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
262262
263263
- name: Upload artifacts

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ endif()
6666
# 3rd party libs
6767
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
6868
option(LLAMA_BLAS "llama: use BLAS" OFF)
69-
option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
69+
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
7070
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
7171
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
7272
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")

examples/main/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ These options help improve the performance and memory usage of the LLaMA models.
272272

273273
### Prompt Caching
274274

275-
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.
275+
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
276276

277277
### Quantization
278278

examples/main/main.cpp

+14-4
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,6 @@ int main(int argc, char ** argv) {
134134
return 0;
135135
}
136136

137-
// Add a space in front of the first character to match OG llama tokenizer behavior
138-
params.prompt.insert(0, 1, ' ');
139137

140138
std::string path_session = params.path_prompt_cache;
141139
std::vector<llama_token> session_tokens;
@@ -155,6 +153,7 @@ int main(int argc, char ** argv) {
155153
return 1;
156154
}
157155
session_tokens.resize(n_token_count_out);
156+
llama_set_rng_seed(ctx, params.seed);
158157

159158
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
160159
} else {
@@ -163,7 +162,16 @@ int main(int argc, char ** argv) {
163162
}
164163

165164
// tokenize the prompt
166-
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
165+
std::vector<llama_token> embd_inp;
166+
167+
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
168+
// Add a space in front of the first character to match OG llama tokenizer behavior
169+
params.prompt.insert(0, 1, ' ');
170+
171+
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
172+
} else {
173+
embd_inp = session_tokens;
174+
}
167175

168176
const int n_ctx = llama_n_ctx(ctx);
169177

@@ -181,7 +189,9 @@ int main(int argc, char ** argv) {
181189
}
182190
n_matching_session_tokens++;
183191
}
184-
if (n_matching_session_tokens >= embd_inp.size()) {
192+
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
193+
fprintf(stderr, "%s: using full prompt from session file\n", __func__);
194+
} else if (n_matching_session_tokens >= embd_inp.size()) {
185195
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
186196
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
187197
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",

ggml.c

+49-9
Original file line numberDiff line numberDiff line change
@@ -3494,7 +3494,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
34943494
};
34953495
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
34963496

3497-
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3497+
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
34983498
"NONE",
34993499

35003500
"DUP",
@@ -3749,6 +3749,9 @@ const char * ggml_type_name(enum ggml_type type) {
37493749
return GGML_TYPE_NAME[type];
37503750
}
37513751

3752+
const char * ggml_op_name(enum ggml_op op) {
3753+
return GGML_OP_NAME[op];
3754+
}
37523755

37533756
size_t ggml_element_size(const struct ggml_tensor * tensor) {
37543757
return GGML_TYPE_SIZE[tensor->type];
@@ -3805,6 +3808,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
38053808
return wtype;
38063809
}
38073810

3811+
size_t ggml_tensor_overhead(void) {
3812+
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
3813+
}
3814+
38083815
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
38093816
return tensor->nb[0] > tensor->nb[1];
38103817
}
@@ -4017,6 +4024,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
40174024
return result;
40184025
}
40194026

4027+
void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4028+
ctx->no_alloc = no_alloc;
4029+
}
4030+
40204031
// IMPORTANT:
40214032
// when creating "opt" tensors, always save and load the scratch buffer
40224033
// this is an error prone process, but it is necessary to support inplace
@@ -4061,7 +4072,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
40614072
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
40624073

40634074
if (ctx->scratch.data == NULL || data != NULL) {
4064-
size_needed += sizeof(struct ggml_tensor);
4075+
size_needed += GGML_TENSOR_SIZE;
40654076

40664077
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
40674078
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@@ -4077,14 +4088,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
40774088
};
40784089
} else {
40794090
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4080-
GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
4091+
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4092+
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
40814093
assert(false);
40824094
return NULL;
40834095
}
40844096

4085-
if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
4097+
if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
40864098
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4087-
__func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
4099+
__func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
40884100
assert(false);
40894101
return NULL;
40904102
}
@@ -4093,7 +4105,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
40934105

40944106
*obj_new = (struct ggml_object) {
40954107
.offs = cur_end + GGML_OBJECT_SIZE,
4096-
.size = sizeof(struct ggml_tensor),
4108+
.size = GGML_TENSOR_SIZE,
40974109
.next = NULL,
40984110
};
40994111

@@ -13792,11 +13804,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
1379213804
// reached a leaf node, not part of the gradient graph (e.g. a constant)
1379313805
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
1379413806

13807+
if (strlen(node->name) == 0) {
13808+
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
13809+
}
13810+
1379513811
cgraph->leafs[cgraph->n_leafs] = node;
1379613812
cgraph->n_leafs++;
1379713813
} else {
1379813814
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
1379913815

13816+
if (strlen(node->name) == 0) {
13817+
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
13818+
}
13819+
1380013820
cgraph->nodes[cgraph->n_nodes] = node;
1380113821
cgraph->grads[cgraph->n_nodes] = node->grad;
1380213822
cgraph->n_nodes++;
@@ -14510,6 +14530,26 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
1451014530
}
1451114531
}
1451214532

14533+
struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) {
14534+
for (int i = 0; i < cgraph->n_leafs; i++) {
14535+
struct ggml_tensor * leaf = cgraph->leafs[i];
14536+
14537+
if (strcmp(leaf->name, name) == 0) {
14538+
return leaf;
14539+
}
14540+
}
14541+
14542+
for (int i = 0; i < cgraph->n_nodes; i++) {
14543+
struct ggml_tensor * node = cgraph->nodes[i];
14544+
14545+
if (strcmp(node->name, name) == 0) {
14546+
return node;
14547+
}
14548+
}
14549+
14550+
return NULL;
14551+
}
14552+
1451314553
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
1451414554
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
1451514555

@@ -14527,7 +14567,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
1452714567
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
1452814568
i,
1452914569
node->ne[0], node->ne[1], node->ne[2],
14530-
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
14570+
GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
1453114571
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
1453214572
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
1453314573
(double) node->perf_time_us / 1000.0,
@@ -14541,15 +14581,15 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
1454114581
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
1454214582
i,
1454314583
node->ne[0], node->ne[1],
14544-
GGML_OP_LABEL[node->op]);
14584+
GGML_OP_NAME[node->op]);
1454514585
}
1454614586

1454714587
for (int i = 0; i < GGML_OP_COUNT; i++) {
1454814588
if (perf_total_per_op_us[i] == 0) {
1454914589
continue;
1455014590
}
1455114591

14552-
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
14592+
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
1455314593
}
1455414594

1455514595
GGML_PRINT("========================================\n");

ggml.h

+11-1
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@
198198
#define GGML_MAX_PARAMS 256
199199
#define GGML_MAX_CONTEXTS 64
200200
#define GGML_MAX_OPT 4
201+
#define GGML_MAX_NAME 32
201202
#define GGML_DEFAULT_N_THREADS 4
202203

203204
#define GGML_ASSERT(x) \
@@ -372,11 +373,13 @@ extern "C" {
372373

373374
void * data;
374375

375-
char name[32];
376+
char name[GGML_MAX_NAME];
376377

377378
char padding[16];
378379
};
379380

381+
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
382+
380383
// computation graph
381384
struct ggml_cgraph {
382385
int n_nodes;
@@ -429,6 +432,7 @@ extern "C" {
429432
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
430433

431434
GGML_API const char * ggml_type_name(enum ggml_type type);
435+
GGML_API const char * ggml_op_name (enum ggml_op op);
432436

433437
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
434438

@@ -437,6 +441,9 @@ extern "C" {
437441
// TODO: temporary until model loading of ggml examples is refactored
438442
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
439443

444+
// use this to compute the memory overhead of a tensor
445+
GGML_API size_t ggml_tensor_overhead(void);
446+
440447
// main
441448

442449
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -445,6 +452,7 @@ extern "C" {
445452
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
446453

447454
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
455+
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
448456

449457
GGML_API struct ggml_tensor * ggml_new_tensor(
450458
struct ggml_context * ctx,
@@ -970,6 +978,8 @@ extern "C" {
970978
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
971979
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
972980

981+
GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name);
982+
973983
// print info and performance information for the graph
974984
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
975985

0 commit comments

Comments
 (0)