Skip to content

llama.swiftui : add bench functionality #4483

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Dec 17, 2023
3 changes: 3 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ insert_final_newline = unset

[examples/server/public/*]
indent_size = 2

[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab
1 change: 1 addition & 0 deletions examples/llama.swiftui/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
xcuserdata
xcshareddata
182 changes: 157 additions & 25 deletions examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,34 @@ enum LlamaError: Error {
case couldNotInitializeContext
}

func llama_batch_clear(_ batch: inout llama_batch) {
batch.n_tokens = 0
}

func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
batch.token [Int(batch.n_tokens)] = id
batch.pos [Int(batch.n_tokens)] = pos
batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
for i in 0..<seq_ids.count {
batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
}
batch.logits [Int(batch.n_tokens)] = logits ? 1 : 0

batch.n_tokens += 1
}

actor LlamaContext {
private var model: OpaquePointer
private var context: OpaquePointer
private var batch: llama_batch
private var tokens_list: [llama_token]

/// This variable is used to store temporarily invalid cchars
private var temporary_invalid_cchars: [CChar]

var n_len: Int32 = 512
var n_len: Int32 = 64
var n_cur: Int32 = 0

var n_decode: Int32 = 0

init(model: OpaquePointer, context: OpaquePointer) {
Expand All @@ -27,25 +45,34 @@ actor LlamaContext {
}

deinit {
llama_batch_free(batch)
llama_free(context)
llama_free_model(model)
llama_backend_free()
}

static func createContext(path: String) throws -> LlamaContext {
static func create_context(path: String) throws -> LlamaContext {
llama_backend_init(false)
let model_params = llama_model_default_params()
var model_params = llama_model_default_params()

#if targetEnvironment(simulator)
model_params.n_gpu_layers = 0
print("Running on simulator, force use n_gpu_layers = 0")
#endif
let model = llama_load_model_from_file(path, model_params)
guard let model else {
print("Could not load model at \(path)")
throw LlamaError.couldNotInitializeContext
}

let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
print("Using \(n_threads) threads")

var ctx_params = llama_context_default_params()
ctx_params.seed = 1234
ctx_params.seed = 1234
ctx_params.n_ctx = 2048
ctx_params.n_threads = 8
ctx_params.n_threads_batch = 8
ctx_params.n_threads = UInt32(n_threads)
ctx_params.n_threads_batch = UInt32(n_threads)

let context = llama_new_context_with_model(model, ctx_params)
guard let context else {
Expand All @@ -56,6 +83,26 @@ actor LlamaContext {
return LlamaContext(model: model, context: context)
}

func model_info() -> String {
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 256)
result.initialize(repeating: Int8(0), count: 256)
defer {
result.deallocate()
}

// TODO: this is probably very stupid way to get the string from C

let nChars = llama_model_desc(model, result, 256)
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))

var SwiftString = ""
for char in bufferPointer {
SwiftString.append(Character(UnicodeScalar(UInt8(char))))
}

return SwiftString
}

func get_n_tokens() -> Int32 {
return batch.n_tokens;
}
Expand All @@ -79,16 +126,11 @@ actor LlamaContext {
print(String(cString: token_to_piece(token: id) + [0]))
}

// batch = llama_batch_init(512, 0) // done in init()
batch.n_tokens = Int32(tokens_list.count)
llama_batch_clear(&batch)

for i1 in 0..<batch.n_tokens {
for i1 in 0..<tokens_list.count {
let i = Int(i1)
batch.token[i] = tokens_list[i]
batch.pos[i] = i1
batch.n_seq_id[Int(i)] = 1
batch.seq_id[Int(i)]![0] = 0
batch.logits[i] = 0
llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true

Expand Down Expand Up @@ -141,18 +183,11 @@ actor LlamaContext {
print(new_token_str)
// tokens_list.append(new_token_id)

batch.n_tokens = 0

batch.token[Int(batch.n_tokens)] = new_token_id
batch.pos[Int(batch.n_tokens)] = n_cur
batch.n_seq_id[Int(batch.n_tokens)] = 1
batch.seq_id[Int(batch.n_tokens)]![0] = 0
batch.logits[Int(batch.n_tokens)] = 1 // true
batch.n_tokens += 1
llama_batch_clear(&batch)
llama_batch_add(&batch, new_token_id, n_cur, [0], true)

n_decode += 1

n_cur += 1
n_cur += 1

if llama_decode(context, batch) != 0 {
print("failed to evaluate llama!")
Expand All @@ -161,14 +196,111 @@ actor LlamaContext {
return new_token_str
}

func bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) -> String {
var pp_avg: Double = 0
var tg_avg: Double = 0

var pp_std: Double = 0
var tg_std: Double = 0

for r in 0..<nr {
// bench prompt processing

llama_batch_clear(&batch)

let n_tokens = pp

for i in 0..<n_tokens {
llama_batch_add(&batch, 0, Int32(i), [0], false)
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true

llama_kv_cache_clear(context)

let t_pp_start = ggml_time_us()

if llama_decode(context, batch) != 0 {
print("llama_decode() failed during prompt")
}

let t_pp_end = ggml_time_us()

// bench text generation

llama_kv_cache_clear(context)

let t_tg_start = ggml_time_us()

for i in 0..<tg {
llama_batch_clear(&batch)

for j in 0..<pl {
llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
}

if llama_decode(context, batch) != 0 {
print("llama_decode() failed during text generation")
}
}

let t_tg_end = ggml_time_us()

llama_kv_cache_clear(context)

let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0

let speed_pp = Double(pp) / t_pp
let speed_tg = Double(pl*tg) / t_tg

pp_avg += speed_pp
tg_avg += speed_tg

pp_std += speed_pp * speed_pp
tg_std += speed_tg * speed_tg

print("pp \(speed_pp) t/s, tg \(speed_tg) t/s")
}

pp_avg /= Double(nr)
tg_avg /= Double(nr)

if nr > 1 {
pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1))
tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1))
} else {
pp_std = 0
tg_std = 0
}

let model_desc = model_info();
let model_size = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
let backend = "Metal";
let pp_avg_str = String(format: "%.2f", pp_avg);
let tg_avg_str = String(format: "%.2f", tg_avg);
let pp_std_str = String(format: "%.2f", pp_std);
let tg_std_str = String(format: "%.2f", tg_std);

var result = ""

result += String("| model | size | params | backend | test | t/s |\n")
result += String("| --- | --- | --- | --- | --- | --- |\n")
result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n")
result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n")

return result;
}

func clear() {
tokens_list.removeAll()
temporary_invalid_cchars.removeAll()
llama_kv_cache_clear(context)
}

private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)

Expand Down
Loading