Skip to content

Commit 3c118a6

Browse files
authored
Merge pull request #104 from neonredwood/support-gpt4-omni
Add support for GPT-4-O, "Omni" model
2 parents 045f0e4 + f2e1ac2 commit 3c118a6

File tree

7 files changed

+62
-5
lines changed

7 files changed

+62
-5
lines changed

js/src/core.ts

+4
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,10 @@ export function getEncodingNameForModel(model: TiktokenModel) {
274274
case "text-embedding-ada-002": {
275275
return "cl100k_base";
276276
}
277+
case "gpt-4o":
278+
case "gpt-4o-2024-05-13": {
279+
return "o200k_base";
280+
}
277281
default:
278282
never(model);
279283
throw new Error("Unknown model");

js/src/index.ts

+3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import p50k_base from "./ranks/p50k_base";
44
import p50k_edit from "./ranks/p50k_edit";
55
import r50k_base from "./ranks/r50k_base";
66
import cl100k_base from "./ranks/cl100k_base";
7+
import o200k_base from "./ranks/o200k_base";
78

89
import { Tiktoken, getEncodingNameForModel } from "./core";
910
import { never } from "./utils";
@@ -23,6 +24,8 @@ export function getEncoding(
2324
return new Tiktoken(p50k_edit, extendSpecialTokens);
2425
case "cl100k_base":
2526
return new Tiktoken(cl100k_base, extendSpecialTokens);
27+
case "o200k_base":
28+
return new Tiktoken(o200k_base, extendSpecialTokens);
2629
default:
2730
never(encoding);
2831
throw new Error("Unknown encoding");

scripts/ranks.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -264,8 +264,8 @@ async function main() {
264264
const bpe = await downloadBpe(data);
265265

266266
if (lib === "js") {
267-
bpe.pat_str = bpe.pat_str.replace(
268-
/\(\?i:(.*?)\)/,
267+
bpe.pat_str = bpe.pat_str.replaceAll(
268+
/\(\?i:(.*?)\)/g,
269269
(_, match: string) =>
270270
`(${match
271271
.split("|")

tiktoken/model_to_encoding.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -52,5 +52,7 @@
5252
"gpt-4-turbo-preview": "cl100k_base",
5353
"gpt-4-1106-preview": "cl100k_base",
5454
"gpt-4-0125-preview": "cl100k_base",
55-
"gpt-4-vision-preview": "cl100k_base"
55+
"gpt-4-vision-preview": "cl100k_base",
56+
"gpt-4o": "o200k_base",
57+
"gpt-4o-2024-05-13": "o200k_base"
5658
}

tiktoken/registry.json

+9-1
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,13 @@
4646
"<|endofprompt|>": 100276
4747
},
4848
"pat_str": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
49+
},
50+
"o200k_base": {
51+
"load_tiktoken_bpe": "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
52+
"special_tokens": {
53+
"<|endoftext|>": 199999,
54+
"<|endofprompt|>": 200018
55+
},
56+
"pat_str": "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
4957
}
50-
}
58+
}

wasm/src/lib.rs

+19-1
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,19 @@ impl CoreBPEConstructor {
133133
include_str!("./ranks/cl100k_base.regex.tiktoken"),
134134
)
135135
}
136+
137+
#[cfg(feature = "inline")]
138+
fn o200k_base() -> Self {
139+
let mut special_tokens = HashMap::default();
140+
special_tokens.insert(String::from(ENDOFTEXT), 199999);
141+
special_tokens.insert(String::from(ENDOFPROMPT), 200018);
142+
143+
CoreBPEConstructor::new(
144+
include_str!("./ranks/o200k_base.compress.tiktoken"),
145+
Some(special_tokens),
146+
include_str!("./ranks/o200k_base.regex.tiktoken"),
147+
)
148+
}
136149
}
137150

138151
#[wasm_bindgen]
@@ -179,6 +192,7 @@ impl Tiktoken {
179192
"p50k_base" => Ok(CoreBPEConstructor::p50k_base()),
180193
"p50k_edit" => Ok(CoreBPEConstructor::p50k_edit()),
181194
"cl100k_base" => Ok(CoreBPEConstructor::cl100k_base()),
195+
"o200k_base" => Ok(CoreBPEConstructor::o200k_base()),
182196
&_ => Err(JsError::new("Invalid encoding")),
183197
}?;
184198

@@ -325,7 +339,7 @@ impl Tiktoken {
325339
#[cfg(feature = "inline")]
326340
#[wasm_bindgen(typescript_custom_section)]
327341
const _: &'static str = r#"
328-
export type TiktokenEncoding = "gpt2" | "r50k_base" | "p50k_base" | "p50k_edit" | "cl100k_base";
342+
export type TiktokenEncoding = "gpt2" | "r50k_base" | "p50k_base" | "p50k_edit" | "cl100k_base" | "o200k_base";
329343
330344
/**
331345
* @param {TiktokenEncoding} encoding
@@ -404,6 +418,8 @@ export type TiktokenModel =
404418
| "gpt-4-1106-preview"
405419
| "gpt-4-0125-preview"
406420
| "gpt-4-vision-preview"
421+
| "gpt-4o"
422+
| "gpt-4o-2024-05-13"
407423
408424
/**
409425
* @param {TiktokenModel} encoding
@@ -474,6 +490,8 @@ pub fn encoding_for_model(
474490
"gpt-4-turbo-2024-04-09" => Ok("cl100k_base"),
475491
"gpt-4-turbo-preview" => Ok("cl100k_base"),
476492
"gpt-4-0125-preview" => Ok("cl100k_base"),
493+
"gpt-4o" => Ok("o200k_base"),
494+
"gpt-4o-2024-05-13" => Ok("o200k_base"),
477495
model => Err(JsError::new(
478496
format!("Invalid model: {}", model.to_string()).as_str(),
479497
)),

wasm/test/test_simple_public.test.ts

+22
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,28 @@ describe("cl100k_base", () => {
5959
});
6060
});
6161

62+
describe("o200k_base", () => {
63+
const enc = get_encoding("o200k_base");
64+
65+
it("encodes hello world string", () => {
66+
expect(enc.encode("hello world")).toStrictEqual(
67+
new Uint32Array([24912, 2375])
68+
);
69+
});
70+
71+
it("decodes hello world string", () => {
72+
expect(
73+
new TextDecoder().decode(enc.decode(new Uint32Array([24912, 2375])))
74+
).toStrictEqual("hello world");
75+
});
76+
77+
it("encodes hello world string, all allowed special characters", () => {
78+
expect(enc.encode("hello <|endoftext|>", "all")).toStrictEqual(
79+
new Uint32Array([24912, 220, 199999])
80+
);
81+
});
82+
});
83+
6284
it("test_simple", () => {
6385
const encodings = [
6486
"gpt2",

0 commit comments

Comments
 (0)