Skip to content

Commit 7cce753

Browse files
authored
Add the components governance file cgmanifest.json for tokenizer's vocab files (#7283)
* Add the governance file cgmanifest.json for tokenizer's vocab files * Address the feedback * apply more schema requirements on the doc
1 parent a9b4212 commit 7cce753

File tree

6 files changed

+61
-6
lines changed

6 files changed

+61
-6
lines changed

THIRD-PARTY-NOTICES.TXT

+2-2
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8686
See the License for the specific language governing permissions and
8787
limitations under the License.
8888

89-
License notice for OpenAI Tiktoken Tokenizer
90-
--------------------------------------------
89+
License notice for OpenAI Tiktoken Tokenizer & Tokenizer's vocab files
90+
----------------------------------------------------------------------
9191

9292
https://github.com/openai/tiktoken/blob/main/LICENSE
9393

cgmanifest.json

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"$schema": "https://json.schemastore.org/component-detection-manifest.json",
3+
"version": 1,
4+
"registrations": [
5+
{
6+
"component": {
7+
"type": "other",
8+
"other": {
9+
"name": "cl100k_base.tiktoken",
10+
"version": "1",
11+
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
12+
"hash": "sha1:6494e42d5aad2bbb441ea9793af9e7db335c8d9c"
13+
}
14+
},
15+
"developmentDependency": false
16+
},
17+
{
18+
"component": {
19+
"type": "other",
20+
"other": {
21+
"name": "o200k_base.tiktoken",
22+
"version": "1",
23+
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
24+
"hash": "sha1:1d4fdeb17c52829ead47ac65e61197fd530b1c31"
25+
}
26+
},
27+
"developmentDependency": false
28+
},
29+
{
30+
"component": {
31+
"type": "other",
32+
"other": {
33+
"name": "p50k_base.tiktoken",
34+
"version": "1",
35+
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
36+
"hash": "sha1:0ecf4ae6d454e7719bcf35f284eac0b73f37e3c9"
37+
}
38+
},
39+
"developmentDependency": false
40+
},
41+
{
42+
"component": {
43+
"type": "other",
44+
"other": {
45+
"name": "r50k_base.tiktoken",
46+
"version": "1",
47+
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
48+
"hash": "sha1:5674ba48e48e76284eb747c896a291dc5583c808"
49+
}
50+
},
51+
"developmentDependency": false
52+
}
53+
]
54+
}

src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj

+2-1
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@
1111
<!--
1212
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
1313
The files are downloaded from the following sources and compressed to the Destination.
14-
- gpt2.tiktoken: https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
14+
- gpt2.tiktoken: https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
1515
1616
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
1717
18+
Gpt2 vocab data is exact as the r50k_base vocab data, but with a different name.
1819
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
1920
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
2021
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.

src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1153,7 +1153,7 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
11531153
private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
11541154
private const string P50RanksFile = "p50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
11551155
private const string R50RanksFile = "r50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
1156-
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
1156+
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken". Gpt2 is using the same encoding as R50kBase
11571157
private const string O200kBaseFile = "o200k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
11581158

11591159
internal const string Cl100kBaseEncodingName = "cl100k_base";

test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ public void TestMissingDataPackages(string modelName, string packageName)
3434

3535
public static IEnumerable<object[]> ModelUrlData()
3636
{
37+
// Gpt2 is covered by the r50k_base.tiktoken file
3738
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
38-
yield return new object[] { @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
3939
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
4040
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
4141
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };

test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ public async Task TestTokenizerCreation()
9898
public static IEnumerable<object[]> ModelUrlData()
9999
{
100100
yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
101-
yield return new object[] { GPT2, @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
101+
yield return new object[] { GPT2, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" }; // GPT2 uses the same encoding as R50kBase
102102
yield return new object[] { P50kBase, @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
103103
yield return new object[] { R50kBase, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
104104
yield return new object[] { GPT4o, @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };

0 commit comments

Comments
 (0)