Skip to content

Commit 8e1ed19

Browse files
committed
Initial commit
0 parents  commit 8e1ed19

13 files changed

+51279
-0
lines changed

.github/workflows/lint.yml

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
name: Lint
2+
3+
on:
4+
push:
5+
tags:
6+
branches:
7+
pull_request:
8+
9+
jobs:
10+
golangci:
11+
name: lint
12+
runs-on: ubuntu-latest
13+
steps:
14+
- uses: actions/setup-go@v2
15+
with:
16+
go-version: 1.18
17+
stable: false
18+
- uses: actions/checkout@v2
19+
- name: golangci-lint
20+
uses: golangci/golangci-lint-action@v2
21+
with:
22+
# Optional: version of golangci-lint to use in form of v1.2 or v1.2.3 or `latest` to use the latest version
23+
version: latest
24+
25+
# Optional: working directory, useful for monorepos
26+
working-directory: ./
27+
28+
# Optional: golangci-lint command line arguments.
29+
args: --timeout 60s --max-same-issues 50
30+
31+
# Optional: show only new issues if it's a pull request. The default value is `false`.
32+
# only-new-issues: true
33+
34+
# Optional: if set to true then the action will use pre-installed Go.
35+
# skip-go-installation: true
36+
37+
# Optional: if set to true then the action don't cache or restore ~/go/pkg.
38+
# skip-pkg-cache: true
39+
40+
# Optional: if set to true then the action don't cache or restore ~/.cache/go-build.
41+
# skip-build-cache: true
42+
43+
# optionally use a specific version of Go rather than the latest one
44+
go_version: '1.18'

.github/workflows/test.yml

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: Tests
2+
3+
on:
4+
push:
5+
tags:
6+
branches:
7+
pull_request:
8+
9+
jobs:
10+
11+
test:
12+
runs-on: ubuntu-latest
13+
steps:
14+
- uses: actions/checkout@v2
15+
16+
- name: Set up Go
17+
uses: actions/setup-go@v2
18+
with:
19+
go-version: 1.18
20+
stable: false
21+
22+
- name: Build
23+
run: make build
24+
25+
- name: Test
26+
run: make test
27+
28+
- name: Test
29+
run: make coverage
30+
31+
- name: Codecov
32+
uses: codecov/codecov-action@v2
33+
with:
34+
token: ${{ secrets.CODECOV_TOKEN }}
35+
file: ./cover.out
36+
flags: unittests
37+
verbose: true

.gitignore

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
2+
# Created by https://www.toptal.com/developers/gitignore/api/go
3+
# Edit at https://www.toptal.com/developers/gitignore?templates=go
4+
5+
### Go ###
6+
# If you prefer the allow list template instead of the deny list, see community template:
7+
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
8+
#
9+
# Binaries for programs and plugins
10+
*.exe
11+
*.exe~
12+
*.dll
13+
*.so
14+
*.dylib
15+
16+
# Test binary, built with `go test -c`
17+
*.test
18+
19+
# Output of the go coverage tool, specifically when used with LiteIDE
20+
*.out
21+
22+
# Dependency directories (remove the comment below to include it)
23+
# vendor/
24+
25+
# Go workspace file
26+
go.work
27+
28+
### Go Patch ###
29+
/vendor/
30+
/Godeps/
31+
32+
# End of https://www.toptal.com/developers/gitignore/api/go
33+
34+
cover.out
35+
cover.html
36+
.vscode
37+
38+
.idea/

LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2022 Samuel Berthe
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

Makefile

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
2+
build:
3+
go build -v ./...
4+
5+
test:
6+
go test -v ./...
7+
watch-test:
8+
reflex -t 50ms -s -- sh -c 'gotest -v ./...'
9+
10+
bench:
11+
go test -benchmem -count 3 -bench ./...
12+
watch-bench:
13+
reflex -t 50ms -s -- sh -c 'go test -benchmem -count 3 -bench ./...'
14+
15+
coverage:
16+
go test -v -coverprofile=cover.out -covermode=atomic .
17+
go tool cover -html=cover.out -o cover.html
18+
19+
tools:
20+
go install github.com/cespare/reflex@latest
21+
go install github.com/rakyll/gotest@latest
22+
go install github.com/psampaz/go-mod-outdated@latest
23+
go install github.com/jondot/goweight@latest
24+
go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest
25+
go get -t -u golang.org/x/tools/cmd/cover
26+
go get -t -u github.com/sonatype-nexus-community/nancy@latest
27+
go mod tidy
28+
29+
lint:
30+
golangci-lint run --timeout 60s --max-same-issues 50 ./...
31+
lint-fix:
32+
golangci-lint run --timeout 60s --max-same-issues 50 --fix ./...
33+
34+
audit:
35+
go mod tidy
36+
go list -json -m all | nancy sleuth
37+
38+
outdated:
39+
go mod tidy
40+
go list -u -m -json all | go-mod-outdated -update -direct
41+
42+
weight:
43+
goweight

README.md

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# go-gpt-3-encoder
2+
3+
Go BPE tokenizer (Encoder+Decoder) for GPT2 and GPT3.
4+
5+
## About
6+
7+
GPT2 and GPT3 use byte pair encoding to turn text into a series of integers to feed into the model. This is a Go implementation of OpenAI's original Python encoder/decoder which can be found [here](https://github.com/openai/gpt-2/blob/master/src/encoder.py).
8+
9+
This code was inspired by [Javascript implementation](https://github.com/latitudegames/GPT-3-Encoder) and partially generated by OpenAI himself!
10+
11+
## Install
12+
13+
```bash
14+
go get github.com/samber/go-gpt-3-encoder
15+
```
16+
17+
## Usage
18+
19+
Compatible with Node >= 12
20+
21+
```go
22+
import "github.com/samber/go-gpt-3-encoder"
23+
24+
encoder, err := NewEncoder()
25+
if err != nil {
26+
log.Fatal(err)
27+
}
28+
29+
str := "This is an example sentence to try encoding out on!"
30+
31+
encoded, err := encoder.Encode(str)
32+
if err != nil {
33+
log.Fatal(err)
34+
}
35+
36+
fmt.Println("We can look at each token and what it represents:")
37+
for _, token := encoded {
38+
fmt.Printf("%s -- %s\n", token, encoder.Decode([]string{token}))
39+
}
40+
41+
decoded := encoder.Decode(encoded)
42+
fmt.Printf("We can decode it back into: %s\n", decoded)
43+
```
44+
45+
## Contribute
46+
47+
Some corner cases are not covered by this library. See `@TODO` in tests.

0 commit comments

Comments
 (0)