Skip to content

Commit 5ddf7ea

Browse files
authored
hooks : setting up flake8 and pre-commit hooks (ggml-org#1681)
Small, non-functional changes were made to non-compliant files. These include breaking up long lines, whitespace sanitation and unused import removal. Maximum line length in python files was set to a generous 125 chars, in order to minimize number of changes needed in scripts and general annoyance. The "txt" prompts directory is excluded from the checks as it may contain oddly formatted files and strings for a good reason. Signed-off-by: Jiri Podivin <[email protected]>
1 parent bac1992 commit 5ddf7ea

File tree

5 files changed

+42
-12
lines changed

5 files changed

+42
-12
lines changed

.flake8

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[flake8]
2+
max-line-length = 125

.pre-commit-config.yaml

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# See https://pre-commit.com for more information
2+
# See https://pre-commit.com/hooks.html for more hooks
3+
exclude: prompts/.*.txt
4+
repos:
5+
- repo: https://github.com/pre-commit/pre-commit-hooks
6+
rev: v3.2.0
7+
hooks:
8+
- id: trailing-whitespace
9+
- id: end-of-file-fixer
10+
- id: check-yaml
11+
- id: check-added-large-files
12+
- repo: https://github.com/PyCQA/flake8
13+
rev: 6.0.0
14+
hooks:
15+
- id: flake8

convert.py

+18-8
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,11 @@ def validate_conversion_to(self, data_type: DataType) -> None:
512512
if not isinstance(self.data_type, QuantizedDataType):
513513
raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
514514
if self.data_type.have_g_idx:
515-
sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML. For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n")
515+
sys.stderr.write(
516+
"Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
517+
"which is not yet natively supported by GGML. "
518+
"For now you can still convert this model by passing `--outtype f16` to dequantize, "
519+
"but that will result in a much larger output file for no quality benefit.\n")
516520
sys.exit(1)
517521
assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
518522

@@ -694,8 +698,9 @@ def load(offset: int, elm_count: int) -> NDArray:
694698
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
695699
return LazyStorage(load=load, kind=pid[1], description=description)
696700

697-
# @staticmethod
698-
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
701+
# @staticmethod
702+
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
703+
# pyright: ignore[reportSelfClsParameterName]
699704
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
700705
assert isinstance(storage, LazyStorage)
701706

@@ -812,7 +817,7 @@ def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
812817
# Use mmap for the actual data to avoid race conditions with the file offset.
813818
off = fp.raw.tell()
814819
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
815-
fp.raw.seek(off) # needed on Windows
820+
fp.raw.seek(off) # needed on Windows
816821

817822
def read_tensor() -> None: # this is a function so that variables captured in `load` don't change
818823
shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
@@ -1054,7 +1059,7 @@ def load_some_model(path: Path) -> ModelPlus:
10541059
files = list(path.glob("model-00001-of-*.safetensors"))
10551060
if not files:
10561061
# Try the PyTorch patterns too, with lower priority
1057-
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ]
1062+
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
10581063
files = [file for glob in globs for file in path.glob(glob)]
10591064
if not files:
10601065
# Try GGML too, but with lower priority, since if both a non-GGML
@@ -1094,7 +1099,9 @@ def load_vocab(path: Path) -> SentencePieceVocab:
10941099
elif path3.exists():
10951100
path = path3
10961101
else:
1097-
raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir")
1102+
raise FileNotFoundError(
1103+
f"Could not find tokenizer.model in {path} or its parent; "
1104+
"if it's in another directory, pass the directory as --vocab-dir")
10981105
added_tokens_path = path.parent / "added_tokens.json"
10991106
print(f"Loading vocab file {path}")
11001107
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
@@ -1110,7 +1117,9 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path:
11101117
}[params.file_type]
11111118
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
11121119
if ret in model_paths:
1113-
sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input. Please explicitly specify a path using --outfile.\n")
1120+
sys.stderr.write(
1121+
f"Error: Default output path ({ret}) would overwrite the input. "
1122+
"Please explicitly specify a path using --outfile.\n")
11141123
sys.exit(1)
11151124
return ret
11161125

@@ -1131,7 +1140,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
11311140
parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
11321141
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
11331142
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
1134-
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
1143+
parser.add_argument("model", type=Path,
1144+
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
11351145
args = parser.parse_args(args_in)
11361146

11371147
vocab: Vocab

examples/jeopardy/graph.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import matplotlib.pyplot as plt
2-
import sys, os
2+
import os
33
import csv
44

55
labels = []
@@ -8,6 +8,7 @@
88

99
rows = []
1010

11+
1112
def bar_chart(numbers, labels, pos):
1213
plt.bar(pos, numbers, color='blue')
1314
plt.xticks(ticks=pos, labels=labels)
@@ -16,6 +17,7 @@ def bar_chart(numbers, labels, pos):
1617
plt.ylabel("Questions Correct")
1718
plt.show()
1819

20+
1921
def calculatecorrect():
2022
directory = os.fsencode("./examples/jeopardy/results/")
2123
csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
@@ -38,14 +40,13 @@ def calculatecorrect():
3840
print(line)
3941
else:
4042
print("Correct answer: " + rows[i][2] + "\n")
41-
i+=1
43+
i += 1
4244
print("Did the AI get the question right? (y/n)")
4345
if input() == "y":
4446
totalcorrect += 1
4547
numbers.append(totalcorrect)
4648

4749

48-
4950
if __name__ == '__main__':
5051
calculatecorrect()
5152
pos = list(range(numEntries))

scripts/verify-checksum-models.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import os
22
import hashlib
33

4+
45
def sha256sum(file):
56
block_size = 16 * 1024 * 1024 # 16 MB block size
6-
b = bytearray(block_size)
7+
b = bytearray(block_size)
78
file_hash = hashlib.sha256()
89
mv = memoryview(b)
910
with open(file, 'rb', buffering=0) as f:
@@ -15,6 +16,7 @@ def sha256sum(file):
1516

1617
return file_hash.hexdigest()
1718

19+
1820
# Define the path to the llama directory (parent folder of script directory)
1921
llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
2022

0 commit comments

Comments
 (0)