Skip to content

Commit 585f3cb

Browse files
committed
update pipeline to process by product
1 parent 7f5439c commit 585f3cb

14 files changed

+344
-1709
lines changed

.gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
scripts/llmstxt-files/
2+
scripts/run-assets/
3+
14
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
25

36
# dependencies
@@ -122,7 +125,7 @@ ipython_config.py
122125
# pyenv
123126
# For a library or package, you might want to ignore these files since the code is
124127
# intended to run in multiple environments; otherwise, check them in:
125-
.python-version
128+
# .python-version
126129

127130
# pipenv
128131
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.

.python-version

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.12

makefile

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
2+
# source .venv/bin/activate
3+
4+
.PHONY: run
5+
run:
6+
python scripts/run.py
7+
8+
.PHONY: clear
9+
clear:
10+
python scripts/clear.py

pyproject.toml

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[project]
2+
name = "llmstxt-site"
3+
version = "0.1.0"
4+
description = "Add your description here"
5+
readme = "README.md"
6+
requires-python = ">=3.12"
7+
dependencies = [
8+
"requests>=2.32.3",
9+
"tiktoken>=0.8.0",
10+
"tqdm>=4.67.1",
11+
]

scripts/01_download_txts.py

-59
This file was deleted.

scripts/02_count_tokens.py

-61
This file was deleted.

scripts/__init__.py

Whitespace-only changes.

scripts/clear.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import os
2+
import shutil
3+
4+
# Get the absolute path of the directory containing this script
5+
script_dir = os.path.dirname(os.path.abspath(__file__))
6+
7+
# Define the folders to delete
8+
folders_to_delete = ["llmstxt-files", "run-assets"]
9+
10+
# Check and delete each folder if it exists
11+
for folder in folders_to_delete:
12+
folder_path = os.path.join(script_dir, folder)
13+
if os.path.exists(folder_path):
14+
shutil.rmtree(folder_path)
15+
print(f"Deleted folder: {folder}")

scripts/create_product_json.py

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import requests
2+
import os
3+
4+
import tiktoken
5+
6+
enc = tiktoken.get_encoding("o200k_base")
7+
# To get the tokeniser corresponding to a specific model in the OpenAI API
8+
# enc = tiktoken.encoding_for_model("gpt-4o")
9+
disallowed_special = enc.special_tokens_set - {"<|endoftext|>"}
10+
11+
12+
def create_product_json(product: dict, product_dir: str):
13+
product_name = product["product"]
14+
product_json = {}
15+
product_json["product"] = product_name
16+
product_json["website"] = product["website"]
17+
18+
# Download each file in llms-txt and llms-full-txt and store content
19+
files = {}
20+
if product["llms-txt"]:
21+
files["llms-txt"] = product["llms-txt"]
22+
if product["llms-full-txt"]:
23+
files["llms-full-txt"] = product["llms-full-txt"]
24+
25+
file_contents = {}
26+
for filename, url in files.items():
27+
try:
28+
response = requests.get(url)
29+
response.raise_for_status()
30+
31+
# Save file to company directory
32+
filepath = os.path.join(product_dir, f"{filename}.txt")
33+
with open(filepath, "w", encoding="utf-8") as f:
34+
content = response.text
35+
tokens = enc.encode(content, disallowed_special=disallowed_special)
36+
37+
product_json[filename] = url
38+
product_json[f"{filename}-tokens"] = len(tokens)
39+
f.write(content)
40+
file_contents[filename] = content
41+
print(f"Downloaded {filename} for {product_name}")
42+
43+
except requests.RequestException as e:
44+
print(f"Error downloading {filename} for {product_name}: {e}")
45+
except Exception as e:
46+
print(f"Error encoding {filename}: {e}")
47+
48+
if "llms-full-txt" not in product_json:
49+
product_json["llms-full-txt"] = ""
50+
product_json["llms-full-txt-tokens"] = None
51+
if "llms-txt" not in product_json:
52+
product_json["llms-txt"] = ""
53+
product_json["llms-txt-tokens"] = None
54+
55+
# Create combined file with all content
56+
if file_contents:
57+
combined_filepath = os.path.join(product_dir, "combined.txt")
58+
with open(combined_filepath, "w", encoding="utf-8") as f:
59+
for filename, content in file_contents.items():
60+
f.write(content)
61+
f.write("\n\n")
62+
print(f"Created combined file for {product_name}")
63+
64+
return product_json

scripts/03_create_redirects.py renamed to scripts/create_product_redirects.py

+4-10
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
1-
import json
2-
3-
data = json.load(open("data_raw.json"))
4-
5-
redirects = []
6-
for product in data:
1+
def create_product_redirects(product: dict):
2+
redirects = []
73
source_path = product["product"].lower().replace(" ", "-").replace(".", "-")
84
destination_path = "#"
95
if product["llms-full-txt"]:
@@ -12,7 +8,7 @@
128
destination_path = product["llms-txt"]
139
else:
1410
print(f"No destination path for {product['product']}")
15-
continue
11+
return redirects
1612

1713
redirects.append(
1814
{
@@ -40,6 +36,4 @@
4036
}
4137
)
4238

43-
44-
with open("redirects.json", "w") as f:
45-
json.dump({"redirects": redirects}, f, indent=4)
39+
return redirects

0 commit comments

Comments
 (0)