Skip to content

Commit f9a297e

Browse files
committed
Add Docker example script for testing Crawl4AI functionality
1 parent bcdd809 commit f9a297e

File tree

1 file changed

+300
-0
lines changed

1 file changed

+300
-0
lines changed

Diff for: docs/examples/docker_example.py

+300
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
import requests
2+
import json
3+
import time
4+
import sys
5+
import base64
6+
import os
7+
from typing import Dict, Any
8+
9+
class Crawl4AiTester:
10+
def __init__(self, base_url: str = "http://localhost:11235"):
11+
self.base_url = base_url
12+
13+
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
14+
# Submit crawl job
15+
response = requests.post(f"{self.base_url}/crawl", json=request_data)
16+
task_id = response.json()["task_id"]
17+
print(f"Task ID: {task_id}")
18+
19+
# Poll for result
20+
start_time = time.time()
21+
while True:
22+
if time.time() - start_time > timeout:
23+
raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
24+
25+
result = requests.get(f"{self.base_url}/task/{task_id}")
26+
status = result.json()
27+
28+
if status["status"] == "failed":
29+
print("Task failed:", status.get("error"))
30+
raise Exception(f"Task failed: {status.get('error')}")
31+
32+
if status["status"] == "completed":
33+
return status
34+
35+
time.sleep(2)
36+
37+
def test_docker_deployment(version="basic"):
38+
tester = Crawl4AiTester()
39+
print(f"Testing Crawl4AI Docker {version} version")
40+
41+
# Health check with timeout and retry
42+
max_retries = 5
43+
for i in range(max_retries):
44+
try:
45+
health = requests.get(f"{tester.base_url}/health", timeout=10)
46+
print("Health check:", health.json())
47+
break
48+
except requests.exceptions.RequestException as e:
49+
if i == max_retries - 1:
50+
print(f"Failed to connect after {max_retries} attempts")
51+
sys.exit(1)
52+
print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
53+
time.sleep(5)
54+
55+
# Test cases based on version
56+
test_basic_crawl(tester)
57+
58+
# if version in ["full", "transformer"]:
59+
# test_cosine_extraction(tester)
60+
61+
# test_js_execution(tester)
62+
# test_css_selector(tester)
63+
# test_structured_extraction(tester)
64+
# test_llm_extraction(tester)
65+
# test_llm_with_ollama(tester)
66+
# test_screenshot(tester)
67+
68+
69+
def test_basic_crawl(tester: Crawl4AiTester):
70+
print("\n=== Testing Basic Crawl ===")
71+
request = {
72+
"urls": "https://www.nbcnews.com/business",
73+
"priority": 10
74+
}
75+
76+
result = tester.submit_and_wait(request)
77+
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
78+
assert result["result"]["success"]
79+
assert len(result["result"]["markdown"]) > 0
80+
81+
def test_js_execution(tester: Crawl4AiTester):
82+
print("\n=== Testing JS Execution ===")
83+
request = {
84+
"urls": "https://www.nbcnews.com/business",
85+
"priority": 8,
86+
"js_code": [
87+
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
88+
],
89+
"wait_for": "article.tease-card:nth-child(10)",
90+
"crawler_params": {
91+
"headless": True
92+
}
93+
}
94+
95+
result = tester.submit_and_wait(request)
96+
print(f"JS execution result length: {len(result['result']['markdown'])}")
97+
assert result["result"]["success"]
98+
99+
def test_css_selector(tester: Crawl4AiTester):
100+
print("\n=== Testing CSS Selector ===")
101+
request = {
102+
"urls": "https://www.nbcnews.com/business",
103+
"priority": 7,
104+
"css_selector": ".wide-tease-item__description",
105+
"crawler_params": {
106+
"headless": True
107+
},
108+
"extra": {"word_count_threshold": 10}
109+
110+
}
111+
112+
result = tester.submit_and_wait(request)
113+
print(f"CSS selector result length: {len(result['result']['markdown'])}")
114+
assert result["result"]["success"]
115+
116+
def test_structured_extraction(tester: Crawl4AiTester):
117+
print("\n=== Testing Structured Extraction ===")
118+
schema = {
119+
"name": "Coinbase Crypto Prices",
120+
"baseSelector": ".cds-tableRow-t45thuk",
121+
"fields": [
122+
{
123+
"name": "crypto",
124+
"selector": "td:nth-child(1) h2",
125+
"type": "text",
126+
},
127+
{
128+
"name": "symbol",
129+
"selector": "td:nth-child(1) p",
130+
"type": "text",
131+
},
132+
{
133+
"name": "price",
134+
"selector": "td:nth-child(2)",
135+
"type": "text",
136+
}
137+
],
138+
}
139+
140+
request = {
141+
"urls": "https://www.coinbase.com/explore",
142+
"priority": 9,
143+
"extraction_config": {
144+
"type": "json_css",
145+
"params": {
146+
"schema": schema
147+
}
148+
}
149+
}
150+
151+
result = tester.submit_and_wait(request)
152+
extracted = json.loads(result["result"]["extracted_content"])
153+
print(f"Extracted {len(extracted)} items")
154+
print("Sample item:", json.dumps(extracted[0], indent=2))
155+
assert result["result"]["success"]
156+
assert len(extracted) > 0
157+
158+
def test_llm_extraction(tester: Crawl4AiTester):
159+
print("\n=== Testing LLM Extraction ===")
160+
schema = {
161+
"type": "object",
162+
"properties": {
163+
"model_name": {
164+
"type": "string",
165+
"description": "Name of the OpenAI model."
166+
},
167+
"input_fee": {
168+
"type": "string",
169+
"description": "Fee for input token for the OpenAI model."
170+
},
171+
"output_fee": {
172+
"type": "string",
173+
"description": "Fee for output token for the OpenAI model."
174+
}
175+
},
176+
"required": ["model_name", "input_fee", "output_fee"]
177+
}
178+
179+
request = {
180+
"urls": "https://openai.com/api/pricing",
181+
"priority": 8,
182+
"extraction_config": {
183+
"type": "llm",
184+
"params": {
185+
"provider": "openai/gpt-4o-mini",
186+
"api_token": os.getenv("OPENAI_API_KEY"),
187+
"schema": schema,
188+
"extraction_type": "schema",
189+
"instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
190+
}
191+
},
192+
"crawler_params": {"word_count_threshold": 1}
193+
}
194+
195+
try:
196+
result = tester.submit_and_wait(request)
197+
extracted = json.loads(result["result"]["extracted_content"])
198+
print(f"Extracted {len(extracted)} model pricing entries")
199+
print("Sample entry:", json.dumps(extracted[0], indent=2))
200+
assert result["result"]["success"]
201+
except Exception as e:
202+
print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
203+
204+
def test_llm_with_ollama(tester: Crawl4AiTester):
205+
print("\n=== Testing LLM with Ollama ===")
206+
schema = {
207+
"type": "object",
208+
"properties": {
209+
"article_title": {
210+
"type": "string",
211+
"description": "The main title of the news article"
212+
},
213+
"summary": {
214+
"type": "string",
215+
"description": "A brief summary of the article content"
216+
},
217+
"main_topics": {
218+
"type": "array",
219+
"items": {"type": "string"},
220+
"description": "Main topics or themes discussed in the article"
221+
}
222+
}
223+
}
224+
225+
request = {
226+
"urls": "https://www.nbcnews.com/business",
227+
"priority": 8,
228+
"extraction_config": {
229+
"type": "llm",
230+
"params": {
231+
"provider": "ollama/llama2",
232+
"schema": schema,
233+
"extraction_type": "schema",
234+
"instruction": "Extract the main article information including title, summary, and main topics."
235+
}
236+
},
237+
"extra": {"word_count_threshold": 1},
238+
"crawler_params": {"verbose": True}
239+
}
240+
241+
try:
242+
result = tester.submit_and_wait(request)
243+
extracted = json.loads(result["result"]["extracted_content"])
244+
print("Extracted content:", json.dumps(extracted, indent=2))
245+
assert result["result"]["success"]
246+
except Exception as e:
247+
print(f"Ollama extraction test failed: {str(e)}")
248+
249+
def test_cosine_extraction(tester: Crawl4AiTester):
250+
print("\n=== Testing Cosine Extraction ===")
251+
request = {
252+
"urls": "https://www.nbcnews.com/business",
253+
"priority": 8,
254+
"extraction_config": {
255+
"type": "cosine",
256+
"params": {
257+
"semantic_filter": "business finance economy",
258+
"word_count_threshold": 10,
259+
"max_dist": 0.2,
260+
"top_k": 3
261+
}
262+
}
263+
}
264+
265+
try:
266+
result = tester.submit_and_wait(request)
267+
extracted = json.loads(result["result"]["extracted_content"])
268+
print(f"Extracted {len(extracted)} text clusters")
269+
print("First cluster tags:", extracted[0]["tags"])
270+
assert result["result"]["success"]
271+
except Exception as e:
272+
print(f"Cosine extraction test failed: {str(e)}")
273+
274+
def test_screenshot(tester: Crawl4AiTester):
275+
print("\n=== Testing Screenshot ===")
276+
request = {
277+
"urls": "https://www.nbcnews.com/business",
278+
"priority": 5,
279+
"screenshot": True,
280+
"crawler_params": {
281+
"headless": True
282+
}
283+
}
284+
285+
result = tester.submit_and_wait(request)
286+
print("Screenshot captured:", bool(result["result"]["screenshot"]))
287+
288+
if result["result"]["screenshot"]:
289+
# Save screenshot
290+
screenshot_data = base64.b64decode(result["result"]["screenshot"])
291+
with open("test_screenshot.jpg", "wb") as f:
292+
f.write(screenshot_data)
293+
print("Screenshot saved as test_screenshot.jpg")
294+
295+
assert result["result"]["success"]
296+
297+
if __name__ == "__main__":
298+
version = sys.argv[1] if len(sys.argv) > 1 else "basic"
299+
# version = "full"
300+
test_docker_deployment(version)

0 commit comments

Comments
 (0)