1
+ import requests
2
+ import json
3
+ import time
4
+ import sys
5
+ import base64
6
+ import os
7
+ from typing import Dict , Any
8
+
9
+ class Crawl4AiTester :
10
+ def __init__ (self , base_url : str = "http://localhost:11235" ):
11
+ self .base_url = base_url
12
+
13
+ def submit_and_wait (self , request_data : Dict [str , Any ], timeout : int = 300 ) -> Dict [str , Any ]:
14
+ # Submit crawl job
15
+ response = requests .post (f"{ self .base_url } /crawl" , json = request_data )
16
+ task_id = response .json ()["task_id" ]
17
+ print (f"Task ID: { task_id } " )
18
+
19
+ # Poll for result
20
+ start_time = time .time ()
21
+ while True :
22
+ if time .time () - start_time > timeout :
23
+ raise TimeoutError (f"Task { task_id } did not complete within { timeout } seconds" )
24
+
25
+ result = requests .get (f"{ self .base_url } /task/{ task_id } " )
26
+ status = result .json ()
27
+
28
+ if status ["status" ] == "failed" :
29
+ print ("Task failed:" , status .get ("error" ))
30
+ raise Exception (f"Task failed: { status .get ('error' )} " )
31
+
32
+ if status ["status" ] == "completed" :
33
+ return status
34
+
35
+ time .sleep (2 )
36
+
37
+ def test_docker_deployment (version = "basic" ):
38
+ tester = Crawl4AiTester ()
39
+ print (f"Testing Crawl4AI Docker { version } version" )
40
+
41
+ # Health check with timeout and retry
42
+ max_retries = 5
43
+ for i in range (max_retries ):
44
+ try :
45
+ health = requests .get (f"{ tester .base_url } /health" , timeout = 10 )
46
+ print ("Health check:" , health .json ())
47
+ break
48
+ except requests .exceptions .RequestException as e :
49
+ if i == max_retries - 1 :
50
+ print (f"Failed to connect after { max_retries } attempts" )
51
+ sys .exit (1 )
52
+ print (f"Waiting for service to start (attempt { i + 1 } /{ max_retries } )..." )
53
+ time .sleep (5 )
54
+
55
+ # Test cases based on version
56
+ test_basic_crawl (tester )
57
+
58
+ # if version in ["full", "transformer"]:
59
+ # test_cosine_extraction(tester)
60
+
61
+ # test_js_execution(tester)
62
+ # test_css_selector(tester)
63
+ # test_structured_extraction(tester)
64
+ # test_llm_extraction(tester)
65
+ # test_llm_with_ollama(tester)
66
+ # test_screenshot(tester)
67
+
68
+
69
+ def test_basic_crawl (tester : Crawl4AiTester ):
70
+ print ("\n === Testing Basic Crawl ===" )
71
+ request = {
72
+ "urls" : "https://www.nbcnews.com/business" ,
73
+ "priority" : 10
74
+ }
75
+
76
+ result = tester .submit_and_wait (request )
77
+ print (f"Basic crawl result length: { len (result ['result' ]['markdown' ])} " )
78
+ assert result ["result" ]["success" ]
79
+ assert len (result ["result" ]["markdown" ]) > 0
80
+
81
+ def test_js_execution (tester : Crawl4AiTester ):
82
+ print ("\n === Testing JS Execution ===" )
83
+ request = {
84
+ "urls" : "https://www.nbcnews.com/business" ,
85
+ "priority" : 8 ,
86
+ "js_code" : [
87
+ "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
88
+ ],
89
+ "wait_for" : "article.tease-card:nth-child(10)" ,
90
+ "crawler_params" : {
91
+ "headless" : True
92
+ }
93
+ }
94
+
95
+ result = tester .submit_and_wait (request )
96
+ print (f"JS execution result length: { len (result ['result' ]['markdown' ])} " )
97
+ assert result ["result" ]["success" ]
98
+
99
+ def test_css_selector (tester : Crawl4AiTester ):
100
+ print ("\n === Testing CSS Selector ===" )
101
+ request = {
102
+ "urls" : "https://www.nbcnews.com/business" ,
103
+ "priority" : 7 ,
104
+ "css_selector" : ".wide-tease-item__description" ,
105
+ "crawler_params" : {
106
+ "headless" : True
107
+ },
108
+ "extra" : {"word_count_threshold" : 10 }
109
+
110
+ }
111
+
112
+ result = tester .submit_and_wait (request )
113
+ print (f"CSS selector result length: { len (result ['result' ]['markdown' ])} " )
114
+ assert result ["result" ]["success" ]
115
+
116
+ def test_structured_extraction (tester : Crawl4AiTester ):
117
+ print ("\n === Testing Structured Extraction ===" )
118
+ schema = {
119
+ "name" : "Coinbase Crypto Prices" ,
120
+ "baseSelector" : ".cds-tableRow-t45thuk" ,
121
+ "fields" : [
122
+ {
123
+ "name" : "crypto" ,
124
+ "selector" : "td:nth-child(1) h2" ,
125
+ "type" : "text" ,
126
+ },
127
+ {
128
+ "name" : "symbol" ,
129
+ "selector" : "td:nth-child(1) p" ,
130
+ "type" : "text" ,
131
+ },
132
+ {
133
+ "name" : "price" ,
134
+ "selector" : "td:nth-child(2)" ,
135
+ "type" : "text" ,
136
+ }
137
+ ],
138
+ }
139
+
140
+ request = {
141
+ "urls" : "https://www.coinbase.com/explore" ,
142
+ "priority" : 9 ,
143
+ "extraction_config" : {
144
+ "type" : "json_css" ,
145
+ "params" : {
146
+ "schema" : schema
147
+ }
148
+ }
149
+ }
150
+
151
+ result = tester .submit_and_wait (request )
152
+ extracted = json .loads (result ["result" ]["extracted_content" ])
153
+ print (f"Extracted { len (extracted )} items" )
154
+ print ("Sample item:" , json .dumps (extracted [0 ], indent = 2 ))
155
+ assert result ["result" ]["success" ]
156
+ assert len (extracted ) > 0
157
+
158
+ def test_llm_extraction (tester : Crawl4AiTester ):
159
+ print ("\n === Testing LLM Extraction ===" )
160
+ schema = {
161
+ "type" : "object" ,
162
+ "properties" : {
163
+ "model_name" : {
164
+ "type" : "string" ,
165
+ "description" : "Name of the OpenAI model."
166
+ },
167
+ "input_fee" : {
168
+ "type" : "string" ,
169
+ "description" : "Fee for input token for the OpenAI model."
170
+ },
171
+ "output_fee" : {
172
+ "type" : "string" ,
173
+ "description" : "Fee for output token for the OpenAI model."
174
+ }
175
+ },
176
+ "required" : ["model_name" , "input_fee" , "output_fee" ]
177
+ }
178
+
179
+ request = {
180
+ "urls" : "https://openai.com/api/pricing" ,
181
+ "priority" : 8 ,
182
+ "extraction_config" : {
183
+ "type" : "llm" ,
184
+ "params" : {
185
+ "provider" : "openai/gpt-4o-mini" ,
186
+ "api_token" : os .getenv ("OPENAI_API_KEY" ),
187
+ "schema" : schema ,
188
+ "extraction_type" : "schema" ,
189
+ "instruction" : """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
190
+ }
191
+ },
192
+ "crawler_params" : {"word_count_threshold" : 1 }
193
+ }
194
+
195
+ try :
196
+ result = tester .submit_and_wait (request )
197
+ extracted = json .loads (result ["result" ]["extracted_content" ])
198
+ print (f"Extracted { len (extracted )} model pricing entries" )
199
+ print ("Sample entry:" , json .dumps (extracted [0 ], indent = 2 ))
200
+ assert result ["result" ]["success" ]
201
+ except Exception as e :
202
+ print (f"LLM extraction test failed (might be due to missing API key): { str (e )} " )
203
+
204
+ def test_llm_with_ollama (tester : Crawl4AiTester ):
205
+ print ("\n === Testing LLM with Ollama ===" )
206
+ schema = {
207
+ "type" : "object" ,
208
+ "properties" : {
209
+ "article_title" : {
210
+ "type" : "string" ,
211
+ "description" : "The main title of the news article"
212
+ },
213
+ "summary" : {
214
+ "type" : "string" ,
215
+ "description" : "A brief summary of the article content"
216
+ },
217
+ "main_topics" : {
218
+ "type" : "array" ,
219
+ "items" : {"type" : "string" },
220
+ "description" : "Main topics or themes discussed in the article"
221
+ }
222
+ }
223
+ }
224
+
225
+ request = {
226
+ "urls" : "https://www.nbcnews.com/business" ,
227
+ "priority" : 8 ,
228
+ "extraction_config" : {
229
+ "type" : "llm" ,
230
+ "params" : {
231
+ "provider" : "ollama/llama2" ,
232
+ "schema" : schema ,
233
+ "extraction_type" : "schema" ,
234
+ "instruction" : "Extract the main article information including title, summary, and main topics."
235
+ }
236
+ },
237
+ "extra" : {"word_count_threshold" : 1 },
238
+ "crawler_params" : {"verbose" : True }
239
+ }
240
+
241
+ try :
242
+ result = tester .submit_and_wait (request )
243
+ extracted = json .loads (result ["result" ]["extracted_content" ])
244
+ print ("Extracted content:" , json .dumps (extracted , indent = 2 ))
245
+ assert result ["result" ]["success" ]
246
+ except Exception as e :
247
+ print (f"Ollama extraction test failed: { str (e )} " )
248
+
249
+ def test_cosine_extraction (tester : Crawl4AiTester ):
250
+ print ("\n === Testing Cosine Extraction ===" )
251
+ request = {
252
+ "urls" : "https://www.nbcnews.com/business" ,
253
+ "priority" : 8 ,
254
+ "extraction_config" : {
255
+ "type" : "cosine" ,
256
+ "params" : {
257
+ "semantic_filter" : "business finance economy" ,
258
+ "word_count_threshold" : 10 ,
259
+ "max_dist" : 0.2 ,
260
+ "top_k" : 3
261
+ }
262
+ }
263
+ }
264
+
265
+ try :
266
+ result = tester .submit_and_wait (request )
267
+ extracted = json .loads (result ["result" ]["extracted_content" ])
268
+ print (f"Extracted { len (extracted )} text clusters" )
269
+ print ("First cluster tags:" , extracted [0 ]["tags" ])
270
+ assert result ["result" ]["success" ]
271
+ except Exception as e :
272
+ print (f"Cosine extraction test failed: { str (e )} " )
273
+
274
+ def test_screenshot (tester : Crawl4AiTester ):
275
+ print ("\n === Testing Screenshot ===" )
276
+ request = {
277
+ "urls" : "https://www.nbcnews.com/business" ,
278
+ "priority" : 5 ,
279
+ "screenshot" : True ,
280
+ "crawler_params" : {
281
+ "headless" : True
282
+ }
283
+ }
284
+
285
+ result = tester .submit_and_wait (request )
286
+ print ("Screenshot captured:" , bool (result ["result" ]["screenshot" ]))
287
+
288
+ if result ["result" ]["screenshot" ]:
289
+ # Save screenshot
290
+ screenshot_data = base64 .b64decode (result ["result" ]["screenshot" ])
291
+ with open ("test_screenshot.jpg" , "wb" ) as f :
292
+ f .write (screenshot_data )
293
+ print ("Screenshot saved as test_screenshot.jpg" )
294
+
295
+ assert result ["result" ]["success" ]
296
+
297
+ if __name__ == "__main__" :
298
+ version = sys .argv [1 ] if len (sys .argv ) > 1 else "basic"
299
+ # version = "full"
300
+ test_docker_deployment (version )
0 commit comments