-
Notifications
You must be signed in to change notification settings - Fork 3.3k
/
Copy pathtutorial_v0.5.py
460 lines (382 loc) · 17 KB
/
tutorial_v0.5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
import asyncio
import time
import re
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig, MemoryAdaptiveDispatcher, HTTPCrawlerConfig
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import (
BestFirstCrawlingStrategy,
FilterChain,
URLPatternFilter,
DomainFilter,
ContentTypeFilter,
)
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
from crawl4ai.configs import ProxyConfig
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai.types import LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
from pprint import pprint
# 1️⃣ Deep Crawling with Best-First Strategy
async def deep_crawl():
"""
PART 1: Deep Crawling with Best-First Strategy
This function demonstrates:
- Using the BestFirstCrawlingStrategy
- Creating filter chains to narrow down crawl targets
- Using a scorer to prioritize certain URLs
- Respecting robots.txt rules
"""
print("\n===== DEEP CRAWLING =====")
print("This example shows how to implement deep crawling with filters, scorers, and robots.txt compliance.")
# Create a filter chain to filter urls based on patterns, domains and content type
filter_chain = FilterChain(
[
DomainFilter(
allowed_domains=["docs.crawl4ai.com"],
blocked_domains=["old.docs.crawl4ai.com"],
),
URLPatternFilter(patterns=["*core*", "*advanced*"],),
ContentTypeFilter(allowed_types=["text/html"]),
]
)
# Create a keyword scorer that prioritises the pages with certain keywords first
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration"], weight=0.7
)
# Set up the configuration with robots.txt compliance enabled
deep_crawl_config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
filter_chain=filter_chain,
url_scorer=keyword_scorer,
),
scraping_strategy=LXMLWebScrapingStrategy(),
stream=True,
verbose=True,
check_robots_txt=True, # Enable robots.txt compliance
)
# Execute the crawl
async with AsyncWebCrawler() as crawler:
print("\n📊 Starting deep crawl with Best-First strategy...")
print(" - Filtering by domain, URL patterns, and content type")
print(" - Scoring pages based on keyword relevance")
print(" - Respecting robots.txt rules")
start_time = time.perf_counter()
results = []
async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config):
# Print each result as it comes in
depth = result.metadata.get("depth", 0)
score = result.metadata.get("score", 0)
print(f"Crawled: {result.url} (Depth: {depth}), score: {score:.2f}")
results.append(result)
duration = time.perf_counter() - start_time
# Print summary statistics
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
# Group by depth
if results:
depth_counts = {}
for result in results:
depth = result.metadata.get("depth", 0)
depth_counts[depth] = depth_counts.get(depth, 0) + 1
print("\n📊 Pages crawled by depth:")
for depth, count in sorted(depth_counts.items()):
print(f" Depth {depth}: {count} pages")
# 2️⃣ Memory-Adaptive Dispatcher
async def memory_adaptive_dispatcher():
"""
PART 2: Memory-Adaptive Dispatcher
This function demonstrates:
- Using MemoryAdaptiveDispatcher to manage system memory
- Batch and streaming modes with multiple URLs
"""
print("\n===== MEMORY-ADAPTIVE DISPATCHER =====")
print("This example shows how to use the memory-adaptive dispatcher for resource management.")
# Configure the dispatcher (optional, defaults are used if not provided)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0, # Pause if memory usage exceeds 80%
check_interval=0.5, # Check memory every 0.5 seconds
)
# Test URLs
urls = [
"https://docs.crawl4ai.com",
"https://github.com/unclecode/crawl4ai"
]
async def batch_mode():
print("\n📊 BATCH MODE:")
print(" In this mode, all results are collected before being returned.")
async with AsyncWebCrawler() as crawler:
start_time = time.perf_counter()
results = await crawler.arun_many(
urls=urls,
config=CrawlerRunConfig(stream=False), # Batch mode
dispatcher=dispatcher,
)
print(f" ✅ Received all {len(results)} results after {time.perf_counter() - start_time:.2f} seconds")
for result in results:
print(f" → {result.url} with status code: {result.status_code}")
async def stream_mode():
print("\n📊 STREAMING MODE:")
print(" In this mode, results are processed as they become available.")
async with AsyncWebCrawler() as crawler:
start_time = time.perf_counter()
count = 0
first_result_time = None
async for result in await crawler.arun_many(
urls=urls,
config=CrawlerRunConfig(stream=True), # Stream mode
dispatcher=dispatcher,
):
count += 1
current_time = time.perf_counter() - start_time
if count == 1:
first_result_time = current_time
print(f" ✅ First result after {first_result_time:.2f} seconds: {result.url}")
else:
print(f" → Result #{count} after {current_time:.2f} seconds: {result.url}")
print(f" ✅ Total: {count} results")
print(f" ✅ First result: {first_result_time:.2f} seconds")
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
# Run both examples
await batch_mode()
await stream_mode()
print("\n🔍 Key Takeaway: The memory-adaptive dispatcher prevents OOM errors")
print(" and manages concurrency based on system resources.")
# 3️⃣ HTTP Crawler Strategy
async def http_crawler_strategy():
"""
PART 3: HTTP Crawler Strategy
This function demonstrates:
- Using the lightweight HTTP-only crawler
- Setting custom headers and configurations
"""
print("\n===== HTTP CRAWLER STRATEGY =====")
print("This example shows how to use the fast, lightweight HTTP-only crawler.")
# Use the HTTP crawler strategy
http_config = HTTPCrawlerConfig(
method="GET",
headers={"User-Agent": "MyCustomBot/1.0"},
follow_redirects=True,
verify_ssl=True
)
print("\n📊 Initializing HTTP crawler strategy...")
print(" - Using custom User-Agent: MyCustomBot/1.0")
print(" - Following redirects: Enabled")
print(" - Verifying SSL: Enabled")
# Create crawler with HTTP strategy
async with AsyncWebCrawler(
crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config=http_config)
) as crawler:
start_time = time.perf_counter()
result = await crawler.arun("https://example.com")
duration = time.perf_counter() - start_time
print(f"\n✅ Crawled in {duration:.2f} seconds")
print(f"✅ Status code: {result.status_code}")
print(f"✅ Content length: {len(result.html)} bytes")
# Check if there was a redirect
if result.redirected_url and result.redirected_url != result.url:
print(f"ℹ️ Redirected from {result.url} to {result.redirected_url}")
print("\n🔍 Key Takeaway: HTTP crawler is faster and more memory-efficient")
print(" than browser-based crawling for simple pages.")
# 4️⃣ Proxy Rotation
async def proxy_rotation():
"""
PART 4: Proxy Rotation
This function demonstrates:
- Setting up a proxy rotation strategy
- Using multiple proxies in a round-robin fashion
"""
print("\n===== PROXY ROTATION =====")
print("This example shows how to implement proxy rotation for distributed crawling.")
# Load proxies and create rotation strategy
proxies = ProxyConfig.from_env()
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
if not proxies:
print("No proxies found in environment. Set PROXIES env variable!")
return
proxy_strategy = RoundRobinProxyStrategy(proxies)
# Create configs
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
proxy_rotation_strategy=proxy_strategy
)
async with AsyncWebCrawler(config=browser_config) as crawler:
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
print("\n📈 Initializing crawler with proxy rotation...")
async with AsyncWebCrawler(config=browser_config) as crawler:
print("\n🚀 Starting batch crawl with proxy rotation...")
results = await crawler.arun_many(
urls=urls,
config=run_config
)
for result in results:
if result.success:
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
current_proxy = run_config.proxy_config if run_config.proxy_config else None
if current_proxy and ip_match:
print(f"URL {result.url}")
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
verified = ip_match.group(0) == current_proxy.ip
if verified:
print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
else:
print("❌ Proxy failed or IP mismatch!")
print("---")
else:
print(f"❌ Crawl via proxy failed!: {result.error_message}")
# 5️⃣ LLM Content Filter (requires API key)
async def llm_content_filter():
"""
PART 5: LLM Content Filter
This function demonstrates:
- Configuring LLM providers via LLMConfig
- Using LLM to generate focused markdown
- LLMConfig for configuration
Note: Requires a valid API key for the chosen LLM provider
"""
print("\n===== LLM CONTENT FILTER =====")
print("This example shows how to use LLM to generate focused markdown content.")
print("Note: This example requires an API key. Set it in environment variables.")
# Create LLM configuration
# Replace with your actual API key or set as environment variable
llm_config = LLMConfig(
provider="gemini/gemini-1.5-pro",
api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable
)
print("\n📊 Setting up LLM content filter...")
print(f" - Provider: {llm_config.provider}")
print(" - API token: Using environment variable")
print(" - Instruction: Extract key concepts and summaries")
# Create markdown generator with LLM filter
markdown_generator = DefaultMarkdownGenerator(
content_filter=LLMContentFilter(
llm_config=llm_config,
instruction="Extract key concepts and summaries"
)
)
config = CrawlerRunConfig(markdown_generator=markdown_generator)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://docs.crawl4ai.com", config=config)
pprint(result.markdown.fit_markdown)
print("\n✅ Generated focused markdown:")
# 6️⃣ PDF Processing
async def pdf_processing():
"""
PART 6: PDF Processing
This function demonstrates:
- Using PDFCrawlerStrategy and PDFContentScrapingStrategy
- Extracting text and metadata from PDFs
"""
print("\n===== PDF PROCESSING =====")
print("This example shows how to extract text and metadata from PDF files.")
# Sample PDF URL
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
print("\n📊 Initializing PDF crawler...")
print(f" - Target PDF: {pdf_url}")
print(" - Using PDFCrawlerStrategy and PDFContentScrapingStrategy")
# Create crawler with PDF strategy
async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
print("\n🚀 Starting PDF processing...")
start_time = time.perf_counter()
result = await crawler.arun(
pdf_url,
config=CrawlerRunConfig(scraping_strategy=PDFContentScrapingStrategy())
)
duration = time.perf_counter() - start_time
print(f"\n✅ Processed PDF in {duration:.2f} seconds")
# Show metadata
print("\n📄 PDF Metadata:")
if result.metadata:
for key, value in result.metadata.items():
if key not in ["html", "text", "markdown"] and value:
print(f" - {key}: {value}")
else:
print(" No metadata available")
# Show sample of content
if result.markdown:
print("\n📝 PDF Content Sample:")
content_sample = result.markdown[:500] + "..." if len(result.markdown) > 500 else result.markdown
print(f"---\n{content_sample}\n---")
else:
print("\n⚠️ No content extracted")
print("\n🔍 Key Takeaway: Crawl4AI can now process PDF files")
print(" to extract both text content and metadata.")
# 7️⃣ LLM Schema Generation (requires API key)
async def llm_schema_generation():
"""
PART 7: LLM Schema Generation
This function demonstrates:
- Configuring LLM providers via LLMConfig
- Using LLM to generate extraction schemas
- JsonCssExtractionStrategy
Note: Requires a valid API key for the chosen LLM provider
"""
print("\n===== LLM SCHEMA GENERATION =====")
print("This example shows how to use LLM to automatically generate extraction schemas.")
print("Note: This example requires an API key. Set it in environment variables.")
# Sample HTML
sample_html = """
<div class="product">
<h2 class="title">Awesome Gaming Laptop</h2>
<div class="price">$1,299.99</div>
<div class="specs">
<ul>
<li>16GB RAM</li>
<li>512GB SSD</li>
<li>RTX 3080</li>
</ul>
</div>
<div class="rating">4.7/5</div>
</div>
"""
print("\n📊 Setting up LLMConfig...")
# Create LLM configuration
llm_config = LLMConfig(
provider="gemini/gemini-1.5-pro",
api_token="env:GEMINI_API_KEY"
)
print("\n🚀 Generating schema for product extraction...")
print(" This would use the LLM to analyze HTML and create an extraction schema")
schema = JsonCssExtractionStrategy.generate_schema(
html=sample_html,
llm_config = llm_config,
query="Extract product name and price"
)
print("\n✅ Generated Schema:")
pprint(schema)
# Run all sections
async def run_tutorial():
"""
Main function to run all tutorial sections.
"""
print("\n🚀 CRAWL4AI v0.5.0 TUTORIAL 🚀")
print("===============================")
print("This tutorial demonstrates the key features of Crawl4AI v0.5.0")
print("Including deep crawling, memory-adaptive dispatching, advanced filtering,")
print("and more powerful extraction capabilities.")
# Sections to run
sections = [
deep_crawl, # 1. Deep Crawling with Best-First Strategy
memory_adaptive_dispatcher, # 2. Memory-Adaptive Dispatcher
http_crawler_strategy, # 3. HTTP Crawler Strategy
proxy_rotation, # 4. Proxy Rotation
llm_content_filter, # 5. LLM Content Filter
pdf_processing, # 6. PDF Processing
llm_schema_generation, # 7. Schema Generation using LLM
]
for section in sections:
try:
await section()
except Exception as e:
print(f"⚠️ Error in {section.__name__}: {e}")
print("\n🎉 TUTORIAL COMPLETE! 🎉")
print("You've now explored the key features of Crawl4AI v0.5.0")
print("For more information, visit https://docs.crawl4ai.com")
# Run the tutorial
if __name__ == "__main__":
asyncio.run(run_tutorial())