Skip to content

Commit 19df96e

Browse files
committed
feat(proxy): add proxy rotation strategy
Implements a new proxy rotation system with the following changes: - Add ProxyRotationStrategy abstract base class - Add RoundRobinProxyStrategy concrete implementation - Integrate proxy rotation with AsyncWebCrawler - Add proxy_rotation_strategy parameter to CrawlerRunConfig - Add example script demonstrating proxy rotation usage - Remove deprecated synchronous WebCrawler code - Clean up rate limiting documentation BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations
1 parent b957ff2 commit 19df96e

12 files changed

+257
-162
lines changed

Diff for: crawl4ai/__init__.py

+24-18
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
WebScrapingStrategy,
99
LXMLWebScrapingStrategy,
1010
)
11+
from .proxy_strategy import (
12+
ProxyRotationStrategy,
13+
RoundRobinProxyStrategy,
14+
)
1115
from .extraction_strategy import (
1216
ExtractionStrategy,
1317
LLMExtractionStrategy,
@@ -60,31 +64,33 @@
6064
"DisplayMode",
6165
"MarkdownGenerationResult",
6266
"Crawl4aiDockerClient",
67+
"ProxyRotationStrategy",
68+
"RoundRobinProxyStrategy",
6369
]
6470

6571

66-
def is_sync_version_installed():
67-
try:
68-
import selenium # noqa
72+
# def is_sync_version_installed():
73+
# try:
74+
# import selenium # noqa
6975

70-
return True
71-
except ImportError:
72-
return False
76+
# return True
77+
# except ImportError:
78+
# return False
7379

7480

75-
if is_sync_version_installed():
76-
try:
77-
from .web_crawler import WebCrawler
81+
# if is_sync_version_installed():
82+
# try:
83+
# from .web_crawler import WebCrawler
7884

79-
__all__.append("WebCrawler")
80-
except ImportError:
81-
print(
82-
"Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
83-
)
84-
else:
85-
WebCrawler = None
86-
# import warnings
87-
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
85+
# __all__.append("WebCrawler")
86+
# except ImportError:
87+
# print(
88+
# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
89+
# )
90+
# else:
91+
# WebCrawler = None
92+
# # import warnings
93+
# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
8894

8995
# Disable all Pydantic warnings
9096
warnings.filterwarnings("ignore", module="pydantic")

Diff for: crawl4ai/async_configs.py

+5
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from .deep_crawling import DeepCrawlStrategy
1717
from typing import Union, List
1818
from .cache_context import CacheMode
19+
from .proxy_strategy import ProxyRotationStrategy
1920

2021
import inspect
2122
from typing import Any, Dict, Optional
@@ -542,6 +543,7 @@ def __init__(
542543
parser_type: str = "lxml",
543544
scraping_strategy: ContentScrapingStrategy = None,
544545
proxy_config: dict = None,
546+
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
545547
# SSL Parameters
546548
fetch_ssl_certificate: bool = False,
547549
# Caching Parameters
@@ -620,6 +622,7 @@ def __init__(
620622
self.parser_type = parser_type
621623
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
622624
self.proxy_config = proxy_config
625+
self.proxy_rotation_strategy = proxy_rotation_strategy
623626

624627
# SSL Parameters
625628
self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -731,6 +734,7 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
731734
parser_type=kwargs.get("parser_type", "lxml"),
732735
scraping_strategy=kwargs.get("scraping_strategy"),
733736
proxy_config=kwargs.get("proxy_config"),
737+
proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
734738
# SSL Parameters
735739
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
736740
# Caching Parameters
@@ -827,6 +831,7 @@ def to_dict(self):
827831
"parser_type": self.parser_type,
828832
"scraping_strategy": self.scraping_strategy,
829833
"proxy_config": self.proxy_config,
834+
"proxy_rotation_strategy": self.proxy_rotation_strategy,
830835
"fetch_ssl_certificate": self.fetch_ssl_certificate,
831836
"cache_mode": self.cache_mode,
832837
"session_id": self.session_id,

Diff for: crawl4ai/async_webcrawler.py

+13
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,19 @@ async def arun(
394394
tag="FETCH",
395395
)
396396

397+
# Update proxy configuration from rotation strategy if available
398+
if config and config.proxy_rotation_strategy:
399+
next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
400+
if next_proxy:
401+
if verbose:
402+
self.logger.info(
403+
message="Switch proxy: {proxy}",
404+
tag="PROXY",
405+
params={"proxy": next_proxy.get("server")},
406+
)
407+
config.proxy_config = next_proxy
408+
# config = config.clone(proxy_config=next_proxy)
409+
397410
# Fetch fresh content if needed
398411
if not cached_result or not html:
399412
t1 = time.perf_counter()

Diff for: crawl4ai/proxy_strategy.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from typing import List, Dict, Optional
2+
from abc import ABC, abstractmethod
3+
from itertools import cycle
4+
5+
class ProxyRotationStrategy(ABC):
6+
"""Base abstract class for proxy rotation strategies"""
7+
8+
@abstractmethod
9+
async def get_next_proxy(self) -> Optional[Dict]:
10+
"""Get next proxy configuration from the strategy"""
11+
pass
12+
13+
@abstractmethod
14+
def add_proxies(self, proxies: List[Dict]):
15+
"""Add proxy configurations to the strategy"""
16+
pass
17+
18+
class RoundRobinProxyStrategy(ProxyRotationStrategy):
19+
"""Simple round-robin proxy rotation strategy"""
20+
21+
def __init__(self, proxies: List[Dict] = None):
22+
"""
23+
Initialize with optional list of proxy configurations
24+
25+
Args:
26+
proxies: List of proxy config dictionaries, each containing at least
27+
'server' key with proxy URL
28+
"""
29+
self._proxies = []
30+
self._proxy_cycle = None
31+
if proxies:
32+
self.add_proxies(proxies)
33+
34+
def add_proxies(self, proxies: List[Dict]):
35+
"""Add new proxies to the rotation pool"""
36+
self._proxies.extend(proxies)
37+
self._proxy_cycle = cycle(self._proxies)
38+
39+
async def get_next_proxy(self) -> Optional[Dict]:
40+
"""Get next proxy in round-robin fashion"""
41+
if not self._proxy_cycle:
42+
return None
43+
return next(self._proxy_cycle)

Diff for: docs/examples/proxy_rotation_demo.py

+161
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import os
2+
import re
3+
from typing import List, Dict
4+
from crawl4ai import (
5+
AsyncWebCrawler,
6+
BrowserConfig,
7+
CrawlerRunConfig,
8+
CacheMode,
9+
RoundRobinProxyStrategy
10+
)
11+
12+
def load_proxies_from_env() -> List[Dict]:
13+
"""Load proxies from PROXIES environment variable"""
14+
proxies = []
15+
try:
16+
proxy_list = os.getenv("PROXIES", "").split(",")
17+
for proxy in proxy_list:
18+
if not proxy:
19+
continue
20+
ip, port, username, password = proxy.split(":")
21+
proxies.append({
22+
"server": f"http://{ip}:{port}",
23+
"username": username,
24+
"password": password,
25+
"ip": ip # Store original IP for verification
26+
})
27+
except Exception as e:
28+
print(f"Error loading proxies from environment: {e}")
29+
return proxies
30+
31+
async def demo_proxy_rotation():
32+
"""
33+
Proxy Rotation Demo using RoundRobinProxyStrategy
34+
===============================================
35+
Demonstrates proxy rotation using the strategy pattern.
36+
"""
37+
print("\n=== Proxy Rotation Demo (Round Robin) ===")
38+
39+
# Load proxies and create rotation strategy
40+
proxies = load_proxies_from_env()
41+
if not proxies:
42+
print("No proxies found in environment. Set PROXIES env variable!")
43+
return
44+
45+
proxy_strategy = RoundRobinProxyStrategy(proxies)
46+
47+
# Create configs
48+
browser_config = BrowserConfig(headless=True, verbose=False)
49+
run_config = CrawlerRunConfig(
50+
cache_mode=CacheMode.BYPASS,
51+
proxy_rotation_strategy=proxy_strategy
52+
)
53+
54+
# Test URLs
55+
urls = ["https://httpbin.org/ip"] * len(proxies) # Test each proxy once
56+
57+
async with AsyncWebCrawler(config=browser_config) as crawler:
58+
for url in urls:
59+
result = await crawler.arun(url=url, config=run_config)
60+
61+
if result.success:
62+
# Extract IP from response
63+
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
64+
current_proxy = run_config.proxy_config if run_config.proxy_config else None
65+
66+
if current_proxy:
67+
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
68+
verified = ip_match and ip_match.group(0) == current_proxy['ip']
69+
if verified:
70+
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
71+
else:
72+
print("❌ Proxy failed or IP mismatch!")
73+
else:
74+
print(f"Request failed: {result.error_message}")
75+
76+
async def demo_proxy_rotation_batch():
77+
"""
78+
Proxy Rotation Demo with Batch Processing
79+
=======================================
80+
Demonstrates proxy rotation using arun_many with memory dispatcher.
81+
"""
82+
print("\n=== Proxy Rotation Batch Demo ===")
83+
84+
try:
85+
# Load proxies and create rotation strategy
86+
proxies = load_proxies_from_env()
87+
if not proxies:
88+
print("No proxies found in environment. Set PROXIES env variable!")
89+
return
90+
91+
proxy_strategy = RoundRobinProxyStrategy(proxies)
92+
93+
# Configurations
94+
browser_config = BrowserConfig(headless=True, verbose=False)
95+
run_config = CrawlerRunConfig(
96+
cache_mode=CacheMode.BYPASS,
97+
proxy_rotation_strategy=proxy_strategy,
98+
markdown_generator=DefaultMarkdownGenerator()
99+
)
100+
101+
# Test URLs - multiple requests to test rotation
102+
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
103+
104+
print("\n📈 Initializing crawler with proxy rotation...")
105+
async with AsyncWebCrawler(config=browser_config) as crawler:
106+
monitor = CrawlerMonitor(
107+
max_visible_rows=10,
108+
display_mode=DisplayMode.DETAILED
109+
)
110+
111+
dispatcher = MemoryAdaptiveDispatcher(
112+
memory_threshold_percent=80.0,
113+
check_interval=0.5,
114+
max_session_permit=1, #len(proxies), # Match concurrent sessions to proxy count
115+
# monitor=monitor
116+
)
117+
118+
print("\n🚀 Starting batch crawl with proxy rotation...")
119+
results = await crawler.arun_many(
120+
urls=urls,
121+
config=run_config,
122+
dispatcher=dispatcher
123+
)
124+
125+
# Verify results
126+
success_count = 0
127+
for result in results:
128+
if result.success:
129+
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
130+
current_proxy = run_config.proxy_config if run_config.proxy_config else None
131+
132+
if current_proxy and ip_match:
133+
print(f"URL {result.url}")
134+
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}")
135+
verified = ip_match.group(0) == current_proxy['ip']
136+
if verified:
137+
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
138+
success_count += 1
139+
else:
140+
print("❌ Proxy failed or IP mismatch!")
141+
print("---")
142+
143+
print(f"\n✅ Completed {len(results)} requests with {success_count} successful proxy verifications")
144+
145+
except Exception as e:
146+
print(f"\n❌ Error in proxy rotation batch demo: {str(e)}")
147+
148+
if __name__ == "__main__":
149+
import asyncio
150+
from crawl4ai import (
151+
CrawlerMonitor,
152+
DisplayMode,
153+
MemoryAdaptiveDispatcher,
154+
DefaultMarkdownGenerator
155+
)
156+
157+
async def run_demos():
158+
# await demo_proxy_rotation() # Original single-request demo
159+
await demo_proxy_rotation_batch() # New batch processing demo
160+
161+
asyncio.run(run_demos())

Diff for: docs/md_v2/api/async-webcrawler.md

+2-34
Original file line numberDiff line numberDiff line change
@@ -160,41 +160,9 @@ The `arun_many()` method now uses an intelligent dispatcher that:
160160

161161
### 4.2 Example Usage
162162

163-
```python
164-
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, RateLimitConfig
165-
from crawl4ai.dispatcher import DisplayMode
166-
167-
# Configure browser
168-
browser_cfg = BrowserConfig(headless=True)
169-
170-
# Configure crawler with rate limiting
171-
run_cfg = CrawlerRunConfig(
172-
# Enable rate limiting
173-
enable_rate_limiting=True,
174-
rate_limit_config=RateLimitConfig(
175-
base_delay=(1.0, 2.0), # Random delay between 1-2 seconds
176-
max_delay=30.0, # Maximum delay after rate limit hits
177-
max_retries=2, # Number of retries before giving up
178-
rate_limit_codes=[429, 503] # Status codes that trigger rate limiting
179-
),
180-
# Resource monitoring
181-
memory_threshold_percent=70.0, # Pause if memory exceeds this
182-
check_interval=0.5, # How often to check resources
183-
max_session_permit=3, # Maximum concurrent crawls
184-
display_mode=DisplayMode.DETAILED.value # Show detailed progress
185-
)
163+
Check page [Multi-url Crawling](../advanced/multi-url-crawling.md) for a detailed example of how to use `arun_many()`.
186164

187-
urls = [
188-
"https://example.com/page1",
189-
"https://example.com/page2",
190-
"https://example.com/page3"
191-
]
192-
193-
async with AsyncWebCrawler(config=browser_cfg) as crawler:
194-
results = await crawler.arun_many(urls, config=run_cfg)
195-
for result in results:
196-
print(f"URL: {result.url}, Success: {result.success}")
197-
```
165+
```python
198166

199167
### 4.3 Key Features
200168

0 commit comments

Comments
 (0)