unclecode
diff --git a/Diff for: ‎crawl4ai/__init__.py
+24-18 b/Diff for: ‎crawl4ai/__init__.py
+24-18
diff --git a/Diff for: ‎crawl4ai/async_configs.py
+5 b/Diff for: ‎crawl4ai/async_configs.py
+5
diff --git a/Diff for: ‎crawl4ai/async_webcrawler.py
+13 b/Diff for: ‎crawl4ai/async_webcrawler.py
+13
diff --git a/Diff for: ‎crawl4ai/proxy_strategy.py
+43 b/Diff for: ‎crawl4ai/proxy_strategy.py
+43
diff --git a/Diff for: ‎docs/examples/proxy_rotation_demo.py
+161 b/Diff for: ‎docs/examples/proxy_rotation_demo.py
+161
diff --git a/Diff for: ‎docs/md_v2/api/async-webcrawler.md
+2-34 b/Diff for: ‎docs/md_v2/api/async-webcrawler.md
+2-34
@@ -8,6 +8,10 @@
     WebScrapingStrategy,
     LXMLWebScrapingStrategy,
 )
+from .proxy_strategy import (
+    ProxyRotationStrategy,
+    RoundRobinProxyStrategy,
+)
 from .extraction_strategy import (
     ExtractionStrategy,
     LLMExtractionStrategy,
@@ -60,31 +64,33 @@
     "DisplayMode",
     "MarkdownGenerationResult",
     "Crawl4aiDockerClient",
+    "ProxyRotationStrategy",
+    "RoundRobinProxyStrategy",
 ]
 
 
-def is_sync_version_installed():
-    try:
-        import selenium # noqa
+# def is_sync_version_installed():
+#     try:
+#         import selenium # noqa
 
-        return True
-    except ImportError:
-        return False
+#         return True
+#     except ImportError:
+#         return False
 
 
-if is_sync_version_installed():
-    try:
-        from .web_crawler import WebCrawler
+# if is_sync_version_installed():
+#     try:
+#         from .web_crawler import WebCrawler
 
-        __all__.append("WebCrawler")
-    except ImportError:
-        print(
-            "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
-        )
-else:
-    WebCrawler = None
-    # import warnings
-    # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
+#         __all__.append("WebCrawler")
+#     except ImportError:
+#         print(
+#             "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
+#         )
+# else:
+#     WebCrawler = None
+#     # import warnings
+#     # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
 
 # Disable all Pydantic warnings
 warnings.filterwarnings("ignore", module="pydantic")
 
@@ -16,6 +16,7 @@
 from .deep_crawling import DeepCrawlStrategy
 from typing import Union, List
 from .cache_context import CacheMode
+from .proxy_strategy import ProxyRotationStrategy
 
 import inspect
 from typing import Any, Dict, Optional
@@ -542,6 +543,7 @@ def __init__(
         parser_type: str = "lxml",
         scraping_strategy: ContentScrapingStrategy = None,
         proxy_config: dict = None,
+        proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
         # SSL Parameters
         fetch_ssl_certificate: bool = False,
         # Caching Parameters
@@ -620,6 +622,7 @@ def __init__(
         self.parser_type = parser_type
         self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
         self.proxy_config = proxy_config
+        self.proxy_rotation_strategy = proxy_rotation_strategy
 
         # SSL Parameters
         self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -731,6 +734,7 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
             parser_type=kwargs.get("parser_type", "lxml"),
             scraping_strategy=kwargs.get("scraping_strategy"),
             proxy_config=kwargs.get("proxy_config"),
+            proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
             # SSL Parameters
             fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
             # Caching Parameters
@@ -827,6 +831,7 @@ def to_dict(self):
             "parser_type": self.parser_type,
             "scraping_strategy": self.scraping_strategy,
             "proxy_config": self.proxy_config,
+            "proxy_rotation_strategy": self.proxy_rotation_strategy,
             "fetch_ssl_certificate": self.fetch_ssl_certificate,
             "cache_mode": self.cache_mode,
             "session_id": self.session_id,
 
@@ -394,6 +394,19 @@ async def arun(
                         tag="FETCH",
                     )
 
+                # Update proxy configuration from rotation strategy if available
+                if config and config.proxy_rotation_strategy:
+                    next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
+                    if next_proxy:
+                        if verbose:
+                            self.logger.info(
+                                message="Switch proxy: {proxy}",
+                                tag="PROXY",
+                                params={"proxy": next_proxy.get("server")},
+                            )
+                        config.proxy_config = next_proxy
+                        # config = config.clone(proxy_config=next_proxy)
+
                 # Fetch fresh content if needed
                 if not cached_result or not html:
                     t1 = time.perf_counter()
 
@@ -0,0 +1,43 @@
+from typing import List, Dict, Optional
+from abc import ABC, abstractmethod
+from itertools import cycle
+
+class ProxyRotationStrategy(ABC):
+    """Base abstract class for proxy rotation strategies"""
+    
+    @abstractmethod
+    async def get_next_proxy(self) -> Optional[Dict]:
+        """Get next proxy configuration from the strategy"""
+        pass
+
+    @abstractmethod
+    def add_proxies(self, proxies: List[Dict]):
+        """Add proxy configurations to the strategy"""
+        pass
+
+class RoundRobinProxyStrategy(ProxyRotationStrategy):
+    """Simple round-robin proxy rotation strategy"""
+
+    def __init__(self, proxies: List[Dict] = None):
+        """
+        Initialize with optional list of proxy configurations
+        
+        Args:
+            proxies: List of proxy config dictionaries, each containing at least
+                    'server' key with proxy URL
+        """
+        self._proxies = []
+        self._proxy_cycle = None
+        if proxies:
+            self.add_proxies(proxies)
+
+    def add_proxies(self, proxies: List[Dict]):
+        """Add new proxies to the rotation pool"""
+        self._proxies.extend(proxies)
+        self._proxy_cycle = cycle(self._proxies)
+
+    async def get_next_proxy(self) -> Optional[Dict]:
+        """Get next proxy in round-robin fashion"""
+        if not self._proxy_cycle:
+            return None
+        return next(self._proxy_cycle)
@@ -0,0 +1,161 @@
+import os
+import re
+from typing import List, Dict
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    RoundRobinProxyStrategy
+)
+
+def load_proxies_from_env() -> List[Dict]:
+    """Load proxies from PROXIES environment variable"""
+    proxies = []
+    try:
+        proxy_list = os.getenv("PROXIES", "").split(",")
+        for proxy in proxy_list:
+            if not proxy:
+                continue
+            ip, port, username, password = proxy.split(":")
+            proxies.append({
+                "server": f"http://{ip}:{port}",
+                "username": username,
+                "password": password,
+                "ip": ip  # Store original IP for verification
+            })
+    except Exception as e:
+        print(f"Error loading proxies from environment: {e}")
+    return proxies
+
+async def demo_proxy_rotation():
+    """
+    Proxy Rotation Demo using RoundRobinProxyStrategy
+    ===============================================
+    Demonstrates proxy rotation using the strategy pattern.
+    """
+    print("\n=== Proxy Rotation Demo (Round Robin) ===")
+    
+    # Load proxies and create rotation strategy
+    proxies = load_proxies_from_env()
+    if not proxies:
+        print("No proxies found in environment. Set PROXIES env variable!")
+        return
+        
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+    
+    # Create configs
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        proxy_rotation_strategy=proxy_strategy
+    )
+    
+    # Test URLs
+    urls = ["https://httpbin.org/ip"] * len(proxies)  # Test each proxy once
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        for url in urls:
+            result = await crawler.arun(url=url, config=run_config)
+            
+            if result.success:
+                # Extract IP from response
+                ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                current_proxy = run_config.proxy_config if run_config.proxy_config else None
+                
+                if current_proxy:
+                    print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
+                    verified = ip_match and ip_match.group(0) == current_proxy['ip']
+                    if verified:
+                        print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
+                    else:
+                        print("❌ Proxy failed or IP mismatch!")
+            else:
+                print(f"Request failed: {result.error_message}")
+
+async def demo_proxy_rotation_batch():
+    """
+    Proxy Rotation Demo with Batch Processing
+    =======================================
+    Demonstrates proxy rotation using arun_many with memory dispatcher.
+    """
+    print("\n=== Proxy Rotation Batch Demo ===")
+    
+    try:
+        # Load proxies and create rotation strategy
+        proxies = load_proxies_from_env()
+        if not proxies:
+            print("No proxies found in environment. Set PROXIES env variable!")
+            return
+            
+        proxy_strategy = RoundRobinProxyStrategy(proxies)
+        
+        # Configurations
+        browser_config = BrowserConfig(headless=True, verbose=False)
+        run_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            proxy_rotation_strategy=proxy_strategy,
+            markdown_generator=DefaultMarkdownGenerator()
+        )
+
+        # Test URLs - multiple requests to test rotation
+        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice
+
+        print("\n📈 Initializing crawler with proxy rotation...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            monitor = CrawlerMonitor(
+                max_visible_rows=10,
+                display_mode=DisplayMode.DETAILED
+            )
+            
+            dispatcher = MemoryAdaptiveDispatcher(
+                memory_threshold_percent=80.0,
+                check_interval=0.5,
+                max_session_permit=1, #len(proxies),  # Match concurrent sessions to proxy count
+                # monitor=monitor
+            )
+            
+            print("\n🚀 Starting batch crawl with proxy rotation...")
+            results = await crawler.arun_many(
+                urls=urls,
+                config=run_config,
+                dispatcher=dispatcher
+            )
+
+            # Verify results
+            success_count = 0
+            for result in results:
+                if result.success:
+                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                    current_proxy = run_config.proxy_config if run_config.proxy_config else None
+                    
+                    if current_proxy and ip_match:
+                        print(f"URL {result.url}")
+                        print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}")
+                        verified = ip_match.group(0) == current_proxy['ip']
+                        if verified:
+                            print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
+                            success_count += 1
+                        else:
+                            print("❌ Proxy failed or IP mismatch!")
+                    print("---")
+                    
+            print(f"\n✅ Completed {len(results)} requests with {success_count} successful proxy verifications")
+            
+    except Exception as e:
+        print(f"\n❌ Error in proxy rotation batch demo: {str(e)}")
+
+if __name__ == "__main__":
+    import asyncio
+    from crawl4ai import (
+        CrawlerMonitor, 
+        DisplayMode,
+        MemoryAdaptiveDispatcher,
+        DefaultMarkdownGenerator
+    )
+    
+    async def run_demos():
+        # await demo_proxy_rotation()  # Original single-request demo
+        await demo_proxy_rotation_batch()  # New batch processing demo
+        
+    asyncio.run(run_demos())
@@ -160,41 +160,9 @@ The `arun_many()` method now uses an intelligent dispatcher that:
 
 ### 4.2 Example Usage
 
-```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, RateLimitConfig
-from crawl4ai.dispatcher import DisplayMode
-
-# Configure browser
-browser_cfg = BrowserConfig(headless=True)
-
-# Configure crawler with rate limiting
-run_cfg = CrawlerRunConfig(
-    # Enable rate limiting
-    enable_rate_limiting=True,
-    rate_limit_config=RateLimitConfig(
-        base_delay=(1.0, 2.0),  # Random delay between 1-2 seconds
-        max_delay=30.0,         # Maximum delay after rate limit hits
-        max_retries=2,          # Number of retries before giving up
-        rate_limit_codes=[429, 503]  # Status codes that trigger rate limiting
-    ),
-    # Resource monitoring
-    memory_threshold_percent=70.0,  # Pause if memory exceeds this
-    check_interval=0.5,            # How often to check resources
-    max_session_permit=3,          # Maximum concurrent crawls
-    display_mode=DisplayMode.DETAILED.value  # Show detailed progress
-)
+Check page [Multi-url Crawling](../advanced/multi-url-crawling.md) for a detailed example of how to use `arun_many()`.
 
-urls = [
-    "https://example.com/page1",
-    "https://example.com/page2",
-    "https://example.com/page3"
-]
-
-async with AsyncWebCrawler(config=browser_cfg) as crawler:
-    results = await crawler.arun_many(urls, config=run_cfg)
-    for result in results:
-        print(f"URL: {result.url}, Success: {result.success}")
-```
+```python
 
 ### 4.3 Key Features