-
Notifications
You must be signed in to change notification settings - Fork 3.3k
/
Copy pathidentity_based_browsing.py
108 lines (85 loc) · 4.13 KB
/
identity_based_browsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Identity-Based Browsing Example with Crawl4AI
This example demonstrates how to:
1. Create a persistent browser profile interactively
2. List available profiles
3. Use a saved profile for crawling authenticated sites
4. Delete profiles when no longer needed
Uses the new BrowserProfiler class for profile management.
"""
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig
from crawl4ai.browser_profiler import BrowserProfiler
from crawl4ai.async_logger import AsyncLogger
from colorama import Fore, Style, init
# Initialize colorama
init()
# Create a shared logger instance
logger = AsyncLogger(verbose=True)
# Create a shared BrowserProfiler instance
profiler = BrowserProfiler(logger=logger)
async def crawl_with_profile(profile_path, url):
"""Use a profile to crawl an authenticated page"""
logger.info(f"\nCrawling {Fore.CYAN}{url}{Style.RESET_ALL} using profile at {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CRAWL")
# Create browser config with the profile path
browser_config = BrowserConfig(
headless=False, # Set to False if you want to see the browser window
use_managed_browser=True, # Required for persistent profiles
user_data_dir=profile_path
)
start_time = asyncio.get_event_loop().time()
# Initialize crawler with the browser config
async with AsyncWebCrawler(config=browser_config) as crawler:
# Crawl the URL - You should have access to authenticated content now
result = await crawler.arun(url)
elapsed_time = asyncio.get_event_loop().time() - start_time
if result.success:
# Use url_status method for consistent logging
logger.url_status(url, True, elapsed_time, tag="CRAWL")
# Print page title or some indication of success
title = result.metadata.get("title", "")
logger.success(f"Page title: {Fore.GREEN}{title}{Style.RESET_ALL}", tag="CRAWL")
return result
else:
# Log error status
logger.error_status(url, result.error_message, tag="CRAWL")
return None
async def main():
logger.info(f"{Fore.CYAN}Identity-Based Browsing Example with Crawl4AI{Style.RESET_ALL}", tag="DEMO")
logger.info("This example demonstrates using profiles for authenticated browsing", tag="DEMO")
# Choose between interactive mode and automatic mode
mode = input(f"{Fore.CYAN}Run in [i]nteractive mode or [a]utomatic mode? (i/a): {Style.RESET_ALL}").lower()
if mode == 'i':
# Interactive profile management - use the interactive_manager method
# Pass the crawl_with_profile function as the callback for the "crawl a website" option
await profiler.interactive_manager(crawl_callback=crawl_with_profile)
else:
# Automatic mode - simplified example
profiles = profiler.list_profiles()
if not profiles:
# Create a new profile if none exists
logger.info("No profiles found. Creating a new one...", tag="DEMO")
profile_path = await profiler.create_profile()
if not profile_path:
logger.error("Cannot proceed without a valid profile", tag="DEMO")
return
else:
# Use the first (most recent) profile
profile_path = profiles[0]["path"]
logger.info(f"Using existing profile: {Fore.CYAN}{profiles[0]['name']}{Style.RESET_ALL}", tag="DEMO")
# Example: Crawl an authenticated page
urls_to_crawl = [
"https://github.com/settings/profile", # GitHub requires login
# "https://twitter.com/home", # Twitter requires login
# "https://www.linkedin.com/feed/", # LinkedIn requires login
]
for url in urls_to_crawl:
await crawl_with_profile(profile_path, url)
if __name__ == "__main__":
try:
# Run the async main function
asyncio.run(main())
except KeyboardInterrupt:
logger.warning("Example interrupted by user", tag="DEMO")
except Exception as e:
logger.error(f"Error in example: {str(e)}", tag="DEMO")