-
Notifications
You must be signed in to change notification settings - Fork 3.3k
/
Copy pathscraping_strategies_performance.py
135 lines (111 loc) · 4.56 KB
/
scraping_strategies_performance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import time, re
from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy
import time
import functools
from collections import defaultdict
class TimingStats:
def __init__(self):
self.stats = defaultdict(lambda: defaultdict(lambda: {"calls": 0, "total_time": 0}))
def add(self, strategy_name, func_name, elapsed):
self.stats[strategy_name][func_name]["calls"] += 1
self.stats[strategy_name][func_name]["total_time"] += elapsed
def report(self):
for strategy_name, funcs in self.stats.items():
print(f"\n{strategy_name} Timing Breakdown:")
print("-" * 60)
print(f"{'Function':<30} {'Calls':<10} {'Total(s)':<10} {'Avg(ms)':<10}")
print("-" * 60)
for func, data in sorted(funcs.items(), key=lambda x: x[1]["total_time"], reverse=True):
avg_ms = (data["total_time"] / data["calls"]) * 1000
print(f"{func:<30} {data['calls']:<10} {data['total_time']:<10.3f} {avg_ms:<10.2f}")
timing_stats = TimingStats()
# Modify timing decorator
def timing_decorator(strategy_name):
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
elapsed = time.time() - start
timing_stats.add(strategy_name, func.__name__, elapsed)
return result
return wrapper
return decorator
# Modified decorator application
def apply_decorators(cls, method_name, strategy_name):
try:
original_method = getattr(cls, method_name)
decorated_method = timing_decorator(strategy_name)(original_method)
setattr(cls, method_name, decorated_method)
except AttributeError:
print(f"Method {method_name} not found in class {cls.__name__}.")
# Apply to key methods
methods_to_profile = [
'_scrap',
# 'process_element',
'_process_element',
'process_image',
]
# Apply decorators to both strategies
for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
for method in methods_to_profile:
apply_decorators(strategy, method, name)
def generate_large_html(n_elements=1000):
html = ['<!DOCTYPE html><html><head></head><body>']
for i in range(n_elements):
html.append(f'''
<div class="article">
<h2>Heading {i}</h2>
<div>
<div>
<p>This is paragraph {i} with some content and a <a href="http://example.com/{i}">link</a></p>
</div>
</div>
<img src="image{i}.jpg" alt="Image {i}">
<ul>
<li>List item {i}.1</li>
<li>List item {i}.2</li>
</ul>
</div>
''')
html.append('</body></html>')
return ''.join(html)
def test_scraping():
# Initialize both scrapers
original_scraper = WebScrapingStrategy()
selected_scraper = LXMLWebScrapingStrategy()
# Generate test HTML
print("Generating HTML...")
html = generate_large_html(5000)
print(f"HTML Size: {len(html)/1024:.2f} KB")
# Time the scraping
print("\nStarting scrape...")
start_time = time.time()
kwargs = {
"url": "http://example.com",
"html": html,
"word_count_threshold": 5,
"keep_data_attributes": True
}
t1 = time.perf_counter()
result_selected = selected_scraper.scrap(**kwargs)
t2 = time.perf_counter()
result_original = original_scraper.scrap(**kwargs)
t3 = time.perf_counter()
elapsed = t3 - start_time
print(f"\nScraping completed in {elapsed:.2f} seconds")
timing_stats.report()
# Print stats of LXML output
print("\Turbo Output:")
print(f"\nExtracted links: {len(result_selected.links.internal) + len(result_selected.links.external)}")
print(f"Extracted images: {len(result_selected.media.images)}")
print(f"Clean HTML size: {len(result_selected.cleaned_html)/1024:.2f} KB")
print(f"Scraping time: {t2 - t1:.2f} seconds")
# Print stats of original output
print("\nOriginal Output:")
print(f"\nExtracted links: {len(result_original.links.internal) + len(result_original.links.external)}")
print(f"Extracted images: {len(result_original.media.images)}")
print(f"Clean HTML size: {len(result_original.cleaned_html)/1024:.2f} KB")
print(f"Scraping time: {t3 - t1:.2f} seconds")
if __name__ == "__main__":
test_scraping()