-
Notifications
You must be signed in to change notification settings - Fork 3.3k
/
Copy pathserp_api_project_11_feb.py
305 lines (257 loc) · 10.6 KB
/
serp_api_project_11_feb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import asyncio
import json
from typing import Any, Dict, List, Optional
from regex import P
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
LLMExtractionStrategy,
JsonCssExtractionStrategy,
CrawlerHub,
CrawlResult,
DefaultMarkdownGenerator,
PruningContentFilter,
)
from pathlib import Path
from pydantic import BaseModel
__current_dir = Path(__file__).parent
# Crawl4ai Hello Web
async def little_hello_web():
async with AsyncWebCrawler() as crawler:
result : CrawlResult = await crawler.arun(
url="https://www.helloworld.org"
)
print(result.markdown.raw_markdown[:500])
async def hello_web():
browser_config = BrowserConfig(headless=True, verbose=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
),
)
result : CrawlResult = await crawler.arun(
url="https://www.helloworld.org", config=crawler_config
)
print(result.markdown.fit_markdown[:500])
# Naive Approach Using Large Language Models
async def extract_using_llm():
print("Extracting using Large Language Models")
browser_config = BrowserConfig(headless=True, verbose=True)
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
try:
class Sitelink(BaseModel):
title: str
link: str
class GoogleSearchResult(BaseModel):
title: str
link: str
snippet: str
sitelinks: Optional[List[Sitelink]] = None
llm_extraction_strategy = LLMExtractionStrategy(
provider = "openai/gpt-4o",
schema = GoogleSearchResult.model_json_schema(),
instruction="""I want to extract the title, link, snippet, and sitelinks from a Google search result. I shared here the content of div#search from the search result page. We are just interested in organic search results.
Example:
{
"title": "Google",
"link": "https://www.google.com",
"snippet": "Google is a search engine.",
"sitelinks": [
{
"title": "Gmail",
"link": "https://mail.google.com"
},
{
"title": "Google Drive",
"link": "https://drive.google.com"
}
]
}""",
# apply_chunking=False,
chunk_token_threshold=2 ** 12, # 2^12 = 4096
verbose=True,
# input_format="html", # html, markdown, cleaned_html
input_format="cleaned_html"
)
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
keep_attrs=["id", "class"],
keep_data_attributes=True,
delay_before_return_html=2,
extraction_strategy=llm_extraction_strategy,
css_selector="div#search",
)
result : CrawlResult = await crawler.arun(
url="https://www.google.com/search?q=apple%20inc&start=0&num=10",
config=crawl_config,
)
search_result = {}
if result.success:
search_result = json.loads(result.extracted_content)
# save search result to file
with open(__current_dir / "search_result_using_llm.json", "w") as f:
f.write(json.dumps(search_result, indent=4))
print(json.dumps(search_result, indent=4))
finally:
await crawler.close()
# Example of using CrawlerHub
async def schema_generator():
print("Generating schema")
html = ""
# Load html from file
with open(__current_dir / "google_search_item.html", "r") as f:
html = f.read()
organic_schema = JsonCssExtractionStrategy.generate_schema(
html=html,
target_json_example="""{
"title": "...",
"link": "...",
"snippet": "...",
"date": "1 hour ago",
"sitelinks": [
{
"title": "...",
"link": "..."
}
]
}""",
query="""The given HTML is the crawled HTML from the Google search result, which refers to one HTML element representing one organic Google search result. Please find the schema for the organic search item based on the given HTML. I am interested in the title, link, snippet text, sitelinks, and date.""",
)
print(json.dumps(organic_schema, indent=4))
pass
# Golden Standard
async def build_schema(html:str, force: bool = False) -> Dict[str, Any]:
print("Building schema")
schemas = {}
if (__current_dir / "organic_schema.json").exists() and not force:
with open(__current_dir / "organic_schema.json", "r") as f:
schemas["organic"] = json.loads(f.read())
else:
# Extract schema from html
organic_schema = JsonCssExtractionStrategy.generate_schema(
html=html,
target_json_example="""{
"title": "...",
"link": "...",
"snippet": "...",
"date": "1 hour ago",
"sitelinks": [
{
"title": "...",
"link": "..."
}
]
}""",
query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text, sitelinks and date. Usually they are all inside a div#search.""",
)
# Save schema to file current_dir/organic_schema.json
with open(__current_dir / "organic_schema.json", "w") as f:
f.write(json.dumps(organic_schema, indent=4))
schemas["organic"] = organic_schema
# Repeat the same for top_stories_schema
if (__current_dir / "top_stories_schema.json").exists():
with open(__current_dir / "top_stories_schema.json", "r") as f:
schemas["top_stories"] = json.loads(f.read())
else:
top_stories_schema = JsonCssExtractionStrategy.generate_schema(
html=html,
target_json_example="""{
"title": "...",
"link": "...",
"source": "Insider Monkey",
"date": "1 hour ago",
}""",
query="""The given HTML is the crawled HTML from the Google search result. Please find the schema for the Top Stories item in the given HTML. I am interested in the title, link, source, and date.""",
)
with open(__current_dir / "top_stories_schema.json", "w") as f:
f.write(json.dumps(top_stories_schema, indent=4))
schemas["top_stories"] = top_stories_schema
# Repeat the same for suggested_queries_schema
if (__current_dir / "suggested_queries_schema.json").exists():
with open(__current_dir / "suggested_queries_schema.json", "r") as f:
schemas["suggested_queries"] = json.loads(f.read())
else:
suggested_queries_schema = JsonCssExtractionStrategy.generate_schema(
html=html,
target_json_example="""{
"query": "A for Apple",
}""",
query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "relatedSearches" at the bottom of the page. I am interested in the queries only.""",
)
with open(__current_dir / "suggested_queries_schema.json", "w") as f:
f.write(json.dumps(suggested_queries_schema, indent=4))
schemas["suggested_queries"] = suggested_queries_schema
return schemas
async def search(q: str = "apple inc") -> Dict[str, Any]:
print("Searching for:", q)
browser_config = BrowserConfig(headless=True, verbose=True)
crawler = AsyncWebCrawler(config=browser_config)
search_result: Dict[str, List[Dict[str, Any]]] = {}
await crawler.start()
try:
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
keep_attrs=["id", "class"],
keep_data_attributes=True,
delay_before_return_html=2,
)
from urllib.parse import quote
result: CrawlResult = await crawler.arun(
f"https://www.google.com/search?q={quote(q)}&start=0&num=10",
config=crawl_config
)
if result.success:
schemas : Dict[str, Any] = await build_schema(result.html)
for schema in schemas.values():
schema_key = schema["name"].lower().replace(' ', '_')
search_result[schema_key] = JsonCssExtractionStrategy(
schema=schema
).run(
url="",
sections=[result.html],
)
# save search result to file
with open(__current_dir / "search_result.json", "w") as f:
f.write(json.dumps(search_result, indent=4))
print(json.dumps(search_result, indent=4))
finally:
await crawler.close()
return search_result
# Example of using CrawlerHub
async def hub_example(query: str = "apple inc"):
print("Using CrawlerHub")
crawler_cls = CrawlerHub.get("google_search")
crawler = crawler_cls()
# Text search
text_results = await crawler.run(
query=query,
search_type="text",
schema_cache_path="/Users/unclecode/.crawl4ai"
)
# Save search result to file
with open(__current_dir / "search_result_using_hub.json", "w") as f:
f.write(json.dumps(json.loads(text_results), indent=4))
print(json.dumps(json.loads(text_results), indent=4))
async def demo():
# Step 1: Introduction & Overview
# await little_hello_web()
# await hello_web()
# Step 2: Demo end result, using hub
# await hub_example()
# Step 3: Using LLm for extraction
# await extract_using_llm()
# Step 4: GEt familiar with schema generation
# await schema_generator()
# Step 5: Golden Standard
# await search()
# Step 6: Introduction to CrawlerHub
await hub_example()
if __name__ == "__main__":
asyncio.run(demo())