Skip to content

Commit 503dbd1

Browse files
committed
feat: merged localscraper into smartscraper
1 parent c898e99 commit 503dbd1

File tree

7 files changed

+48
-191
lines changed

7 files changed

+48
-191
lines changed

scrapegraph-py/examples/localscraper_example.py

-31
This file was deleted.

scrapegraph-py/examples/smartscraper_example.py

+2
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@
99
# SmartScraper request
1010
response = sgai_client.smartscraper(
1111
website_url="https://example.com",
12+
# website_html="...", # Optional, if you want to pass in HTML content instead of a URL
1213
user_prompt="Extract the main heading, description, and summary of the webpage",
1314
)
1415

16+
1517
# Print the response
1618
print(f"Request ID: {response['request_id']}")
1719
print(f"Result: {response['result']}")

scrapegraph-py/examples/smartscraper_schema_example.py

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class WebpageSchema(BaseModel):
1616
# SmartScraper request with output schema
1717
response = sgai_client.smartscraper(
1818
website_url="https://example.com",
19+
# website_html="...", # Optional, if you want to pass in HTML content instead of a URL
1920
user_prompt="Extract webpage information",
2021
output_schema=WebpageSchema,
2122
)

scrapegraph-py/scrapegraph_py/async_client.py

+8-43
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,6 @@
99
from scrapegraph_py.exceptions import APIError
1010
from scrapegraph_py.logger import sgai_logger as logger
1111
from scrapegraph_py.models.feedback import FeedbackRequest
12-
from scrapegraph_py.models.localscraper import (
13-
GetLocalScraperRequest,
14-
LocalScraperRequest,
15-
)
1612
from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest
1713
from scrapegraph_py.models.smartscraper import (
1814
GetSmartScraperRequest,
@@ -165,16 +161,22 @@ async def get_markdownify(self, request_id: str):
165161

166162
async def smartscraper(
167163
self,
168-
website_url: str,
169164
user_prompt: str,
165+
website_url: Optional[str] = None,
166+
website_html: Optional[str] = None,
170167
output_schema: Optional[BaseModel] = None,
171168
):
172169
"""Send a smartscraper request"""
173-
logger.info(f"🔍 Starting smartscraper request for {website_url}")
170+
logger.info("🔍 Starting smartscraper request")
171+
if website_url:
172+
logger.debug(f"🌐 URL: {website_url}")
173+
if website_html:
174+
logger.debug("📄 Using provided HTML content")
174175
logger.debug(f"📝 Prompt: {user_prompt}")
175176

176177
request = SmartScraperRequest(
177178
website_url=website_url,
179+
website_html=website_html,
178180
user_prompt=user_prompt,
179181
output_schema=output_schema,
180182
)
@@ -200,43 +202,6 @@ async def get_smartscraper(self, request_id: str):
200202
logger.info(f"✨ Successfully retrieved result for request {request_id}")
201203
return result
202204

203-
async def localscraper(
204-
self,
205-
user_prompt: str,
206-
website_html: str,
207-
output_schema: Optional[BaseModel] = None,
208-
):
209-
"""Send a localscraper request"""
210-
logger.info("🔍 Starting localscraper request")
211-
logger.debug(f"📝 Prompt: {user_prompt}")
212-
213-
request = LocalScraperRequest(
214-
user_prompt=user_prompt,
215-
website_html=website_html,
216-
output_schema=output_schema,
217-
)
218-
logger.debug("✅ Request validation passed")
219-
220-
result = await self._make_request(
221-
"POST", f"{API_BASE_URL}/localscraper", json=request.model_dump()
222-
)
223-
logger.info("✨ Localscraper request completed successfully")
224-
return result
225-
226-
async def get_localscraper(self, request_id: str):
227-
"""Get the result of a previous localscraper request"""
228-
logger.info(f"🔍 Fetching localscraper result for request {request_id}")
229-
230-
# Validate input using Pydantic model
231-
GetLocalScraperRequest(request_id=request_id)
232-
logger.debug("✅ Request ID validation passed")
233-
234-
result = await self._make_request(
235-
"GET", f"{API_BASE_URL}/localscraper/{request_id}"
236-
)
237-
logger.info(f"✨ Successfully retrieved result for request {request_id}")
238-
return result
239-
240205
async def submit_feedback(
241206
self, request_id: str, rating: int, feedback_text: Optional[str] = None
242207
):

scrapegraph-py/scrapegraph_py/client.py

+8-41
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,6 @@
1010
from scrapegraph_py.exceptions import APIError
1111
from scrapegraph_py.logger import sgai_logger as logger
1212
from scrapegraph_py.models.feedback import FeedbackRequest
13-
from scrapegraph_py.models.localscraper import (
14-
GetLocalScraperRequest,
15-
LocalScraperRequest,
16-
)
1713
from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest
1814
from scrapegraph_py.models.smartscraper import (
1915
GetSmartScraperRequest,
@@ -175,16 +171,22 @@ def get_markdownify(self, request_id: str):
175171

176172
def smartscraper(
177173
self,
178-
website_url: str,
179174
user_prompt: str,
175+
website_url: Optional[str] = None,
176+
website_html: Optional[str] = None,
180177
output_schema: Optional[BaseModel] = None,
181178
):
182179
"""Send a smartscraper request"""
183-
logger.info(f"🔍 Starting smartscraper request for {website_url}")
180+
logger.info("🔍 Starting smartscraper request")
181+
if website_url:
182+
logger.debug(f"🌐 URL: {website_url}")
183+
if website_html:
184+
logger.debug("📄 Using provided HTML content")
184185
logger.debug(f"📝 Prompt: {user_prompt}")
185186

186187
request = SmartScraperRequest(
187188
website_url=website_url,
189+
website_html=website_html,
188190
user_prompt=user_prompt,
189191
output_schema=output_schema,
190192
)
@@ -208,41 +210,6 @@ def get_smartscraper(self, request_id: str):
208210
logger.info(f"✨ Successfully retrieved result for request {request_id}")
209211
return result
210212

211-
def localscraper(
212-
self,
213-
user_prompt: str,
214-
website_html: str,
215-
output_schema: Optional[BaseModel] = None,
216-
):
217-
"""Send a localscraper request"""
218-
logger.info("🔍 Starting localscraper request")
219-
logger.debug(f"📝 Prompt: {user_prompt}")
220-
221-
request = LocalScraperRequest(
222-
user_prompt=user_prompt,
223-
website_html=website_html,
224-
output_schema=output_schema,
225-
)
226-
logger.debug("✅ Request validation passed")
227-
228-
result = self._make_request(
229-
"POST", f"{API_BASE_URL}/localscraper", json=request.model_dump()
230-
)
231-
logger.info("✨ Localscraper request completed successfully")
232-
return result
233-
234-
def get_localscraper(self, request_id: str):
235-
"""Get the result of a previous localscraper request"""
236-
logger.info(f"🔍 Fetching localscraper result for request {request_id}")
237-
238-
# Validate input using Pydantic model
239-
GetLocalScraperRequest(request_id=request_id)
240-
logger.debug("✅ Request ID validation passed")
241-
242-
result = self._make_request("GET", f"{API_BASE_URL}/localscraper/{request_id}")
243-
logger.info(f"✨ Successfully retrieved result for request {request_id}")
244-
return result
245-
246213
def submit_feedback(
247214
self, request_id: str, rating: int, feedback_text: Optional[str] = None
248215
):

scrapegraph-py/scrapegraph_py/models/localscraper.py

-67
This file was deleted.

scrapegraph-py/scrapegraph_py/models/smartscraper.py

+29-9
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Optional, Type
44
from uuid import UUID
55

6+
from bs4 import BeautifulSoup
67
from pydantic import BaseModel, Field, model_validator
78

89

@@ -11,7 +12,14 @@ class SmartScraperRequest(BaseModel):
1112
...,
1213
example="Extract info about the company",
1314
)
14-
website_url: str = Field(..., example="https://scrapegraphai.com/")
15+
website_url: Optional[str] = Field(
16+
default=None, example="https://scrapegraphai.com/"
17+
)
18+
website_html: Optional[str] = Field(
19+
default=None,
20+
example="<html><body><h1>Title</h1><p>Content</p></body></html>",
21+
description="HTML content, maximum size 2MB",
22+
)
1523
output_schema: Optional[Type[BaseModel]] = None
1624

1725
@model_validator(mode="after")
@@ -23,14 +31,26 @@ def validate_user_prompt(self) -> "SmartScraperRequest":
2331
return self
2432

2533
@model_validator(mode="after")
26-
def validate_url(self) -> "SmartScraperRequest":
27-
if self.website_url is None or not self.website_url.strip():
28-
raise ValueError("Website URL cannot be empty")
29-
if not (
30-
self.website_url.startswith("http://")
31-
or self.website_url.startswith("https://")
32-
):
33-
raise ValueError("Invalid URL")
34+
def validate_url_and_html(self) -> "SmartScraperRequest":
35+
if self.website_html is not None:
36+
if len(self.website_html.encode("utf-8")) > 2 * 1024 * 1024:
37+
raise ValueError("Website HTML content exceeds maximum size of 2MB")
38+
try:
39+
soup = BeautifulSoup(self.website_html, "html.parser")
40+
if not soup.find():
41+
raise ValueError("Invalid HTML - no parseable content found")
42+
except Exception as e:
43+
raise ValueError(f"Invalid HTML structure: {str(e)}")
44+
elif self.website_url is not None:
45+
if not self.website_url.strip():
46+
raise ValueError("Website URL cannot be empty")
47+
if not (
48+
self.website_url.startswith("http://")
49+
or self.website_url.startswith("https://")
50+
):
51+
raise ValueError("Invalid URL")
52+
else:
53+
raise ValueError("Either website_url or website_html must be provided")
3454
return self
3555

3656
def model_dump(self, *args, **kwargs) -> dict:

0 commit comments

Comments
 (0)