Skip to content

Commit 381c146

Browse files
authored
Merge pull request #34 from CodeBeaverAI/codebeaver/crewai-29
Codebeaver/crewai 29
2 parents 973eeab + 773d46a commit 381c146

File tree

4 files changed

+234
-0
lines changed

4 files changed

+234
-0
lines changed

codebeaver.yml

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
workspaces:
2+
- from: jest
3+
name: scrapegraph-js
4+
path: scrapegraph-js
5+
- from: pytest
6+
name: scrapegraph-py
7+
path: scrapegraph-py
+90
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import pytest
2+
from pydantic import BaseModel
3+
from scrapegraph_py.models.localscraper import LocalScraperRequest, GetLocalScraperRequest
4+
5+
# Create a dummy output schema to test the conversion in model_dump.
6+
class DummySchema(BaseModel):
7+
test_field: str
8+
9+
def test_output_schema_conversion():
10+
"""
11+
Test that when an output_schema is provided in a LocalScraperRequest,
12+
model_dump returns a dictionary where the output_schema key holds the JSON schema
13+
of the provided Pydantic model.
14+
"""
15+
user_prompt = "Extract company details"
16+
website_html = "<html><body><div>Content</div></body></html>"
17+
# Create a LocalScraperRequest with a dummy output_schema.
18+
request = LocalScraperRequest(user_prompt=user_prompt, website_html=website_html, output_schema=DummySchema)
19+
dumped = request.model_dump()
20+
# Verify that output_schema is converted properly in the dumped dictionary.
21+
assert "output_schema" in dumped
22+
assert dumped["output_schema"] == DummySchema.model_json_schema()
23+
24+
def test_invalid_website_html_structure():
25+
"""
26+
Test that LocalScraperRequest raises a ValueError when the website_html provided
27+
has no parseable HTML tags. This ensures the HTML content validation catches
28+
non-HTML input.
29+
"""
30+
# This string has no HTML tags so BeautifulSoup.find() should return None.
31+
invalid_html = "Just some random text"
32+
with pytest.raises(ValueError, match="Invalid HTML - no parseable content found"):
33+
LocalScraperRequest(user_prompt="Extract info about the company", website_html=invalid_html)
34+
35+
def test_invalid_user_prompt_non_alnum():
36+
"""
37+
Test that LocalScraperRequest raises a ValueError when the user_prompt
38+
does not contain any alphanumeric characters.
39+
"""
40+
with pytest.raises(ValueError, match="User prompt must contain a valid prompt"):
41+
LocalScraperRequest(
42+
user_prompt="!!!",
43+
website_html="<html><body><div>Valid Content</div></body></html>"
44+
)
45+
46+
def test_get_localscraper_request_invalid_uuid():
47+
"""
48+
Test that GetLocalScraperRequest raises a ValueError when an invalid UUID is provided.
49+
This ensures that the model correctly validates the request_id as a proper UUID.
50+
"""
51+
invalid_uuid = "not-a-valid-uuid"
52+
with pytest.raises(ValueError, match="request_id must be a valid UUID"):
53+
GetLocalScraperRequest(request_id=invalid_uuid)
54+
55+
def test_website_html_exceeds_maximum_size():
56+
"""
57+
Test that LocalScraperRequest raises a ValueError when the website_html content
58+
exceeds the maximum allowed size of 2MB. The generated HTML is valid but too large.
59+
"""
60+
# Calculate the number of characters needed to exceed 2MB when encoded in UTF-8.
61+
max_size_bytes = 2 * 1024 * 1024
62+
# Create a valid HTML string that exceeds 2MB.
63+
base_html_prefix = "<html><body>"
64+
base_html_suffix = "</body></html>"
65+
repeated_char_length = max_size_bytes - len(base_html_prefix.encode("utf-8")) - len(base_html_suffix.encode("utf-8")) + 1
66+
oversized_content = "a" * repeated_char_length
67+
oversized_html = f"{base_html_prefix}{oversized_content}{base_html_suffix}"
68+
69+
with pytest.raises(ValueError, match="Website HTML content exceeds maximum size of 2MB"):
70+
LocalScraperRequest(user_prompt="Extract info", website_html=oversized_html)
71+
72+
def test_website_html_exactly_maximum_size():
73+
"""
74+
Test that LocalScraperRequest accepts website_html content exactly 2MB in size.
75+
This ensures that the size validation correctly allows content on the boundary.
76+
"""
77+
user_prompt = "Extract info with exact size HTML"
78+
prefix = "<html><body>"
79+
suffix = "</body></html>"
80+
# Calculate the length of the content needed to exactly reach 2MB when combined with prefix and suffix.
81+
max_size_bytes = 2 * 1024 * 1024
82+
content_length = max_size_bytes - len(prefix.encode("utf-8")) - len(suffix.encode("utf-8"))
83+
valid_content = "a" * content_length
84+
html = prefix + valid_content + suffix
85+
86+
# Attempt to create a valid LocalScraperRequest.
87+
request = LocalScraperRequest(user_prompt=user_prompt, website_html=html)
88+
89+
# Verify that the HTML content is exactly 2MB in size when encoded in UTF-8.
90+
assert len(request.website_html.encode("utf-8")) == max_size_bytes
+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import pytest
2+
from scrapegraph_py.models.markdownify import MarkdownifyRequest, GetMarkdownifyRequest
3+
4+
def test_markdownify_request_invalid_url_scheme():
5+
"""
6+
Test that MarkdownifyRequest raises a ValueError when the website_url does not
7+
start with either 'http://' or 'https://'.
8+
"""
9+
with pytest.raises(ValueError, match="Invalid URL"):
10+
MarkdownifyRequest(website_url="ftp://example.com")
11+
12+
def test_markdownify_request_empty_url():
13+
"""
14+
Test that MarkdownifyRequest raises a ValueError when the website_url is empty or contains only whitespace.
15+
"""
16+
with pytest.raises(ValueError, match="Website URL cannot be empty"):
17+
MarkdownifyRequest(website_url=" ")
18+
19+
def test_markdownify_request_valid_url():
20+
"""
21+
Test that MarkdownifyRequest properly creates an instance when provided with a valid URL.
22+
This covers the scenario where the input URL meets all validation requirements.
23+
"""
24+
valid_url = "https://example.com"
25+
req = MarkdownifyRequest(website_url=valid_url)
26+
assert req.website_url == valid_url
27+
28+
def test_markdownify_request_untrimmed_url():
29+
"""
30+
Test that MarkdownifyRequest raises a ValueError when the website_url contains leading or trailing whitespace.
31+
Although the stripped URL would be valid, the actual value is not processed further, causing the check
32+
for the proper URL scheme to fail.
33+
"""
34+
# The URL has leading whitespace, so it does not start directly with "https://"
35+
with pytest.raises(ValueError, match="Invalid URL"):
36+
MarkdownifyRequest(website_url=" https://example.com")
37+
38+
def test_get_markdownify_request_invalid_uuid():
39+
"""
40+
Test that GetMarkdownifyRequest raises a ValueError when the request_id is not a valid UUID.
41+
"""
42+
with pytest.raises(ValueError, match="request_id must be a valid UUID"):
43+
GetMarkdownifyRequest(request_id="invalid_uuid")
44+
45+
def test_get_markdownify_request_valid_uuid():
46+
"""
47+
Test that GetMarkdownifyRequest properly creates an instance when provided with a valid UUID.
48+
"""
49+
valid_uuid = "123e4567-e89b-12d3-a456-426614174000"
50+
req = GetMarkdownifyRequest(request_id=valid_uuid)
51+
assert req.request_id == valid_uuid
52+
53+
def test_get_markdownify_request_untrimmed_uuid():
54+
"""
55+
Test that GetMarkdownifyRequest raises a ValueError when the request_id
56+
contains leading or trailing whitespace, despite the trimmed UUID being valid.
57+
"""
58+
with pytest.raises(ValueError, match="request_id must be a valid UUID"):
59+
GetMarkdownifyRequest(request_id=" 123e4567-e89b-12d3-a456-426614174000 ")
+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import pytest
2+
from pydantic import BaseModel, ValidationError
3+
from scrapegraph_py.models.smartscraper import SmartScraperRequest, GetSmartScraperRequest
4+
5+
# Define a dummy schema to test the output_schema conversion in model_dump
6+
class DummySchema(BaseModel):
7+
"""A dummy schema to simulate a Pydantic model with JSON schema conversion."""
8+
a: int = 1
9+
10+
def test_model_dump_with_output_schema_conversion():
11+
"""
12+
Test that model_dump on SmartScraperRequest converts the provided output_schema into a JSON schema dict.
13+
"""
14+
# Create a request with a valid user prompt, website URL, and a dummy output_schema.
15+
request = SmartScraperRequest(
16+
user_prompt="Extract information about the company",
17+
website_url="https://scrapegraphai.com/",
18+
output_schema=DummySchema
19+
)
20+
# Get the dump dict from the model.
21+
output = request.model_dump()
22+
# The model_dump should include the 'output_schema' converted to its JSON schema representation.
23+
expected_schema = DummySchema.model_json_schema()
24+
assert output.get("output_schema") == expected_schema
25+
26+
def test_model_dump_without_output_schema():
27+
"""
28+
Test that model_dump on SmartScraperRequest returns output_schema as None
29+
when no output_schema is provided. This ensures that the conversion logic is only
30+
applied when output_schema is not None.
31+
"""
32+
# Create a valid SmartScraperRequest without providing an output_schema.
33+
request = SmartScraperRequest(
34+
user_prompt="Extract some meaningful data",
35+
website_url="https://scrapegraphai.com/"
36+
)
37+
# Get the dumped dictionary from the model.
38+
output = request.model_dump()
39+
# Ensure that the output contains the key "output_schema" and its value is None.
40+
assert "output_schema" in output, "Output schema key should be present even if None"
41+
assert output["output_schema"] is None, "Output schema should be None when not provided"
42+
43+
def test_invalid_get_smartscraper_request_id():
44+
"""
45+
Test that GetSmartScraperRequest raises a ValueError when provided with an invalid UUID.
46+
This test ensures that the request_id field is validated correctly.
47+
"""
48+
with pytest.raises(ValueError, match="request_id must be a valid UUID"):
49+
GetSmartScraperRequest(request_id="invalid-uuid")
50+
51+
def test_invalid_url_in_smartscraper_request():
52+
"""
53+
Test that SmartScraperRequest raises a ValueError when provided with a website_url
54+
that does not start with 'http://' or 'https://'. This ensures the URL validation works.
55+
"""
56+
with pytest.raises(ValueError, match="Invalid URL"):
57+
SmartScraperRequest(
58+
user_prompt="Extract data",
59+
website_url="ftp://invalid-url"
60+
)
61+
62+
def test_invalid_user_prompt_empty_and_non_alnum():
63+
"""
64+
Test that SmartScraperRequest raises a ValueError when the user_prompt is either empty (or only whitespace)
65+
or when it contains no alphanumeric characters. This ensures the user prompt validator is working correctly.
66+
"""
67+
# Test with a user_prompt that is empty (only whitespace)
68+
with pytest.raises(ValueError, match="User prompt cannot be empty"):
69+
SmartScraperRequest(
70+
user_prompt=" ",
71+
website_url="https://scrapegraphai.com/"
72+
)
73+
# Test with a user_prompt that contains no alphanumeric characters
74+
with pytest.raises(ValueError, match="User prompt must contain a valid prompt"):
75+
SmartScraperRequest(
76+
user_prompt="!!!",
77+
website_url="https://scrapegraphai.com/"
78+
)

0 commit comments

Comments
 (0)