|
| 1 | +import pytest |
| 2 | +from pydantic import BaseModel |
| 3 | +from scrapegraph_py.models.localscraper import LocalScraperRequest, GetLocalScraperRequest |
| 4 | + |
| 5 | +# Create a dummy output schema to test the conversion in model_dump. |
| 6 | +class DummySchema(BaseModel): |
| 7 | + test_field: str |
| 8 | + |
| 9 | +def test_output_schema_conversion(): |
| 10 | + """ |
| 11 | + Test that when an output_schema is provided in a LocalScraperRequest, |
| 12 | + model_dump returns a dictionary where the output_schema key holds the JSON schema |
| 13 | + of the provided Pydantic model. |
| 14 | + """ |
| 15 | + user_prompt = "Extract company details" |
| 16 | + website_html = "<html><body><div>Content</div></body></html>" |
| 17 | + # Create a LocalScraperRequest with a dummy output_schema. |
| 18 | + request = LocalScraperRequest(user_prompt=user_prompt, website_html=website_html, output_schema=DummySchema) |
| 19 | + dumped = request.model_dump() |
| 20 | + # Verify that output_schema is converted properly in the dumped dictionary. |
| 21 | + assert "output_schema" in dumped |
| 22 | + assert dumped["output_schema"] == DummySchema.model_json_schema() |
| 23 | + |
| 24 | +def test_invalid_website_html_structure(): |
| 25 | + """ |
| 26 | + Test that LocalScraperRequest raises a ValueError when the website_html provided |
| 27 | + has no parseable HTML tags. This ensures the HTML content validation catches |
| 28 | + non-HTML input. |
| 29 | + """ |
| 30 | + # This string has no HTML tags so BeautifulSoup.find() should return None. |
| 31 | + invalid_html = "Just some random text" |
| 32 | + with pytest.raises(ValueError, match="Invalid HTML - no parseable content found"): |
| 33 | + LocalScraperRequest(user_prompt="Extract info about the company", website_html=invalid_html) |
| 34 | + |
| 35 | +def test_invalid_user_prompt_non_alnum(): |
| 36 | + """ |
| 37 | + Test that LocalScraperRequest raises a ValueError when the user_prompt |
| 38 | + does not contain any alphanumeric characters. |
| 39 | + """ |
| 40 | + with pytest.raises(ValueError, match="User prompt must contain a valid prompt"): |
| 41 | + LocalScraperRequest( |
| 42 | + user_prompt="!!!", |
| 43 | + website_html="<html><body><div>Valid Content</div></body></html>" |
| 44 | + ) |
| 45 | + |
| 46 | +def test_get_localscraper_request_invalid_uuid(): |
| 47 | + """ |
| 48 | + Test that GetLocalScraperRequest raises a ValueError when an invalid UUID is provided. |
| 49 | + This ensures that the model correctly validates the request_id as a proper UUID. |
| 50 | + """ |
| 51 | + invalid_uuid = "not-a-valid-uuid" |
| 52 | + with pytest.raises(ValueError, match="request_id must be a valid UUID"): |
| 53 | + GetLocalScraperRequest(request_id=invalid_uuid) |
| 54 | + |
| 55 | +def test_website_html_exceeds_maximum_size(): |
| 56 | + """ |
| 57 | + Test that LocalScraperRequest raises a ValueError when the website_html content |
| 58 | + exceeds the maximum allowed size of 2MB. The generated HTML is valid but too large. |
| 59 | + """ |
| 60 | + # Calculate the number of characters needed to exceed 2MB when encoded in UTF-8. |
| 61 | + max_size_bytes = 2 * 1024 * 1024 |
| 62 | + # Create a valid HTML string that exceeds 2MB. |
| 63 | + base_html_prefix = "<html><body>" |
| 64 | + base_html_suffix = "</body></html>" |
| 65 | + repeated_char_length = max_size_bytes - len(base_html_prefix.encode("utf-8")) - len(base_html_suffix.encode("utf-8")) + 1 |
| 66 | + oversized_content = "a" * repeated_char_length |
| 67 | + oversized_html = f"{base_html_prefix}{oversized_content}{base_html_suffix}" |
| 68 | + |
| 69 | + with pytest.raises(ValueError, match="Website HTML content exceeds maximum size of 2MB"): |
| 70 | + LocalScraperRequest(user_prompt="Extract info", website_html=oversized_html) |
| 71 | + |
| 72 | +def test_website_html_exactly_maximum_size(): |
| 73 | + """ |
| 74 | + Test that LocalScraperRequest accepts website_html content exactly 2MB in size. |
| 75 | + This ensures that the size validation correctly allows content on the boundary. |
| 76 | + """ |
| 77 | + user_prompt = "Extract info with exact size HTML" |
| 78 | + prefix = "<html><body>" |
| 79 | + suffix = "</body></html>" |
| 80 | + # Calculate the length of the content needed to exactly reach 2MB when combined with prefix and suffix. |
| 81 | + max_size_bytes = 2 * 1024 * 1024 |
| 82 | + content_length = max_size_bytes - len(prefix.encode("utf-8")) - len(suffix.encode("utf-8")) |
| 83 | + valid_content = "a" * content_length |
| 84 | + html = prefix + valid_content + suffix |
| 85 | + |
| 86 | + # Attempt to create a valid LocalScraperRequest. |
| 87 | + request = LocalScraperRequest(user_prompt=user_prompt, website_html=html) |
| 88 | + |
| 89 | + # Verify that the HTML content is exactly 2MB in size when encoded in UTF-8. |
| 90 | + assert len(request.website_html.encode("utf-8")) == max_size_bytes |
0 commit comments