|
1 | 1 | import io
|
| 2 | +import json |
2 | 3 | import logging
|
3 | 4 | import math
|
4 | 5 | import pathlib
|
5 |
| -from unittest.mock import AsyncMock, MagicMock |
| 6 | +from unittest.mock import AsyncMock, MagicMock, Mock |
6 | 7 |
|
7 | 8 | import pymupdf
|
8 | 9 | import pytest
|
|
17 | 18 | DocumentTable,
|
18 | 19 | DocumentTableCell,
|
19 | 20 | )
|
| 21 | +from azure.core.exceptions import HttpResponseError |
20 | 22 | from PIL import Image, ImageChops
|
21 | 23 |
|
22 | 24 | from prepdocslib.mediadescriber import ContentUnderstandingDescriber
|
@@ -308,3 +310,63 @@ async def mock_describe_image(self, image_bytes):
|
308 | 310 | pages[0].text
|
309 | 311 | == "# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n<figure><figcaption>Figure 1<br>Pie chart</figcaption></figure>\n\n\nThis is text after the figure that's not part of it."
|
310 | 312 | )
|
| 313 | + |
| 314 | + |
| 315 | +@pytest.mark.asyncio |
| 316 | +async def test_parse_unsupportedformat(monkeypatch, caplog): |
| 317 | + mock_poller = MagicMock() |
| 318 | + |
| 319 | + async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): |
| 320 | + |
| 321 | + if kwargs.get("features") == ["ocrHighResolution"]: |
| 322 | + |
| 323 | + class FakeErrorOne: |
| 324 | + def __init__(self): |
| 325 | + self.error = Mock(message="A fake error", code="FakeErrorOne") |
| 326 | + |
| 327 | + class FakeHttpResponse(HttpResponseError): |
| 328 | + def __init__(self, response, error, *args, **kwargs): |
| 329 | + self.error = error |
| 330 | + super().__init__(self, response=response, *args, **kwargs) |
| 331 | + |
| 332 | + message = { |
| 333 | + "error": { |
| 334 | + "code": "InvalidArgument", |
| 335 | + "message": "A fake error", |
| 336 | + } |
| 337 | + } |
| 338 | + response = Mock(status_code=500, headers={}) |
| 339 | + response.text = lambda encoding=None: json.dumps(message).encode("utf-8") |
| 340 | + response.headers["content-type"] = "application/json" |
| 341 | + response.content_type = "application/json" |
| 342 | + raise FakeHttpResponse(response, FakeErrorOne()) |
| 343 | + else: |
| 344 | + return mock_poller |
| 345 | + |
| 346 | + async def mock_poller_result(): |
| 347 | + return AnalyzeResult( |
| 348 | + content="Page content", |
| 349 | + pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])], |
| 350 | + tables=[], |
| 351 | + figures=[], |
| 352 | + ) |
| 353 | + |
| 354 | + monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) |
| 355 | + monkeypatch.setattr(mock_poller, "result", mock_poller_result) |
| 356 | + |
| 357 | + parser = DocumentAnalysisParser( |
| 358 | + endpoint="https://example.com", |
| 359 | + credential=MockAzureCredential(), |
| 360 | + use_content_understanding=True, |
| 361 | + content_understanding_endpoint="https://example.com", |
| 362 | + ) |
| 363 | + content = io.BytesIO(b"pdf content bytes") |
| 364 | + content.name = "test.docx" |
| 365 | + with caplog.at_level(logging.ERROR): |
| 366 | + pages = [page async for page in parser.parse(content)] |
| 367 | + assert "This document type does not support media description." in caplog.text |
| 368 | + |
| 369 | + assert len(pages) == 1 |
| 370 | + assert pages[0].page_num == 0 |
| 371 | + assert pages[0].offset == 0 |
| 372 | + assert pages[0].text == "Page content" |
0 commit comments