Skip to content

Commit ed2179f

Browse files
authored
docs(samples): Updated code samples for 2.1.0 release (#406)
* docs(samples): Added Image Quality Output to Document OCR Processor * docs(samples): Added `field_mask` to `batch_process` samples
1 parent 39cdf84 commit ed2179f

6 files changed

+45
-8
lines changed

documentai/snippets/batch_process_documents_processor_version_sample.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
# input_mime_type = "application/pdf"
3030
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
3131
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
32+
# field_mask = "text,entities,pages.pageNumber" # Optional. The fields to return in the Document object.
3233

3334

3435
def batch_process_documents_processor_version(
@@ -40,6 +41,7 @@ def batch_process_documents_processor_version(
4041
input_mime_type: str,
4142
gcs_output_bucket: str,
4243
gcs_output_uri_prefix: str,
44+
field_mask: str = None,
4345
timeout: int = 400,
4446
):
4547

@@ -67,7 +69,7 @@ def batch_process_documents_processor_version(
6769
destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"
6870

6971
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
70-
gcs_uri=destination_uri
72+
gcs_uri=destination_uri, field_mask=field_mask
7173
)
7274

7375
# Where to write results

documentai/snippets/batch_process_documents_processor_version_sample_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
2929
input_mime_type = "application/pdf"
3030
gcs_output_uri_prefix = uuid4()
31+
field_mask = "text,pages.pageNumber"
3132
BUCKET_NAME = f"document-ai-python-{uuid4()}"
3233

3334

@@ -56,6 +57,7 @@ def test_batch_process_documents_processor_version(capsys, test_bucket):
5657
input_mime_type=input_mime_type,
5758
gcs_output_bucket=f"gs://{test_bucket}",
5859
gcs_output_uri_prefix=gcs_output_uri_prefix,
60+
field_mask=field_mask,
5961
)
6062
out, _ = capsys.readouterr()
6163

documentai/snippets/batch_process_documents_sample.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
# input_mime_type = "application/pdf"
2929
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
3030
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
31+
# field_mask = "text,entities,pages.pageNumber" # Optional. The fields to return in the Document object.
3132

3233

3334
def batch_process_documents(
@@ -38,6 +39,7 @@ def batch_process_documents(
3839
input_mime_type: str,
3940
gcs_output_bucket: str,
4041
gcs_output_uri_prefix: str,
42+
field_mask: str = None,
4143
timeout: int = 400,
4244
):
4345

@@ -65,7 +67,7 @@ def batch_process_documents(
6567
destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"
6668

6769
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
68-
gcs_uri=destination_uri
70+
gcs_uri=destination_uri, field_mask=field_mask
6971
)
7072

7173
# Where to write results

documentai/snippets/batch_process_documents_sample_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
2828
input_mime_type = "application/pdf"
2929
gcs_output_uri_prefix = uuid4()
30+
field_mask = "text,pages.pageNumber"
3031
BUCKET_NAME = f"document-ai-python-{uuid4()}"
3132

3233

@@ -54,6 +55,7 @@ def test_batch_process_documents(capsys, test_bucket):
5455
input_mime_type=input_mime_type,
5556
gcs_output_bucket=f"gs://{test_bucket}",
5657
gcs_output_uri_prefix=gcs_output_uri_prefix,
58+
field_mask=field_mask,
5759
)
5860
out, _ = capsys.readouterr()
5961

documentai/snippets/process_document_ocr_sample.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,22 @@
2424
# project_id = 'YOUR_PROJECT_ID'
2525
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
2626
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
27+
# processor_version = 'rc' # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
2728
# file_path = '/path/to/local/pdf'
2829
# mime_type = 'application/pdf' # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
2930

3031

3132
def process_document_ocr_sample(
32-
project_id: str, location: str, processor_id: str, file_path: str, mime_type: str
33+
project_id: str,
34+
location: str,
35+
processor_id: str,
36+
processor_version: str,
37+
file_path: str,
38+
mime_type: str,
3339
) -> None:
3440
# Online processing request to Document AI
3541
document = process_document(
36-
project_id, location, processor_id, file_path, mime_type
42+
project_id, location, processor_id, processor_version, file_path, mime_type
3743
)
3844

3945
# For a full list of Document object attributes, please reference this page:
@@ -52,19 +58,30 @@ def process_document_ocr_sample(
5258
print_lines(page.lines, text)
5359
print_tokens(page.tokens, text)
5460

61+
# Currently supported in version pretrained-ocr-v1.1-2022-09-12
62+
if page.image_quality_scores:
63+
print_image_quality_scores(page.image_quality_scores)
64+
5565

5666
def process_document(
57-
project_id: str, location: str, processor_id: str, file_path: str, mime_type: str
67+
project_id: str,
68+
location: str,
69+
processor_id: str,
70+
processor_version: str,
71+
file_path: str,
72+
mime_type: str,
5873
) -> documentai.Document:
5974
# You must set the api_endpoint if you use a location other than 'us', e.g.:
6075
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
6176

6277
client = documentai.DocumentProcessorServiceClient(client_options=opts)
6378

64-
# The full resource name of the processor, e.g.:
65-
# projects/project_id/locations/location/processor/processor_id
79+
# The full resource name of the processor version
80+
# e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
6681
# You must create processors before running sample code.
67-
name = client.processor_path(project_id, location, processor_id)
82+
name = client.processor_version_path(
83+
project_id, location, processor_id, processor_version
84+
)
6885

6986
# Read the file into memory
7087
with open(file_path, "rb") as image:
@@ -133,6 +150,16 @@ def print_tokens(tokens: Sequence[documentai.Document.Page.Token], text: str) ->
133150
print(f" Last token break type: {repr(last_token_break_type)}")
134151

135152

153+
def print_image_quality_scores(
154+
image_quality_scores: documentai.Document.Page.ImageQualityScores,
155+
) -> None:
156+
print(f" Quality score: {image_quality_scores.quality_score:.1%}")
157+
print(" Detected defects:")
158+
159+
for detected_defect in image_quality_scores.detected_defects:
160+
print(f" {detected_defect.type_}: {detected_defect.confidence:.1%}")
161+
162+
136163
def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
137164
"""
138165
Document AI identifies text in different parts of the document by their

documentai/snippets/process_document_ocr_sample_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
location = "us"
2121
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
2222
processor_id = "52a38e080c1a7296"
23+
processor_version = "rc"
2324
file_path = "resources/handwritten_form.pdf"
2425
mime_type = "application/pdf"
2526

@@ -29,6 +30,7 @@ def test_process_documents(capsys):
2930
project_id=project_id,
3031
location=location,
3132
processor_id=processor_id,
33+
processor_version=processor_version,
3234
file_path=file_path,
3335
mime_type=mime_type,
3436
)

0 commit comments

Comments
 (0)