Skip to content

Commit bfe4ffc

Browse files
holtskinnergalz10parthea
authored
refactor: Updates to Document AI Python Samples (#323)
* Updated OCR Quickstart Sample Added Types to Request Creation Added ClientOptions object for type safety Simplified output code to print full text instead of paragraphs Updated Link to Document Object v1 specification Added mime_type as variable * Updates to process_document_sample - Same Updates as Quickstart Sample - Moved Imports to top of quickstart file * Updated Batch Process Example - Added typing - Use BatchProcessMetadata instead of Operation ID to get output files from GCS - Added MimeType specification - Added Alternatives for Directory Processing & Callbacks - Minor Changes to process_document/quickstart for unified style with batch * Updates to OCR Response Handling Sample - Separated Online Processing Request into function - Added explicit typing for documentai objects - Converted `.format()` to f-string - Simplified `layout_to_text()` * Updated Form Processing Sample - Updated to `v1` API - Separated processing request into function - Added explicit typing for Document AI Types - Separated `print_table_rows()` into function for modularity - Fixed Spelling error "Collumns" * Updated Specialized Processor Sample - Added Extraction of Properties (Nested Entities) and Normalized Values * Updates to Splitter/Classifier Sample - Updated to `v1` API - Changed Page Numeber Printout - (Splitter Classifiers now output all page numbers within a subdocument, instead of just the first and last) * Updated Test for process_document_sample - Added mime_type * Updated Document Quality Processor Sample - Updated to `v1` API - Moved API Call to separate function - Updated `.format()` to f-strings - Added Handling for Multiple Page Numbers per entity - Reused `page_refs_to_string()` from splitter/classifier example - Added `mime_type` as parameter * Updated Batch Processing Directory sample variable from CR comments * Added Sample Input PDF Files & Output JSON Files * Fixed Spelling Error in Invoice Parser Output filenames * Addressed Code Review Comments - Changed Copyright Year back to 2020 - Changed "property" variable to "prop" to avoid naming conflicts * Updated Client Library Requirements versions * Addressed Unit Test Failures * Re-added google-api-core to requirements.txt * Update samples/snippets/process_document_form_sample.py Co-authored-by: Anthonios Partheniou <[email protected]> * Update samples/snippets/requirements.txt Co-authored-by: Anthonios Partheniou <[email protected]> * Fixed "entirity" spelling error Co-authored-by: Gal Zahavi <[email protected]> Co-authored-by: Anthonios Partheniou <[email protected]>
1 parent 35b59e6 commit bfe4ffc

File tree

58 files changed

+462989
-392
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+462989
-392
lines changed

document_ai/snippets/batch_process_documents_sample.py

Lines changed: 93 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -16,118 +16,136 @@
1616
# [START documentai_batch_process_document]
1717
import re
1818

19+
from google.api_core.client_options import ClientOptions
1920
from google.cloud import documentai_v1 as documentai
2021
from google.cloud import storage
2122

2223
# TODO(developer): Uncomment these variables before running the sample.
23-
# project_id= 'YOUR_PROJECT_ID'
24-
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
24+
# project_id = 'YOUR_PROJECT_ID'
25+
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
2526
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
26-
# gcs_input_uri = "YOUR_INPUT_URI"
27-
# gcs_output_uri = "YOUR_OUTPUT_BUCKET_URI"
28-
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX"
27+
# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
28+
# input_mime_type = "application/pdf"
29+
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
30+
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
2931

3032

3133
def batch_process_documents(
32-
project_id,
33-
location,
34-
processor_id,
35-
gcs_input_uri,
36-
gcs_output_uri,
37-
gcs_output_uri_prefix,
34+
project_id: str,
35+
location: str,
36+
processor_id: str,
37+
gcs_input_uri: str,
38+
input_mime_type: str,
39+
gcs_output_bucket: str,
40+
gcs_output_uri_prefix: str,
3841
timeout: int = 300,
3942
):
4043

4144
# You must set the api_endpoint if you use a location other than 'us', e.g.:
42-
opts = {}
43-
if location == "eu":
44-
opts = {"api_endpoint": "eu-documentai.googleapis.com"}
45+
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
4546

4647
client = documentai.DocumentProcessorServiceClient(client_options=opts)
4748

48-
destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"
49-
50-
gcs_documents = documentai.GcsDocuments(
51-
documents=[{"gcs_uri": gcs_input_uri, "mime_type": "application/pdf"}]
49+
gcs_document = documentai.GcsDocument(
50+
gcs_uri=gcs_input_uri, mime_type=input_mime_type
5251
)
5352

54-
# 'mime_type' can be 'application/pdf', 'image/tiff',
55-
# and 'image/gif', or 'application/json'
53+
# Load GCS Input URI into a List of document files
54+
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
5655
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
5756

58-
# Where to write results
59-
output_config = documentai.DocumentOutputConfig(
60-
gcs_output_config={"gcs_uri": destination_uri}
57+
# NOTE: Alternatively, specify a GCS URI Prefix to process an entire directory
58+
#
59+
# gcs_input_uri = "gs://bucket/directory/"
60+
# gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
61+
# input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
62+
#
63+
64+
# Cloud Storage URI for the Output Directory
65+
destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"
66+
67+
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
68+
gcs_uri=destination_uri
6169
)
6270

63-
# Location can be 'us' or 'eu'
64-
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
65-
request = documentai.types.document_processor_service.BatchProcessRequest(
71+
# Where to write results
72+
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
73+
74+
# The full resource name of the processor, e.g.:
75+
# projects/project_id/locations/location/processor/processor_id
76+
# You must create new processors in the Cloud Console first
77+
name = client.processor_path(project_id, location, processor_id)
78+
79+
request = documentai.BatchProcessRequest(
6680
name=name,
6781
input_documents=input_config,
6882
document_output_config=output_config,
6983
)
7084

85+
# BatchProcess returns a Long Running Operation (LRO)
7186
operation = client.batch_process_documents(request)
7287

73-
# Wait for the operation to finish
88+
# Continually polls the operation until it is complete.
89+
# This could take some time for larger files
90+
# Format: projects/PROJECT_NUMBER/locations/LOCATION/operations/OPERATION_ID
91+
print(f"Waiting for operation {operation.operation.name} to complete...")
7492
operation.result(timeout=timeout)
7593

76-
# Results are written to GCS. Use a regex to find
77-
# output files
78-
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
79-
output_bucket = match.group(1)
80-
prefix = match.group(2)
94+
# NOTE: Can also use callbacks for asynchronous processing
95+
#
96+
# def my_callback(future):
97+
# result = future.result()
98+
#
99+
# operation.add_done_callback(my_callback)
81100

82-
storage_client = storage.Client()
83-
bucket = storage_client.get_bucket(output_bucket)
84-
blob_list = list(bucket.list_blobs(prefix=prefix))
85-
print("Output files:")
101+
# Once the operation is complete,
102+
# get output document information from operation metadata
103+
metadata = documentai.BatchProcessMetadata(operation.metadata)
86104

87-
for i, blob in enumerate(blob_list):
88-
# If JSON file, download the contents of this blob as a bytes object.
89-
if ".json" in blob.name:
90-
blob_as_bytes = blob.download_as_bytes()
105+
if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
106+
raise ValueError(f"Batch Process Failed: {metadata.state_message}")
91107

92-
document = documentai.types.Document.from_json(blob_as_bytes)
93-
print(f"Fetched file {i + 1}")
108+
storage_client = storage.Client()
109+
110+
print("Output files:")
111+
# One process per Input Document
112+
for process in metadata.individual_process_statuses:
113+
# output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
114+
# The Cloud Storage API requires the bucket name and URI prefix separately
115+
matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
116+
if not matches:
117+
print(
118+
"Could not parse output GCS destination:",
119+
process.output_gcs_destination,
120+
)
121+
continue
122+
123+
output_bucket, output_prefix = matches.groups()
124+
125+
# Get List of Document Objects from the Output Bucket
126+
output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)
127+
128+
# Document AI may output multiple JSON files per source file
129+
for blob in output_blobs:
130+
# Document AI should only output JSON files to GCS
131+
if ".json" not in blob.name:
132+
print(
133+
f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
134+
)
135+
continue
136+
137+
# Download JSON File as bytes object and convert to Document Object
138+
print(f"Fetching {blob.name}")
139+
document = documentai.Document.from_json(
140+
blob.download_as_bytes(), ignore_unknown_fields=True
141+
)
94142

95143
# For a full list of Document object attributes, please reference this page:
96-
# https://cloud.google.com/document-ai/docs/reference/rpc/google.cloud.documentai.v1beta3#document
144+
# https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document
97145

98146
# Read the text recognition output from the processor
99-
for page in document.pages:
100-
for form_field in page.form_fields:
101-
field_name = get_text(form_field.field_name, document)
102-
field_value = get_text(form_field.field_value, document)
103-
print("Extracted key value pair:")
104-
print(f"\t{field_name}, {field_value}")
105-
for paragraph in page.paragraphs:
106-
paragraph_text = get_text(paragraph.layout, document)
107-
print(f"Paragraph text:\n{paragraph_text}")
108-
else:
109-
print(f"Skipping non-supported file type {blob.name}")
110-
111-
112-
# Extract shards from the text field
113-
def get_text(doc_element: dict, document: dict):
114-
"""
115-
Document AI identifies form fields by their offsets
116-
in document text. This function converts offsets
117-
to text snippets.
118-
"""
119-
response = ""
120-
# If a text segment spans several lines, it will
121-
# be stored in different text segments.
122-
for segment in doc_element.text_anchor.text_segments:
123-
start_index = (
124-
int(segment.start_index)
125-
if segment in doc_element.text_anchor.text_segments
126-
else 0
127-
)
128-
end_index = int(segment.end_index)
129-
response += document.text[start_index:end_index]
130-
return response
147+
print("The document contains the following text:")
148+
print(document.text)
131149

132150

133151
# [END documentai_batch_process_document]

document_ai/snippets/batch_process_documents_sample_bad_input_test.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@
2222
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
2323
processor_id = "90484cfdedb024f6"
2424
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
25+
input_mime_type = "application/pdf"
2526
# following bucket contains .csv file which will cause the sample to fail.
2627
gcs_output_full_uri_with_wrong_type = "gs://documentai-beta-samples"
28+
gcs_output_uri_prefix = "test"
2729
BUCKET_NAME = f"document-ai-python-{uuid4()}"
2830

2931

@@ -34,8 +36,9 @@ def test_batch_process_documents_with_bad_input(capsys):
3436
location=location,
3537
processor_id=processor_id,
3638
gcs_input_uri=gcs_input_uri,
37-
gcs_output_uri=gcs_output_full_uri_with_wrong_type,
38-
gcs_output_uri_prefix="test",
39+
input_mime_type=input_mime_type,
40+
gcs_output_bucket=gcs_output_full_uri_with_wrong_type,
41+
gcs_output_uri_prefix=gcs_output_uri_prefix,
3942
timeout=450,
4043
)
4144
out, _ = capsys.readouterr()

document_ai/snippets/batch_process_documents_sample_test.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
2626
processor_id = "90484cfdedb024f6"
2727
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
28+
input_mime_type = "application/pdf"
2829
gcs_output_uri_prefix = uuid4()
2930
BUCKET_NAME = f"document-ai-python-{uuid4()}"
3031

@@ -50,11 +51,12 @@ def test_batch_process_documents(capsys, test_bucket):
5051
location=location,
5152
processor_id=processor_id,
5253
gcs_input_uri=gcs_input_uri,
53-
gcs_output_uri=f"gs://{test_bucket}",
54+
input_mime_type=input_mime_type,
55+
gcs_output_bucket=f"gs://{test_bucket}",
5456
gcs_output_uri_prefix=gcs_output_uri_prefix,
5557
)
5658
out, _ = capsys.readouterr()
5759

58-
assert "Extracted" in out
59-
assert "Paragraph" in out
60-
assert "Invoice" in out
60+
assert "operation" in out
61+
assert "Fetching" in out
62+
assert "text:" in out

0 commit comments

Comments
 (0)