Skip to content

Commit 478dcc1

Browse files
authored
docs(samples): Added Processor Version Samples (#382)
* docs(samples): Added Processor Version Samples To Be Published in documentation: https://cloud.google.com/document-ai/docs/manage-processor - `get_processor_version` - `list_processor_versions` - `set_default_processor_version` * docs(samples): Adjusted Bad Batch Input test to * docs(samples): Added Deploy/Undeploy Samples * docs(samples): Added process & batchProcess examples for processorVersions - Removed Processor Version from basic process and batchProcess examples - Removed Note about must create processors in the Cloud Console - Added note that processor must be created before running sample where missing * docs(samples): Adjusted Enable/Disable Processor Test to avoid Race Conditions * docs(samples): Added Delete Processor Version Sample - Also Fixed Spelling error in Undeploy Comments * docs(samples): Updated non-idempotent unit tests to use mocks - Also replaced test ocr processor id after making a breaking change to the project - Added `field_mask` to process_documents tests
1 parent b9adf36 commit 478dcc1

30 files changed

+935
-36
lines changed
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
# [START documentai_batch_process_documents_processor_version]
17+
import re
18+
19+
from google.api_core.client_options import ClientOptions
20+
from google.cloud import documentai, storage
21+
22+
# TODO(developer): Uncomment these variables before running the sample.
23+
# project_id = 'YOUR_PROJECT_ID'
24+
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
25+
# processor_id = 'YOUR_PROCESSOR_ID' # Example: aeb8cea219b7c272
26+
# processor_version_id = "YOUR_PROCESSOR_VERSION_ID" # Example: pretrained-ocr-v1.0-2020-09-23
27+
# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
28+
# input_mime_type = "application/pdf"
29+
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
30+
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
31+
32+
33+
def batch_process_documents_processor_version(
34+
project_id: str,
35+
location: str,
36+
processor_id: str,
37+
processor_version_id: str,
38+
gcs_input_uri: str,
39+
input_mime_type: str,
40+
gcs_output_bucket: str,
41+
gcs_output_uri_prefix: str,
42+
timeout: int = 300,
43+
):
44+
45+
# You must set the api_endpoint if you use a location other than 'us', e.g.:
46+
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
47+
48+
client = documentai.DocumentProcessorServiceClient(client_options=opts)
49+
50+
gcs_document = documentai.GcsDocument(
51+
gcs_uri=gcs_input_uri, mime_type=input_mime_type
52+
)
53+
54+
# Load GCS Input URI into a List of document files
55+
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
56+
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
57+
58+
# NOTE: Alternatively, specify a GCS URI Prefix to process an entire directory
59+
#
60+
# gcs_input_uri = "gs://bucket/directory/"
61+
# gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
62+
# input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
63+
#
64+
65+
# Cloud Storage URI for the Output Directory
66+
destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"
67+
68+
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
69+
gcs_uri=destination_uri
70+
)
71+
72+
# Where to write results
73+
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
74+
75+
# The full resource name of the processor version
76+
# e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
77+
name = client.processor_version_path(
78+
project_id, location, processor_id, processor_version_id
79+
)
80+
81+
request = documentai.BatchProcessRequest(
82+
name=name,
83+
input_documents=input_config,
84+
document_output_config=output_config,
85+
)
86+
87+
# BatchProcess returns a Long Running Operation (LRO)
88+
operation = client.batch_process_documents(request)
89+
90+
# Continually polls the operation until it is complete.
91+
# This could take some time for larger files
92+
# Format: projects/PROJECT_NUMBER/locations/LOCATION/operations/OPERATION_ID
93+
print(f"Waiting for operation {operation.operation.name} to complete...")
94+
operation.result(timeout=timeout)
95+
96+
# NOTE: Can also use callbacks for asynchronous processing
97+
#
98+
# def my_callback(future):
99+
# result = future.result()
100+
#
101+
# operation.add_done_callback(my_callback)
102+
103+
# Once the operation is complete,
104+
# get output document information from operation metadata
105+
metadata = documentai.BatchProcessMetadata(operation.metadata)
106+
107+
if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
108+
raise ValueError(f"Batch Process Failed: {metadata.state_message}")
109+
110+
storage_client = storage.Client()
111+
112+
print("Output files:")
113+
# One process per Input Document
114+
for process in metadata.individual_process_statuses:
115+
# output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
116+
# The Cloud Storage API requires the bucket name and URI prefix separately
117+
matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
118+
if not matches:
119+
print(
120+
"Could not parse output GCS destination:",
121+
process.output_gcs_destination,
122+
)
123+
continue
124+
125+
output_bucket, output_prefix = matches.groups()
126+
127+
# Get List of Document Objects from the Output Bucket
128+
output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)
129+
130+
# Document AI may output multiple JSON files per source file
131+
for blob in output_blobs:
132+
# Document AI should only output JSON files to GCS
133+
if ".json" not in blob.name:
134+
print(
135+
f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
136+
)
137+
continue
138+
139+
# Download JSON File as bytes object and convert to Document Object
140+
print(f"Fetching {blob.name}")
141+
document = documentai.Document.from_json(
142+
blob.download_as_bytes(), ignore_unknown_fields=True
143+
)
144+
145+
# For a full list of Document object attributes, please reference this page:
146+
# https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document
147+
148+
# Read the text recognition output from the processor
149+
print("The document contains the following text:")
150+
print(document.text)
151+
152+
153+
# [END documentai_batch_process_documents_processor_version]
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
import os
17+
from uuid import uuid4
18+
19+
from google.cloud import storage
20+
from google.cloud.exceptions import NotFound
21+
import pytest
22+
from samples.snippets import batch_process_documents_processor_version_sample
23+
24+
location = "us"
25+
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
26+
processor_id = "90484cfdedb024f6"
27+
processor_version_id = "pretrained-form-parser-v1.0-2020-09-23"
28+
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
29+
input_mime_type = "application/pdf"
30+
gcs_output_uri_prefix = uuid4()
31+
BUCKET_NAME = f"document-ai-python-{uuid4()}"
32+
33+
34+
@pytest.fixture(scope="module")
35+
def test_bucket():
36+
storage_client = storage.Client()
37+
bucket = storage_client.create_bucket(BUCKET_NAME)
38+
yield bucket.name
39+
40+
try:
41+
blobs = list(bucket.list_blobs())
42+
for blob in blobs:
43+
blob.delete()
44+
bucket.delete()
45+
except NotFound:
46+
print("Bucket already deleted.")
47+
48+
49+
def test_batch_process_documents_processor_version(capsys, test_bucket):
50+
batch_process_documents_processor_version_sample.batch_process_documents_processor_version(
51+
project_id=project_id,
52+
location=location,
53+
processor_id=processor_id,
54+
processor_version_id=processor_version_id,
55+
gcs_input_uri=gcs_input_uri,
56+
input_mime_type=input_mime_type,
57+
gcs_output_bucket=f"gs://{test_bucket}",
58+
gcs_output_uri_prefix=gcs_output_uri_prefix,
59+
)
60+
out, _ = capsys.readouterr()
61+
62+
assert "operation" in out
63+
assert "Fetching" in out
64+
assert "text:" in out

documentai/snippets/batch_process_documents_sample.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
# project_id = 'YOUR_PROJECT_ID'
2424
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
2525
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
26-
# processor_version = "pretrained" # Optional. Processor version to use
2726
# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
2827
# input_mime_type = "application/pdf"
2928
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
@@ -73,17 +72,8 @@ def batch_process_documents(
7372

7473
# The full resource name of the processor, e.g.:
7574
# projects/project_id/locations/location/processor/processor_id
76-
# You must create new processors in the Cloud Console first
7775
name = client.processor_path(project_id, location, processor_id)
7876

79-
# NOTE: Alternatively, specify the processor_version to specify a particular version of the processor to use
80-
# projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processorVersion}
81-
#
82-
# name = client.processor_version_path(
83-
# project_id, location, processor_id, processor_version
84-
# )
85-
#
86-
8777
request = documentai.BatchProcessRequest(
8878
name=name,
8979
input_documents=input_config,

documentai/snippets/batch_process_documents_sample_bad_input_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,4 @@ def test_batch_process_documents_with_bad_input(capsys):
4444
out, _ = capsys.readouterr()
4545
assert "Failed" in out
4646
except Exception as e:
47-
assert "Internal error" in e.message
47+
assert "Failed" in e.message
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Copyright 2022 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
# [START documentai_delete_processor_version]
17+
18+
from google.api_core.client_options import ClientOptions
19+
from google.api_core.exceptions import FailedPrecondition, InvalidArgument
20+
from google.cloud import documentai
21+
22+
# TODO(developer): Uncomment these variables before running the sample.
23+
# project_id = 'YOUR_PROJECT_ID'
24+
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
25+
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
26+
# processor_version_id = 'YOUR_PROCESSOR_VERSION_ID'
27+
28+
29+
def delete_processor_version_sample(
30+
project_id: str, location: str, processor_id: str, processor_version_id: str
31+
):
32+
# You must set the api_endpoint if you use a location other than 'us', e.g.:
33+
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
34+
35+
client = documentai.DocumentProcessorServiceClient(client_options=opts)
36+
37+
# The full resource name of the processor version
38+
# e.g.: projects/project_id/locations/location/processors/processor_id/processorVersions/processor_version_id
39+
name = client.processor_version_path(
40+
project_id, location, processor_id, processor_version_id
41+
)
42+
43+
# Make DeleteProcessorVersion request
44+
try:
45+
operation = client.delete_processor_version(name=name)
46+
# Print operation details
47+
print(operation.operation.name)
48+
# Wait for operation to complete
49+
operation.result()
50+
# Delete request will fail if the
51+
# processor version doesn't exist
52+
# or if a request is made on a pretrained processor version
53+
# or the default processor version
54+
except (FailedPrecondition, InvalidArgument) as e:
55+
print(e.message)
56+
57+
58+
# [END documentai_delete_processor_version]
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Copyright 2022 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
import os
17+
18+
import mock
19+
from samples.snippets import delete_processor_version_sample
20+
21+
location = "us"
22+
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
23+
processor_id = "aaaaaaaaa"
24+
processor_version_id = "xxxxxxxxxx"
25+
26+
27+
@mock.patch(
28+
"google.cloud.documentai.DocumentProcessorServiceClient.delete_processor_version"
29+
)
30+
@mock.patch("google.api_core.operation.Operation")
31+
def test_delete_processor_version(
32+
operation_mock, delete_processor_version_mock, capsys
33+
):
34+
delete_processor_version_mock.return_value = operation_mock
35+
36+
delete_processor_version_sample.delete_processor_version_sample(
37+
project_id=project_id,
38+
location=location,
39+
processor_id=processor_id,
40+
processor_version_id=processor_version_id,
41+
)
42+
43+
delete_processor_version_mock.assert_called_once()
44+
45+
out, _ = capsys.readouterr()
46+
47+
assert "operation" in out

0 commit comments

Comments
 (0)