Skip to content

Commit 0db8d69

Browse files
aribrayleahecole
authored andcommitted
docs(samples): new Doc AI samples for v1beta3 (#44)
* batch_process_sample. changing from async to synchronous * add quick start and process_document samples and tests * add test and sample for batch_process * add test and sample for batch_process * resolve formatting * use os.environ * remove os.path.join * move tests * descriptive variable * specific Exception, formatting * parse all pages in process_document * add more helpful comments * remove unused imports * better exception handling * rename test files * ran linter, removed nested function in batch predict * refactor tests * format imports * format imports * format imports * serialize as Document object * extract get_text helper function * fix file path * delete test bucket * Update samples/snippets/batch_process_documents_sample_v1beta3_test.py Co-authored-by: Leah E. Cole <[email protected]> * Update samples/snippets/batch_process_documents_sample_v1beta3_test.py Co-authored-by: Leah E. Cole <[email protected]> * add more specific assertion in batch_process * add more specific assertion in process_document and quickstart * fix output_uri name * Apply suggestions from code review to resolve exception Co-authored-by: Leah E. Cole <[email protected]> * resolve exception * lint Co-authored-by: Leah E. Cole <[email protected]>
1 parent c5bc2e5 commit 0db8d69

9 files changed

+439
-15
lines changed

documentai/snippets/__init__.py

Whitespace-only changes.
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
# [START documentai_batch_process_document]
17+
import re
18+
19+
from google.cloud import documentai_v1beta3 as documentai
20+
from google.cloud import storage
21+
22+
# TODO(developer): Uncomment these variables before running the sample.
23+
# project_id= 'YOUR_PROJECT_ID'
24+
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
25+
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
26+
# input_uri = "YOUR_INPUT_URI"
27+
# gcs_output_uri = "YOUR_OUTPUT_BUCKET_URI"
28+
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX"
29+
30+
31+
def batch_process_documents(
32+
project_id,
33+
location,
34+
processor_id,
35+
gcs_input_uri,
36+
gcs_output_uri,
37+
gcs_output_uri_prefix,
38+
):
39+
40+
client = documentai.DocumentProcessorServiceClient()
41+
42+
destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"
43+
44+
# 'mime_type' can be 'application/pdf', 'image/tiff',
45+
# and 'image/gif', or 'application/json'
46+
input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
47+
gcs_source=gcs_input_uri, mime_type="application/pdf"
48+
)
49+
50+
# Where to write results
51+
output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
52+
gcs_destination=destination_uri
53+
)
54+
55+
# Location can be 'us' or 'eu'
56+
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
57+
request = documentai.types.document_processor_service.BatchProcessRequest(
58+
name=name,
59+
input_configs=[input_config],
60+
output_config=output_config,
61+
)
62+
63+
operation = client.batch_process_documents(request)
64+
65+
# Wait for the operation to finish
66+
operation.result()
67+
68+
# Results are written to GCS. Use a regex to find
69+
# output files
70+
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
71+
output_bucket = match.group(1)
72+
prefix = match.group(2)
73+
74+
storage_client = storage.Client()
75+
bucket = storage_client.get_bucket(output_bucket)
76+
blob_list = list(bucket.list_blobs(prefix=prefix))
77+
print("Output files:")
78+
79+
for i, blob in enumerate(blob_list):
80+
# Download the contents of this blob as a bytes object.
81+
blob_as_bytes = blob.download_as_bytes()
82+
document = documentai.types.Document.from_json(blob_as_bytes)
83+
84+
print(f"Fetched file {i + 1}")
85+
86+
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
87+
88+
# Read the text recognition output from the processor
89+
for page in document.pages:
90+
for form_field in page.form_fields:
91+
field_name = get_text(form_field.field_name, document)
92+
field_value = get_text(form_field.field_value, document)
93+
print("Extracted key value pair:")
94+
print(f"\t{field_name}, {field_value}")
95+
for paragraph in document.pages:
96+
paragraph_text = get_text(paragraph.layout, document)
97+
print(f"Paragraph text:\n{paragraph_text}")
98+
99+
100+
# Extract shards from the text field
101+
def get_text(doc_element: dict, document: dict):
102+
"""
103+
Document AI identifies form fields by their offsets
104+
in document text. This function converts offsets
105+
to text snippets.
106+
"""
107+
response = ""
108+
# If a text segment spans several lines, it will
109+
# be stored in different text segments.
110+
for segment in doc_element.text_anchor.text_segments:
111+
start_index = (
112+
int(segment.start_index)
113+
if "start_index" in doc_element.text_anchor.__dict__
114+
else 0
115+
)
116+
end_index = int(segment.end_index)
117+
response += document.text[start_index:end_index]
118+
return response
119+
120+
121+
# [END documentai_batch_process_document]
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
import os
17+
from uuid import uuid4
18+
19+
from google.cloud import storage
20+
from google.cloud.exceptions import NotFound
21+
22+
import pytest
23+
24+
from samples.snippets import batch_process_documents_sample_v1beta3
25+
26+
location = "us"
27+
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
28+
processor_id = "90484cfdedb024f6"
29+
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
30+
gcs_output_uri_prefix = uuid4()
31+
BUCKET_NAME = f"document-ai-python-{uuid4()}"
32+
33+
34+
@pytest.fixture(scope="module")
35+
def test_bucket():
36+
storage_client = storage.Client()
37+
bucket = storage_client.create_bucket(BUCKET_NAME)
38+
yield bucket.name
39+
40+
try:
41+
blobs = list(bucket.list_blobs())
42+
for blob in blobs:
43+
blob.delete()
44+
bucket.delete()
45+
except NotFound:
46+
print("Bucket already deleted.")
47+
48+
49+
def test_batch_process_documents(capsys, test_bucket):
50+
batch_process_documents_sample_v1beta3.batch_process_documents(
51+
project_id=project_id,
52+
location=location,
53+
processor_id=processor_id,
54+
gcs_input_uri=gcs_input_uri,
55+
gcs_output_uri=f"gs://{test_bucket}",
56+
gcs_output_uri_prefix=gcs_output_uri_prefix,
57+
)
58+
out, _ = capsys.readouterr()
59+
60+
assert "Extracted" in out
61+
assert "Paragraph" in out
62+
assert "Invoice" in out

documentai/snippets/noxfile.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,24 +37,22 @@
3737

3838
TEST_CONFIG = {
3939
# You can opt out from the test for specific Python versions.
40-
'ignored_versions': ["2.7"],
41-
40+
"ignored_versions": ["2.7"],
4241
# An envvar key for determining the project id to use. Change it
4342
# to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
4443
# build specific Cloud project. You can also use your own string
4544
# to use your own Cloud project.
46-
'gcloud_project_env': 'GOOGLE_CLOUD_PROJECT',
45+
"gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
4746
# 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
48-
4947
# A dictionary you want to inject into your test. Don't put any
5048
# secrets here. These values will override predefined values.
51-
'envs': {},
49+
"envs": {},
5250
}
5351

5452

5553
try:
5654
# Ensure we can import noxfile_config in the project's directory.
57-
sys.path.append('.')
55+
sys.path.append(".")
5856
from noxfile_config import TEST_CONFIG_OVERRIDE
5957
except ImportError as e:
6058
print("No user noxfile_config found: detail: {}".format(e))
@@ -69,13 +67,13 @@ def get_pytest_env_vars():
6967
ret = {}
7068

7169
# Override the GCLOUD_PROJECT and the alias.
72-
env_key = TEST_CONFIG['gcloud_project_env']
70+
env_key = TEST_CONFIG["gcloud_project_env"]
7371
# This should error out if not set.
74-
ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key]
75-
ret['GCLOUD_PROJECT'] = os.environ[env_key] # deprecated
72+
ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key]
73+
ret["GCLOUD_PROJECT"] = os.environ[env_key] # deprecated
7674

7775
# Apply user supplied envs.
78-
ret.update(TEST_CONFIG['envs'])
76+
ret.update(TEST_CONFIG["envs"])
7977
return ret
8078

8179

@@ -84,7 +82,7 @@ def get_pytest_env_vars():
8482
ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"]
8583

8684
# Any default versions that should be ignored.
87-
IGNORED_VERSIONS = TEST_CONFIG['ignored_versions']
85+
IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"]
8886

8987
TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])
9088

@@ -138,7 +136,7 @@ def lint(session):
138136
args = FLAKE8_COMMON_ARGS + [
139137
"--application-import-names",
140138
",".join(local_names),
141-
"."
139+
".",
142140
]
143141
session.run("flake8", *args)
144142

@@ -147,6 +145,7 @@ def lint(session):
147145
# Black
148146
#
149147

148+
150149
@nox.session
151150
def blacken(session):
152151
session.install("black")
@@ -194,9 +193,9 @@ def py(session):
194193
if session.python in TESTED_VERSIONS:
195194
_session_tests(session)
196195
else:
197-
session.skip("SKIPPED: {} tests are disabled for this sample.".format(
198-
session.python
199-
))
196+
session.skip(
197+
"SKIPPED: {} tests are disabled for this sample.".format(session.python)
198+
)
200199

201200

202201
#
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
from google.cloud import documentai_v1beta3 as documentai
17+
18+
# [START documentai_process_document]
19+
20+
# TODO(developer): Uncomment these variables before running the sample.
21+
# project_id= 'YOUR_PROJECT_ID';
22+
# location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
23+
# processor_id = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
24+
# file_path = '/path/to/local/pdf';
25+
26+
27+
def process_document_sample(
28+
project_id: str, location: str, processor_id: str, file_path: str
29+
):
30+
# Instantiates a client
31+
client = documentai.DocumentProcessorServiceClient()
32+
33+
# The full resource name of the processor, e.g.:
34+
# projects/project-id/locations/location/processor/processor-id
35+
# You must create new processors in the Cloud Console first
36+
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
37+
38+
with open(file_path, "rb") as image:
39+
image_content = image.read()
40+
41+
# Read the file into memory
42+
document = {"content": image_content, "mime_type": "application/pdf"}
43+
44+
# Configure the process request
45+
request = {"name": name, "document": document}
46+
47+
# Recognizes text entities in the PDF document
48+
result = client.process_document(request=request)
49+
50+
document = result.document
51+
52+
print("Document processing complete.")
53+
54+
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
55+
56+
document_pages = document.pages
57+
58+
# Read the text recognition output from the processor
59+
print("The document contains the following paragraphs:")
60+
for page in document_pages:
61+
paragraphs = page.paragraphs
62+
for paragraph in paragraphs:
63+
paragraph_text = get_text(paragraph.layout, document)
64+
print(f"Paragraph text: {paragraph_text}")
65+
66+
67+
# Extract shards from the text field
68+
def get_text(doc_element: dict, document: dict):
69+
"""
70+
Document AI identifies form fields by their offsets
71+
in document text. This function converts offsets
72+
to text snippets.
73+
"""
74+
response = ""
75+
# If a text segment spans several lines, it will
76+
# be stored in different text segments.
77+
for segment in doc_element.text_anchor.text_segments:
78+
start_index = (
79+
int(segment.start_index)
80+
if segment.start_index in doc_element.text_anchor.text_segments
81+
else 0
82+
)
83+
end_index = int(segment.end_index)
84+
response += document.text[start_index:end_index]
85+
return response
86+
87+
88+
# [END documentai_process_document]
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# # Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
import os
17+
18+
from samples.snippets import process_document_sample_v1beta3
19+
20+
21+
location = "us"
22+
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
23+
processor_id = "90484cfdedb024f6"
24+
file_path = "resources/invoice.pdf"
25+
26+
27+
def test_process_documents(capsys):
28+
process_document_sample_v1beta3.process_document_sample(
29+
project_id=project_id,
30+
location=location,
31+
processor_id=processor_id,
32+
file_path=file_path,
33+
)
34+
out, _ = capsys.readouterr()
35+
36+
assert "Paragraph" in out
37+
assert "Invoice" in out

0 commit comments

Comments
 (0)