Skip to content

Commit 7b2f8c9

Browse files
munkhuushmglholtskinner
authored andcommitted
samples: migrate v1beta2 doc AI samples (#79)
* samples: migrate v1beta2 doc AI samples * added noxfile * reformatted code * organized imports in right order * lint * finally fixed lint * reorganized folders * imports * added from prefix imports * renamed files * renamed package on tests files * nit
1 parent efb2acc commit 7b2f8c9

15 files changed

+825
-18
lines changed

batch_parse_form_v1beta2.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
# [START documentai_batch_parse_form_beta]
17+
import re
18+
19+
from google.cloud import documentai_v1beta2 as documentai
20+
from google.cloud import storage
21+
22+
23+
def batch_parse_form(
24+
project_id="YOUR_PROJECT_ID",
25+
input_uri="gs://cloud-samples-data/documentai/form.pdf",
26+
destination_uri="gs://your-bucket-id/path/to/save/results/",
27+
):
28+
"""Parse a form"""
29+
30+
client = documentai.DocumentUnderstandingServiceClient()
31+
32+
gcs_source = documentai.types.GcsSource(uri=input_uri)
33+
34+
# mime_type can be application/pdf, image/tiff,
35+
# and image/gif, or application/json
36+
input_config = documentai.types.InputConfig(
37+
gcs_source=gcs_source, mime_type="application/pdf"
38+
)
39+
40+
# where to write results
41+
output_config = documentai.types.OutputConfig(
42+
gcs_destination=documentai.types.GcsDestination(uri=destination_uri),
43+
pages_per_shard=1, # Map one doc page to one output page
44+
)
45+
46+
# Improve form parsing results by providing key-value pair hints.
47+
# For each key hint, key is text that is likely to appear in the
48+
# document as a form field name (i.e. "DOB").
49+
# Value types are optional, but can be one or more of:
50+
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
51+
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
52+
key_value_pair_hints = [
53+
documentai.types.KeyValuePairHint(
54+
key="Emergency Contact", value_types=["NAME"]
55+
),
56+
documentai.types.KeyValuePairHint(key="Referred By"),
57+
]
58+
59+
# Setting enabled=True enables form extraction
60+
form_extraction_params = documentai.types.FormExtractionParams(
61+
enabled=True, key_value_pair_hints=key_value_pair_hints
62+
)
63+
64+
# Location can be 'us' or 'eu'
65+
parent = "projects/{}/locations/us".format(project_id)
66+
request = documentai.types.ProcessDocumentRequest(
67+
input_config=input_config,
68+
output_config=output_config,
69+
form_extraction_params=form_extraction_params,
70+
)
71+
72+
# Add each ProcessDocumentRequest to the batch request
73+
requests = []
74+
requests.append(request)
75+
76+
batch_request = documentai.types.BatchProcessDocumentsRequest(
77+
parent=parent, requests=requests
78+
)
79+
80+
operation = client.batch_process_documents(batch_request)
81+
82+
# Wait for the operation to finish
83+
operation.result()
84+
85+
# Results are written to GCS. Use a regex to find
86+
# output files
87+
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
88+
output_bucket = match.group(1)
89+
prefix = match.group(2)
90+
91+
storage_client = storage.client.Client()
92+
bucket = storage_client.get_bucket(output_bucket)
93+
blob_list = list(bucket.list_blobs(prefix=prefix))
94+
print("Output files:")
95+
for blob in blob_list:
96+
print(blob.name)
97+
98+
99+
# [END documentai_batch_parse_form_beta]

batch_parse_form_v1beta2_test.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific ladnguage governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import uuid
17+
18+
from google.cloud import storage
19+
20+
import pytest
21+
22+
from samples.snippets import batch_parse_form_v1beta2
23+
24+
25+
BUCKET = "document-ai-{}".format(uuid.uuid4())
26+
OUTPUT_PREFIX = "TEST_OUTPUT_{}".format(uuid.uuid4())
27+
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
28+
INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf"
29+
BATCH_OUTPUT_URI = "gs://{}/{}/".format(BUCKET, OUTPUT_PREFIX)
30+
31+
32+
@pytest.fixture(autouse=True)
33+
def setup_teardown():
34+
"""Create a temporary bucket to store annotation output."""
35+
storage_client = storage.Client()
36+
bucket = storage_client.create_bucket(BUCKET)
37+
38+
yield
39+
40+
bucket.delete(force=True)
41+
42+
43+
def test_batch_parse_form(capsys):
44+
batch_parse_form_v1beta2.batch_parse_form(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
45+
out, _ = capsys.readouterr()
46+
assert "Output files" in out

batch_parse_table_v1beta2.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
# [START documentai_batch_parse_table_beta]
17+
import re
18+
19+
from google.cloud import documentai_v1beta2 as documentai
20+
from google.cloud import storage
21+
22+
23+
def batch_parse_table(
24+
project_id="YOUR_PROJECT_ID",
25+
input_uri="gs://cloud-samples-data/documentai/form.pdf",
26+
destination_uri="gs://your-bucket-id/path/to/save/results/",
27+
):
28+
"""Parse a form"""
29+
30+
client = documentai.DocumentUnderstandingServiceClient()
31+
32+
gcs_source = documentai.types.GcsSource(uri=input_uri)
33+
34+
# mime_type can be application/pdf, image/tiff,
35+
# and image/gif, or application/json
36+
input_config = documentai.types.InputConfig(
37+
gcs_source=gcs_source, mime_type="application/pdf"
38+
)
39+
40+
# where to write results
41+
output_config = documentai.types.OutputConfig(
42+
gcs_destination=documentai.types.GcsDestination(uri=destination_uri),
43+
pages_per_shard=1, # Map one doc page to one output page
44+
)
45+
46+
# Improve table parsing results by providing bounding boxes
47+
# specifying where the box appears in the document (optional)
48+
table_bound_hints = [
49+
documentai.types.TableBoundHint(
50+
page_number=1,
51+
bounding_box=documentai.types.BoundingPoly(
52+
# Define a polygon around tables to detect
53+
# Each vertice coordinate must be a number between 0 and 1
54+
normalized_vertices=[
55+
# Top left
56+
documentai.types.geometry.NormalizedVertex(x=0, y=0),
57+
# Top right
58+
documentai.types.geometry.NormalizedVertex(x=1, y=0),
59+
# Bottom right
60+
documentai.types.geometry.NormalizedVertex(x=1, y=1),
61+
# Bottom left
62+
documentai.types.geometry.NormalizedVertex(x=0, y=1),
63+
]
64+
),
65+
)
66+
]
67+
68+
# Setting enabled=True enables form extraction
69+
table_extraction_params = documentai.types.TableExtractionParams(
70+
enabled=True, table_bound_hints=table_bound_hints
71+
)
72+
73+
# Location can be 'us' or 'eu'
74+
parent = "projects/{}/locations/us".format(project_id)
75+
request = documentai.types.ProcessDocumentRequest(
76+
input_config=input_config,
77+
output_config=output_config,
78+
table_extraction_params=table_extraction_params,
79+
)
80+
81+
requests = []
82+
requests.append(request)
83+
84+
batch_request = documentai.types.BatchProcessDocumentsRequest(
85+
parent=parent, requests=requests
86+
)
87+
88+
operation = client.batch_process_documents(batch_request)
89+
90+
# Wait for the operation to finish
91+
operation.result()
92+
93+
# Results are written to GCS. Use a regex to find
94+
# output files
95+
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
96+
output_bucket = match.group(1)
97+
prefix = match.group(2)
98+
99+
storage_client = storage.client.Client()
100+
bucket = storage_client.get_bucket(output_bucket)
101+
blob_list = list(bucket.list_blobs(prefix=prefix))
102+
print("Output files:")
103+
for blob in blob_list:
104+
print(blob.name)
105+
106+
107+
# [END documentai_batch_parse_table_beta]

batch_parse_table_v1beta2_test.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific ladnguage governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import uuid
17+
18+
from google.cloud import storage
19+
20+
import pytest
21+
22+
from samples.snippets import batch_parse_table_v1beta2
23+
24+
25+
BUCKET = "document-ai-{}".format(uuid.uuid4())
26+
OUTPUT_PREFIX = "TEST_OUTPUT_{}".format(uuid.uuid4())
27+
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
28+
INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf"
29+
BATCH_OUTPUT_URI = "gs://{}/{}/".format(BUCKET, OUTPUT_PREFIX)
30+
31+
32+
@pytest.fixture(autouse=True)
33+
def setup_teardown():
34+
"""Create a temporary bucket to store annotation output."""
35+
storage_client = storage.Client()
36+
bucket = storage_client.create_bucket(BUCKET)
37+
38+
yield
39+
40+
bucket.delete(force=True)
41+
42+
43+
def test_batch_parse_table(capsys):
44+
batch_parse_table_v1beta2.batch_parse_table(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
45+
out, _ = capsys.readouterr()
46+
assert "Output files:" in out

0 commit comments

Comments
 (0)