Skip to content

Commit 4dd1a49

Browse files
telpirionsofislJustinBeckwith
authored
docs(samples): new Doc AI samples for v1beta3 (#101)
* docs(samples): new Doc AI samples for v1beta3 * feat: add a new version to the library * fix: adds processor ID as literal in tests * fix: removed apiEndpoint from client instantiation Co-authored-by: Sofia Leon <[email protected]> Co-authored-by: sofisl <[email protected]> Co-authored-by: Justin Beckwith <[email protected]>
1 parent 42fdf06 commit 4dd1a49

9 files changed

+435
-34
lines changed

document-ai/.eslintrc.yml

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
---
22
rules:
33
no-console: off
4-
node/no-missing-require: off
5-
node/no-extraneous-require: off
6-
4+
node/no-unsupported-features/node-builtins: off
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
/**
2+
* Copyright 2020 Google LLC
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
16+
'use strict';
17+
18+
const uuid = require('uuid');
19+
20+
async function main(
21+
projectId = 'YOUR_PROJECT_ID',
22+
location = 'YOUR_PROJECT_LOCATION',
23+
processorId = 'YOUR_PROCESSOR_ID', // Create this in the Cloud Console
24+
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf',
25+
gcsOutputUri = 'output-bucket',
26+
gcsOutputUriPrefix = uuid.v4()
27+
) {
28+
// [START documentai_batch_process_document]
29+
/**
30+
* TODO(developer): Uncomment these variables before running the sample.
31+
*/
32+
// const projectId = 'YOUR_PROJECT_ID';
33+
// const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
34+
// const processorId = 'YOUR_PROCESSOR_ID';
35+
// const gcsInputUri = 'YOUR_SOURCE_PDF';
36+
// const gcsOutputUri = 'YOUR_STORAGE_BUCKET';
37+
// const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX';
38+
39+
// Imports the Google Cloud client library
40+
const {
41+
DocumentProcessorServiceClient,
42+
} = require('@google-cloud/documentai').v1beta3;
43+
const {Storage} = require('@google-cloud/storage');
44+
45+
// Instantiates Document AI, Storage clients
46+
const client = new DocumentProcessorServiceClient();
47+
const storage = new Storage();
48+
49+
const {default: PQueue} = require('p-queue');
50+
51+
async function batchProcessDocument() {
52+
const name = `projects/${projectId}/locations/${location}/processors/${processorId}`;
53+
54+
// Configure the batch process request.
55+
const request = {
56+
name,
57+
inputConfigs: [
58+
{
59+
gcsSource: gcsInputUri,
60+
mimeType: 'application/pdf',
61+
},
62+
],
63+
outputConfig: {
64+
gcsDestination: `${gcsOutputUri}/${gcsOutputUriPrefix}/`,
65+
},
66+
};
67+
68+
// Batch process document using a long-running operation.
69+
// You can wait for now, or get results later.
70+
// Note: first request to the service takes longer than subsequent
71+
// requests.
72+
const [operation] = await client.batchProcessDocuments(request);
73+
74+
// Wait for operation to complete.
75+
await operation.promise();
76+
77+
console.log('Document processing complete.');
78+
79+
// Query Storage bucket for the results file(s).
80+
const query = {
81+
prefix: gcsOutputUriPrefix,
82+
};
83+
84+
console.log('Fetching results ...');
85+
86+
// List all of the files in the Storage bucket
87+
const [files] = await storage.bucket(gcsOutputUri).getFiles(query);
88+
89+
// Add all asynchronous downloads to queue for execution.
90+
const queue = new PQueue({concurrency: 15});
91+
const tasks = files.map((fileInfo, index) => async () => {
92+
// Get the file as a buffer
93+
const [file] = await fileInfo.download();
94+
95+
console.log(`Fetched file #${index + 1}:`);
96+
97+
// The results stored in the output Storage location
98+
// are formatted as a document object.
99+
const document = JSON.parse(file.toString());
100+
const {text} = document;
101+
102+
// Extract shards from the text field
103+
const getText = textAnchor => {
104+
if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) {
105+
return '';
106+
}
107+
108+
// First shard in document doesn't have startIndex property
109+
const startIndex = textAnchor.textSegments[0].startIndex || 0;
110+
const endIndex = textAnchor.textSegments[0].endIndex;
111+
112+
return text.substring(startIndex, endIndex);
113+
};
114+
115+
// Read the text recognition output from the processor
116+
console.log('The document contains the following paragraphs:');
117+
118+
const [page1] = document.pages;
119+
const {paragraphs} = page1;
120+
for (const paragraph of paragraphs) {
121+
const paragraphText = getText(paragraph.layout.textAnchor);
122+
console.log(`Paragraph text:\n${paragraphText}`);
123+
}
124+
125+
// Form parsing provides additional output about
126+
// form-formatted PDFs. You must create a form
127+
// processor in the Cloud Console to see full field details.
128+
console.log('\nThe following form key/value pairs were detected:');
129+
130+
const {formFields} = page1;
131+
for (const field of formFields) {
132+
const fieldName = getText(field.fieldName.textAnchor);
133+
const fieldValue = getText(field.fieldValue.textAnchor);
134+
135+
console.log('Extracted key value pair:');
136+
console.log(`\t(${fieldName}, ${fieldValue})`);
137+
}
138+
});
139+
await queue.addAll(tasks);
140+
}
141+
// [END documentai_batch_process_document]
142+
143+
batchProcessDocument();
144+
}
145+
main(...process.argv.slice(2));

document-ai/package.json

+4-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"license": "Apache-2.0",
55
"author": "Google LLC",
66
"engines": {
7-
"node": ">=8"
7+
"node": ">= 10.17.0"
88
},
99
"files": [
1010
"*.js"
@@ -14,7 +14,9 @@
1414
},
1515
"dependencies": {
1616
"@google-cloud/documentai": "^2.1.1",
17-
"@google-cloud/storage": "^5.0.0"
17+
"@google-cloud/storage": "^5.0.0",
18+
"p-queue": "^6.6.2",
19+
"uuid": "^8.3.1"
1820
},
1921
"devDependencies": {
2022
"chai": "^4.2.0",
+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/**
2+
* Copyright 2020, Google, Inc.
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
16+
'use strict';
17+
18+
async function main(projectId, location, processorId, filePath) {
19+
// [START documentai_process_document]
20+
/**
21+
* TODO(developer): Uncomment these variables before running the sample.
22+
*/
23+
// const projectId = 'YOUR_PROJECT_ID';
24+
// const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
25+
// const processor = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
26+
// const filePath = '/path/to/local/pdf';
27+
28+
const {
29+
DocumentProcessorServiceClient,
30+
} = require('@google-cloud/documentai').v1beta3;
31+
32+
// Instantiates a client
33+
const client = new DocumentProcessorServiceClient();
34+
35+
async function processDocument() {
36+
// The full resource name of the processor, e.g.:
37+
// projects/project-id/locations/location/processor/processor-id
38+
// You must create new processors in the Cloud Console first
39+
const name = `projects/${projectId}/locations/${location}/processors/${processorId}`;
40+
41+
// Read the file into memory.
42+
const fs = require('fs').promises;
43+
const imageFile = await fs.readFile(filePath);
44+
45+
// Convert the image data to a Buffer and base64 encode it.
46+
const encodedImage = Buffer.from(imageFile).toString('base64');
47+
48+
const request = {
49+
name,
50+
document: {
51+
content: encodedImage,
52+
mimeType: 'application/pdf',
53+
},
54+
};
55+
56+
// Recognizes text entities in the PDF document
57+
const [result] = await client.processDocument(request);
58+
const {document} = result;
59+
60+
// Get all of the document text as one big string
61+
const {text} = document;
62+
63+
// Extract shards from the text field
64+
const getText = textAnchor => {
65+
if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) {
66+
return '';
67+
}
68+
69+
// First shard in document doesn't have startIndex property
70+
const startIndex = textAnchor.textSegments[0].startIndex || 0;
71+
const endIndex = textAnchor.textSegments[0].endIndex;
72+
73+
return text.substring(startIndex, endIndex);
74+
};
75+
76+
// Read the text recognition output from the processor
77+
console.log('The document contains the following paragraphs:');
78+
const [page1] = document.pages;
79+
const {paragraphs} = page1;
80+
81+
for (const paragraph of paragraphs) {
82+
const paragraphText = getText(paragraph.layout.textAnchor);
83+
console.log(`Paragraph text:\n${paragraphText}`);
84+
}
85+
86+
// Form parsing provides additional output about
87+
// form-formatted PDFs. You must create a form
88+
// processor in the Cloud Console to see full field details.
89+
console.log('\nThe following form key/value pairs were detected:');
90+
91+
const {formFields} = page1;
92+
for (const field of formFields) {
93+
const fieldName = getText(field.fieldName.textAnchor);
94+
const fieldValue = getText(field.fieldValue.textAnchor);
95+
96+
console.log('Extracted key value pair:');
97+
console.log(`\t(${fieldName}, ${fieldValue})`);
98+
}
99+
}
100+
// [END documentai_process_document]
101+
await processDocument();
102+
}
103+
104+
main(...process.argv.slice(2)).catch(err => {
105+
console.error(err);
106+
process.exitCode = 1;
107+
});

document-ai/quickstart.js

+39-23
Original file line numberDiff line numberDiff line change
@@ -15,56 +15,72 @@
1515

1616
'use strict';
1717

18-
async function main(
19-
projectId,
20-
location,
21-
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
22-
) {
18+
async function main(projectId, location, processorId, filePath) {
2319
// [START documentai_quickstart]
2420
/**
2521
* TODO(developer): Uncomment these variables before running the sample.
2622
*/
2723
// const projectId = 'YOUR_PROJECT_ID';
2824
// const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
29-
// const gcsInputUri = 'YOUR_SOURCE_PDF';
25+
// const processor = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
26+
// const filePath = '/path/to/local/pdf';
3027

3128
const {
32-
DocumentUnderstandingServiceClient,
33-
} = require('@google-cloud/documentai').v1beta2;
34-
const client = new DocumentUnderstandingServiceClient();
29+
DocumentProcessorServiceClient,
30+
} = require('@google-cloud/documentai').v1beta3;
31+
32+
// Instantiates a client
33+
const client = new DocumentProcessorServiceClient();
3534

3635
async function quickstart() {
37-
// Configure the request for processing the PDF
38-
const parent = `projects/${projectId}/locations/${location}`;
36+
// The full resource name of the processor, e.g.:
37+
// projects/project-id/locations/location/processor/processor-id
38+
// You must create new processors in the Cloud Console first
39+
const name = `projects/${projectId}/locations/${location}/processors/${processorId}`;
40+
41+
// Read the file into memory.
42+
const fs = require('fs').promises;
43+
const imageFile = await fs.readFile(filePath);
44+
45+
// Convert the image data to a Buffer and base64 encode it.
46+
const encodedImage = Buffer.from(imageFile).toString('base64');
47+
3948
const request = {
40-
parent,
41-
inputConfig: {
42-
gcsSource: {
43-
uri: gcsInputUri,
44-
},
49+
name,
50+
document: {
51+
content: encodedImage,
4552
mimeType: 'application/pdf',
4653
},
4754
};
4855

4956
// Recognizes text entities in the PDF document
5057
const [result] = await client.processDocument(request);
58+
const {document} = result;
5159

5260
// Get all of the document text as one big string
53-
const {text} = result;
61+
const {text} = document;
5462

5563
// Extract shards from the text field
56-
function extractText(textAnchor) {
64+
const getText = textAnchor => {
65+
if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) {
66+
return '';
67+
}
68+
5769
// First shard in document doesn't have startIndex property
5870
const startIndex = textAnchor.textSegments[0].startIndex || 0;
5971
const endIndex = textAnchor.textSegments[0].endIndex;
6072

6173
return text.substring(startIndex, endIndex);
62-
}
74+
};
75+
76+
// Read the text recognition output from the processor
77+
console.log('The document contains the following paragraphs:');
78+
const [page1] = document.pages;
79+
const {paragraphs} = page1;
6380

64-
for (const entity of result.entities) {
65-
console.log(`\nEntity text: ${extractText(entity.textAnchor)}`);
66-
console.log(`Entity type: ${entity.type}`);
67-
console.log(`Entity mention text: ${entity.mentionText}`);
81+
for (const paragraph of paragraphs) {
82+
const paragraphText = getText(paragraph.layout.textAnchor);
83+
console.log(`Paragraph text:\n${paragraphText}`);
6884
}
6985
}
7086
// [END documentai_quickstart]

document-ai/resources/invoice.pdf

57.6 KB
Binary file not shown.

0 commit comments

Comments
 (0)