|
| 1 | +/** |
| 2 | + * Copyright 2020 Google LLC |
| 3 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | + * you may not use this file except in compliance with the License. |
| 5 | + * You may obtain a copy of the License at |
| 6 | + * |
| 7 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | + * |
| 9 | + * Unless required by applicable law or agreed to in writing, software |
| 10 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | + * See the License for the specific language governing permissions and |
| 13 | + * limitations under the License. |
| 14 | + */ |
| 15 | + |
| 16 | +'use strict'; |
| 17 | + |
| 18 | +const uuid = require('uuid'); |
| 19 | + |
| 20 | +async function main( |
| 21 | + projectId = 'YOUR_PROJECT_ID', |
| 22 | + location = 'YOUR_PROJECT_LOCATION', |
| 23 | + processorId = 'YOUR_PROCESSOR_ID', // Create this in the Cloud Console |
| 24 | + gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf', |
| 25 | + gcsOutputUri = 'output-bucket', |
| 26 | + gcsOutputUriPrefix = uuid.v4() |
| 27 | +) { |
| 28 | + // [START documentai_batch_process_document] |
| 29 | + /** |
| 30 | + * TODO(developer): Uncomment these variables before running the sample. |
| 31 | + */ |
| 32 | + // const projectId = 'YOUR_PROJECT_ID'; |
| 33 | + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' |
| 34 | + // const processorId = 'YOUR_PROCESSOR_ID'; |
| 35 | + // const gcsInputUri = 'YOUR_SOURCE_PDF'; |
| 36 | + // const gcsOutputUri = 'YOUR_STORAGE_BUCKET'; |
| 37 | + // const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX'; |
| 38 | + |
| 39 | + // Imports the Google Cloud client library |
| 40 | + const { |
| 41 | + DocumentProcessorServiceClient, |
| 42 | + } = require('@google-cloud/documentai').v1beta3; |
| 43 | + const {Storage} = require('@google-cloud/storage'); |
| 44 | + |
| 45 | + // Instantiates Document AI, Storage clients |
| 46 | + const client = new DocumentProcessorServiceClient(); |
| 47 | + const storage = new Storage(); |
| 48 | + |
| 49 | + const {default: PQueue} = require('p-queue'); |
| 50 | + |
| 51 | + async function batchProcessDocument() { |
| 52 | + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; |
| 53 | + |
| 54 | + // Configure the batch process request. |
| 55 | + const request = { |
| 56 | + name, |
| 57 | + inputConfigs: [ |
| 58 | + { |
| 59 | + gcsSource: gcsInputUri, |
| 60 | + mimeType: 'application/pdf', |
| 61 | + }, |
| 62 | + ], |
| 63 | + outputConfig: { |
| 64 | + gcsDestination: `${gcsOutputUri}/${gcsOutputUriPrefix}/`, |
| 65 | + }, |
| 66 | + }; |
| 67 | + |
| 68 | + // Batch process document using a long-running operation. |
| 69 | + // You can wait for now, or get results later. |
| 70 | + // Note: first request to the service takes longer than subsequent |
| 71 | + // requests. |
| 72 | + const [operation] = await client.batchProcessDocuments(request); |
| 73 | + |
| 74 | + // Wait for operation to complete. |
| 75 | + await operation.promise(); |
| 76 | + |
| 77 | + console.log('Document processing complete.'); |
| 78 | + |
| 79 | + // Query Storage bucket for the results file(s). |
| 80 | + const query = { |
| 81 | + prefix: gcsOutputUriPrefix, |
| 82 | + }; |
| 83 | + |
| 84 | + console.log('Fetching results ...'); |
| 85 | + |
| 86 | + // List all of the files in the Storage bucket |
| 87 | + const [files] = await storage.bucket(gcsOutputUri).getFiles(query); |
| 88 | + |
| 89 | + // Add all asynchronous downloads to queue for execution. |
| 90 | + const queue = new PQueue({concurrency: 15}); |
| 91 | + const tasks = files.map((fileInfo, index) => async () => { |
| 92 | + // Get the file as a buffer |
| 93 | + const [file] = await fileInfo.download(); |
| 94 | + |
| 95 | + console.log(`Fetched file #${index + 1}:`); |
| 96 | + |
| 97 | + // The results stored in the output Storage location |
| 98 | + // are formatted as a document object. |
| 99 | + const document = JSON.parse(file.toString()); |
| 100 | + const {text} = document; |
| 101 | + |
| 102 | + // Extract shards from the text field |
| 103 | + const getText = textAnchor => { |
| 104 | + if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) { |
| 105 | + return ''; |
| 106 | + } |
| 107 | + |
| 108 | + // First shard in document doesn't have startIndex property |
| 109 | + const startIndex = textAnchor.textSegments[0].startIndex || 0; |
| 110 | + const endIndex = textAnchor.textSegments[0].endIndex; |
| 111 | + |
| 112 | + return text.substring(startIndex, endIndex); |
| 113 | + }; |
| 114 | + |
| 115 | + // Read the text recognition output from the processor |
| 116 | + console.log('The document contains the following paragraphs:'); |
| 117 | + |
| 118 | + const [page1] = document.pages; |
| 119 | + const {paragraphs} = page1; |
| 120 | + for (const paragraph of paragraphs) { |
| 121 | + const paragraphText = getText(paragraph.layout.textAnchor); |
| 122 | + console.log(`Paragraph text:\n${paragraphText}`); |
| 123 | + } |
| 124 | + |
| 125 | + // Form parsing provides additional output about |
| 126 | + // form-formatted PDFs. You must create a form |
| 127 | + // processor in the Cloud Console to see full field details. |
| 128 | + console.log('\nThe following form key/value pairs were detected:'); |
| 129 | + |
| 130 | + const {formFields} = page1; |
| 131 | + for (const field of formFields) { |
| 132 | + const fieldName = getText(field.fieldName.textAnchor); |
| 133 | + const fieldValue = getText(field.fieldValue.textAnchor); |
| 134 | + |
| 135 | + console.log('Extracted key value pair:'); |
| 136 | + console.log(`\t(${fieldName}, ${fieldValue})`); |
| 137 | + } |
| 138 | + }); |
| 139 | + await queue.addAll(tasks); |
| 140 | + } |
| 141 | + // [END documentai_batch_process_document] |
| 142 | + |
| 143 | + batchProcessDocument(); |
| 144 | +} |
| 145 | +main(...process.argv.slice(2)); |
0 commit comments