|
| 1 | +/** |
| 2 | + * Copyright 2021, Google, Inc. |
| 3 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | + * you may not use this file except in compliance with the License. |
| 5 | + * You may obtain a copy of the License at |
| 6 | + * |
| 7 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | + * |
| 9 | + * Unless required by applicable law or agreed to in writing, software |
| 10 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | + * See the License for the specific language governing permissions and |
| 13 | + * limitations under the License. |
| 14 | + */ |
| 15 | + |
| 16 | +'use strict'; |
| 17 | + |
| 18 | +async function main(projectId, location, processorId, filePath) { |
| 19 | + // [START documentai_process_ocr_document] |
| 20 | + /** |
| 21 | + * TODO(developer): Uncomment these variables before running the sample. |
| 22 | + */ |
| 23 | + // const projectId = 'YOUR_PROJECT_ID'; |
| 24 | + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' |
| 25 | + // const processorId = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console |
| 26 | + // const filePath = '/path/to/local/pdf'; |
| 27 | + |
| 28 | + const {DocumentProcessorServiceClient} = |
| 29 | + require('@google-cloud/documentai').v1beta3; |
| 30 | + |
| 31 | + // Instantiates a client |
| 32 | + const client = new DocumentProcessorServiceClient(); |
| 33 | + |
| 34 | + async function processDocument() { |
| 35 | + // The full resource name of the processor, e.g.: |
| 36 | + // projects/project-id/locations/location/processor/processor-id |
| 37 | + // You must create new processors in the Cloud Console first |
| 38 | + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; |
| 39 | + |
| 40 | + // Read the file into memory. |
| 41 | + const fs = require('fs').promises; |
| 42 | + const imageFile = await fs.readFile(filePath); |
| 43 | + |
| 44 | + // Convert the image data to a Buffer and base64 encode it. |
| 45 | + const encodedImage = Buffer.from(imageFile).toString('base64'); |
| 46 | + |
| 47 | + const request = { |
| 48 | + name, |
| 49 | + rawDocument: { |
| 50 | + content: encodedImage, |
| 51 | + mimeType: 'application/pdf', |
| 52 | + }, |
| 53 | + }; |
| 54 | + |
| 55 | + // Recognizes text entities in the PDF document |
| 56 | + const [result] = await client.processDocument(request); |
| 57 | + |
| 58 | + console.log('Document processing complete.'); |
| 59 | + |
| 60 | + // Read the text recognition output from the processor |
| 61 | + // For a full list of Document object attributes, |
| 62 | + // please reference this page: https://googleapis.dev/nodejs/documentai/latest/index.html |
| 63 | + const {document} = result; |
| 64 | + const {text} = document; |
| 65 | + |
| 66 | + // Read the text recognition output from the processor |
| 67 | + console.log(`Full document text: ${JSON.stringify(text)}`); |
| 68 | + console.log(`There are ${document.pages.length} page(s) in this document.`); |
| 69 | + for (const page of document.pages) { |
| 70 | + console.log(`Page ${page.pageNumber}`); |
| 71 | + printPageDimensions(page.dimension); |
| 72 | + printDetectedLanguages(page.detectedLanguages); |
| 73 | + printParagraphs(page.paragraphs, text); |
| 74 | + printBlocks(page.blocks, text); |
| 75 | + printLines(page.lines, text); |
| 76 | + printTokens(page.tokens, text); |
| 77 | + } |
| 78 | + } |
| 79 | + |
| 80 | + const printPageDimensions = dimension => { |
| 81 | + console.log(` Width: ${dimension.width}`); |
| 82 | + console.log(` Height: ${dimension.height}`); |
| 83 | + }; |
| 84 | + |
| 85 | + const printDetectedLanguages = detectedLanguages => { |
| 86 | + console.log(' Detected languages:'); |
| 87 | + for (const lang of detectedLanguages) { |
| 88 | + const code = lang.languageCode; |
| 89 | + const confPercent = lang.confidence * 100; |
| 90 | + console.log(` ${code} (${confPercent.toFixed(2)}% confidence)`); |
| 91 | + } |
| 92 | + }; |
| 93 | + |
| 94 | + const printParagraphs = (paragraphs, text) => { |
| 95 | + console.log(` ${paragraphs.length} paragraphs detected:`); |
| 96 | + const firstParagraphText = getText(paragraphs[0].layout.textAnchor, text); |
| 97 | + console.log( |
| 98 | + ` First paragraph text: ${JSON.stringify(firstParagraphText)}` |
| 99 | + ); |
| 100 | + const lastParagraphText = getText( |
| 101 | + paragraphs[paragraphs.length - 1].layout.textAnchor, |
| 102 | + text |
| 103 | + ); |
| 104 | + console.log( |
| 105 | + ` Last paragraph text: ${JSON.stringify(lastParagraphText)}` |
| 106 | + ); |
| 107 | + }; |
| 108 | + |
| 109 | + const printBlocks = (blocks, text) => { |
| 110 | + console.log(` ${blocks.length} blocks detected:`); |
| 111 | + const firstBlockText = getText(blocks[0].layout.textAnchor, text); |
| 112 | + console.log(` First block text: ${JSON.stringify(firstBlockText)}`); |
| 113 | + const lastBlockText = getText( |
| 114 | + blocks[blocks.length - 1].layout.textAnchor, |
| 115 | + text |
| 116 | + ); |
| 117 | + console.log(` Last block text: ${JSON.stringify(lastBlockText)}`); |
| 118 | + }; |
| 119 | + |
| 120 | + const printLines = (lines, text) => { |
| 121 | + console.log(` ${lines.length} lines detected:`); |
| 122 | + const firstLineText = getText(lines[0].layout.textAnchor, text); |
| 123 | + console.log(` First line text: ${JSON.stringify(firstLineText)}`); |
| 124 | + const lastLineText = getText( |
| 125 | + lines[lines.length - 1].layout.textAnchor, |
| 126 | + text |
| 127 | + ); |
| 128 | + console.log(` Last line text: ${JSON.stringify(lastLineText)}`); |
| 129 | + }; |
| 130 | + |
| 131 | + const printTokens = (tokens, text) => { |
| 132 | + console.log(` ${tokens.length} tokens detected:`); |
| 133 | + const firstTokenText = getText(tokens[0].layout.textAnchor, text); |
| 134 | + console.log(` First token text: ${JSON.stringify(firstTokenText)}`); |
| 135 | + const firstTokenBreakType = tokens[0].detectedBreak.type; |
| 136 | + console.log(` First token break type: ${firstTokenBreakType}`); |
| 137 | + const lastTokenText = getText( |
| 138 | + tokens[tokens.length - 1].layout.textAnchor, |
| 139 | + text |
| 140 | + ); |
| 141 | + console.log(` Last token text: ${JSON.stringify(lastTokenText)}`); |
| 142 | + const lastTokenBreakType = tokens[tokens.length - 1].detectedBreak.type; |
| 143 | + console.log(` Last token break type: ${lastTokenBreakType}`); |
| 144 | + }; |
| 145 | + |
| 146 | + // Extract shards from the text field |
| 147 | + const getText = (textAnchor, text) => { |
| 148 | + if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) { |
| 149 | + return ''; |
| 150 | + } |
| 151 | + |
| 152 | + // First shard in document doesn't have startIndex property |
| 153 | + const startIndex = textAnchor.textSegments[0].startIndex || 0; |
| 154 | + const endIndex = textAnchor.textSegments[0].endIndex; |
| 155 | + |
| 156 | + return text.substring(startIndex, endIndex); |
| 157 | + }; |
| 158 | + |
| 159 | + // [END documentai_process_ocr_document] |
| 160 | + await processDocument(); |
| 161 | +} |
| 162 | + |
| 163 | +main(...process.argv.slice(2)).catch(err => { |
| 164 | + console.error(err); |
| 165 | + process.exitCode = 1; |
| 166 | +}); |
0 commit comments