|
| 1 | +// Copyright 2016, Google, Inc. |
| 2 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 3 | +// you may not use this file except in compliance with the License. |
| 4 | +// You may obtain a copy of the License at |
| 5 | +// |
| 6 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 7 | +// |
| 8 | +// Unless required by applicable law or agreed to in writing, software |
| 9 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 10 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 11 | +// See the License for the specific language governing permissions and |
| 12 | +// limitations under the License. |
| 13 | + |
| 14 | +'use strict'; |
| 15 | + |
| 16 | +// [START app] |
| 17 | +// [START import_libraries] |
| 18 | +var async = require('async'); |
| 19 | +var fs = require('fs'); |
| 20 | +var path = require('path'); |
| 21 | +var gcloud = require('gcloud'); |
| 22 | +var natural = require('natural'); |
| 23 | +var redis = require('redis'); |
| 24 | +// [END import_libraries] |
| 25 | + |
| 26 | +// [START authenticate] |
| 27 | +// You must set the GOOGLE_APPLICATION_CREDENTIALS and GCLOUD_PROJECT |
| 28 | +// environment variables to run this sample. See: |
| 29 | +// https://github.com/GoogleCloudPlatform/gcloud-node/blob/master/docs/authentication.md |
| 30 | +var projectId = process.env.GCLOUD_PROJECT; |
| 31 | + |
| 32 | +// Initialize gcloud |
| 33 | +gcloud = gcloud({ |
| 34 | + projectId: projectId |
| 35 | +}); |
| 36 | + |
| 37 | +// Get a reference to the vision component |
| 38 | +var vision = gcloud.vision(); |
| 39 | +// [END authenticate] |
| 40 | + |
| 41 | +function Index() { |
| 42 | + // Connect to a redis server. |
| 43 | + var TOKEN_DB = 0; |
| 44 | + var DOCS_DB = 1; |
| 45 | + var PORT = process.env.REDIS_PORT || '6379'; |
| 46 | + var HOST = process.env.REDIS_HOST || '127.0.0.1'; |
| 47 | + |
| 48 | + this.tokenClient = redis.createClient(PORT, HOST, { |
| 49 | + db: TOKEN_DB |
| 50 | + }).on('error', function (err) { |
| 51 | + console.error('ERR:REDIS: ' + err); |
| 52 | + }); |
| 53 | + this.docsClient = redis.createClient(PORT, HOST, { |
| 54 | + db: DOCS_DB |
| 55 | + }).on('error', function (err) { |
| 56 | + console.error('ERR:REDIS: ' + err); |
| 57 | + }); |
| 58 | +} |
| 59 | + |
| 60 | +Index.prototype.quit = function () { |
| 61 | + this.tokenClient.quit(); |
| 62 | + this.docsClient.quit(); |
| 63 | +}; |
| 64 | + |
| 65 | +Index.prototype.add = function (filename, document, callback) { |
| 66 | + var self = this; |
| 67 | + var PUNCTUATION = ['.', ',', ':', '']; |
| 68 | + var tokenizer = new natural.WordTokenizer(); |
| 69 | + var tokens = tokenizer.tokenize(document); |
| 70 | + |
| 71 | + // TODO: Remove stop words |
| 72 | + |
| 73 | + var tasks = tokens.filter(function (token) { |
| 74 | + return PUNCTUATION.indexOf(token) === -1; |
| 75 | + }).map(function (token) { |
| 76 | + return function (cb) { |
| 77 | + self.tokenClient.sadd(token, filename, cb); |
| 78 | + }; |
| 79 | + }); |
| 80 | + |
| 81 | + tasks.push(function (cb) { |
| 82 | + self.tokenClient.set(filename, document, cb); |
| 83 | + }); |
| 84 | + |
| 85 | + async.parallel(tasks, callback); |
| 86 | +}; |
| 87 | + |
| 88 | +Index.prototype.lookup = function (words, callback) { |
| 89 | + var self = this; |
| 90 | + var tasks = words.map(function (word) { |
| 91 | + word = word.toLowerCase(); |
| 92 | + return function (cb) { |
| 93 | + self.tokenClient.smembers(word, cb) |
| 94 | + } |
| 95 | + }); |
| 96 | + async.parallel(tasks, callback); |
| 97 | +}; |
| 98 | + |
| 99 | +Index.prototype.documentIsProcessed = function (filename, callback) { |
| 100 | + this.docsClient.GET(filename, function (err, value) { |
| 101 | + if (err) { |
| 102 | + return callback(err); |
| 103 | + } |
| 104 | + if (value) { |
| 105 | + console.log(filename + ' already added to index.'); |
| 106 | + callback(null, true); |
| 107 | + } else if (value === '') { |
| 108 | + console.log(filename + ' was already checked, and contains no text.'); |
| 109 | + callback(null, true); |
| 110 | + } else { |
| 111 | + callback(null, false); |
| 112 | + } |
| 113 | + }); |
| 114 | +}; |
| 115 | + |
| 116 | +Index.prototype.setContainsNoText = function (filename, callback) { |
| 117 | + this.docsClient.set(filename, '', callback); |
| 118 | +}; |
| 119 | + |
| 120 | +function lookup(words, callback) { |
| 121 | + var index = new Index(); |
| 122 | + index.lookup(words, function (err, hits) { |
| 123 | + index.quit(); |
| 124 | + if (err) { |
| 125 | + return callback(err); |
| 126 | + } |
| 127 | + words.forEach(function (word, i) { |
| 128 | + console.log('hits for \"' + word + '\":', hits[i].join(', ')); |
| 129 | + }); |
| 130 | + callback(null, hits); |
| 131 | + }); |
| 132 | +} |
| 133 | + |
| 134 | +// [START extract_descrs] |
| 135 | +function extractDescription(texts) { |
| 136 | + var document = ''; |
| 137 | + texts.forEach(function (text) { |
| 138 | + document += (text.desc || ''); |
| 139 | + }); |
| 140 | + return document; |
| 141 | +} |
| 142 | + |
| 143 | +function extractDescriptions(filename, index, texts, callback) { |
| 144 | + if (texts.length) { |
| 145 | + index.add(filename, extractDescription(texts), callback); |
| 146 | + } else { |
| 147 | + console.log(filename + ' had no discernable text.'); |
| 148 | + index.setContainsNoText(filename, callback); |
| 149 | + } |
| 150 | +} |
| 151 | +// [END extract_descrs] |
| 152 | + |
| 153 | +// [START get_text] |
| 154 | +function getTextFromFiles(index, inputFiles, callback) { |
| 155 | + var options = { verbose: true }; |
| 156 | + |
| 157 | + // Make a call to the Vision API to detect text |
| 158 | + vision.detectText(inputFiles, options, function (err, detections) { |
| 159 | + if (err) { |
| 160 | + return callback(err); |
| 161 | + } |
| 162 | + var textResponse = {}; |
| 163 | + var tasks = []; |
| 164 | + inputFiles.forEach(function (filename, i) { |
| 165 | + var response = detections[i]; |
| 166 | + if (response.error) { |
| 167 | + console.log('API Error for ' + filename, response.error); |
| 168 | + return; |
| 169 | + } else if (Array.isArray(response)) { |
| 170 | + textResponse[filename] = 1; |
| 171 | + } else { |
| 172 | + textResponse[filename] = 0; |
| 173 | + } |
| 174 | + tasks.push(function (cb) { |
| 175 | + extractDescriptions(filename, index, response, cb); |
| 176 | + }); |
| 177 | + }); |
| 178 | + async.parallel(tasks, function (err) { |
| 179 | + if (err) { |
| 180 | + return callback(err); |
| 181 | + } |
| 182 | + callback(null, textResponse); |
| 183 | + }); |
| 184 | + }); |
| 185 | +} |
| 186 | + |
| 187 | +// Run the example |
| 188 | +function main(inputDir, callback) { |
| 189 | + var index = new Index(); |
| 190 | + |
| 191 | + async.waterfall([ |
| 192 | + // Scan the specified directory for files |
| 193 | + function (cb) { |
| 194 | + fs.readdir(inputDir, cb); |
| 195 | + }, |
| 196 | + // Separate directories from files |
| 197 | + function (files, cb) { |
| 198 | + async.parallel(files.map(function (file) { |
| 199 | + var filename = path.join(inputDir, file); |
| 200 | + return function (cb) { |
| 201 | + fs.stat(filename, function (err, stats) { |
| 202 | + if (err) { |
| 203 | + return cb(err); |
| 204 | + } |
| 205 | + if (!stats.isDirectory()) { |
| 206 | + return cb(null, filename); |
| 207 | + } |
| 208 | + cb(); |
| 209 | + }); |
| 210 | + }; |
| 211 | + }), cb); |
| 212 | + }, |
| 213 | + // Figure out which files have already been processed |
| 214 | + function (allImageFiles, cb) { |
| 215 | + var tasks = allImageFiles.filter(function (filename) { |
| 216 | + return filename; |
| 217 | + }).map(function (filename) { |
| 218 | + return function (cb) { |
| 219 | + index.documentIsProcessed(filename, function (err, processed) { |
| 220 | + if (err) { |
| 221 | + return cb(err); |
| 222 | + } |
| 223 | + if (!processed) { |
| 224 | + // Forward this filename on for further processing |
| 225 | + return cb(null, filename); |
| 226 | + } |
| 227 | + cb(); |
| 228 | + }); |
| 229 | + } |
| 230 | + }); |
| 231 | + async.parallel(tasks, cb); |
| 232 | + }, |
| 233 | + // Analyze any remaining unprocessed files |
| 234 | + function (imageFilesToProcess, cb) { |
| 235 | + var imageFilesToProcess = imageFilesToProcess.filter(function (filename) { |
| 236 | + return filename; |
| 237 | + }); |
| 238 | + if (imageFilesToProcess.length) { |
| 239 | + return getTextFromFiles(index, imageFilesToProcess, cb); |
| 240 | + } |
| 241 | + console.log('All files processed!'); |
| 242 | + cb(); |
| 243 | + } |
| 244 | + ], function (err, result) { |
| 245 | + index.quit(); |
| 246 | + callback(err, result); |
| 247 | + }); |
| 248 | +} |
| 249 | +// [END get_text] |
| 250 | + |
| 251 | +// [START run_application] |
| 252 | +if (module === require.main) { |
| 253 | + var generalError = 'Usage: node textDetection <command> <arg> ...\n\n' + |
| 254 | + '\tCommands: analyze, lookup'; |
| 255 | + if (process.argv.length < 3) { |
| 256 | + console.log(generalError); |
| 257 | + process.exit(1); |
| 258 | + } |
| 259 | + var args = process.argv.slice(2); |
| 260 | + var command = args.shift(); |
| 261 | + if (command === 'analyze') { |
| 262 | + if (!args.length) { |
| 263 | + console.log('Usage: node textDetection analyze <dir>'); |
| 264 | + process.exit(1); |
| 265 | + } |
| 266 | + main(args[0], console.log); |
| 267 | + } else if (command === 'lookup') { |
| 268 | + if (!args.length) { |
| 269 | + console.log('Usage: node textDetection lookup <word> ...'); |
| 270 | + process.exit(1); |
| 271 | + } |
| 272 | + lookup(args, console.log); |
| 273 | + } else { |
| 274 | + console.log(generalError); |
| 275 | + process.exit(1); |
| 276 | + } |
| 277 | +} |
| 278 | +// [END run_application] |
| 279 | +// [END app] |
| 280 | + |
| 281 | +exports.Index = Index; |
| 282 | +exports.lookup = lookup; |
| 283 | +exports.getTextFromFiles = getTextFromFiles; |
| 284 | +exports.main = main; |
0 commit comments