|
| 1 | +// Copyright 2016, Google, Inc. |
| 2 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 3 | +// you may not use this file except in compliance with the License. |
| 4 | +// You may obtain a copy of the License at |
| 5 | +// |
| 6 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 7 | +// |
| 8 | +// Unless required by applicable law or agreed to in writing, software |
| 9 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 10 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 11 | +// See the License for the specific language governing permissions and |
| 12 | +// limitations under the License. |
| 13 | + |
| 14 | +'use strict'; |
| 15 | + |
| 16 | +// [START app] |
| 17 | +// [START import_libraries] |
| 18 | +var async = require('async'); |
| 19 | +var fs = require('fs'); |
| 20 | +var path = require('path'); |
| 21 | +var gcloud = require('gcloud')(); |
| 22 | +var natural = require('natural'); |
| 23 | +var redis = require('redis'); |
| 24 | +// Get a reference to the vision component |
| 25 | +var vision = gcloud.vision(); |
| 26 | +// [END import_libraries] |
| 27 | + |
| 28 | +function Index() { |
| 29 | + // Connect to a redis server. |
| 30 | + var TOKEN_DB = 0; |
| 31 | + var DOCS_DB = 1; |
| 32 | + var PORT = process.env.REDIS_PORT || '6379'; |
| 33 | + var HOST = process.env.REDIS_HOST || '127.0.0.1'; |
| 34 | + |
| 35 | + this.tokenClient = redis.createClient(PORT, HOST, { |
| 36 | + db: TOKEN_DB |
| 37 | + }).on('error', function (err) { |
| 38 | + console.error('ERR:REDIS: ' + err); |
| 39 | + }); |
| 40 | + this.docsClient = redis.createClient(PORT, HOST, { |
| 41 | + db: DOCS_DB |
| 42 | + }).on('error', function (err) { |
| 43 | + console.error('ERR:REDIS: ' + err); |
| 44 | + }); |
| 45 | +} |
| 46 | + |
| 47 | +Index.prototype.quit = function () { |
| 48 | + this.tokenClient.quit(); |
| 49 | + this.docsClient.quit(); |
| 50 | +}; |
| 51 | + |
| 52 | +Index.prototype.add = function (filename, document, callback) { |
| 53 | + var self = this; |
| 54 | + var PUNCTUATION = ['.', ',', ':', '']; |
| 55 | + var tokenizer = new natural.WordTokenizer(); |
| 56 | + var tokens = tokenizer.tokenize(document); |
| 57 | + |
| 58 | + // TODO: Remove stop words |
| 59 | + |
| 60 | + var tasks = tokens.filter(function (token) { |
| 61 | + return PUNCTUATION.indexOf(token) === -1; |
| 62 | + }).map(function (token) { |
| 63 | + return function (cb) { |
| 64 | + self.tokenClient.sadd(token, filename, cb); |
| 65 | + }; |
| 66 | + }); |
| 67 | + |
| 68 | + tasks.push(function (cb) { |
| 69 | + self.tokenClient.set(filename, document, cb); |
| 70 | + }); |
| 71 | + |
| 72 | + async.parallel(tasks, callback); |
| 73 | +}; |
| 74 | + |
| 75 | +Index.prototype.lookup = function (words, callback) { |
| 76 | + var self = this; |
| 77 | + var tasks = words.map(function (word) { |
| 78 | + word = word.toLowerCase(); |
| 79 | + return function (cb) { |
| 80 | + self.tokenClient.smembers(word, cb); |
| 81 | + }; |
| 82 | + }); |
| 83 | + async.parallel(tasks, callback); |
| 84 | +}; |
| 85 | + |
| 86 | +Index.prototype.documentIsProcessed = function (filename, callback) { |
| 87 | + this.docsClient.GET(filename, function (err, value) { |
| 88 | + if (err) { |
| 89 | + return callback(err); |
| 90 | + } |
| 91 | + if (value) { |
| 92 | + console.log(filename + ' already added to index.'); |
| 93 | + callback(null, true); |
| 94 | + } else if (value === '') { |
| 95 | + console.log(filename + ' was already checked, and contains no text.'); |
| 96 | + callback(null, true); |
| 97 | + } else { |
| 98 | + callback(null, false); |
| 99 | + } |
| 100 | + }); |
| 101 | +}; |
| 102 | + |
| 103 | +Index.prototype.setContainsNoText = function (filename, callback) { |
| 104 | + this.docsClient.set(filename, '', callback); |
| 105 | +}; |
| 106 | + |
| 107 | +function lookup(words, callback) { |
| 108 | + var index = new Index(); |
| 109 | + index.lookup(words, function (err, hits) { |
| 110 | + index.quit(); |
| 111 | + if (err) { |
| 112 | + return callback(err); |
| 113 | + } |
| 114 | + words.forEach(function (word, i) { |
| 115 | + console.log('hits for \"' + word + '\":', hits[i].join(', ')); |
| 116 | + }); |
| 117 | + callback(null, hits); |
| 118 | + }); |
| 119 | +} |
| 120 | + |
| 121 | +// [START extract_descrs] |
| 122 | +function extractDescription(texts) { |
| 123 | + var document = ''; |
| 124 | + texts.forEach(function (text) { |
| 125 | + document += (text.desc || ''); |
| 126 | + }); |
| 127 | + return document; |
| 128 | +} |
| 129 | + |
| 130 | +function extractDescriptions(filename, index, texts, callback) { |
| 131 | + if (texts.length) { |
| 132 | + index.add(filename, extractDescription(texts), callback); |
| 133 | + } else { |
| 134 | + console.log(filename + ' had no discernable text.'); |
| 135 | + index.setContainsNoText(filename, callback); |
| 136 | + } |
| 137 | +} |
| 138 | +// [END extract_descrs] |
| 139 | + |
| 140 | +// [START get_text] |
| 141 | +function getTextFromFiles(index, inputFiles, callback) { |
| 142 | + var options = { verbose: true }; |
| 143 | + |
| 144 | + // Make a call to the Vision API to detect text |
| 145 | + vision.detectText(inputFiles, options, function (err, detections) { |
| 146 | + if (err) { |
| 147 | + return callback(err); |
| 148 | + } |
| 149 | + var textResponse = {}; |
| 150 | + var tasks = []; |
| 151 | + inputFiles.forEach(function (filename, i) { |
| 152 | + var response = detections[i]; |
| 153 | + if (response.error) { |
| 154 | + console.log('API Error for ' + filename, response.error); |
| 155 | + return; |
| 156 | + } else if (Array.isArray(response)) { |
| 157 | + textResponse[filename] = 1; |
| 158 | + } else { |
| 159 | + textResponse[filename] = 0; |
| 160 | + } |
| 161 | + tasks.push(function (cb) { |
| 162 | + extractDescriptions(filename, index, response, cb); |
| 163 | + }); |
| 164 | + }); |
| 165 | + async.parallel(tasks, function (err) { |
| 166 | + if (err) { |
| 167 | + return callback(err); |
| 168 | + } |
| 169 | + callback(null, textResponse); |
| 170 | + }); |
| 171 | + }); |
| 172 | +} |
| 173 | + |
| 174 | +// Run the example |
| 175 | +function main(inputDir, callback) { |
| 176 | + var index = new Index(); |
| 177 | + |
| 178 | + async.waterfall([ |
| 179 | + // Scan the specified directory for files |
| 180 | + function (cb) { |
| 181 | + fs.readdir(inputDir, cb); |
| 182 | + }, |
| 183 | + // Separate directories from files |
| 184 | + function (files, cb) { |
| 185 | + async.parallel(files.map(function (file) { |
| 186 | + var filename = path.join(inputDir, file); |
| 187 | + return function (cb) { |
| 188 | + fs.stat(filename, function (err, stats) { |
| 189 | + if (err) { |
| 190 | + return cb(err); |
| 191 | + } |
| 192 | + if (!stats.isDirectory()) { |
| 193 | + return cb(null, filename); |
| 194 | + } |
| 195 | + cb(); |
| 196 | + }); |
| 197 | + }; |
| 198 | + }), cb); |
| 199 | + }, |
| 200 | + // Figure out which files have already been processed |
| 201 | + function (allImageFiles, cb) { |
| 202 | + var tasks = allImageFiles.filter(function (filename) { |
| 203 | + return filename; |
| 204 | + }).map(function (filename) { |
| 205 | + return function (cb) { |
| 206 | + index.documentIsProcessed(filename, function (err, processed) { |
| 207 | + if (err) { |
| 208 | + return cb(err); |
| 209 | + } |
| 210 | + if (!processed) { |
| 211 | + // Forward this filename on for further processing |
| 212 | + return cb(null, filename); |
| 213 | + } |
| 214 | + cb(); |
| 215 | + }); |
| 216 | + }; |
| 217 | + }); |
| 218 | + async.parallel(tasks, cb); |
| 219 | + }, |
| 220 | + // Analyze any remaining unprocessed files |
| 221 | + function (imageFilesToProcess, cb) { |
| 222 | + imageFilesToProcess = imageFilesToProcess.filter(function (filename) { |
| 223 | + return filename; |
| 224 | + }); |
| 225 | + if (imageFilesToProcess.length) { |
| 226 | + return getTextFromFiles(index, imageFilesToProcess, cb); |
| 227 | + } |
| 228 | + console.log('All files processed!'); |
| 229 | + cb(); |
| 230 | + } |
| 231 | + ], function (err, result) { |
| 232 | + index.quit(); |
| 233 | + callback(err, result); |
| 234 | + }); |
| 235 | +} |
| 236 | +// [END get_text] |
| 237 | + |
| 238 | +// [START run_application] |
| 239 | +if (module === require.main) { |
| 240 | + var generalError = 'Usage: node textDetection <command> <arg> ...\n\n' + |
| 241 | + '\tCommands: analyze, lookup'; |
| 242 | + if (process.argv.length < 3) { |
| 243 | + console.log(generalError); |
| 244 | + process.exit(1); |
| 245 | + } |
| 246 | + var args = process.argv.slice(2); |
| 247 | + var command = args.shift(); |
| 248 | + if (command === 'analyze') { |
| 249 | + if (!args.length) { |
| 250 | + console.log('Usage: node textDetection analyze <dir>'); |
| 251 | + process.exit(1); |
| 252 | + } |
| 253 | + main(args[0], console.log); |
| 254 | + } else if (command === 'lookup') { |
| 255 | + if (!args.length) { |
| 256 | + console.log('Usage: node textDetection lookup <word> ...'); |
| 257 | + process.exit(1); |
| 258 | + } |
| 259 | + lookup(args, console.log); |
| 260 | + } else { |
| 261 | + console.log(generalError); |
| 262 | + process.exit(1); |
| 263 | + } |
| 264 | +} |
| 265 | +// [END run_application] |
| 266 | +// [END app] |
| 267 | + |
| 268 | +exports.Index = Index; |
| 269 | +exports.lookup = lookup; |
| 270 | +exports.getTextFromFiles = getTextFromFiles; |
| 271 | +exports.main = main; |
0 commit comments