|
24 | 24 | python detect.py web-uri http://wheresgus.com/dog.JPG
|
25 | 25 | python detect.py web-geo ./resources/city.jpg
|
26 | 26 | python detect.py faces-uri gs://your-bucket/file.jpg
|
| 27 | +python detect_pdf.py ocr-uri gs://python-docs-samples-tests/HodgeConj.pdf \ |
| 28 | +gs://BUCKET_NAME/PREFIX/ |
27 | 29 |
|
28 | 30 | For more information, the documentation at
|
29 | 31 | https://cloud.google.com/vision/docs.
|
30 | 32 | """
|
31 | 33 |
|
32 | 34 | import argparse
|
33 | 35 | import io
|
| 36 | +import re |
34 | 37 |
|
| 38 | +from google.cloud import storage |
35 | 39 | from google.cloud import vision
|
| 40 | +from google.protobuf import json_format |
36 | 41 |
|
37 | 42 |
|
38 | 43 | # [START def_detect_faces]
|
@@ -636,6 +641,76 @@ def detect_document_uri(uri):
|
636 | 641 | # [END def_detect_document_uri]
|
637 | 642 |
|
638 | 643 |
|
| 644 | +# [START vision_async_detect_document_ocr] |
| 645 | +def async_detect_document(gcs_source_uri, gcs_destination_uri): |
| 646 | + """OCR with PDF/TIFF as source files on GCS""" |
| 647 | + # Supported mime_types are: 'application/pdf' and 'image/tiff' |
| 648 | + mime_type = 'application/pdf' |
| 649 | + |
| 650 | + # How many pages should be grouped into each json output file. |
| 651 | + batch_size = 2 |
| 652 | + |
| 653 | + client = vision.ImageAnnotatorClient() |
| 654 | + |
| 655 | + feature = vision.types.Feature( |
| 656 | + type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION) |
| 657 | + |
| 658 | + gcs_source = vision.types.GcsSource(uri=gcs_source_uri) |
| 659 | + input_config = vision.types.InputConfig( |
| 660 | + gcs_source=gcs_source, mime_type=mime_type) |
| 661 | + |
| 662 | + gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri) |
| 663 | + output_config = vision.types.OutputConfig( |
| 664 | + gcs_destination=gcs_destination, batch_size=batch_size) |
| 665 | + |
| 666 | + async_request = vision.types.AsyncAnnotateFileRequest( |
| 667 | + features=[feature], input_config=input_config, |
| 668 | + output_config=output_config) |
| 669 | + |
| 670 | + operation = client.async_batch_annotate_files( |
| 671 | + requests=[async_request]) |
| 672 | + |
| 673 | + print('Waiting for the operation to finish.') |
| 674 | + operation.result(timeout=180) |
| 675 | + |
| 676 | + # Once the request has completed and the output has been |
| 677 | + # written to GCS, we can list all the output files. |
| 678 | + storage_client = storage.Client() |
| 679 | + |
| 680 | + match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) |
| 681 | + bucket_name = match.group(1) |
| 682 | + prefix = match.group(2) |
| 683 | + |
| 684 | + bucket = storage_client.get_bucket(bucket_name=bucket_name) |
| 685 | + |
| 686 | + # List objects with the given prefix. |
| 687 | + blob_list = list(bucket.list_blobs(prefix=prefix)) |
| 688 | + print('Output files:') |
| 689 | + for blob in blob_list: |
| 690 | + print(blob.name) |
| 691 | + |
| 692 | + # Process the first output file from GCS. |
| 693 | + # Since we specified batch_size=2, the first response contains |
| 694 | + # the first two pages of the input file. |
| 695 | + output = blob_list[0] |
| 696 | + |
| 697 | + json_string = output.download_as_string() |
| 698 | + response = json_format.Parse( |
| 699 | + json_string, vision.types.AnnotateFileResponse()) |
| 700 | + |
| 701 | + # The actual response for the first page of the input file. |
| 702 | + first_page_response = response.responses[0] |
| 703 | + annotation = first_page_response.full_text_annotation |
| 704 | + |
| 705 | + # Here we print the full text from the first page. |
| 706 | + # The response contains more information: |
| 707 | + # annotation/pages/blocks/paragraphs/words/symbols |
| 708 | + # including confidence scores and bounding boxes |
| 709 | + print(u'Full text:\n{}'.format( |
| 710 | + annotation.text)) |
| 711 | +# [END vision_async_detect_document_ocr] |
| 712 | + |
| 713 | + |
639 | 714 | def run_local(args):
|
640 | 715 | if args.command == 'faces':
|
641 | 716 | detect_faces(args.path)
|
@@ -684,6 +759,8 @@ def run_uri(args):
|
684 | 759 | detect_document_uri(args.uri)
|
685 | 760 | elif args.command == 'web-geo-uri':
|
686 | 761 | web_entities_include_geo_results_uri(args.uri)
|
| 762 | + elif args.command == 'ocr-uri': |
| 763 | + async_detect_document(args.uri, args.destination_uri) |
687 | 764 |
|
688 | 765 |
|
689 | 766 | if __name__ == '__main__':
|
@@ -785,9 +862,14 @@ def run_uri(args):
|
785 | 862 | 'document-uri', help=detect_document_uri.__doc__)
|
786 | 863 | document_uri_parser.add_argument('uri')
|
787 | 864 |
|
| 865 | + ocr_uri_parser = subparsers.add_parser( |
| 866 | + 'ocr-uri', help=async_detect_document.__doc__) |
| 867 | + ocr_uri_parser.add_argument('uri') |
| 868 | + ocr_uri_parser.add_argument('destination_uri') |
| 869 | + |
788 | 870 | args = parser.parse_args()
|
789 | 871 |
|
790 |
| - if ('uri' in args.command): |
| 872 | + if 'uri' in args.command: |
791 | 873 | run_uri(args)
|
792 | 874 | else:
|
793 | 875 | run_local(args)
|
0 commit comments