|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# Copyright 2017 Google Inc. All Rights Reserved. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +"""This application demonstrates face detection, face emotions |
| 18 | +and speech transcription using the Google Cloud API. |
| 19 | +
|
| 20 | +Usage Examples: |
| 21 | + python beta_snippets.py boxes \ |
| 22 | + gs://python-docs-samples-tests/video/googlework_short.mp4 |
| 23 | +
|
| 24 | + python beta_snippets.py \ |
| 25 | + emotions gs://python-docs-samples-tests/video/googlework_short.mp4 |
| 26 | +
|
| 27 | + python beta_snippets.py \ |
| 28 | + transcription gs://python-docs-samples-tests/video/googlework_short.mp4 |
| 29 | +""" |
| 30 | + |
| 31 | +import argparse |
| 32 | + |
| 33 | +from google.cloud import videointelligence_v1p1beta1 as videointelligence |
| 34 | + |
| 35 | + |
| 36 | +# [START video_face_bounding_boxes] |
| 37 | +def face_bounding_boxes(gcs_uri): |
| 38 | + """ Detects faces' bounding boxes. """ |
| 39 | + video_client = videointelligence.VideoIntelligenceServiceClient() |
| 40 | + features = [videointelligence.enums.Feature.FACE_DETECTION] |
| 41 | + |
| 42 | + config = videointelligence.types.FaceConfig( |
| 43 | + include_bounding_boxes=True) |
| 44 | + context = videointelligence.types.VideoContext( |
| 45 | + face_detection_config=config) |
| 46 | + |
| 47 | + operation = video_client.annotate_video( |
| 48 | + gcs_uri, features=features, video_context=context) |
| 49 | + print('\nProcessing video for face annotations:') |
| 50 | + |
| 51 | + result = operation.result(timeout=900) |
| 52 | + print('\nFinished processing.') |
| 53 | + |
| 54 | + # There is only one result because a single video was processed. |
| 55 | + faces = result.annotation_results[0].face_detection_annotations |
| 56 | + for i, face in enumerate(faces): |
| 57 | + print('Face {}'.format(i)) |
| 58 | + |
| 59 | + # Each face_detection_annotation has only one segment. |
| 60 | + segment = face.segments[0] |
| 61 | + start_time = (segment.segment.start_time_offset.seconds + |
| 62 | + segment.segment.start_time_offset.nanos / 1e9) |
| 63 | + end_time = (segment.segment.end_time_offset.seconds + |
| 64 | + segment.segment.end_time_offset.nanos / 1e9) |
| 65 | + positions = '{}s to {}s'.format(start_time, end_time) |
| 66 | + print('\tSegment: {}\n'.format(positions)) |
| 67 | + |
| 68 | + # Each detected face may appear in many frames of the video. |
| 69 | + # Here we process only the first frame. |
| 70 | + frame = face.frames[0] |
| 71 | + |
| 72 | + time_offset = (frame.time_offset.seconds + |
| 73 | + frame.time_offset.nanos / 1e9) |
| 74 | + box = frame.attributes[0].normalized_bounding_box |
| 75 | + |
| 76 | + print('First frame time offset: {}s\n'.format(time_offset)) |
| 77 | + |
| 78 | + print('First frame normalized bounding box:') |
| 79 | + print('\tleft : {}'.format(box.left)) |
| 80 | + print('\ttop : {}'.format(box.top)) |
| 81 | + print('\tright : {}'.format(box.right)) |
| 82 | + print('\tbottom: {}'.format(box.bottom)) |
| 83 | + print('\n') |
| 84 | +# [END video_face_bounding_boxes] |
| 85 | + |
| 86 | + |
| 87 | +# [START video_face_emotions] |
| 88 | +def face_emotions(gcs_uri): |
| 89 | + """ Analyze faces' emotions over frames. """ |
| 90 | + video_client = videointelligence.VideoIntelligenceServiceClient() |
| 91 | + features = [videointelligence.enums.Feature.FACE_DETECTION] |
| 92 | + |
| 93 | + config = videointelligence.types.FaceConfig( |
| 94 | + include_emotions=True) |
| 95 | + context = videointelligence.types.VideoContext( |
| 96 | + face_detection_config=config) |
| 97 | + |
| 98 | + operation = video_client.annotate_video( |
| 99 | + gcs_uri, features=features, video_context=context) |
| 100 | + print('\nProcessing video for face annotations:') |
| 101 | + |
| 102 | + result = operation.result(timeout=600) |
| 103 | + print('\nFinished processing.') |
| 104 | + |
| 105 | + # There is only one result because a single video was processed. |
| 106 | + faces = result.annotation_results[0].face_detection_annotations |
| 107 | + for i, face in enumerate(faces): |
| 108 | + for j, frame in enumerate(face.frames): |
| 109 | + time_offset = (frame.time_offset.seconds + |
| 110 | + frame.time_offset.nanos / 1e9) |
| 111 | + emotions = frame.attributes[0].emotions |
| 112 | + |
| 113 | + print('Face {}, frame {}, time_offset {}\n'.format( |
| 114 | + i, j, time_offset)) |
| 115 | + |
| 116 | + # from videointelligence.enums |
| 117 | + emotion_labels = ( |
| 118 | + 'EMOTION_UNSPECIFIED', 'AMUSEMENT', 'ANGER', |
| 119 | + 'CONCENTRATION', 'CONTENTMENT', 'DESIRE', |
| 120 | + 'DISAPPOINTMENT', 'DISGUST', 'ELATION', |
| 121 | + 'EMBARRASSMENT', 'INTEREST', 'PRIDE', 'SADNESS', |
| 122 | + 'SURPRISE') |
| 123 | + |
| 124 | + for emotion in emotions: |
| 125 | + emotion_index = emotion.emotion |
| 126 | + emotion_label = emotion_labels[emotion_index] |
| 127 | + emotion_score = emotion.score |
| 128 | + |
| 129 | + print('emotion: {} (confidence score: {})'.format( |
| 130 | + emotion_label, emotion_score)) |
| 131 | + |
| 132 | + print('\n') |
| 133 | + |
| 134 | + print('\n') |
| 135 | +# [END video_face_emotions] |
| 136 | + |
| 137 | + |
| 138 | +# [START video_speech_transcription] |
| 139 | +def speech_transcription(input_uri): |
| 140 | + """Transcribe speech from a video stored on GCS.""" |
| 141 | + video_client = videointelligence.VideoIntelligenceServiceClient() |
| 142 | + |
| 143 | + features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION] |
| 144 | + |
| 145 | + config = videointelligence.types.SpeechTranscriptionConfig( |
| 146 | + language_code='en-US') |
| 147 | + video_context = videointelligence.types.VideoContext( |
| 148 | + speech_transcription_config=config) |
| 149 | + |
| 150 | + operation = video_client.annotate_video( |
| 151 | + input_uri, features=features, |
| 152 | + video_context=video_context) |
| 153 | + |
| 154 | + print('\nProcessing video for speech transcription.') |
| 155 | + |
| 156 | + result = operation.result(timeout=180) |
| 157 | + |
| 158 | + # There is only one annotation_result since only |
| 159 | + # one video is processed. |
| 160 | + annotation_results = result.annotation_results[0] |
| 161 | + speech_transcription = annotation_results.speech_transcriptions[0] |
| 162 | + alternative = speech_transcription.alternatives[0] |
| 163 | + |
| 164 | + print('Transcript: {}'.format(alternative.transcript)) |
| 165 | + print('Confidence: {}\n'.format(alternative.confidence)) |
| 166 | + |
| 167 | + print('Word level information:') |
| 168 | + for word_info in alternative.words: |
| 169 | + word = word_info.word |
| 170 | + start_time = word_info.start_time |
| 171 | + end_time = word_info.end_time |
| 172 | + print('\t{}s - {}s: {}'.format( |
| 173 | + start_time.seconds + start_time.nanos * 1e-9, |
| 174 | + end_time.seconds + end_time.nanos * 1e-9, |
| 175 | + word)) |
| 176 | +# [END video_speech_transcription] |
| 177 | + |
| 178 | + |
| 179 | +if __name__ == '__main__': |
| 180 | + parser = argparse.ArgumentParser( |
| 181 | + description=__doc__, |
| 182 | + formatter_class=argparse.RawDescriptionHelpFormatter) |
| 183 | + subparsers = parser.add_subparsers(dest='command') |
| 184 | + analyze_faces_parser = subparsers.add_parser( |
| 185 | + 'boxes', help=face_bounding_boxes.__doc__) |
| 186 | + analyze_faces_parser.add_argument('gcs_uri') |
| 187 | + |
| 188 | + analyze_emotions_parser = subparsers.add_parser( |
| 189 | + 'emotions', help=face_emotions.__doc__) |
| 190 | + analyze_emotions_parser.add_argument('gcs_uri') |
| 191 | + |
| 192 | + speech_transcription_parser = subparsers.add_parser( |
| 193 | + 'transcription', help=speech_transcription.__doc__) |
| 194 | + speech_transcription_parser.add_argument('gcs_uri') |
| 195 | + |
| 196 | + args = parser.parse_args() |
| 197 | + |
| 198 | + if args.command == 'boxes': |
| 199 | + face_bounding_boxes(args.gcs_uri) |
| 200 | + elif args.command == 'emotions': |
| 201 | + face_emotions(args.gcs_uri) |
| 202 | + elif args.command == 'transcription': |
| 203 | + speech_transcription(args.gcs_uri) |
0 commit comments