@@ -229,6 +229,55 @@ def analyze_shots(path):
229
229
# [END video_analyze_shots]
230
230
231
231
232
+ def speech_transcription (path ):
233
+ # [START video_speech_transcription]
234
+ """Transcribe speech from a video stored on GCS."""
235
+ from google .cloud import videointelligence
236
+
237
+ video_client = videointelligence .VideoIntelligenceServiceClient ()
238
+ features = [videointelligence .enums .Feature .SPEECH_TRANSCRIPTION ]
239
+
240
+ config = videointelligence .types .SpeechTranscriptionConfig (
241
+ language_code = 'en-US' ,
242
+ enable_automatic_punctuation = True )
243
+ video_context = videointelligence .types .VideoContext (
244
+ speech_transcription_config = config )
245
+
246
+ operation = video_client .annotate_video (
247
+ path , features = features ,
248
+ video_context = video_context )
249
+
250
+ print ('\n Processing video for speech transcription.' )
251
+
252
+ result = operation .result (timeout = 600 )
253
+
254
+ # There is only one annotation_result since only
255
+ # one video is processed.
256
+ annotation_results = result .annotation_results [0 ]
257
+ for speech_transcription in annotation_results .speech_transcriptions :
258
+
259
+ # The number of alternatives for each transcription is limited by
260
+ # SpeechTranscriptionConfig.max_alternatives.
261
+ # Each alternative is a different possible transcription
262
+ # and has its own confidence score.
263
+ for alternative in speech_transcription .alternatives :
264
+ print ('Alternative level information:' )
265
+
266
+ print ('Transcript: {}' .format (alternative .transcript ))
267
+ print ('Confidence: {}\n ' .format (alternative .confidence ))
268
+
269
+ print ('Word level information:' )
270
+ for word_info in alternative .words :
271
+ word = word_info .word
272
+ start_time = word_info .start_time
273
+ end_time = word_info .end_time
274
+ print ('\t {}s - {}s: {}' .format (
275
+ start_time .seconds + start_time .nanos * 1e-9 ,
276
+ end_time .seconds + end_time .nanos * 1e-9 ,
277
+ word ))
278
+ # [END video_speech_transcription]
279
+
280
+
232
281
if __name__ == '__main__' :
233
282
parser = argparse .ArgumentParser (
234
283
description = __doc__ ,
@@ -246,6 +295,9 @@ def analyze_shots(path):
246
295
analyze_shots_parser = subparsers .add_parser (
247
296
'shots' , help = analyze_shots .__doc__ )
248
297
analyze_shots_parser .add_argument ('path' )
298
+ transcribe_speech_parser = subparsers .add_parser (
299
+ 'transcribe' , help = speech_transcription .__doc__ )
300
+ transcribe_speech_parser .add_argument ('path' )
249
301
250
302
args = parser .parse_args ()
251
303
@@ -257,3 +309,5 @@ def analyze_shots(path):
257
309
analyze_shots (args .path )
258
310
if args .command == 'explicit_content' :
259
311
analyze_explicit_content (args .path )
312
+ if args .command == 'transcribe' :
313
+ speech_transcription (args .path )
0 commit comments