feat: video speech transcription [(#1849)](#1849)

anguillanneuf · danoscarmike · commit f7e916d65879 · 2020-09-30T13:21:13.000-07:00
diff --git a/videointelligence/samples/analyze/analyze.py b/videointelligence/samples/analyze/analyze.py
@@ -229,6 +229,55 @@ def analyze_shots(path):
     # [END video_analyze_shots]
 
 
+def speech_transcription(path):
+    # [START video_speech_transcription]
+    """Transcribe speech from a video stored on GCS."""
+    from google.cloud import videointelligence
+
+    video_client = videointelligence.VideoIntelligenceServiceClient()
+    features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION]
+
+    config = videointelligence.types.SpeechTranscriptionConfig(
+        language_code='en-US',
+        enable_automatic_punctuation=True)
+    video_context = videointelligence.types.VideoContext(
+        speech_transcription_config=config)
+
+    operation = video_client.annotate_video(
+        path, features=features,
+        video_context=video_context)
+
+    print('\nProcessing video for speech transcription.')
+
+    result = operation.result(timeout=600)
+
+    # There is only one annotation_result since only
+    # one video is processed.
+    annotation_results = result.annotation_results[0]
+    for speech_transcription in annotation_results.speech_transcriptions:
+
+        # The number of alternatives for each transcription is limited by
+        # SpeechTranscriptionConfig.max_alternatives.
+        # Each alternative is a different possible transcription
+        # and has its own confidence score.
+        for alternative in speech_transcription.alternatives:
+            print('Alternative level information:')
+
+            print('Transcript: {}'.format(alternative.transcript))
+            print('Confidence: {}\n'.format(alternative.confidence))
+
+            print('Word level information:')
+            for word_info in alternative.words:
+                word = word_info.word
+                start_time = word_info.start_time
+                end_time = word_info.end_time
+                print('\t{}s - {}s: {}'.format(
+                    start_time.seconds + start_time.nanos * 1e-9,
+                    end_time.seconds + end_time.nanos * 1e-9,
+                    word))
+    # [END video_speech_transcription]
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         description=__doc__,
@@ -246,6 +295,9 @@ def analyze_shots(path):
     analyze_shots_parser = subparsers.add_parser(
         'shots', help=analyze_shots.__doc__)
     analyze_shots_parser.add_argument('path')
+    transcribe_speech_parser = subparsers.add_parser(
+        'transcribe', help=speech_transcription.__doc__)
+    transcribe_speech_parser.add_argument('path')
 
     args = parser.parse_args()
 
@@ -257,3 +309,5 @@ def analyze_shots(path):
         analyze_shots(args.path)
     if args.command == 'explicit_content':
         analyze_explicit_content(args.path)
+    if args.command == 'transcribe':
+        speech_transcription(args.path)
diff --git a/videointelligence/samples/analyze/analyze_test.py b/videointelligence/samples/analyze/analyze_test.py
@@ -38,3 +38,11 @@ def test_analyze_explicit_content(capsys):
     analyze.analyze_explicit_content('gs://demomaker/cat.mp4')
     out, _ = capsys.readouterr()
     assert 'pornography' in out
+
+
+@pytest.mark.slow
+def test_speech_transcription(capsys):
+    analyze.speech_transcription(
+        'gs://python-docs-samples-tests/video/googlework_short.mp4')
+    out, _ = capsys.readouterr()
+    assert 'cultural' in out