Skip to content

Commit f7e916d

Browse files
anguillanneufdanoscarmike
authored andcommitted
feat: video speech transcription [(#1849)](#1849)
1 parent 3cc1b0c commit f7e916d

File tree

2 files changed

+62
-0
lines changed

2 files changed

+62
-0
lines changed

videointelligence/samples/analyze/analyze.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,55 @@ def analyze_shots(path):
229229
# [END video_analyze_shots]
230230

231231

232+
def speech_transcription(path):
233+
# [START video_speech_transcription]
234+
"""Transcribe speech from a video stored on GCS."""
235+
from google.cloud import videointelligence
236+
237+
video_client = videointelligence.VideoIntelligenceServiceClient()
238+
features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION]
239+
240+
config = videointelligence.types.SpeechTranscriptionConfig(
241+
language_code='en-US',
242+
enable_automatic_punctuation=True)
243+
video_context = videointelligence.types.VideoContext(
244+
speech_transcription_config=config)
245+
246+
operation = video_client.annotate_video(
247+
path, features=features,
248+
video_context=video_context)
249+
250+
print('\nProcessing video for speech transcription.')
251+
252+
result = operation.result(timeout=600)
253+
254+
# There is only one annotation_result since only
255+
# one video is processed.
256+
annotation_results = result.annotation_results[0]
257+
for speech_transcription in annotation_results.speech_transcriptions:
258+
259+
# The number of alternatives for each transcription is limited by
260+
# SpeechTranscriptionConfig.max_alternatives.
261+
# Each alternative is a different possible transcription
262+
# and has its own confidence score.
263+
for alternative in speech_transcription.alternatives:
264+
print('Alternative level information:')
265+
266+
print('Transcript: {}'.format(alternative.transcript))
267+
print('Confidence: {}\n'.format(alternative.confidence))
268+
269+
print('Word level information:')
270+
for word_info in alternative.words:
271+
word = word_info.word
272+
start_time = word_info.start_time
273+
end_time = word_info.end_time
274+
print('\t{}s - {}s: {}'.format(
275+
start_time.seconds + start_time.nanos * 1e-9,
276+
end_time.seconds + end_time.nanos * 1e-9,
277+
word))
278+
# [END video_speech_transcription]
279+
280+
232281
if __name__ == '__main__':
233282
parser = argparse.ArgumentParser(
234283
description=__doc__,
@@ -246,6 +295,9 @@ def analyze_shots(path):
246295
analyze_shots_parser = subparsers.add_parser(
247296
'shots', help=analyze_shots.__doc__)
248297
analyze_shots_parser.add_argument('path')
298+
transcribe_speech_parser = subparsers.add_parser(
299+
'transcribe', help=speech_transcription.__doc__)
300+
transcribe_speech_parser.add_argument('path')
249301

250302
args = parser.parse_args()
251303

@@ -257,3 +309,5 @@ def analyze_shots(path):
257309
analyze_shots(args.path)
258310
if args.command == 'explicit_content':
259311
analyze_explicit_content(args.path)
312+
if args.command == 'transcribe':
313+
speech_transcription(args.path)

videointelligence/samples/analyze/analyze_test.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,11 @@ def test_analyze_explicit_content(capsys):
3838
analyze.analyze_explicit_content('gs://demomaker/cat.mp4')
3939
out, _ = capsys.readouterr()
4040
assert 'pornography' in out
41+
42+
43+
@pytest.mark.slow
44+
def test_speech_transcription(capsys):
45+
analyze.speech_transcription(
46+
'gs://python-docs-samples-tests/video/googlework_short.mp4')
47+
out, _ = capsys.readouterr()
48+
assert 'cultural' in out

0 commit comments

Comments
 (0)