Skip to content

feat: video speech transcription #1849

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions video/cloud-client/analyze/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,55 @@ def analyze_shots(path):
# [END video_analyze_shots]


def speech_transcription(path):
# [START video_speech_transcription]
"""Transcribe speech from a video stored on GCS."""
from google.cloud import videointelligence

video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION]

config = videointelligence.types.SpeechTranscriptionConfig(
language_code='en-US',
enable_automatic_punctuation=True)
video_context = videointelligence.types.VideoContext(
speech_transcription_config=config)

operation = video_client.annotate_video(
path, features=features,
video_context=video_context)

print('\nProcessing video for speech transcription.')

result = operation.result(timeout=600)

# There is only one annotation_result since only
# one video is processed.
annotation_results = result.annotation_results[0]
for speech_transcription in annotation_results.speech_transcriptions:

# The number of alternatives for each transcription is limited by
# SpeechTranscriptionConfig.max_alternatives.
# Each alternative is a different possible transcription
# and has its own confidence score.
for alternative in speech_transcription.alternatives:
print('Alternative level information:')

print('Transcript: {}'.format(alternative.transcript))
print('Confidence: {}\n'.format(alternative.confidence))

print('Word level information:')
for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
print('\t{}s - {}s: {}'.format(
start_time.seconds + start_time.nanos * 1e-9,
end_time.seconds + end_time.nanos * 1e-9,
word))
# [END video_speech_transcription]


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=__doc__,
Expand All @@ -246,6 +295,9 @@ def analyze_shots(path):
analyze_shots_parser = subparsers.add_parser(
'shots', help=analyze_shots.__doc__)
analyze_shots_parser.add_argument('path')
transcribe_speech_parser = subparsers.add_parser(
'transcribe', help=speech_transcription.__doc__)
transcribe_speech_parser.add_argument('path')

args = parser.parse_args()

Expand All @@ -257,3 +309,5 @@ def analyze_shots(path):
analyze_shots(args.path)
if args.command == 'explicit_content':
analyze_explicit_content(args.path)
if args.command == 'transcribe':
speech_transcription(args.path)
8 changes: 8 additions & 0 deletions video/cloud-client/analyze/analyze_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,11 @@ def test_analyze_explicit_content(capsys):
analyze.analyze_explicit_content('gs://demomaker/cat.mp4')
out, _ = capsys.readouterr()
assert 'pornography' in out


@pytest.mark.slow
def test_speech_transcription(capsys):
analyze.speech_transcription(
'gs://python-docs-samples-tests/video/googlework_short.mp4')
out, _ = capsys.readouterr()
assert 'cultural' in out