Skip to content

Commit dd4911e

Browse files
happyhumanbusunkim96
authored andcommitted
Diarization Output Modified [(#1586)](GoogleCloudPlatform/python-docs-samples#1586)
* Printing the last paragraph only. * Python3 print * Removing sample rate setting * Adding the missing output parameter in the example * Changes based on the comments * Removed filenames as input parameters * Removed unused args * Updated README file * Updated the inline comment * Modified code to make it more readable * Simplified the response object processing. * Fixing the long line issue.
1 parent f2914f6 commit dd4911e

File tree

3 files changed

+60
-79
lines changed

3 files changed

+60
-79
lines changed

packages/google-cloud-speech/samples/snippets/README.rst

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -221,25 +221,22 @@ To run this sample:
221221
222222
$ python beta_snippets.py
223223
224-
usage: beta_snippets.py [-h] command path first second
224+
usage: beta_snippets.py [-h] command
225225
226226
Google Cloud Speech API sample that demonstrates enhanced models
227227
and recognition metadata.
228228
229229
Example usage:
230-
python beta_snippets.py enhanced-model resources/commercial_mono.wav
231-
python beta_snippets.py metadata resources/commercial_mono.wav
232-
python beta_snippets.py punctuation resources/commercial_mono.wav
233-
python beta_snippets.py diarization resources/commercial_mono.wav
234-
python beta_snippets.py multi-channel resources/commercial_mono.wav
235-
python beta_snippets.py multi-language resources/multi.wav en-US es
236-
python beta_snippets.py word-level-conf resources/commercial_mono.wav
230+
python beta_snippets.py enhanced-model
231+
python beta_snippets.py metadata
232+
python beta_snippets.py punctuation
233+
python beta_snippets.py diarization
234+
python beta_snippets.py multi-channel
235+
python beta_snippets.py multi-language
236+
python beta_snippets.py word-level-conf
237237
238238
positional arguments:
239239
command
240-
path File for audio file to be recognized
241-
first First language in audio file to be recognized
242-
second Second language in audio file to be recognized
243240
244241
optional arguments:
245242
-h, --help show this help message and exit

packages/google-cloud-speech/samples/snippets/beta_snippets.py

Lines changed: 44 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -18,27 +18,26 @@
1818
and recognition metadata.
1919
2020
Example usage:
21-
python beta_snippets.py enhanced-model resources/commercial_mono.wav
22-
python beta_snippets.py metadata resources/commercial_mono.wav
23-
python beta_snippets.py punctuation resources/commercial_mono.wav
24-
python beta_snippets.py diarization resources/commercial_mono.wav
25-
python beta_snippets.py multi-channel resources/commercial_mono.wav
26-
python beta_snippets.py multi-language resources/multi.wav en-US es
27-
python beta_snippets.py word-level-conf resources/commercial_mono.wav
21+
python beta_snippets.py enhanced-model
22+
python beta_snippets.py metadata
23+
python beta_snippets.py punctuation
24+
python beta_snippets.py diarization
25+
python beta_snippets.py multi-channel
26+
python beta_snippets.py multi-language
27+
python beta_snippets.py word-level-conf
2828
"""
2929

3030
import argparse
3131
import io
3232

3333

34-
def transcribe_file_with_enhanced_model(speech_file):
34+
def transcribe_file_with_enhanced_model():
3535
"""Transcribe the given audio file using an enhanced model."""
3636
# [START speech_transcribe_file_with_enhanced_model]
3737
from google.cloud import speech_v1p1beta1 as speech
3838
client = speech.SpeechClient()
3939

40-
# TODO(developer): Uncomment and set to a path to your audio file.
41-
# speech_file = 'path/to/file.wav'
40+
speech_file = 'resources/commercial_mono.wav'
4241

4342
with io.open(speech_file, 'rb') as audio_file:
4443
content = audio_file.read()
@@ -64,14 +63,13 @@ def transcribe_file_with_enhanced_model(speech_file):
6463
# [END speech_transcribe_file_with_enhanced_model]
6564

6665

67-
def transcribe_file_with_metadata(speech_file):
66+
def transcribe_file_with_metadata():
6867
"""Send a request that includes recognition metadata."""
6968
# [START speech_transcribe_file_with_metadata]
7069
from google.cloud import speech_v1p1beta1 as speech
7170
client = speech.SpeechClient()
7271

73-
# TODO(developer): Uncomment and set to a path to your audio file.
74-
# speech_file = 'path/to/file.wav'
72+
speech_file = 'resources/commercial_mono.wav'
7573

7674
with io.open(speech_file, 'rb') as audio_file:
7775
content = audio_file.read()
@@ -110,14 +108,13 @@ def transcribe_file_with_metadata(speech_file):
110108
# [END speech_transcribe_file_with_metadata]
111109

112110

113-
def transcribe_file_with_auto_punctuation(speech_file):
111+
def transcribe_file_with_auto_punctuation():
114112
"""Transcribe the given audio file with auto punctuation enabled."""
115113
# [START speech_transcribe_file_with_auto_punctuation]
116114
from google.cloud import speech_v1p1beta1 as speech
117115
client = speech.SpeechClient()
118116

119-
# TODO(developer): Uncomment and set to a path to your audio file.
120-
# speech_file = 'path/to/file.wav'
117+
speech_file = 'resources/commercial_mono.wav'
121118

122119
with io.open(speech_file, 'rb') as audio_file:
123120
content = audio_file.read()
@@ -140,14 +137,13 @@ def transcribe_file_with_auto_punctuation(speech_file):
140137
# [END speech_transcribe_file_with_auto_punctuation]
141138

142139

143-
def transcribe_file_with_diarization(speech_file):
140+
def transcribe_file_with_diarization():
144141
"""Transcribe the given audio file synchronously with diarization."""
145142
# [START speech_transcribe_diarization]
146143
from google.cloud import speech_v1p1beta1 as speech
147144
client = speech.SpeechClient()
148145

149-
# TODO(developer): Uncomment and set to a path to your audio file.
150-
# speech_file = 'path/to/file.wav'
146+
speech_file = 'resources/commercial_mono.wav'
151147

152148
with open(speech_file, 'rb') as audio_file:
153149
content = audio_file.read()
@@ -156,33 +152,37 @@ def transcribe_file_with_diarization(speech_file):
156152

157153
config = speech.types.RecognitionConfig(
158154
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
159-
sample_rate_hertz=16000,
155+
sample_rate_hertz=8000,
160156
language_code='en-US',
161157
enable_speaker_diarization=True,
162158
diarization_speaker_count=2)
163159

164160
print('Waiting for operation to complete...')
165161
response = client.recognize(config, audio)
166162

167-
for i, result in enumerate(response.results):
168-
alternative = result.alternatives[0]
169-
print('-' * 20)
170-
print('First alternative of result {}: {}'
171-
.format(i, alternative.transcript))
172-
print('Speaker Tag for the first word: {}'
173-
.format(alternative.words[0].speaker_tag))
163+
# The transcript within each result is separate and sequential per result.
164+
# However, the words list within an alternative includes all the words
165+
# from all the results thus far. Thus, to get all the words with speaker
166+
# tags, you only have to take the words list from the last result:
167+
result = response.results[-1]
168+
169+
words_info = result.alternatives[0].words
170+
171+
# Printing out the output:
172+
for word_info in words_info:
173+
print("word: '{}', speaker_tag: {}".format(word_info.word,
174+
word_info.speaker_tag))
174175
# [END speech_transcribe_diarization]
175176

176177

177-
def transcribe_file_with_multichannel(speech_file):
178+
def transcribe_file_with_multichannel():
178179
"""Transcribe the given audio file synchronously with
179180
multi channel."""
180181
# [START speech_transcribe_multichannel]
181182
from google.cloud import speech_v1p1beta1 as speech
182183
client = speech.SpeechClient()
183184

184-
# TODO(developer): Uncomment and set to a path to your audio file.
185-
# speech_file = 'path/to/file.wav'
185+
speech_file = 'resources/Google_Gnome.wav'
186186

187187
with open(speech_file, 'rb') as audio_file:
188188
content = audio_file.read()
@@ -207,17 +207,16 @@ def transcribe_file_with_multichannel(speech_file):
207207
# [END speech_transcribe_multichannel]
208208

209209

210-
def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
210+
def transcribe_file_with_multilanguage():
211211
"""Transcribe the given audio file synchronously with
212212
multi language."""
213213
# [START speech_transcribe_multilanguage]
214214
from google.cloud import speech_v1p1beta1 as speech
215215
client = speech.SpeechClient()
216216

217-
# TODO(developer): Uncomment and set to a path to your audio file.
218-
# speech_file = 'path/to/file.wav'
219-
# first_lang = first language code, e,g, 'en-US'
220-
# second_lang = first language code, e,g, 'es'
217+
speech_file = 'resources/multi.wav'
218+
first_lang = 'en-US'
219+
second_lang = 'es'
221220

222221
with open(speech_file, 'rb') as audio_file:
223222
content = audio_file.read()
@@ -226,6 +225,7 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
226225

227226
config = speech.types.RecognitionConfig(
228227
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
228+
sample_rate_hertz=44100,
229229
audio_channel_count=2,
230230
language_code=first_lang,
231231
alternative_language_codes=[second_lang])
@@ -241,15 +241,14 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
241241
# [END speech_transcribe_multilanguage]
242242

243243

244-
def transcribe_file_with_word_level_confidence(speech_file):
244+
def transcribe_file_with_word_level_confidence():
245245
"""Transcribe the given audio file synchronously with
246246
word level confidence."""
247247
# [START speech_transcribe_word_level_confidence]
248248
from google.cloud import speech_v1p1beta1 as speech
249249
client = speech.SpeechClient()
250250

251-
# TODO(developer): Uncomment and set to a path to your audio file.
252-
# speech_file = 'path/to/file.wav'
251+
speech_file = 'resources/Google_Gnome.wav'
253252

254253
with open(speech_file, 'rb') as audio_file:
255254
content = audio_file.read()
@@ -279,28 +278,20 @@ def transcribe_file_with_word_level_confidence(speech_file):
279278
description=__doc__,
280279
formatter_class=argparse.RawDescriptionHelpFormatter)
281280
parser.add_argument('command')
282-
parser.add_argument(
283-
'path', help='File for audio file to be recognized')
284-
parser.add_argument(
285-
'first', help='First language in audio file to be recognized',
286-
nargs='?')
287-
parser.add_argument(
288-
'second', help='Second language in audio file to be recognized',
289-
nargs='?')
290281

291282
args = parser.parse_args()
292283

293284
if args.command == 'enhanced-model':
294-
transcribe_file_with_enhanced_model(args.path)
285+
transcribe_file_with_enhanced_model()
295286
elif args.command == 'metadata':
296-
transcribe_file_with_metadata(args.path)
287+
transcribe_file_with_metadata()
297288
elif args.command == 'punctuation':
298-
transcribe_file_with_auto_punctuation(args.path)
289+
transcribe_file_with_auto_punctuation()
299290
elif args.command == 'diarization':
300-
transcribe_file_with_diarization(args.path)
291+
transcribe_file_with_diarization()
301292
elif args.command == 'multi-channel':
302-
transcribe_file_with_multichannel(args.path)
293+
transcribe_file_with_multichannel()
303294
elif args.command == 'multi-language':
304-
transcribe_file_with_multilanguage(args.path, args.first, args.second)
295+
transcribe_file_with_multilanguage()
305296
elif args.command == 'word-level-conf':
306-
transcribe_file_with_word_level_confidence(args.path)
297+
transcribe_file_with_word_level_confidence()

packages/google-cloud-speech/samples/snippets/beta_snippets_test.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,56 +26,49 @@
2626

2727

2828
def test_transcribe_file_with_enhanced_model(capsys):
29-
transcribe_file_with_enhanced_model(
30-
os.path.join(RESOURCES, 'commercial_mono.wav'))
29+
transcribe_file_with_enhanced_model()
3130
out, _ = capsys.readouterr()
3231

3332
assert 'Chrome' in out
3433

3534

3635
def test_transcribe_file_with_metadata(capsys):
37-
transcribe_file_with_metadata(
38-
os.path.join(RESOURCES, 'commercial_mono.wav'))
36+
transcribe_file_with_metadata()
3937
out, _ = capsys.readouterr()
4038

4139
assert 'Chrome' in out
4240

4341

4442
def test_transcribe_file_with_auto_punctuation(capsys):
45-
transcribe_file_with_auto_punctuation(
46-
os.path.join(RESOURCES, 'commercial_mono.wav'))
43+
transcribe_file_with_auto_punctuation()
4744
out, _ = capsys.readouterr()
4845

4946
assert 'Okay. Sure.' in out
5047

5148

5249
def test_transcribe_diarization(capsys):
53-
transcribe_file_with_diarization(
54-
os.path.join(RESOURCES, 'Google_Gnome.wav'))
50+
transcribe_file_with_diarization()
5551
out, err = capsys.readouterr()
5652

57-
assert 'OK Google stream stranger things from Netflix to my TV' in out
53+
assert "word: 'here', speaker_tag: 1" in out
5854

5955

6056
def test_transcribe_multichannel_file(capsys):
61-
transcribe_file_with_multichannel(
62-
os.path.join(RESOURCES, 'Google_Gnome.wav'))
57+
transcribe_file_with_multichannel()
6358
out, err = capsys.readouterr()
6459

6560
assert 'OK Google stream stranger things from Netflix to my TV' in out
6661

6762

6863
def test_transcribe_multilanguage_file(capsys):
69-
transcribe_file_with_multilanguage(
70-
os.path.join(RESOURCES, 'multi.wav'), 'en-US', 'es')
64+
transcribe_file_with_multilanguage()
7165
out, err = capsys.readouterr()
7266

7367
assert 'how are you doing estoy bien e tu' in out
7468

7569

7670
def test_transcribe_word_level_confidence(capsys):
77-
transcribe_file_with_word_level_confidence(
78-
os.path.join(RESOURCES, 'Google_Gnome.wav'))
71+
transcribe_file_with_word_level_confidence()
7972
out, err = capsys.readouterr()
8073

8174
assert 'OK Google stream stranger things from Netflix to my TV' in out

0 commit comments

Comments
 (0)