18
18
and recognition metadata.
19
19
20
20
Example usage:
21
- python beta_snippets.py enhanced-model resources/commercial_mono.wav
22
- python beta_snippets.py metadata resources/commercial_mono.wav
23
- python beta_snippets.py punctuation resources/commercial_mono.wav
24
- python beta_snippets.py diarization resources/commercial_mono.wav
25
- python beta_snippets.py multi-channel resources/commercial_mono.wav
26
- python beta_snippets.py multi-language resources/multi.wav en-US es
27
- python beta_snippets.py word-level-conf resources/commercial_mono.wav
21
+ python beta_snippets.py enhanced-model
22
+ python beta_snippets.py metadata
23
+ python beta_snippets.py punctuation
24
+ python beta_snippets.py diarization
25
+ python beta_snippets.py multi-channel
26
+ python beta_snippets.py multi-language
27
+ python beta_snippets.py word-level-conf
28
28
"""
29
29
30
30
import argparse
31
31
import io
32
32
33
33
34
- def transcribe_file_with_enhanced_model (speech_file ):
34
+ def transcribe_file_with_enhanced_model ():
35
35
"""Transcribe the given audio file using an enhanced model."""
36
36
# [START speech_transcribe_file_with_enhanced_model]
37
37
from google .cloud import speech_v1p1beta1 as speech
38
38
client = speech .SpeechClient ()
39
39
40
- # TODO(developer): Uncomment and set to a path to your audio file.
41
- # speech_file = 'path/to/file.wav'
40
+ speech_file = 'resources/commercial_mono.wav'
42
41
43
42
with io .open (speech_file , 'rb' ) as audio_file :
44
43
content = audio_file .read ()
@@ -64,14 +63,13 @@ def transcribe_file_with_enhanced_model(speech_file):
64
63
# [END speech_transcribe_file_with_enhanced_model]
65
64
66
65
67
- def transcribe_file_with_metadata (speech_file ):
66
+ def transcribe_file_with_metadata ():
68
67
"""Send a request that includes recognition metadata."""
69
68
# [START speech_transcribe_file_with_metadata]
70
69
from google .cloud import speech_v1p1beta1 as speech
71
70
client = speech .SpeechClient ()
72
71
73
- # TODO(developer): Uncomment and set to a path to your audio file.
74
- # speech_file = 'path/to/file.wav'
72
+ speech_file = 'resources/commercial_mono.wav'
75
73
76
74
with io .open (speech_file , 'rb' ) as audio_file :
77
75
content = audio_file .read ()
@@ -110,14 +108,13 @@ def transcribe_file_with_metadata(speech_file):
110
108
# [END speech_transcribe_file_with_metadata]
111
109
112
110
113
- def transcribe_file_with_auto_punctuation (speech_file ):
111
+ def transcribe_file_with_auto_punctuation ():
114
112
"""Transcribe the given audio file with auto punctuation enabled."""
115
113
# [START speech_transcribe_file_with_auto_punctuation]
116
114
from google .cloud import speech_v1p1beta1 as speech
117
115
client = speech .SpeechClient ()
118
116
119
- # TODO(developer): Uncomment and set to a path to your audio file.
120
- # speech_file = 'path/to/file.wav'
117
+ speech_file = 'resources/commercial_mono.wav'
121
118
122
119
with io .open (speech_file , 'rb' ) as audio_file :
123
120
content = audio_file .read ()
@@ -140,14 +137,13 @@ def transcribe_file_with_auto_punctuation(speech_file):
140
137
# [END speech_transcribe_file_with_auto_punctuation]
141
138
142
139
143
- def transcribe_file_with_diarization (speech_file ):
140
+ def transcribe_file_with_diarization ():
144
141
"""Transcribe the given audio file synchronously with diarization."""
145
142
# [START speech_transcribe_diarization]
146
143
from google .cloud import speech_v1p1beta1 as speech
147
144
client = speech .SpeechClient ()
148
145
149
- # TODO(developer): Uncomment and set to a path to your audio file.
150
- # speech_file = 'path/to/file.wav'
146
+ speech_file = 'resources/commercial_mono.wav'
151
147
152
148
with open (speech_file , 'rb' ) as audio_file :
153
149
content = audio_file .read ()
@@ -156,33 +152,37 @@ def transcribe_file_with_diarization(speech_file):
156
152
157
153
config = speech .types .RecognitionConfig (
158
154
encoding = speech .enums .RecognitionConfig .AudioEncoding .LINEAR16 ,
159
- sample_rate_hertz = 16000 ,
155
+ sample_rate_hertz = 8000 ,
160
156
language_code = 'en-US' ,
161
157
enable_speaker_diarization = True ,
162
158
diarization_speaker_count = 2 )
163
159
164
160
print ('Waiting for operation to complete...' )
165
161
response = client .recognize (config , audio )
166
162
167
- for i , result in enumerate (response .results ):
168
- alternative = result .alternatives [0 ]
169
- print ('-' * 20 )
170
- print ('First alternative of result {}: {}'
171
- .format (i , alternative .transcript ))
172
- print ('Speaker Tag for the first word: {}'
173
- .format (alternative .words [0 ].speaker_tag ))
163
+ # The transcript within each result is separate and sequential per result.
164
+ # However, the words list within an alternative includes all the words
165
+ # from all the results thus far. Thus, to get all the words with speaker
166
+ # tags, you only have to take the words list from the last result:
167
+ result = response .results [- 1 ]
168
+
169
+ words_info = result .alternatives [0 ].words
170
+
171
+ # Printing out the output:
172
+ for word_info in words_info :
173
+ print ("word: '{}', speaker_tag: {}" .format (word_info .word ,
174
+ word_info .speaker_tag ))
174
175
# [END speech_transcribe_diarization]
175
176
176
177
177
- def transcribe_file_with_multichannel (speech_file ):
178
+ def transcribe_file_with_multichannel ():
178
179
"""Transcribe the given audio file synchronously with
179
180
multi channel."""
180
181
# [START speech_transcribe_multichannel]
181
182
from google .cloud import speech_v1p1beta1 as speech
182
183
client = speech .SpeechClient ()
183
184
184
- # TODO(developer): Uncomment and set to a path to your audio file.
185
- # speech_file = 'path/to/file.wav'
185
+ speech_file = 'resources/Google_Gnome.wav'
186
186
187
187
with open (speech_file , 'rb' ) as audio_file :
188
188
content = audio_file .read ()
@@ -207,17 +207,16 @@ def transcribe_file_with_multichannel(speech_file):
207
207
# [END speech_transcribe_multichannel]
208
208
209
209
210
- def transcribe_file_with_multilanguage (speech_file , first_lang , second_lang ):
210
+ def transcribe_file_with_multilanguage ():
211
211
"""Transcribe the given audio file synchronously with
212
212
multi language."""
213
213
# [START speech_transcribe_multilanguage]
214
214
from google .cloud import speech_v1p1beta1 as speech
215
215
client = speech .SpeechClient ()
216
216
217
- # TODO(developer): Uncomment and set to a path to your audio file.
218
- # speech_file = 'path/to/file.wav'
219
- # first_lang = first language code, e,g, 'en-US'
220
- # second_lang = first language code, e,g, 'es'
217
+ speech_file = 'resources/multi.wav'
218
+ first_lang = 'en-US'
219
+ second_lang = 'es'
221
220
222
221
with open (speech_file , 'rb' ) as audio_file :
223
222
content = audio_file .read ()
@@ -226,6 +225,7 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
226
225
227
226
config = speech .types .RecognitionConfig (
228
227
encoding = speech .enums .RecognitionConfig .AudioEncoding .LINEAR16 ,
228
+ sample_rate_hertz = 44100 ,
229
229
audio_channel_count = 2 ,
230
230
language_code = first_lang ,
231
231
alternative_language_codes = [second_lang ])
@@ -241,15 +241,14 @@ def transcribe_file_with_multilanguage(speech_file, first_lang, second_lang):
241
241
# [END speech_transcribe_multilanguage]
242
242
243
243
244
- def transcribe_file_with_word_level_confidence (speech_file ):
244
+ def transcribe_file_with_word_level_confidence ():
245
245
"""Transcribe the given audio file synchronously with
246
246
word level confidence."""
247
247
# [START speech_transcribe_word_level_confidence]
248
248
from google .cloud import speech_v1p1beta1 as speech
249
249
client = speech .SpeechClient ()
250
250
251
- # TODO(developer): Uncomment and set to a path to your audio file.
252
- # speech_file = 'path/to/file.wav'
251
+ speech_file = 'resources/Google_Gnome.wav'
253
252
254
253
with open (speech_file , 'rb' ) as audio_file :
255
254
content = audio_file .read ()
@@ -279,28 +278,20 @@ def transcribe_file_with_word_level_confidence(speech_file):
279
278
description = __doc__ ,
280
279
formatter_class = argparse .RawDescriptionHelpFormatter )
281
280
parser .add_argument ('command' )
282
- parser .add_argument (
283
- 'path' , help = 'File for audio file to be recognized' )
284
- parser .add_argument (
285
- 'first' , help = 'First language in audio file to be recognized' ,
286
- nargs = '?' )
287
- parser .add_argument (
288
- 'second' , help = 'Second language in audio file to be recognized' ,
289
- nargs = '?' )
290
281
291
282
args = parser .parse_args ()
292
283
293
284
if args .command == 'enhanced-model' :
294
- transcribe_file_with_enhanced_model (args . path )
285
+ transcribe_file_with_enhanced_model ()
295
286
elif args .command == 'metadata' :
296
- transcribe_file_with_metadata (args . path )
287
+ transcribe_file_with_metadata ()
297
288
elif args .command == 'punctuation' :
298
- transcribe_file_with_auto_punctuation (args . path )
289
+ transcribe_file_with_auto_punctuation ()
299
290
elif args .command == 'diarization' :
300
- transcribe_file_with_diarization (args . path )
291
+ transcribe_file_with_diarization ()
301
292
elif args .command == 'multi-channel' :
302
- transcribe_file_with_multichannel (args . path )
293
+ transcribe_file_with_multichannel ()
303
294
elif args .command == 'multi-language' :
304
- transcribe_file_with_multilanguage (args . path , args . first , args . second )
295
+ transcribe_file_with_multilanguage ()
305
296
elif args .command == 'word-level-conf' :
306
- transcribe_file_with_word_level_confidence (args . path )
297
+ transcribe_file_with_word_level_confidence ()
0 commit comments