Skip to content

Commit 7bb425d

Browse files
authored
Speech continuous (#1940)
* Draft 1: indefinitely-long streaming transcription * Clean up & refactor of indefinite speech transcrib * Make sure chunks_per_second is a whole number. * Update for google-cloud-python client lib. * Update sample to not error out, but make a new request every ~60ish seconds * Update transcribe_streaming_mic.py * Clean up unnecessary code, since we no longer wait for it to error out * Update based on feedback
1 parent fd7944a commit 7bb425d

File tree

2 files changed

+225
-0
lines changed

2 files changed

+225
-0
lines changed

speech/cloud-client/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
google-cloud-speech==0.36.0
2+
pyaudio==0.2.11
3+
six==1.12.0
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2018 Google LLC
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
"""Google Cloud Speech API sample application using the streaming API.
18+
19+
NOTE: This module requires the additional dependency `pyaudio`. To install
20+
using pip:
21+
22+
pip install pyaudio
23+
24+
Example usage:
25+
python transcribe_streaming_indefinite.py
26+
"""
27+
28+
# [START speech_transcribe_infinite_streaming]
29+
from __future__ import division
30+
31+
import time
32+
import re
33+
import sys
34+
35+
from google.cloud import speech
36+
37+
import pyaudio
38+
from six.moves import queue
39+
40+
# Audio recording parameters
41+
STREAMING_LIMIT = 55000
42+
SAMPLE_RATE = 16000
43+
CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms
44+
45+
46+
def get_current_time():
47+
return int(round(time.time() * 1000))
48+
49+
50+
def duration_to_secs(duration):
51+
return duration.seconds + (duration.nanos / float(1e9))
52+
53+
54+
class ResumableMicrophoneStream:
55+
"""Opens a recording stream as a generator yielding the audio chunks."""
56+
def __init__(self, rate, chunk_size):
57+
self._rate = rate
58+
self._chunk_size = chunk_size
59+
self._num_channels = 1
60+
self._max_replay_secs = 5
61+
62+
# Create a thread-safe buffer of audio data
63+
self._buff = queue.Queue()
64+
self.closed = True
65+
self.start_time = get_current_time()
66+
67+
# 2 bytes in 16 bit samples
68+
self._bytes_per_sample = 2 * self._num_channels
69+
self._bytes_per_second = self._rate * self._bytes_per_sample
70+
71+
self._bytes_per_chunk = (self._chunk_size * self._bytes_per_sample)
72+
self._chunks_per_second = (
73+
self._bytes_per_second // self._bytes_per_chunk)
74+
75+
def __enter__(self):
76+
self.closed = False
77+
78+
self._audio_interface = pyaudio.PyAudio()
79+
self._audio_stream = self._audio_interface.open(
80+
format=pyaudio.paInt16,
81+
channels=self._num_channels,
82+
rate=self._rate,
83+
input=True,
84+
frames_per_buffer=self._chunk_size,
85+
# Run the audio stream asynchronously to fill the buffer object.
86+
# This is necessary so that the input device's buffer doesn't
87+
# overflow while the calling thread makes network requests, etc.
88+
stream_callback=self._fill_buffer,
89+
)
90+
91+
return self
92+
93+
def __exit__(self, type, value, traceback):
94+
self._audio_stream.stop_stream()
95+
self._audio_stream.close()
96+
self.closed = True
97+
# Signal the generator to terminate so that the client's
98+
# streaming_recognize method will not block the process termination.
99+
self._buff.put(None)
100+
self._audio_interface.terminate()
101+
102+
def _fill_buffer(self, in_data, *args, **kwargs):
103+
"""Continuously collect data from the audio stream, into the buffer."""
104+
self._buff.put(in_data)
105+
return None, pyaudio.paContinue
106+
107+
def generator(self):
108+
while not self.closed:
109+
if get_current_time() - self.start_time > STREAMING_LIMIT:
110+
self.start_time = get_current_time()
111+
break
112+
# Use a blocking get() to ensure there's at least one chunk of
113+
# data, and stop iteration if the chunk is None, indicating the
114+
# end of the audio stream.
115+
chunk = self._buff.get()
116+
if chunk is None:
117+
return
118+
data = [chunk]
119+
120+
# Now consume whatever other data's still buffered.
121+
while True:
122+
try:
123+
chunk = self._buff.get(block=False)
124+
if chunk is None:
125+
return
126+
data.append(chunk)
127+
except queue.Empty:
128+
break
129+
130+
yield b''.join(data)
131+
132+
133+
def listen_print_loop(responses, stream):
134+
"""Iterates through server responses and prints them.
135+
136+
The responses passed is a generator that will block until a response
137+
is provided by the server.
138+
139+
Each response may contain multiple results, and each result may contain
140+
multiple alternatives; for details, see https://goo.gl/tjCPAU. Here we
141+
print only the transcription for the top alternative of the top result.
142+
143+
In this case, responses are provided for interim results as well. If the
144+
response is an interim one, print a line feed at the end of it, to allow
145+
the next result to overwrite it, until the response is a final one. For the
146+
final one, print a newline to preserve the finalized transcription.
147+
"""
148+
responses = (r for r in responses if (
149+
r.results and r.results[0].alternatives))
150+
151+
num_chars_printed = 0
152+
for response in responses:
153+
if not response.results:
154+
continue
155+
156+
# The `results` list is consecutive. For streaming, we only care about
157+
# the first result being considered, since once it's `is_final`, it
158+
# moves on to considering the next utterance.
159+
result = response.results[0]
160+
if not result.alternatives:
161+
continue
162+
163+
# Display the transcription of the top alternative.
164+
top_alternative = result.alternatives[0]
165+
transcript = top_alternative.transcript
166+
167+
# Display interim results, but with a carriage return at the end of the
168+
# line, so subsequent lines will overwrite them.
169+
#
170+
# If the previous result was longer than this one, we need to print
171+
# some extra spaces to overwrite the previous result
172+
overwrite_chars = ' ' * (num_chars_printed - len(transcript))
173+
174+
if not result.is_final:
175+
sys.stdout.write(transcript + overwrite_chars + '\r')
176+
sys.stdout.flush()
177+
178+
num_chars_printed = len(transcript)
179+
else:
180+
print(transcript + overwrite_chars)
181+
182+
# Exit recognition if any of the transcribed phrases could be
183+
# one of our keywords.
184+
if re.search(r'\b(exit|quit)\b', transcript, re.I):
185+
print('Exiting..')
186+
stream.closed = True
187+
break
188+
189+
num_chars_printed = 0
190+
191+
192+
def main():
193+
client = speech.SpeechClient()
194+
config = speech.types.RecognitionConfig(
195+
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
196+
sample_rate_hertz=SAMPLE_RATE,
197+
language_code='en-US',
198+
max_alternatives=1,
199+
enable_word_time_offsets=True)
200+
streaming_config = speech.types.StreamingRecognitionConfig(
201+
config=config,
202+
interim_results=True)
203+
204+
mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)
205+
206+
print('Say "Quit" or "Exit" to terminate the program.')
207+
208+
with mic_manager as stream:
209+
while not stream.closed:
210+
audio_generator = stream.generator()
211+
requests = (speech.types.StreamingRecognizeRequest(
212+
audio_content=content)
213+
for content in audio_generator)
214+
215+
responses = client.streaming_recognize(streaming_config,
216+
requests)
217+
# Now, put the transcription responses to use.
218+
listen_print_loop(responses, stream)
219+
220+
221+
if __name__ == '__main__':
222+
main()
223+
# [END speech_transcribe_infinite_streaming]

0 commit comments

Comments
 (0)