|
24 | 24 | from google.cloud.speech.connection import Connection
|
25 | 25 | from google.cloud.speech.encoding import Encoding
|
26 | 26 | from google.cloud.speech.operation import Operation
|
27 |
| -from google.cloud.speech.streaming.request import _make_request_stream |
28 | 27 | from google.cloud.speech.sample import Sample
|
29 |
| -from google.cloud.speech.streaming.response import StreamingSpeechResponse |
| 28 | +from google.cloud.speech.streaming_response import StreamingSpeechResponse |
30 | 29 |
|
31 | 30 | try:
|
32 | 31 | from google.cloud.gapic.speech.v1beta1.speech_api import SpeechApi
|
| 32 | + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( |
| 33 | + RecognitionConfig) |
| 34 | + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( |
| 35 | + StreamingRecognitionConfig) |
| 36 | + from google.cloud.grpc.speech.v1beta1.cloud_speech_pb2 import ( |
| 37 | + StreamingRecognizeRequest) |
33 | 38 | except ImportError: # pragma: NO COVER
|
34 | 39 | _HAVE_GAX = False
|
35 | 40 | else:
|
@@ -284,16 +289,9 @@ def stream_recognize(self, sample, language_code=None,
|
284 | 289 | with the is_final=false flag). If false or
|
285 | 290 | omitted, only is_final=true result(s) are
|
286 | 291 | returned.
|
287 |
| -
|
288 |
| - :rtype: :class:`~streaming.StreamingResponseContainer` |
289 |
| - :returns: An instance of ``StreamingReponseContainer``. |
290 |
| -
|
291 | 292 | """
|
292 | 293 | if not _USE_GAX:
|
293 |
| - raise EnvironmentError('GRPC is required to use this API.') |
294 |
| - |
295 |
| - if sample.stream.closed: |
296 |
| - raise ValueError('Stream is closed.') |
| 294 | + raise EnvironmentError('gRPC is required to use this API.') |
297 | 295 |
|
298 | 296 | requests = _make_request_stream(sample, language_code=language_code,
|
299 | 297 | max_alternatives=max_alternatives,
|
@@ -379,3 +377,160 @@ def _build_request_data(sample, language_code=None, max_alternatives=None,
|
379 | 377 | }
|
380 | 378 |
|
381 | 379 | return data
|
| 380 | + |
| 381 | + |
| 382 | +def _make_request_stream(sample, language_code=None, max_alternatives=None, |
| 383 | + profanity_filter=None, speech_context=None, |
| 384 | + single_utterance=None, interim_results=None): |
| 385 | + """Generate stream of requests from sample. |
| 386 | +
|
| 387 | + :type sample: :class:`~google.cloud.speech.sample.Sample` |
| 388 | + :param sample: Instance of ``Sample`` containing audio information. |
| 389 | +
|
| 390 | + :type language_code: str |
| 391 | + :param language_code: (Optional) The language of the supplied audio as |
| 392 | + BCP-47 language tag. Example: ``'en-GB'``. |
| 393 | + If omitted, defaults to ``'en-US'``. |
| 394 | +
|
| 395 | + :type max_alternatives: int |
| 396 | + :param max_alternatives: (Optional) Maximum number of recognition |
| 397 | + hypotheses to be returned. The server may |
| 398 | + return fewer than maxAlternatives. |
| 399 | + Valid values are 0-30. A value of 0 or 1 |
| 400 | + will return a maximum of 1. Defaults to 1 |
| 401 | +
|
| 402 | + :type profanity_filter: bool |
| 403 | + :param profanity_filter: If True, the server will attempt to filter |
| 404 | + out profanities, replacing all but the |
| 405 | + initial character in each filtered word with |
| 406 | + asterisks, e.g. ``'f***'``. If False or |
| 407 | + omitted, profanities won't be filtered out. |
| 408 | +
|
| 409 | + :type speech_context: list |
| 410 | + :param speech_context: A list of strings (max 50) containing words and |
| 411 | + phrases "hints" so that the speech recognition |
| 412 | + is more likely to recognize them. This can be |
| 413 | + used to improve the accuracy for specific words |
| 414 | + and phrases. This can also be used to add new |
| 415 | + words to the vocabulary of the recognizer. |
| 416 | +
|
| 417 | + :type single_utterance: boolean |
| 418 | + :param single_utterance: [Optional] If false or omitted, the recognizer |
| 419 | + will perform continuous recognition |
| 420 | + (continuing to process audio even if the user |
| 421 | + pauses speaking) until the client closes the |
| 422 | + output stream (gRPC API) or when the maximum |
| 423 | + time limit has been reached. Multiple |
| 424 | + SpeechRecognitionResults with the is_final |
| 425 | + flag set to true may be returned. |
| 426 | +
|
| 427 | + If true, the recognizer will detect a single |
| 428 | + spoken utterance. When it detects that the |
| 429 | + user has paused or stopped speaking, it will |
| 430 | + return an END_OF_UTTERANCE event and cease |
| 431 | + recognition. It will return no more than one |
| 432 | + SpeechRecognitionResult with the is_final flag |
| 433 | + set to true. |
| 434 | +
|
| 435 | + :type interim_results: boolean |
| 436 | + :param interim_results: [Optional] If true, interim results (tentative |
| 437 | + hypotheses) may be returned as they become |
| 438 | + available (these interim results are indicated |
| 439 | + with the is_final=false flag). If false or |
| 440 | + omitted, only is_final=true result(s) are |
| 441 | + returned. |
| 442 | + """ |
| 443 | + config_request = _make_streaming_config( |
| 444 | + sample, language_code=language_code, max_alternatives=max_alternatives, |
| 445 | + profanity_filter=profanity_filter, speech_context=speech_context, |
| 446 | + single_utterance=single_utterance, interim_results=interim_results) |
| 447 | + |
| 448 | + # The config request MUST go first and not contain any audio data. |
| 449 | + yield config_request |
| 450 | + |
| 451 | + while True: |
| 452 | + data = sample.stream.read(sample.chunk_size) |
| 453 | + if not data: |
| 454 | + break |
| 455 | + # Optimize the request data size to around 100ms. |
| 456 | + yield StreamingRecognizeRequest(audio_content=data) |
| 457 | + |
| 458 | + |
| 459 | +def _make_streaming_config(sample, language_code, |
| 460 | + max_alternatives, profanity_filter, |
| 461 | + speech_context, single_utterance, |
| 462 | + interim_results): |
| 463 | + """Build streaming configuration. |
| 464 | +
|
| 465 | + :type sample: :class:`~google.cloud.speech.sample.Sample` |
| 466 | + :param sample: Instance of ``Sample`` containing audio information. |
| 467 | +
|
| 468 | + :type language_code: str |
| 469 | + :param language_code: (Optional) The language of the supplied audio as |
| 470 | + BCP-47 language tag. Example: ``'en-GB'``. |
| 471 | + If omitted, defaults to ``'en-US'``. |
| 472 | +
|
| 473 | + :type max_alternatives: int |
| 474 | + :param max_alternatives: (Optional) Maximum number of recognition |
| 475 | + hypotheses to be returned. The server may |
| 476 | + return fewer than maxAlternatives. |
| 477 | + Valid values are 0-30. A value of 0 or 1 |
| 478 | + will return a maximum of 1. Defaults to 1 |
| 479 | +
|
| 480 | + :type profanity_filter: bool |
| 481 | + :param profanity_filter: If True, the server will attempt to filter |
| 482 | + out profanities, replacing all but the |
| 483 | + initial character in each filtered word with |
| 484 | + asterisks, e.g. ``'f***'``. If False or |
| 485 | + omitted, profanities won't be filtered out. |
| 486 | +
|
| 487 | + :type speech_context: list |
| 488 | + :param speech_context: A list of strings (max 50) containing words and |
| 489 | + phrases "hints" so that the speech recognition |
| 490 | + is more likely to recognize them. This can be |
| 491 | + used to improve the accuracy for specific words |
| 492 | + and phrases. This can also be used to add new |
| 493 | + words to the vocabulary of the recognizer. |
| 494 | +
|
| 495 | + :type single_utterance: boolean |
| 496 | + :param single_utterance: [Optional] If false or omitted, the recognizer |
| 497 | + will perform continuous recognition |
| 498 | + (continuing to process audio even if the user |
| 499 | + pauses speaking) until the client closes the |
| 500 | + output stream (gRPC API) or when the maximum |
| 501 | + time limit has been reached. Multiple |
| 502 | + SpeechRecognitionResults with the is_final |
| 503 | + flag set to true may be returned. |
| 504 | +
|
| 505 | + If true, the recognizer will detect a single |
| 506 | + spoken utterance. When it detects that the |
| 507 | + user has paused or stopped speaking, it will |
| 508 | + return an END_OF_UTTERANCE event and cease |
| 509 | + recognition. It will return no more than one |
| 510 | + SpeechRecognitionResult with the is_final flag |
| 511 | + set to true. |
| 512 | +
|
| 513 | + :type interim_results: boolean |
| 514 | + :param interim_results: [Optional] If true, interim results (tentative |
| 515 | + hypotheses) may be returned as they become |
| 516 | + available (these interim results are indicated |
| 517 | + with the is_final=false flag). If false or |
| 518 | + omitted, only is_final=true result(s) are |
| 519 | + returned. |
| 520 | +
|
| 521 | + :rtype: :class:`~StreamingRecognitionConfig` |
| 522 | + :returns: Instance of ``StreamingRecognitionConfig``. |
| 523 | + """ |
| 524 | + config = RecognitionConfig( |
| 525 | + encoding=sample.encoding, sample_rate=sample.sample_rate, |
| 526 | + language_code=language_code, max_alternatives=max_alternatives, |
| 527 | + profanity_filter=profanity_filter, speech_context=speech_context) |
| 528 | + |
| 529 | + streaming_config = StreamingRecognitionConfig( |
| 530 | + config=config, single_utterance=single_utterance, |
| 531 | + interim_results=interim_results) |
| 532 | + |
| 533 | + config_request = StreamingRecognizeRequest( |
| 534 | + streaming_config=streaming_config) |
| 535 | + |
| 536 | + return config_request |
0 commit comments