|
24 | 24 | import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
|
25 | 25 | import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
|
26 | 26 | import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding;
|
| 27 | +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata; |
| 28 | +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType; |
| 29 | +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance; |
| 30 | +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType; |
27 | 31 | import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
|
28 | 32 | import com.google.cloud.speech.v1p1beta1.SpeechClient;
|
29 | 33 | import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
|
@@ -53,7 +57,7 @@ public static void main(String... args) throws Exception {
|
53 | 57 | "\tjava %s \"<command>\" \"<path-to-image>\"\n"
|
54 | 58 | + "Commands:\n"
|
55 | 59 | + "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets | model-selection\n"
|
56 |
| - + "\t| auto-punctuation | stream-punctuation\n" |
| 60 | + + "\t| auto-punctuation | stream-punctuation | enhanced-model | metadata\n" |
57 | 61 | + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
|
58 | 62 | + "for a Cloud Storage resource (gs://...)\n",
|
59 | 63 | Recognize.class.getCanonicalName());
|
@@ -97,6 +101,10 @@ public static void main(String... args) throws Exception {
|
97 | 101 | }
|
98 | 102 | } else if (command.equals("stream-punctuation")) {
|
99 | 103 | streamingTranscribeWithAutomaticPunctuation(path);
|
| 104 | + } else if (command.equals("enhanced-model")) { |
| 105 | + transcribeFileWithEnhancedModel(path); |
| 106 | + } else if (command.equals("metadata")) { |
| 107 | + transcribeFileWithMetadata(path); |
100 | 108 | }
|
101 | 109 | }
|
102 | 110 |
|
@@ -678,4 +686,97 @@ public SettableFuture<List<T>> future() {
|
678 | 686 | }
|
679 | 687 | }
|
680 | 688 | // [END speech_stream_recognize_punctuation]
|
| 689 | + |
| 690 | + // [START speech_transcribe_file_with_enhanced_model] |
| 691 | + /** |
| 692 | + * Transcribe the given audio file using an enhanced model. |
| 693 | + * |
| 694 | + * @param fileName the path to an audio file. |
| 695 | + */ |
| 696 | + public static void transcribeFileWithEnhancedModel(String fileName) throws Exception { |
| 697 | + Path path = Paths.get(fileName); |
| 698 | + byte[] content = Files.readAllBytes(path); |
| 699 | + |
| 700 | + try (SpeechClient speechClient = SpeechClient.create()) { |
| 701 | + // Get the contents of the local audio file |
| 702 | + RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder() |
| 703 | + .setContent(ByteString.copyFrom(content)) |
| 704 | + .build(); |
| 705 | + |
| 706 | + // Configure request to enable enhanced models |
| 707 | + RecognitionConfig config = RecognitionConfig.newBuilder() |
| 708 | + .setEncoding(AudioEncoding.LINEAR16) |
| 709 | + .setLanguageCode("en-US") |
| 710 | + .setSampleRateHertz(8000) |
| 711 | + // Enhanced models are only available to projects that |
| 712 | + // opt in for audio data collection. |
| 713 | + .setUseEnhanced(true) |
| 714 | + // A model must be specified to use enhanced model. |
| 715 | + .setModel("phone_call") |
| 716 | + .build(); |
| 717 | + |
| 718 | + // Perform the transcription request |
| 719 | + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); |
| 720 | + |
| 721 | + // Print out the results |
| 722 | + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { |
| 723 | + // There can be several alternative transcripts for a given chunk of speech. Just use the |
| 724 | + // first (most likely) one here. |
| 725 | + SpeechRecognitionAlternative alternative = result.getAlternatives(0); |
| 726 | + System.out.format("Transcript: %s\n\n", alternative.getTranscript()); |
| 727 | + } |
| 728 | + } |
| 729 | + } |
| 730 | + // [END speech_transcribe_file_with_enhanced_model] |
| 731 | + |
| 732 | + // [START speech_transcribe_file_with_metadata] |
| 733 | + /** |
| 734 | + * Transcribe the given audio file and include recognition metadata in the request. |
| 735 | + * |
| 736 | + * @param fileName the path to an audio file. |
| 737 | + */ |
| 738 | + public static void transcribeFileWithMetadata(String fileName) throws Exception { |
| 739 | + Path path = Paths.get(fileName); |
| 740 | + byte[] content = Files.readAllBytes(path); |
| 741 | + |
| 742 | + try (SpeechClient speechClient = SpeechClient.create()) { |
| 743 | + // Get the contents of the local audio file |
| 744 | + RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder() |
| 745 | + .setContent(ByteString.copyFrom(content)) |
| 746 | + .build(); |
| 747 | + |
| 748 | + // Construct a recognition metadata object. |
| 749 | + // Most metadata fields are specified as enums that can be found |
| 750 | + // in speech.enums.RecognitionMetadata |
| 751 | + RecognitionMetadata metadata = RecognitionMetadata.newBuilder() |
| 752 | + .setInteractionType(InteractionType.DISCUSSION) |
| 753 | + .setMicrophoneDistance(MicrophoneDistance.NEARFIELD) |
| 754 | + .setRecordingDeviceType(RecordingDeviceType.SMARTPHONE) |
| 755 | + .setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings |
| 756 | + // And some are integers, for instance the 6 digit NAICS code |
| 757 | + // https://www.naics.com/search/ |
| 758 | + .setIndustryNaicsCodeOfAudio(519190) |
| 759 | + .build(); |
| 760 | + |
| 761 | + // Configure request to enable enhanced models |
| 762 | + RecognitionConfig config = RecognitionConfig.newBuilder() |
| 763 | + .setEncoding(AudioEncoding.LINEAR16) |
| 764 | + .setLanguageCode("en-US") |
| 765 | + .setSampleRateHertz(8000) |
| 766 | + .setMetadata(metadata) // Add the metadata to the config |
| 767 | + .build(); |
| 768 | + |
| 769 | + // Perform the transcription request |
| 770 | + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); |
| 771 | + |
| 772 | + // Print out the results |
| 773 | + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { |
| 774 | + // There can be several alternative transcripts for a given chunk of speech. Just use the |
| 775 | + // first (most likely) one here. |
| 776 | + SpeechRecognitionAlternative alternative = result.getAlternatives(0); |
| 777 | + System.out.format("Transcript: %s\n\n", alternative.getTranscript()); |
| 778 | + } |
| 779 | + } |
| 780 | + } |
| 781 | + // [END speech_transcribe_file_with_metadata] |
681 | 782 | }
|
0 commit comments