Skip to content

Commit de00f10

Browse files
nnegreykurtisvg
authored andcommitted
Add samples for enhanced models and metadata (#1093)
1 parent 310e0ab commit de00f10

File tree

5 files changed

+131
-2
lines changed

5 files changed

+131
-2
lines changed

speech/cloud-client/README.md

+12
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,15 @@ Performing streaming speech transcription and punctuation on an audio file
9191
```
9292
mvn exec:java -DRecognize -Dexec.args="stream-punctuation ./resources/audio.raw"
9393
```
94+
95+
## Enhanced Model
96+
Transcribe an audio file using an enhanced model
97+
```
98+
mvn exec:java -DRecognize -Dexec.args="enhanced-model ./resources/commercial_mono.wav"
99+
```
100+
101+
## Recognition Metadata
102+
Transcribe an audio file with recognition metadata
103+
```
104+
mvn exec:java -DRecognize -Dexec.args="metadata ./resources/commercial_mono.wav"
105+
```

speech/cloud-client/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
<dependency>
4141
<groupId>com.google.cloud</groupId>
4242
<artifactId>google-cloud-speech</artifactId>
43-
<version>0.42.0-alpha</version>
43+
<version>0.46.0-alpha</version>
4444
</dependency>
4545
<!-- [END dependencies] -->
4646

Binary file not shown.

speech/cloud-client/src/main/java/com/example/speech/Recognize.java

+102-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
2525
import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
2626
import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding;
27+
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata;
28+
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType;
29+
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance;
30+
import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType;
2731
import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
2832
import com.google.cloud.speech.v1p1beta1.SpeechClient;
2933
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
@@ -53,7 +57,7 @@ public static void main(String... args) throws Exception {
5357
"\tjava %s \"<command>\" \"<path-to-image>\"\n"
5458
+ "Commands:\n"
5559
+ "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets | model-selection\n"
56-
+ "\t| auto-punctuation | stream-punctuation\n"
60+
+ "\t| auto-punctuation | stream-punctuation | enhanced-model | metadata\n"
5761
+ "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
5862
+ "for a Cloud Storage resource (gs://...)\n",
5963
Recognize.class.getCanonicalName());
@@ -97,6 +101,10 @@ public static void main(String... args) throws Exception {
97101
}
98102
} else if (command.equals("stream-punctuation")) {
99103
streamingTranscribeWithAutomaticPunctuation(path);
104+
} else if (command.equals("enhanced-model")) {
105+
transcribeFileWithEnhancedModel(path);
106+
} else if (command.equals("metadata")) {
107+
transcribeFileWithMetadata(path);
100108
}
101109
}
102110

@@ -678,4 +686,97 @@ public SettableFuture<List<T>> future() {
678686
}
679687
}
680688
// [END speech_stream_recognize_punctuation]
689+
690+
// [START speech_transcribe_file_with_enhanced_model]
691+
/**
692+
* Transcribe the given audio file using an enhanced model.
693+
*
694+
* @param fileName the path to an audio file.
695+
*/
696+
public static void transcribeFileWithEnhancedModel(String fileName) throws Exception {
697+
Path path = Paths.get(fileName);
698+
byte[] content = Files.readAllBytes(path);
699+
700+
try (SpeechClient speechClient = SpeechClient.create()) {
701+
// Get the contents of the local audio file
702+
RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder()
703+
.setContent(ByteString.copyFrom(content))
704+
.build();
705+
706+
// Configure request to enable enhanced models
707+
RecognitionConfig config = RecognitionConfig.newBuilder()
708+
.setEncoding(AudioEncoding.LINEAR16)
709+
.setLanguageCode("en-US")
710+
.setSampleRateHertz(8000)
711+
// Enhanced models are only available to projects that
712+
// opt in for audio data collection.
713+
.setUseEnhanced(true)
714+
// A model must be specified to use enhanced model.
715+
.setModel("phone_call")
716+
.build();
717+
718+
// Perform the transcription request
719+
RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
720+
721+
// Print out the results
722+
for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
723+
// There can be several alternative transcripts for a given chunk of speech. Just use the
724+
// first (most likely) one here.
725+
SpeechRecognitionAlternative alternative = result.getAlternatives(0);
726+
System.out.format("Transcript: %s\n\n", alternative.getTranscript());
727+
}
728+
}
729+
}
730+
// [END speech_transcribe_file_with_enhanced_model]
731+
732+
// [START speech_transcribe_file_with_metadata]
733+
/**
734+
* Transcribe the given audio file and include recognition metadata in the request.
735+
*
736+
* @param fileName the path to an audio file.
737+
*/
738+
public static void transcribeFileWithMetadata(String fileName) throws Exception {
739+
Path path = Paths.get(fileName);
740+
byte[] content = Files.readAllBytes(path);
741+
742+
try (SpeechClient speechClient = SpeechClient.create()) {
743+
// Get the contents of the local audio file
744+
RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder()
745+
.setContent(ByteString.copyFrom(content))
746+
.build();
747+
748+
// Construct a recognition metadata object.
749+
// Most metadata fields are specified as enums that can be found
750+
// in speech.enums.RecognitionMetadata
751+
RecognitionMetadata metadata = RecognitionMetadata.newBuilder()
752+
.setInteractionType(InteractionType.DISCUSSION)
753+
.setMicrophoneDistance(MicrophoneDistance.NEARFIELD)
754+
.setRecordingDeviceType(RecordingDeviceType.SMARTPHONE)
755+
.setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings
756+
// And some are integers, for instance the 6 digit NAICS code
757+
// https://www.naics.com/search/
758+
.setIndustryNaicsCodeOfAudio(519190)
759+
.build();
760+
761+
// Configure request to enable enhanced models
762+
RecognitionConfig config = RecognitionConfig.newBuilder()
763+
.setEncoding(AudioEncoding.LINEAR16)
764+
.setLanguageCode("en-US")
765+
.setSampleRateHertz(8000)
766+
.setMetadata(metadata) // Add the metadata to the config
767+
.build();
768+
769+
// Perform the transcription request
770+
RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
771+
772+
// Print out the results
773+
for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
774+
// There can be several alternative transcripts for a given chunk of speech. Just use the
775+
// first (most likely) one here.
776+
SpeechRecognitionAlternative alternative = result.getAlternatives(0);
777+
System.out.format("Transcript: %s\n\n", alternative.getTranscript());
778+
}
779+
}
780+
}
781+
// [END speech_transcribe_file_with_metadata]
681782
}

speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java

+16
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ public class RecognizeIT {
4646
private String videoFileName = "./resources/Google_Gnome.wav";
4747
private String gcsVideoPath = "gs://" + BUCKET + "/speech/Google_Gnome.wav";
4848

49+
private String recognitionAudioFile = "./resources/commercial_mono.wav";
50+
4951
@Before
5052
public void setUp() {
5153
bout = new ByteArrayOutputStream();
@@ -145,4 +147,18 @@ public void testStreamAutoPunctuation() throws Exception {
145147
String got = bout.toString();
146148
assertThat(got).contains("How old is the Brooklyn Bridge?");
147149
}
150+
151+
@Test
152+
public void testEnhancedModel() throws Exception {
153+
Recognize.transcribeFileWithEnhancedModel(recognitionAudioFile);
154+
String got = bout.toString();
155+
assertThat(got).contains("Chrome");
156+
}
157+
158+
@Test
159+
public void testMetadata() throws Exception {
160+
Recognize.transcribeFileWithMetadata(recognitionAudioFile);
161+
String got = bout.toString();
162+
assertThat(got).contains("Chrome");
163+
}
148164
}

0 commit comments

Comments
 (0)