Transcription of a file in Cloud Storage with diarization

Recognize multiple speakers in an audio file stored in Cloud Storage.

Code sample

Java

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Java API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  import 
  
 com.google.api.gax.longrunning. OperationFuture 
 
 ; 
 import 
  
 com.google.cloud.speech.v1. LongRunningRecognizeMetadata 
 
 ; 
 import 
  
 com.google.cloud.speech.v1. LongRunningRecognizeResponse 
 
 ; 
 import 
  
 com.google.cloud.speech.v1. RecognitionAudio 
 
 ; 
 import 
  
 com.google.cloud.speech.v1. RecognitionConfig 
 
 ; 
 import 
  
 com.google.cloud.speech.v1. SpeakerDiarizationConfig 
 
 ; 
 import 
  
 com.google.cloud.speech.v1. SpeechClient 
 
 ; 
 import 
  
 com.google.cloud.speech.v1. SpeechRecognitionAlternative 
 
 ; 
 import 
  
 com.google.cloud.speech.v1. WordInfo 
 
 ; 
 import 
  
 java.io.IOException 
 ; 
 import 
  
 java.util.concurrent.ExecutionException 
 ; 
 public 
  
 class 
 TranscribeDiarizationGcs 
  
 { 
  
 static 
  
 void 
  
 transcribeDiarizationGcs 
 () 
  
 throws 
  
 IOException 
 , 
  
 ExecutionException 
 , 
  
 InterruptedException 
  
 { 
  
 // TODO(developer): Replace these variables before running the sample. 
  
 String 
  
 gcsUri 
  
 = 
  
 "gs://cloud-samples-data/speech/commercial_mono.wav" 
 ; 
  
 transcribeDiarizationGcs 
 ( 
 gcsUri 
 ); 
  
 } 
  
 // Transcribe the give gcs file using speaker diarization 
  
 public 
  
 static 
  
 void 
  
 transcribeDiarizationGcs 
 ( 
 String 
  
 gcsUri 
 ) 
  
 throws 
  
 IOException 
 , 
  
 ExecutionException 
 , 
  
 InterruptedException 
  
 { 
  
 // Initialize client that will be used to send requests. This client only needs to be created 
  
 // once, and can be reused for multiple requests. After completing all of your requests, call 
  
 // the "close" method on the client to safely clean up any remaining background resources. 
  
 try 
  
 ( 
  SpeechClient 
 
  
 speechClient 
  
 = 
  
  SpeechClient 
 
 . 
 create 
 ()) 
  
 { 
  
  SpeakerDiarizationConfig 
 
  
 speakerDiarizationConfig 
  
 = 
  
  SpeakerDiarizationConfig 
 
 . 
 newBuilder 
 () 
  
 . 
  setEnableSpeakerDiarization 
 
 ( 
 true 
 ) 
  
 . 
  setMinSpeakerCount 
 
 ( 
 2 
 ) 
  
 . 
  setMaxSpeakerCount 
 
 ( 
 2 
 ) 
  
 . 
 build 
 (); 
  
 // Configure request to enable Speaker diarization 
  
  RecognitionConfig 
 
  
 config 
  
 = 
  
  RecognitionConfig 
 
 . 
 newBuilder 
 () 
  
 . 
  setEncoding 
 
 ( 
  RecognitionConfig 
 
 . 
 AudioEncoding 
 . 
 LINEAR16 
 ) 
  
 . 
 setLanguageCode 
 ( 
 "en-US" 
 ) 
  
 . 
  setSampleRateHertz 
 
 ( 
 8000 
 ) 
  
 . 
  setDiarizationConfig 
 
 ( 
 speakerDiarizationConfig 
 ) 
  
 . 
 build 
 (); 
  
 // Set the remote path for the audio file 
  
  RecognitionAudio 
 
  
 audio 
  
 = 
  
  RecognitionAudio 
 
 . 
 newBuilder 
 (). 
 setUri 
 ( 
 gcsUri 
 ). 
 build 
 (); 
  
 // Use non-blocking call for getting file transcription 
  
 OperationFuture<LongRunningRecognizeResponse 
 , 
  
 LongRunningRecognizeMetadata 
>  
 future 
  
 = 
  
 speechClient 
 . 
  longRunningRecognizeAsync 
 
 ( 
 config 
 , 
  
 audio 
 ); 
  
 System 
 . 
 out 
 . 
 println 
 ( 
 "Waiting for response..." 
 ); 
  
 // Speaker Tags are only included in the last result object, which has only one alternative. 
  
  LongRunningRecognizeResponse 
 
  
 response 
  
 = 
  
 future 
 . 
  get 
 
 (); 
  
  SpeechRecognitionAlternative 
 
  
 alternative 
  
 = 
  
 response 
 . 
  getResults 
 
 ( 
 response 
 . 
  getResults 
 
Count () 
  
 - 
  
 1 
 ). 
 getAlternatives 
 ( 
 0 
 ); 
  
 // The alternative is made up of WordInfo objects that contain the speaker_tag. 
  
  WordInfo 
 
  
 wordInfo 
  
 = 
  
 alternative 
 . 
  getWords 
 
 ( 
 0 
 ); 
  
 int 
  
 currentSpeakerTag 
  
 = 
  
 wordInfo 
 . 
  getSpeakerTag 
 
 (); 
  
 // For each word, get all the words associated with one speaker, once the speaker changes, 
  
 // add a new line with the new speaker and their spoken words. 
  
 StringBuilder 
  
 speakerWords 
  
 = 
  
 new 
  
 StringBuilder 
 ( 
  
 String 
 . 
 format 
 ( 
 "Speaker %d: %s" 
 , 
  
 wordInfo 
 . 
  getSpeakerTag 
 
 (), 
  
 wordInfo 
 . 
  getWord 
 
 ())); 
  
 for 
  
 ( 
 int 
  
 i 
  
 = 
  
 1 
 ; 
  
 i 
 < 
 alternative 
 . 
  getWordsCount 
 
 (); 
  
 i 
 ++ 
 ) 
  
 { 
  
 wordInfo 
  
 = 
  
 alternative 
 . 
  getWords 
 
 ( 
 i 
 ); 
  
 if 
  
 ( 
 currentSpeakerTag 
  
 == 
  
 wordInfo 
 . 
  getSpeakerTag 
 
 ()) 
  
 { 
  
 speakerWords 
 . 
 append 
 ( 
 " " 
 ); 
  
 speakerWords 
 . 
 append 
 ( 
 wordInfo 
 . 
  getWord 
 
 ()); 
  
 } 
  
 else 
  
 { 
  
 speakerWords 
 . 
 append 
 ( 
  
 String 
 . 
 format 
 ( 
 "\nSpeaker %d: %s" 
 , 
  
 wordInfo 
 . 
  getSpeakerTag 
 
 (), 
  
 wordInfo 
 . 
  getWord 
 
 ())); 
  
 currentSpeakerTag 
  
 = 
  
 wordInfo 
 . 
  getSpeakerTag 
 
 (); 
  
 } 
  
 } 
  
 System 
 . 
 out 
 . 
 println 
 ( 
 speakerWords 
 . 
 toString 
 ()); 
  
 } 
  
 } 
 } 
 

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser .

Design a Mobile Site
View Site in Mobile | Classic
Share by: