Transcription of a local file with diarization

Recognize multiple speakers in a local audio file.

Explore further

For detailed documentation that includes this code sample, see the following:

Code sample

Go

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Go API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  import 
  
 ( 
  
 "context" 
  
 "fmt" 
  
 "io" 
  
 "os" 
  
 "strings" 
  
 speech 
  
 "cloud.google.com/go/speech/apiv1" 
  
 "cloud.google.com/go/speech/apiv1/speechpb" 
 ) 
 // transcribe_diarization_gcs_beta Transcribes a remote audio file using speaker diarization. 
 func 
  
 transcribe_diarization 
 ( 
 w 
  
 io 
 . 
 Writer 
 ) 
  
 error 
  
 { 
  
 ctx 
  
 := 
  
 context 
 . 
 Background 
 () 
  
 client 
 , 
  
 err 
  
 := 
  
 speech 
 . 
  NewClient 
 
 ( 
 ctx 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 return 
  
 fmt 
 . 
 Errorf 
 ( 
 "NewClient: %w" 
 , 
  
 err 
 ) 
  
 } 
  
 defer 
  
 client 
 . 
 Close 
 () 
  
 diarizationConfig 
  
 := 
  
& speechpb 
 . 
 SpeakerDiarizationConfig 
 { 
  
 EnableSpeakerDiarization 
 : 
  
 true 
 , 
  
 MinSpeakerCount 
 : 
  
 2 
 , 
  
 MaxSpeakerCount 
 : 
  
 2 
 , 
  
 } 
  
 recognitionConfig 
  
 := 
  
& speechpb 
 . 
 RecognitionConfig 
 { 
  
 Encoding 
 : 
  
 speechpb 
 . 
  RecognitionConfig_LINEAR16 
 
 , 
  
 SampleRateHertz 
 : 
  
 8000 
 , 
  
 LanguageCode 
 : 
  
 "en-US" 
 , 
  
 DiarizationConfig 
 : 
  
 diarizationConfig 
 , 
  
 } 
  
 // Get the contents of the local audio file 
  
 content 
 , 
  
 err 
  
 := 
  
 os 
 . 
 ReadFile 
 ( 
 "../resources/commercial_mono.wav" 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 return 
  
 fmt 
 . 
 Errorf 
 ( 
 "error reading file %w" 
 , 
  
 err 
 ) 
  
 } 
  
 audio 
  
 := 
  
& speechpb 
 . 
 RecognitionAudio 
 { 
  
 AudioSource 
 : 
  
& speechpb 
 . 
 RecognitionAudio_Content 
 { 
 Content 
 : 
  
 content 
 }, 
  
 } 
  
 longRunningRecognizeRequest 
  
 := 
  
& speechpb 
 . 
 LongRunningRecognizeRequest 
 { 
  
 Config 
 : 
  
 recognitionConfig 
 , 
  
 Audio 
 : 
  
 audio 
 , 
  
 } 
  
 operation 
 , 
  
 err 
  
 := 
  
 client 
 . 
 LongRunningRecognize 
 ( 
 ctx 
 , 
  
 longRunningRecognizeRequest 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 return 
  
 fmt 
 . 
 Errorf 
 ( 
 "error running recognize %w" 
 , 
  
 err 
 ) 
  
 } 
  
 response 
 , 
  
 err 
  
 := 
  
 operation 
 . 
  Wait 
 
 ( 
 ctx 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 return 
  
 err 
  
 } 
  
 // Speaker Tags are only included in the last result object, which has only one 
  
 // alternative. 
  
 alternative 
  
 := 
  
 response 
 . 
 Results 
 [ 
 len 
 ( 
 response 
 . 
 Results 
 ) 
 - 
 1 
 ]. 
 Alternatives 
 [ 
 0 
 ] 
  
 wordInfo 
  
 := 
  
 alternative 
 . 
 GetWords 
 ()[ 
 0 
 ] 
  
 currentSpeakerTag 
  
 := 
  
 wordInfo 
 . 
 GetSpeakerTag 
 () 
  
 var 
  
 speakerWords 
  
 strings 
 . 
 Builder 
  
 speakerWords 
 . 
 WriteString 
 ( 
 fmt 
 . 
 Sprintf 
 ( 
 "Speaker %d: %s" 
 , 
  
 wordInfo 
 . 
 GetSpeakerTag 
 (), 
  
 wordInfo 
 . 
 GetWord 
 ())) 
  
 // For each word, get all the words associated with one speaker, once the speaker changes, 
  
 // add a new line with the new speaker and their spoken words. 
  
 for 
  
 i 
  
 := 
  
 1 
 ; 
  
 i 
 < 
 len 
 ( 
 alternative 
 . 
 Words 
 ); 
  
 i 
 ++ 
  
 { 
  
 wordInfo 
  
 := 
  
 alternative 
 . 
 Words 
 [ 
 i 
 ] 
  
 if 
  
 currentSpeakerTag 
  
 == 
  
 wordInfo 
 . 
 GetSpeakerTag 
 () 
  
 { 
  
 speakerWords 
 . 
 WriteString 
 ( 
 " " 
 ) 
  
 speakerWords 
 . 
 WriteString 
 ( 
 wordInfo 
 . 
 GetWord 
 ()) 
  
 } 
  
 else 
  
 { 
  
 speakerWords 
 . 
 WriteString 
 ( 
 fmt 
 . 
 Sprintf 
 ( 
 "\nSpeaker %d: %s" 
 , 
  
 wordInfo 
 . 
 GetSpeakerTag 
 (), 
  
 wordInfo 
 . 
 GetWord 
 ())) 
  
 currentSpeakerTag 
  
 = 
  
 wordInfo 
 . 
 GetSpeakerTag 
 () 
  
 } 
  
 } 
  
 fmt 
 . 
 Fprintf 
 ( 
 w 
 , 
  
 speakerWords 
 . 
 String 
 ()) 
  
 return 
  
 nil 
 }

Java

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Java API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  /** 
 * Transcribe the given audio file using speaker diarization. 
 * 
 * @param fileName the path to an audio file. 
 */ 
 public 
  
 static 
  
 void 
  
 transcribeDiarization 
 ( 
 String 
  
 fileName 
 ) 
  
 throws 
  
 Exception 
  
 { 
  
 Path 
  
 path 
  
 = 
  
 Paths 
 . 
 get 
 ( 
 fileName 
 ); 
  
 byte 
 [] 
  
 content 
  
 = 
  
 Files 
 . 
 readAllBytes 
 ( 
 path 
 ); 
  
 try 
  
 ( 
 SpeechClient 
  
 speechClient 
  
 = 
  
 SpeechClient 
 . 
 create 
 ()) 
  
 { 
  
 // Get the contents of the local audio file 
  
 RecognitionAudio 
  
 recognitionAudio 
  
 = 
  
 RecognitionAudio 
 . 
 newBuilder 
 (). 
 setContent 
 ( 
 ByteString 
 . 
 copyFrom 
 ( 
 content 
 )). 
 build 
 (); 
  
 SpeakerDiarizationConfig 
  
 speakerDiarizationConfig 
  
 = 
  
 SpeakerDiarizationConfig 
 . 
 newBuilder 
 () 
  
 . 
 setEnableSpeakerDiarization 
 ( 
 true 
 ) 
  
 . 
 setMinSpeakerCount 
 ( 
 2 
 ) 
  
 . 
 setMaxSpeakerCount 
 ( 
 2 
 ) 
  
 . 
 build 
 (); 
  
 // Configure request to enable Speaker diarization 
  
 RecognitionConfig 
  
 config 
  
 = 
  
 RecognitionConfig 
 . 
 newBuilder 
 () 
  
 . 
 setEncoding 
 ( 
 AudioEncoding 
 . 
 LINEAR16 
 ) 
  
 . 
 setLanguageCode 
 ( 
 "en-US" 
 ) 
  
 . 
 setSampleRateHertz 
 ( 
 8000 
 ) 
  
 . 
 setDiarizationConfig 
 ( 
 speakerDiarizationConfig 
 ) 
  
 . 
 build 
 (); 
  
 // Perform the transcription request 
  
 RecognizeResponse 
  
 recognizeResponse 
  
 = 
  
 speechClient 
 . 
 recognize 
 ( 
 config 
 , 
  
 recognitionAudio 
 ); 
  
 // Speaker Tags are only included in the last result object, which has only one alternative. 
  
 SpeechRecognitionAlternative 
  
 alternative 
  
 = 
  
 recognizeResponse 
 . 
 getResults 
 ( 
 recognizeResponse 
 . 
 getResultsCount 
 () 
  
 - 
  
 1 
 ). 
 getAlternatives 
 ( 
 0 
 ); 
  
 // The alternative is made up of WordInfo objects that contain the speaker_tag. 
  
 WordInfo 
  
 wordInfo 
  
 = 
  
 alternative 
 . 
 getWords 
 ( 
 0 
 ); 
  
 int 
  
 currentSpeakerTag 
  
 = 
  
 wordInfo 
 . 
 getSpeakerTag 
 (); 
  
 // For each word, get all the words associated with one speaker, once the speaker changes, 
  
 // add a new line with the new speaker and their spoken words. 
  
 StringBuilder 
  
 speakerWords 
  
 = 
  
 new 
  
 StringBuilder 
 ( 
  
 String 
 . 
 format 
 ( 
 "Speaker %d: %s" 
 , 
  
 wordInfo 
 . 
 getSpeakerTag 
 (), 
  
 wordInfo 
 . 
 getWord 
 ())); 
  
 for 
  
 ( 
 int 
  
 i 
  
 = 
  
 1 
 ; 
  
 i 
 < 
 alternative 
 . 
 getWordsCount 
 (); 
  
 i 
 ++ 
 ) 
  
 { 
  
 wordInfo 
  
 = 
  
 alternative 
 . 
 getWords 
 ( 
 i 
 ); 
  
 if 
  
 ( 
 currentSpeakerTag 
  
 == 
  
 wordInfo 
 . 
 getSpeakerTag 
 ()) 
  
 { 
  
 speakerWords 
 . 
 append 
 ( 
 " " 
 ); 
  
 speakerWords 
 . 
 append 
 ( 
 wordInfo 
 . 
 getWord 
 ()); 
  
 } 
  
 else 
  
 { 
  
 speakerWords 
 . 
 append 
 ( 
  
 String 
 . 
 format 
 ( 
 "\nSpeaker %d: %s" 
 , 
  
 wordInfo 
 . 
 getSpeakerTag 
 (), 
  
 wordInfo 
 . 
 getWord 
 ())); 
  
 currentSpeakerTag 
  
 = 
  
 wordInfo 
 . 
 getSpeakerTag 
 (); 
  
 } 
  
 } 
  
 System 
 . 
 out 
 . 
 println 
 ( 
 speakerWords 
 . 
 toString 
 ()); 
  
 } 
 }

Node.js

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Node.js API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  const 
  
 fs 
  
 = 
  
 require 
 ( 
 'fs' 
 ); 
 // Imports the Google Cloud client library 
 const 
  
 speech 
  
 = 
  
 require 
 ( 
 ' @google-cloud/speech 
' 
 ). 
 v1p1beta1 
 ; 
 // Creates a client 
 const 
  
 client 
  
 = 
  
 new 
  
 speech 
 . 
  SpeechClient 
 
 (); 
 /** 
 * TODO(developer): Uncomment the following lines before running the sample. 
 */ 
 // const fileName = 'Local path to audio file, e.g. /path/to/audio.raw'; 
 const 
  
 config 
  
 = 
  
 { 
  
 encoding 
 : 
  
 'LINEAR16' 
 , 
  
 sampleRateHertz 
 : 
  
 8000 
 , 
  
 languageCode 
 : 
  
 'en-US' 
 , 
  
 enableSpeakerDiarization 
 : 
  
 true 
 , 
  
 minSpeakerCount 
 : 
  
 2 
 , 
  
 maxSpeakerCount 
 : 
  
 2 
 , 
  
 model 
 : 
  
 'phone_call' 
 , 
 }; 
 const 
  
 audio 
  
 = 
  
 { 
  
 content 
 : 
  
 fs 
 . 
 readFileSync 
 ( 
 fileName 
 ). 
 toString 
 ( 
 'base64' 
 ), 
 }; 
 const 
  
 request 
  
 = 
  
 { 
  
 config 
 : 
  
 config 
 , 
  
 audio 
 : 
  
 audio 
 , 
 }; 
 const 
  
 [ 
 response 
 ] 
  
 = 
  
 await 
  
 client 
 . 
 recognize 
 ( 
 request 
 ); 
 const 
  
 transcription 
  
 = 
  
 response 
 . 
 results 
  
 . 
 map 
 ( 
 result 
  
 = 
>  
 result 
 . 
 alternatives 
 [ 
 0 
 ]. 
 transcript 
 ) 
  
 . 
 join 
 ( 
 '\n' 
 ); 
 console 
 . 
 log 
 ( 
 `Transcription: 
 ${ 
 transcription 
 } 
 ` 
 ); 
 console 
 . 
 log 
 ( 
 'Speaker Diarization:' 
 ); 
 const 
  
 result 
  
 = 
  
 response 
 . 
 results 
 [ 
 response 
 . 
 results 
 . 
 length 
  
 - 
  
 1 
 ]; 
 const 
  
 wordsInfo 
  
 = 
  
 result 
 . 
 alternatives 
 [ 
 0 
 ]. 
 words 
 ; 
 // Note: The transcript within each result is separate and sequential per result. 
 // However, the words list within an alternative includes all the words 
 // from all the results thus far. Thus, to get all the words with speaker 
 // tags, you only have to take the words list from the last result: 
 wordsInfo 
 . 
 forEach 
 ( 
 a 
  
 = 
>  
 console 
 . 
 log 
 ( 
 ` word: 
 ${ 
 a 
 . 
 word 
 } 
 , speakerTag: 
 ${ 
 a 
 . 
 speakerTag 
 } 
 ` 
 ) 
 );

Python

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Python API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  from 
  
 google.cloud 
  
 import 
 speech_v1p1beta1 
 as 
 speech 
 client 
 = 
 speech 
 . 
 SpeechClient 
 () 
 speech_file 
 = 
 "resources/commercial_mono.wav" 
 with 
 open 
 ( 
 speech_file 
 , 
 "rb" 
 ) 
 as 
 audio_file 
 : 
 content 
 = 
 audio_file 
 . 
 read 
 () 
 audio 
 = 
 speech 
 . 
 RecognitionAudio 
 ( 
 content 
 = 
 content 
 ) 
 diarization_config 
 = 
 speech 
 . 
 SpeakerDiarizationConfig 
 ( 
 enable_speaker_diarization 
 = 
 True 
 , 
 min_speaker_count 
 = 
 2 
 , 
 max_speaker_count 
 = 
 10 
 , 
 ) 
 config 
 = 
 speech 
 . 
 RecognitionConfig 
 ( 
 encoding 
 = 
 speech 
 . 
 RecognitionConfig 
 . 
 AudioEncoding 
 . 
 LINEAR16 
 , 
 sample_rate_hertz 
 = 
 8000 
 , 
 language_code 
 = 
 "en-US" 
 , 
 diarization_config 
 = 
 diarization_config 
 , 
 ) 
 print 
 ( 
 "Waiting for operation to complete..." 
 ) 
 response 
 = 
 client 
 . 
 recognize 
 ( 
 config 
 = 
 config 
 , 
 audio 
 = 
 audio 
 ) 
 # The transcript within each result is separate and sequential per result. 
 # However, the words list within an alternative includes all the words 
 # from all the results thus far. Thus, to get all the words with speaker 
 # tags, you only have to take the words list from the last result: 
 result 
 = 
 response 
 . 
 results 
 [ 
 - 
 1 
 ] 
 words_info 
 = 
 result 
 . 
 alternatives 
 [ 
 0 
 ] 
 . 
 words 
 # Printing out the output: 
 for 
 word_info 
 in 
 words_info 
 : 
 print 
 ( 
 f 
 "word: ' 
 { 
 word_info 
 . 
 word 
 } 
 ', speaker_tag: 
 { 
 word_info 
 . 
 speaker_tag 
 } 
 " 
 ) 
 return 
 result

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser .

Transcription of a local file with diarization Stay organized with collections Save and categorize content based on your preferences.

Explore further

Code sample

Go

Java

Node.js

Python

What's next

Transcription of a local file with diarization