Transcribe a streaming feed on a local file

Transcribe a streaming audio feed from a microphone on a local file.

Explore further

For detailed documentation that includes this code sample, see the following:

Code sample

Go

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Go API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  import 
  
 ( 
  
 "context" 
  
 "flag" 
  
 "fmt" 
  
 "io" 
  
 "log" 
  
 "os" 
  
 "path/filepath" 
  
 speech 
  
 "cloud.google.com/go/speech/apiv1" 
  
 "cloud.google.com/go/speech/apiv1/speechpb" 
 ) 
 func 
  
 main 
 () 
  
 { 
  
 flag 
 . 
 Usage 
  
 = 
  
 func 
 () 
  
 { 
  
 fmt 
 . 
 Fprintf 
 ( 
 os 
 . 
 Stderr 
 , 
  
 "Usage: %s <AUDIOFILE>\n" 
 , 
  
 filepath 
 . 
 Base 
 ( 
 os 
 . 
 Args 
 [ 
 0 
 ])) 
  
 fmt 
 . 
 Fprintf 
 ( 
 os 
 . 
 Stderr 
 , 
  
 "<AUDIOFILE> must be a path to a local audio file. Audio file must be a 16-bit signed little-endian encoded with a sample rate of 16000.\n" 
 ) 
  
 } 
  
 flag 
 . 
 Parse 
 () 
  
 if 
  
 len 
 ( 
 flag 
 . 
 Args 
 ()) 
  
 != 
  
 1 
  
 { 
  
 log 
 . 
 Fatal 
 ( 
 "Please pass path to your local audio file as a command line argument" 
 ) 
  
 } 
  
 audioFile 
  
 := 
  
 flag 
 . 
 Arg 
 ( 
 0 
 ) 
  
 ctx 
  
 := 
  
 context 
 . 
 Background 
 () 
  
 client 
 , 
  
 err 
  
 := 
  
 speech 
 . 
  NewClient 
 
 ( 
 ctx 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 log 
 . 
 Fatal 
 ( 
 err 
 ) 
  
 } 
  
 stream 
 , 
  
 err 
  
 := 
  
 client 
 . 
 StreamingRecognize 
 ( 
 ctx 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 log 
 . 
 Fatal 
 ( 
 err 
 ) 
  
 } 
  
 // Send the initial configuration message. 
  
 if 
  
 err 
  
 := 
  
 stream 
 . 
 Send 
 ( 
& speechpb 
 . 
 StreamingRecognizeRequest 
 { 
  
 StreamingRequest 
 : 
  
& speechpb 
 . 
 StreamingRecognizeRequest_StreamingConfig 
 { 
  
 StreamingConfig 
 : 
  
& speechpb 
 . 
 StreamingRecognitionConfig 
 { 
  
 Config 
 : 
  
& speechpb 
 . 
 RecognitionConfig 
 { 
  
 Encoding 
 : 
  
 speechpb 
 . 
  RecognitionConfig_LINEAR16 
 
 , 
  
 SampleRateHertz 
 : 
  
 16000 
 , 
  
 LanguageCode 
 : 
  
 "en-US" 
 , 
  
 }, 
  
 }, 
  
 }, 
  
 }); 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 log 
 . 
 Fatal 
 ( 
 err 
 ) 
  
 } 
  
 f 
 , 
  
 err 
  
 := 
  
 os 
 . 
 Open 
 ( 
 audioFile 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 log 
 . 
 Fatal 
 ( 
 err 
 ) 
  
 } 
  
 defer 
  
 f 
 . 
 Close 
 () 
  
 go 
  
 func 
 () 
  
 { 
  
 buf 
  
 := 
  
 make 
 ([] 
 byte 
 , 
  
 1024 
 ) 
  
 for 
  
 { 
  
 n 
 , 
  
 err 
  
 := 
  
 f 
 . 
 Read 
 ( 
 buf 
 ) 
  
 if 
  
 n 
 > 
 0 
  
 { 
  
 if 
  
 err 
  
 := 
  
 stream 
 . 
 Send 
 ( 
& speechpb 
 . 
 StreamingRecognizeRequest 
 { 
  
 StreamingRequest 
 : 
  
& speechpb 
 . 
 StreamingRecognizeRequest_AudioContent 
 { 
  
 AudioContent 
 : 
  
 buf 
 [: 
 n 
 ], 
  
 }, 
  
 }); 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 log 
 . 
 Printf 
 ( 
 "Could not send audio: %v" 
 , 
  
 err 
 ) 
  
 } 
  
 } 
  
 if 
  
 err 
  
 == 
  
 io 
 . 
 EOF 
  
 { 
  
 // Nothing else to pipe, close the stream. 
  
 if 
  
 err 
  
 := 
  
 stream 
 . 
 CloseSend 
 (); 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 log 
 . 
 Fatalf 
 ( 
 "Could not close stream: %v" 
 , 
  
 err 
 ) 
  
 } 
  
 return 
  
 } 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 log 
 . 
 Printf 
 ( 
 "Could not read from %s: %v" 
 , 
  
 audioFile 
 , 
  
 err 
 ) 
  
 continue 
  
 } 
  
 } 
  
 }() 
  
 for 
  
 { 
  
 resp 
 , 
  
 err 
  
 := 
  
 stream 
 . 
 Recv 
 () 
  
 if 
  
 err 
  
 == 
  
 io 
 . 
 EOF 
  
 { 
  
 break 
  
 } 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 log 
 . 
 Fatalf 
 ( 
 "Cannot stream results: %v" 
 , 
  
 err 
 ) 
  
 } 
  
 if 
  
 err 
  
 := 
  
 resp 
 . 
 Error 
 ; 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 log 
 . 
 Fatalf 
 ( 
 "Could not recognize: %v" 
 , 
  
 err 
 ) 
  
 } 
  
 for 
  
 _ 
 , 
  
 result 
  
 := 
  
 range 
  
 resp 
 . 
 Results 
  
 { 
  
 fmt 
 . 
 Printf 
 ( 
 "Result: %+v\n" 
 , 
  
 result 
 ) 
  
 } 
  
 } 
 } 
 

Java

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Java API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  /** 
 * Performs streaming speech recognition on raw PCM audio data. 
 * 
 * @param fileName the path to a PCM audio file to transcribe. 
 */ 
 public 
  
 static 
  
 void 
  
 streamingRecognizeFile 
 ( 
 String 
  
 fileName 
 ) 
  
 throws 
  
 Exception 
 , 
  
 IOException 
  
 { 
  
 Path 
  
 path 
  
 = 
  
 Paths 
 . 
 get 
 ( 
 fileName 
 ); 
  
 byte 
 [] 
  
 data 
  
 = 
  
 Files 
 . 
 readAllBytes 
 ( 
 path 
 ); 
  
 // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS 
  
 try 
  
 ( 
 SpeechClient 
  
 speech 
  
 = 
  
 SpeechClient 
 . 
 create 
 ()) 
  
 { 
  
 // Configure request with local raw PCM audio 
  
 RecognitionConfig 
  
 recConfig 
  
 = 
  
 RecognitionConfig 
 . 
 newBuilder 
 () 
  
 . 
 setEncoding 
 ( 
 AudioEncoding 
 . 
 LINEAR16 
 ) 
  
 . 
 setLanguageCode 
 ( 
 "en-US" 
 ) 
  
 . 
 setSampleRateHertz 
 ( 
 16000 
 ) 
  
 . 
 setModel 
 ( 
 "default" 
 ) 
  
 . 
 build 
 (); 
  
 StreamingRecognitionConfig 
  
 config 
  
 = 
  
 StreamingRecognitionConfig 
 . 
 newBuilder 
 (). 
 setConfig 
 ( 
 recConfig 
 ). 
 build 
 (); 
  
 class 
 ResponseApiStreamingObserver<T> 
  
 implements 
  
 ApiStreamObserver<T> 
  
 { 
  
 private 
  
 final 
  
 SettableFuture<List<T> 
>  
 future 
  
 = 
  
 SettableFuture 
 . 
 create 
 (); 
  
 private 
  
 final 
  
 List<T> 
  
 messages 
  
 = 
  
 new 
  
 java 
 . 
 util 
 . 
 ArrayList<T> 
 (); 
  
 @Override 
  
 public 
  
 void 
  
 onNext 
 ( 
 T 
  
 message 
 ) 
  
 { 
  
 messages 
 . 
 add 
 ( 
 message 
 ); 
  
 } 
  
 @Override 
  
 public 
  
 void 
  
 onError 
 ( 
 Throwable 
  
 t 
 ) 
  
 { 
  
 future 
 . 
 setException 
 ( 
 t 
 ); 
  
 } 
  
 @Override 
  
 public 
  
 void 
  
 onCompleted 
 () 
  
 { 
  
 future 
 . 
 set 
 ( 
 messages 
 ); 
  
 } 
  
 // Returns the SettableFuture object to get received messages / exceptions. 
  
 public 
  
 SettableFuture<List<T> 
>  
 future 
 () 
  
 { 
  
 return 
  
 future 
 ; 
  
 } 
  
 } 
  
 ResponseApiStreamingObserver<StreamingRecognizeResponse> 
  
 responseObserver 
  
 = 
  
 new 
  
 ResponseApiStreamingObserver 
<> (); 
  
 BidiStreamingCallable<StreamingRecognizeRequest 
 , 
  
 StreamingRecognizeResponse 
>  
 callable 
  
 = 
  
 speech 
 . 
 streamingRecognizeCallable 
 (); 
  
 ApiStreamObserver<StreamingRecognizeRequest> 
  
 requestObserver 
  
 = 
  
 callable 
 . 
 bidiStreamingCall 
 ( 
 responseObserver 
 ); 
  
 // The first request must **only** contain the audio configuration: 
  
 requestObserver 
 . 
 onNext 
 ( 
  
 StreamingRecognizeRequest 
 . 
 newBuilder 
 (). 
 setStreamingConfig 
 ( 
 config 
 ). 
 build 
 ()); 
  
 // Subsequent requests must **only** contain the audio data. 
  
 requestObserver 
 . 
 onNext 
 ( 
  
 StreamingRecognizeRequest 
 . 
 newBuilder 
 () 
  
 . 
 setAudioContent 
 ( 
 ByteString 
 . 
 copyFrom 
 ( 
 data 
 )) 
  
 . 
 build 
 ()); 
  
 // Mark transmission as completed after sending the data. 
  
 requestObserver 
 . 
 onCompleted 
 (); 
  
 List<StreamingRecognizeResponse> 
  
 responses 
  
 = 
  
 responseObserver 
 . 
 future 
 (). 
 get 
 (); 
  
 for 
  
 ( 
 StreamingRecognizeResponse 
  
 response 
  
 : 
  
 responses 
 ) 
  
 { 
  
 // For streaming recognize, the results list has one is_final result (if available) followed 
  
 // by a number of in-progress results (if iterim_results is true) for subsequent utterances. 
  
 // Just print the first result here. 
  
 StreamingRecognitionResult 
  
 result 
  
 = 
  
 response 
 . 
 getResultsList 
 (). 
 get 
 ( 
 0 
 ); 
  
 // There can be several alternative transcripts for a given chunk of speech. Just use the 
  
 // first (most likely) one here. 
  
 SpeechRecognitionAlternative 
  
 alternative 
  
 = 
  
 result 
 . 
 getAlternativesList 
 (). 
 get 
 ( 
 0 
 ); 
  
 System 
 . 
 out 
 . 
 printf 
 ( 
 "Transcript : %s\n" 
 , 
  
 alternative 
 . 
 getTranscript 
 ()); 
  
 } 
  
 } 
 } 
 

Node.js

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Node.js API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  const 
  
 fs 
  
 = 
  
 require 
 ( 
 'fs' 
 ); 
 // Imports the Google Cloud client library 
 const 
  
 speech 
  
 = 
  
 require 
 ( 
 ' @google-cloud/speech 
' 
 ); 
 // Creates a client 
 const 
  
 client 
  
 = 
  
 new 
  
 speech 
 . 
  SpeechClient 
 
 (); 
 /** 
 * TODO(developer): Uncomment the following lines before running the sample. 
 */ 
 // const filename = 'Local path to audio file, e.g. /path/to/audio.raw'; 
 // const encoding = 'Encoding of the audio file, e.g. LINEAR16'; 
 // const sampleRateHertz = 16000; 
 // const languageCode = 'BCP-47 language code, e.g. en-US'; 
 const 
  
 request 
  
 = 
  
 { 
  
 config 
 : 
  
 { 
  
 encoding 
 : 
  
 encoding 
 , 
  
 sampleRateHertz 
 : 
  
 sampleRateHertz 
 , 
  
 languageCode 
 : 
  
 languageCode 
 , 
  
 }, 
  
 interimResults 
 : 
  
 false 
 , 
  
 // If you want interim results, set this to true 
 }; 
 // Stream the audio to the Google Cloud Speech API 
 const 
  
 recognizeStream 
  
 = 
  
 client 
  
 . 
 streamingRecognize 
 ( 
 request 
 ) 
  
 . 
 on 
 ( 
 'error' 
 , 
  
 console 
 . 
 error 
 ) 
  
 . 
 on 
 ( 
 'data' 
 , 
  
 data 
  
 = 
>  
 { 
  
 console 
 . 
 log 
 ( 
  
 `Transcription: 
 ${ 
 data 
 . 
 results 
 [ 
 0 
 ]. 
 alternatives 
 [ 
 0 
 ]. 
 transcript 
 } 
 ` 
  
 ); 
  
 }); 
 // Stream an audio file from disk to the Speech API, e.g. "./resources/audio.raw" 
 fs 
 . 
 createReadStream 
 ( 
 filename 
 ). 
 pipe 
 ( 
 recognizeStream 
 ); 
 

PHP

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  use Google\Cloud\Speech\V1\SpeechClient; 
 use Google\Cloud\Speech\V1\RecognitionConfig; 
 use Google\Cloud\Speech\V1\StreamingRecognitionConfig; 
 use Google\Cloud\Speech\V1\StreamingRecognizeRequest; 
 use Google\Cloud\Speech\V1\RecognitionConfig\AudioEncoding; 
 /** 
 * @param string $audioFile path to an audio file 
 */ 
 function streaming_recognize(string $audioFile) 
 { 
 // change these variables if necessary 
 $encoding = AudioEncoding::LINEAR16; 
 $sampleRateHertz = 32000; 
 $languageCode = 'en-US'; 
 $speechClient = new SpeechClient(); 
 try { 
 $config = (new RecognitionConfig()) 
 ->setEncoding($encoding) 
 ->setSampleRateHertz($sampleRateHertz) 
 ->setLanguageCode($languageCode); 
 $strmConfig = new StreamingRecognitionConfig(); 
 $strmConfig->setConfig($config); 
 $strmReq = new StreamingRecognizeRequest(); 
 $strmReq->setStreamingConfig($strmConfig); 
 $strm = $speechClient->streamingRecognize(); 
 $strm->write($strmReq); 
 $strmReq = new StreamingRecognizeRequest(); 
 $content = file_get_contents($audioFile); 
 $strmReq->setAudioContent($content); 
 $strm->write($strmReq); 
 foreach ($strm->closeWriteAndReadAll() as $response) { 
 foreach ($response->getResults() as $result) { 
 foreach ($result->getAlternatives() as $alt) { 
 printf("Transcription: %s\n", $alt->getTranscript()); 
 } 
 } 
 } 
 } finally { 
 $speechClient->close(); 
 } 
 } 
 

Python

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Python API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  def 
  
 transcribe_streaming 
 ( 
 stream_file 
 : 
 str 
 ) 
 - 
> speech 
 . 
 RecognitionConfig 
 : 
  
 """Streams transcription of the given audio file using Google Cloud Speech-to-Text API. 
 Args: 
 stream_file (str): Path to the local audio file to be transcribed. 
 Example: "resources/audio.raw" 
 """ 
 client 
 = 
 speech 
 . 
 SpeechClient 
 () 
 with 
 open 
 ( 
 stream_file 
 , 
 "rb" 
 ) 
 as 
 audio_file 
 : 
 audio_content 
 = 
 audio_file 
 . 
 read 
 () 
 # In practice, stream should be a generator yielding chunks of audio data. 
 stream 
 = 
 [ 
 audio_content 
 ] 
 requests 
 = 
 ( 
 speech 
 . 
 StreamingRecognizeRequest 
 ( 
 audio_content 
 = 
 chunk 
 ) 
 for 
 chunk 
 in 
 stream 
 ) 
 config 
 = 
 speech 
 . 
 RecognitionConfig 
 ( 
 encoding 
 = 
 speech 
 . 
 RecognitionConfig 
 . 
 AudioEncoding 
 . 
 LINEAR16 
 , 
 sample_rate_hertz 
 = 
 16000 
 , 
 language_code 
 = 
 "en-US" 
 , 
 ) 
 streaming_config 
 = 
 speech 
 . 
 StreamingRecognitionConfig 
 ( 
 config 
 = 
 config 
 ) 
 # streaming_recognize returns a generator. 
 responses 
 = 
 client 
 . 
 streaming_recognize 
 ( 
 config 
 = 
 streaming_config 
 , 
 requests 
 = 
 requests 
 , 
 ) 
 for 
 response 
 in 
 responses 
 : 
 # Once the transcription has settled, the first result will contain the 
 # is_final result. The other results will be for subsequent portions of 
 # the audio. 
 for 
 result 
 in 
 response 
 . 
 results 
 : 
 print 
 ( 
 f 
 "Finished: 
 { 
 result 
 . 
 is_final 
 } 
 " 
 ) 
 print 
 ( 
 f 
 "Stability: 
 { 
 result 
 . 
 stability 
 } 
 " 
 ) 
 alternatives 
 = 
 result 
 . 
 alternatives 
 # The alternatives are ordered from most likely to least. 
 for 
 alternative 
 in 
 alternatives 
 : 
 print 
 ( 
 f 
 "Confidence: 
 { 
 alternative 
 . 
 confidence 
 } 
 " 
 ) 
 print 
 ( 
 f 
 "Transcript: 
 { 
 alternative 
 . 
 transcript 
 } 
 " 
 ) 
 

Ruby

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  # audio_file_path = "Path to file on which to perform speech recognition" 
 require 
  
 "google/cloud/speech" 
 speech 
  
 = 
  
 Google 
 :: 
 Cloud 
 :: 
  Speech 
 
 . 
  speech 
 
  
 version 
 : 
  
 :v1 
 audio_content 
  
 = 
  
 File 
 . 
 binread 
  
 audio_file_path 
 bytes_total 
  
 = 
  
 audio_content 
 . 
 size 
 bytes_sent 
  
 = 
  
 0 
 chunk_size 
  
 = 
  
 32_000 
 input_stream 
  
 = 
  
 Gapic 
 :: 
 StreamInput 
 . 
 new 
 output_stream 
  
 = 
  
 speech 
 . 
 streaming_recognize 
  
 input_stream 
 config 
  
 = 
  
 { 
  
 config 
 : 
  
 { 
  
 encoding 
 : 
  
 :LINEAR16 
 , 
  
 sample_rate_hertz 
 : 
  
 16_000 
 , 
  
 language_code 
 : 
  
 "en-US" 
 , 
  
 enable_word_time_offsets 
 : 
  
 true 
  
 } 
 } 
 input_stream 
 . 
 push 
  
 streaming_config 
 : 
  
 config 
 # Simulated streaming from a microphone 
 # Stream bytes... 
 while 
  
 bytes_sent 
 < 
 bytes_total 
  
 input_stream 
 . 
 push 
  
 audio_content 
 : 
  
 audio_content 
 [ 
 bytes_sent 
 , 
  
 chunk_size 
 ] 
  
 bytes_sent 
  
 += 
  
 chunk_size 
  
 sleep 
  
 1 
 end 
 puts 
  
 "Stopped passing" 
 input_stream 
 . 
 close 
 results 
  
 = 
  
 output_stream 
 results 
 . 
 each 
  
 do 
  
 | 
 result 
 | 
  
 puts 
  
 "Transcript: 
 #{ 
 result 
 } 
 " 
 end 
 

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser .

Design a Mobile Site
View Site in Mobile | Classic
Share by: