Transcribe a local file using a transcription model

Transcribe a local audio file using a trained transcription model.

Explore further

For detailed documentation that includes this code sample, see the following:

Code sample

Go

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Go API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  func 
  
 modelSelection 
 ( 
 w 
  
 io 
 . 
 Writer 
 ) 
  
 error 
  
 { 
  
 ctx 
  
 := 
  
 context 
 . 
 Background 
 () 
  
 client 
 , 
  
 err 
  
 := 
  
 speech 
 . 
 NewClient 
 ( 
 ctx 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 return 
  
 fmt 
 . 
 Errorf 
 ( 
 "NewClient: %w" 
 , 
  
 err 
 ) 
  
 } 
  
 defer 
  
 client 
 . 
 Close 
 () 
  
 data 
 , 
  
 err 
  
 := 
  
 os 
 . 
 ReadFile 
 ( 
 "../testdata/Google_Gnome.wav" 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 return 
  
 fmt 
 . 
 Errorf 
 ( 
 "ReadFile: %w" 
 , 
  
 err 
 ) 
  
 } 
  
 req 
  
 := 
  
& speechpb 
 . 
 RecognizeRequest 
 { 
  
 Config 
 : 
  
& speechpb 
 . 
 RecognitionConfig 
 { 
  
 Encoding 
 : 
  
 speechpb 
 . 
 RecognitionConfig_LINEAR16 
 , 
  
 SampleRateHertz 
 : 
  
 16000 
 , 
  
 LanguageCode 
 : 
  
 "en-US" 
 , 
  
 Model 
 : 
  
 "video" 
 , 
  
 }, 
  
 Audio 
 : 
  
& speechpb 
 . 
 RecognitionAudio 
 { 
  
 AudioSource 
 : 
  
& speechpb 
 . 
 RecognitionAudio_Content 
 { 
 Content 
 : 
  
 data 
 }, 
  
 }, 
  
 } 
  
 resp 
 , 
  
 err 
  
 := 
  
 client 
 . 
 Recognize 
 ( 
 ctx 
 , 
  
 req 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 return 
  
 fmt 
 . 
 Errorf 
 ( 
 "recognize: %w" 
 , 
  
 err 
 ) 
  
 } 
  
 for 
  
 i 
 , 
  
 result 
  
 := 
  
 range 
  
 resp 
 . 
 Results 
  
 { 
  
 fmt 
 . 
 Fprintf 
 ( 
 w 
 , 
  
 "%s\n" 
 , 
  
 strings 
 . 
 Repeat 
 ( 
 "-" 
 , 
  
 20 
 )) 
  
 fmt 
 . 
 Fprintf 
 ( 
 w 
 , 
  
 "Result %d\n" 
 , 
  
 i 
 + 
 1 
 ) 
  
 for 
  
 j 
 , 
  
 alternative 
  
 := 
  
 range 
  
 result 
 . 
 Alternatives 
  
 { 
  
 fmt 
 . 
 Fprintf 
 ( 
 w 
 , 
  
 "Alternative %d: %s\n" 
 , 
  
 j 
 + 
 1 
 , 
  
 alternative 
 . 
 Transcript 
 ) 
  
 } 
  
 } 
  
 return 
  
 nil 
 } 
 

Java

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Java API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  /** 
 * Performs transcription of the given audio file synchronously with the selected model. 
 * 
 * @param fileName the path to a audio file to transcribe 
 */ 
 public 
  
 static 
  
 void 
  
 transcribeModelSelection 
 ( 
 String 
  
 fileName 
 ) 
  
 throws 
  
 Exception 
  
 { 
  
 Path 
  
 path 
  
 = 
  
 Paths 
 . 
 get 
 ( 
 fileName 
 ); 
  
 byte 
 [] 
  
 content 
  
 = 
  
 Files 
 . 
 readAllBytes 
 ( 
 path 
 ); 
  
 try 
  
 ( 
 SpeechClient 
  
 speech 
  
 = 
  
 SpeechClient 
 . 
 create 
 ()) 
  
 { 
  
 // Configure request with video media type 
  
 RecognitionConfig 
  
 recConfig 
  
 = 
  
 RecognitionConfig 
 . 
 newBuilder 
 () 
  
 // encoding may either be omitted or must match the value in the file header 
  
 . 
 setEncoding 
 ( 
 AudioEncoding 
 . 
 LINEAR16 
 ) 
  
 . 
 setLanguageCode 
 ( 
 "en-US" 
 ) 
  
 // sample rate hertz may be either be omitted or must match the value in the file 
  
 // header 
  
 . 
 setSampleRateHertz 
 ( 
 16000 
 ) 
  
 . 
 setModel 
 ( 
 "video" 
 ) 
  
 . 
 build 
 (); 
  
 RecognitionAudio 
  
 recognitionAudio 
  
 = 
  
 RecognitionAudio 
 . 
 newBuilder 
 (). 
 setContent 
 ( 
 ByteString 
 . 
 copyFrom 
 ( 
 content 
 )). 
 build 
 (); 
  
 RecognizeResponse 
  
 recognizeResponse 
  
 = 
  
 speech 
 . 
 recognize 
 ( 
 recConfig 
 , 
  
 recognitionAudio 
 ); 
  
 // Just print the first result here. 
  
 SpeechRecognitionResult 
  
 result 
  
 = 
  
 recognizeResponse 
 . 
 getResultsList 
 (). 
 get 
 ( 
 0 
 ); 
  
 // There can be several alternative transcripts for a given chunk of speech. Just use the 
  
 // first (most likely) one here. 
  
 SpeechRecognitionAlternative 
  
 alternative 
  
 = 
  
 result 
 . 
 getAlternativesList 
 (). 
 get 
 ( 
 0 
 ); 
  
 System 
 . 
 out 
 . 
 printf 
 ( 
 "Transcript : %s\n" 
 , 
  
 alternative 
 . 
 getTranscript 
 ()); 
  
 } 
 } 
 

Node.js

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Node.js API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  // Imports the Google Cloud client library for Beta API 
 /** 
 * TODO(developer): Update client library import to use new 
 * version of API when desired features become available 
 */ 
 const 
  
 speech 
  
 = 
  
 require 
 ( 
 ' @google-cloud/speech 
' 
 ). 
 v1p1beta1 
 ; 
 const 
  
 fs 
  
 = 
  
 require 
 ( 
 'fs' 
 ); 
 // Creates a client 
 const 
  
 client 
  
 = 
  
 new 
  
 speech 
 . 
  SpeechClient 
 
 (); 
 /** 
 * TODO(developer): Uncomment the following lines before running the sample. 
 */ 
 // const filename = 'Local path to audio file, e.g. /path/to/audio.raw'; 
 // const model = 'Model to use, e.g. phone_call, video, default'; 
 // const encoding = 'Encoding of the audio file, e.g. LINEAR16'; 
 // const sampleRateHertz = 16000; 
 // const languageCode = 'BCP-47 language code, e.g. en-US'; 
 const 
  
 config 
  
 = 
  
 { 
  
 encoding 
 : 
  
 encoding 
 , 
  
 sampleRateHertz 
 : 
  
 sampleRateHertz 
 , 
  
 languageCode 
 : 
  
 languageCode 
 , 
  
 model 
 : 
  
 model 
 , 
 }; 
 const 
  
 audio 
  
 = 
  
 { 
  
 content 
 : 
  
 fs 
 . 
 readFileSync 
 ( 
 filename 
 ). 
 toString 
 ( 
 'base64' 
 ), 
 }; 
 const 
  
 request 
  
 = 
  
 { 
  
 config 
 : 
  
 config 
 , 
  
 audio 
 : 
  
 audio 
 , 
 }; 
 // Detects speech in the audio file 
 const 
  
 [ 
 response 
 ] 
  
 = 
  
 await 
  
 client 
 . 
 recognize 
 ( 
 request 
 ); 
 const 
  
 transcription 
  
 = 
  
 response 
 . 
 results 
  
 . 
 map 
 ( 
 result 
  
 = 
>  
 result 
 . 
 alternatives 
 [ 
 0 
 ]. 
 transcript 
 ) 
  
 . 
 join 
 ( 
 '\n' 
 ); 
 console 
 . 
 log 
 ( 
 'Transcription: ' 
 , 
  
 transcription 
 ); 
 

PHP

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  use Google\Cloud\Speech\V1\SpeechClient; 
 use Google\Cloud\Speech\V1\RecognitionAudio; 
 use Google\Cloud\Speech\V1\RecognitionConfig; 
 use Google\Cloud\Speech\V1\RecognitionConfig\AudioEncoding; 
 /** 
 * @param string $audioFile path to an audio file 
 * @param string $model video 
 */ 
 function transcribe_model_selection(string $audioFile, string $model) 
 { 
 // change these variables if necessary 
 $encoding = AudioEncoding::LINEAR16; 
 $sampleRateHertz = 32000; 
 $languageCode = 'en-US'; 
 // get contents of a file into a string 
 $content = file_get_contents($audioFile); 
 // set string as audio content 
 $audio = (new RecognitionAudio()) 
 ->setContent($content); 
 // set config 
 $config = (new RecognitionConfig()) 
 ->setEncoding($encoding) 
 ->setSampleRateHertz($sampleRateHertz) 
 ->setLanguageCode($languageCode) 
 ->setModel($model); 
 // create the speech client 
 $client = new SpeechClient(); 
 // make the API call 
 $response = $client->recognize($config, $audio); 
 $results = $response->getResults(); 
 // print results 
 foreach ($results as $result) { 
 $alternatives = $result->getAlternatives(); 
 $mostLikely = $alternatives[0]; 
 $transcript = $mostLikely->getTranscript(); 
 $confidence = $mostLikely->getConfidence(); 
 printf('Transcript: %s' . PHP_EOL, $transcript); 
 printf('Confidence: %s' . PHP_EOL, $confidence); 
 } 
 $client->close(); 
 } 
 

Python

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Python API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  from 
  
 google.cloud 
  
 import 
 speech 
 # Instantiates a client 
 client 
 = 
 speech 
 . 
 SpeechClient 
 () 
 # Reads a file as bytes 
 with 
 open 
 ( 
 "resources/Google_Gnome.wav" 
 , 
 "rb" 
 ) 
 as 
 f 
 : 
 audio_content 
 = 
 f 
 . 
 read 
 () 
 audio 
 = 
 speech 
 . 
  RecognitionAudio 
 
 ( 
 content 
 = 
 audio_content 
 ) 
 config 
 = 
 speech 
 . 
  RecognitionConfig 
 
 ( 
 encoding 
 = 
 speech 
 . 
 RecognitionConfig 
 . 
 AudioEncoding 
 . 
 LINEAR16 
 , 
 sample_rate_hertz 
 = 
 16000 
 , 
 language_code 
 = 
 "en-US" 
 , 
 model 
 = 
 "video" 
 , 
 # Chosen model 
 ) 
 response 
 = 
 client 
 . 
  recognize 
 
 ( 
 config 
 = 
 config 
 , 
 audio 
 = 
 audio 
 ) 
 for 
 i 
 , 
 result 
 in 
 enumerate 
 ( 
 response 
 . 
 results 
 ): 
 alternative 
 = 
 result 
 . 
 alternatives 
 [ 
 0 
 ] 
 print 
 ( 
 "-" 
 * 
 20 
 ) 
 print 
 ( 
 f 
 "First alternative of result 
 { 
 i 
 } 
 " 
 ) 
 print 
 ( 
 f 
 "Transcript: 
 { 
 alternative 
 . 
 transcript 
 } 
 " 
 ) 
 

Ruby

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  # file_path = "path/to/audio.wav" 
 require 
  
 "google/cloud/speech" 
 speech 
  
 = 
  
 Google 
 :: 
 Cloud 
 :: 
  Speech 
 
 . 
  speech 
 
  
 version 
 : 
  
 :v1 
 config 
  
 = 
  
 { 
  
 encoding 
 : 
  
 :LINEAR16 
 , 
  
 sample_rate_hertz 
 : 
  
 16_000 
 , 
  
 language_code 
 : 
  
 "en-US" 
 , 
  
 model 
 : 
  
 model 
 } 
 file 
  
 = 
  
 File 
 . 
 binread 
  
 file_path 
 audio 
  
 = 
  
 { 
  
 content 
 : 
  
 file 
  
 } 
 operation 
  
 = 
  
 speech 
 . 
 long_running_recognize 
  
 config 
 : 
  
 config 
 , 
  
 audio 
 : 
  
 audio 
 puts 
  
 "Operation started" 
 operation 
 . 
 wait_until_done! 
 raise 
  
 operation 
 . 
 results 
 . 
 message 
  
 if 
  
 operation 
 . 
 error? 
 results 
  
 = 
  
 operation 
 . 
 response 
 . 
 results 
 results 
 . 
 each_with_index 
  
 do 
  
 | 
 result 
 , 
  
 i 
 | 
  
 alternative 
  
 = 
  
 result 
 . 
 alternatives 
 . 
 first 
  
 puts 
  
 "-" 
  
 * 
  
 20 
  
 puts 
  
 "First alternative of result 
 #{ 
 i 
 } 
 " 
  
 puts 
  
 "Transcript: 
 #{ 
 alternative 
 . 
 transcript 
 } 
 " 
 end 
 

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser .

Design a Mobile Site
View Site in Mobile | Classic
Share by: