Recognize speech by using enhanced models

This page describes how to request an enhanced speech recognition model when you send a transcription request to Speech-to-Text.

There are currently two enhanced models: phone calland video. These models have been optimized to more accurately transcribe audio data from these specific sources. See the supported languages page to see if the enhanced models are available for your language.

Google creates and improves enhanced models based on data collected through data logging. While opting in to data logging is not required in order to use enhanced models, if you do opt in you can help Google improve these models and also enjoy a discount on your usage.

To use the enhanced recognition models set the following fields in RecognitionConfig :

  1. Set useEnhanced to true .
  2. Pass either the phone_call or video string in the model field.

Speech-to-Text supports enhanced models for all speech recognition methods: speech:recognize speech:longrunningrecognize , and Streaming .

The following code samples demonstrate how to request to use an enhanced model for a transcription request.

Protocol

Refer to the speech:recognize API endpoint for complete details.

To perform synchronous speech recognition, make a POST request and provide the appropriate request body. The following shows an example of a POST request using curl . The example uses the Google Cloud CLI to generate an access token. For instructions on installing the gcloud CLI, see the quickstart .

curl  
-s  
-H  
 "Content-Type: application/json" 
  
 \ 
  
-H  
 "Authorization: Bearer 
 $( 
gcloud  
auth  
application-default  
print-access-token ) 
 " 
  
 \ 
  
https://speech.googleapis.com/v1/speech:recognize  
 \ 
  
--data  
 '{ 
 "config": { 
 "encoding": "LINEAR16", 
 "languageCode": "en-US", 
 "enableWordTimeOffsets": false, 
 "enableAutomaticPunctuation": true, 
  "model": "phone_call", 
  "useEnhanced": true 
 }, 
 "audio": { 
 "uri": "gs://cloud-samples-tests/speech/commercial_mono.wav" 
 } 
 }' 

See the RecognitionConfig reference documentation for more information on configuring the request body.

If the request is successful, the server returns a 200 OK HTTP status code and the response in JSON format:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "Hi, I'd like to buy a Chromecast. I was wondering whether you could help me with that.",
          "confidence": 0.8930228
        }
      ],
      "resultEndTime": "5.640s"
    },
    {
      "alternatives": [
        {
          "transcript": " Certainly, which color would you like? We are blue black and red.",
          "confidence": 0.9101991
        }
      ],
      "resultEndTime": "10.220s"
    },
    {
      "alternatives": [
        {
          "transcript": " Let's go with the black one.",
          "confidence": 0.8818244
        }
      ],
      "resultEndTime": "13.870s"
    },
    {
      "alternatives": [
        {
          "transcript": " Would you like the new Chromecast Ultra model or the regular Chromecast?",
          "confidence": 0.94733626
        }
      ],
      "resultEndTime": "18.460s"
    },
    {
      "alternatives": [
        {
          "transcript": " Regular Chromecast is fine. Thank you. Okay. Sure. Would you like to ship it regular or Express?",
          "confidence": 0.9519095
        }
      ],
      "resultEndTime": "25.930s"
    },
    {
      "alternatives": [
        {
          "transcript": " Express, please.",
          "confidence": 0.9101229
        }
      ],
      "resultEndTime": "28.260s"
    },
    {
      "alternatives": [
        {
          "transcript": " Terrific. It's on the way. Thank you. Thank you very much. Bye.",
          "confidence": 0.9321616
        }
      ],
      "resultEndTime": "34.150s"
    }
 ]
}

Go

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Go API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  func 
  
 enhancedModel 
 ( 
 w 
  
 io 
 . 
 Writer 
 ) 
  
 error 
  
 { 
  
 ctx 
  
 := 
  
 context 
 . 
 Background 
 () 
  
 client 
 , 
  
 err 
  
 := 
  
 speech 
 . 
 NewClient 
 ( 
 ctx 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 return 
  
 fmt 
 . 
 Errorf 
 ( 
 "NewClient: %w" 
 , 
  
 err 
 ) 
  
 } 
  
 defer 
  
 client 
 . 
 Close 
 () 
  
 data 
 , 
  
 err 
  
 := 
  
 os 
 . 
 ReadFile 
 ( 
 "../testdata/commercial_mono.wav" 
 ) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 return 
  
 fmt 
 . 
 Errorf 
 ( 
 "ReadFile: %w" 
 , 
  
 err 
 ) 
  
 } 
  
 resp 
 , 
  
 err 
  
 := 
  
 client 
 . 
 Recognize 
 ( 
 ctx 
 , 
  
& speechpb 
 . 
 RecognizeRequest 
 { 
  
 Config 
 : 
  
& speechpb 
 . 
 RecognitionConfig 
 { 
  
 Encoding 
 : 
  
 speechpb 
 . 
 RecognitionConfig_LINEAR16 
 , 
  
 SampleRateHertz 
 : 
  
 8000 
 , 
  
 LanguageCode 
 : 
  
 "en-US" 
 , 
  
 UseEnhanced 
 : 
  
 true 
 , 
  
 // A model must be specified to use enhanced model. 
  
 Model 
 : 
  
 "phone_call" 
 , 
  
 }, 
  
 Audio 
 : 
  
& speechpb 
 . 
 RecognitionAudio 
 { 
  
 AudioSource 
 : 
  
& speechpb 
 . 
 RecognitionAudio_Content 
 { 
 Content 
 : 
  
 data 
 }, 
  
 }, 
  
 }) 
  
 if 
  
 err 
  
 != 
  
 nil 
  
 { 
  
 return 
  
 fmt 
 . 
 Errorf 
 ( 
 "client.Recognize: %w" 
 , 
  
 err 
 ) 
  
 } 
  
 for 
  
 i 
 , 
  
 result 
  
 := 
  
 range 
  
 resp 
 . 
 Results 
  
 { 
  
 fmt 
 . 
 Fprintf 
 ( 
 w 
 , 
  
 "%s\n" 
 , 
  
 strings 
 . 
 Repeat 
 ( 
 "-" 
 , 
  
 20 
 )) 
  
 fmt 
 . 
 Fprintf 
 ( 
 w 
 , 
  
 "Result %d\n" 
 , 
  
 i 
 + 
 1 
 ) 
  
 for 
  
 j 
 , 
  
 alternative 
  
 := 
  
 range 
  
 result 
 . 
 Alternatives 
  
 { 
  
 fmt 
 . 
 Fprintf 
 ( 
 w 
 , 
  
 "Alternative %d: %s\n" 
 , 
  
 j 
 + 
 1 
 , 
  
 alternative 
 . 
 Transcript 
 ) 
  
 } 
  
 } 
  
 return 
  
 nil 
 } 
 

Python

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Python API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  from 
  
 google.cloud 
  
 import 
 speech 
 def 
  
 transcribe_file_with_enhanced_model 
 ( 
 audio_file 
 : 
 str 
 ) 
 - 
> speech 
 . 
 RecognizeResponse 
 : 
  
 """Transcribe the given audio file using an enhanced model. 
 Args: 
 audio_file (str): Path to the local audio file to be transcribed. 
 Example: "resources/commercial_mono.wav" 
 Returns: 
 speech.RecognizeResponse: The response containing the transcription results. 
 """ 
 client 
 = 
 speech 
 . 
 SpeechClient 
 () 
 # audio_file = 'resources/commercial_mono.wav' 
 with 
 open 
 ( 
 audio_file 
 , 
 "rb" 
 ) 
 as 
 f 
 : 
 audio_content 
 = 
 f 
 . 
 read 
 () 
 audio 
 = 
 speech 
 . 
 RecognitionAudio 
 ( 
 content 
 = 
 audio_content 
 ) 
 config 
 = 
 speech 
 . 
 RecognitionConfig 
 ( 
 encoding 
 = 
 speech 
 . 
 RecognitionConfig 
 . 
 AudioEncoding 
 . 
 LINEAR16 
 , 
 sample_rate_hertz 
 = 
 8000 
 , 
 language_code 
 = 
 "en-US" 
 , 
 use_enhanced 
 = 
 True 
 , 
 # A model must be specified to use enhanced model. 
 model 
 = 
 "phone_call" 
 , 
 ) 
 response 
 = 
 client 
 . 
 recognize 
 ( 
 config 
 = 
 config 
 , 
 audio 
 = 
 audio 
 ) 
 for 
 i 
 , 
 result 
 in 
 enumerate 
 ( 
 response 
 . 
 results 
 ): 
 alternative 
 = 
 result 
 . 
 alternatives 
 [ 
 0 
 ] 
 print 
 ( 
 "-" 
 * 
 20 
 ) 
 print 
 ( 
 f 
 "First alternative of result 
 { 
 i 
 } 
 " 
 ) 
 print 
 ( 
 f 
 "Transcript: 
 { 
 alternative 
 . 
 transcript 
 } 
 " 
 ) 
 return 
 response 
 

Java

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Java API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  /** 
 * Transcribe the given audio file using an enhanced model. 
 * 
 * @param fileName the path to an audio file. 
 */ 
 public 
  
 static 
  
 void 
  
 transcribeFileWithEnhancedModel 
 ( 
 String 
  
 fileName 
 ) 
  
 throws 
  
 Exception 
  
 { 
  
 Path 
  
 path 
  
 = 
  
 Paths 
 . 
 get 
 ( 
 fileName 
 ); 
  
 byte 
 [] 
  
 content 
  
 = 
  
 Files 
 . 
 readAllBytes 
 ( 
 path 
 ); 
  
 try 
  
 ( 
 SpeechClient 
  
 speechClient 
  
 = 
  
 SpeechClient 
 . 
 create 
 ()) 
  
 { 
  
 // Get the contents of the local audio file 
  
 RecognitionAudio 
  
 recognitionAudio 
  
 = 
  
 RecognitionAudio 
 . 
 newBuilder 
 (). 
 setContent 
 ( 
 ByteString 
 . 
 copyFrom 
 ( 
 content 
 )). 
 build 
 (); 
  
 // Configure request to enable enhanced models 
  
 RecognitionConfig 
  
 config 
  
 = 
  
 RecognitionConfig 
 . 
 newBuilder 
 () 
  
 . 
 setEncoding 
 ( 
 AudioEncoding 
 . 
 LINEAR16 
 ) 
  
 . 
 setLanguageCode 
 ( 
 "en-US" 
 ) 
  
 . 
 setSampleRateHertz 
 ( 
 8000 
 ) 
  
 . 
 setUseEnhanced 
 ( 
 true 
 ) 
  
 // A model must be specified to use enhanced model. 
  
 . 
 setModel 
 ( 
 "phone_call" 
 ) 
  
 . 
 build 
 (); 
  
 // Perform the transcription request 
  
 RecognizeResponse 
  
 recognizeResponse 
  
 = 
  
 speechClient 
 . 
 recognize 
 ( 
 config 
 , 
  
 recognitionAudio 
 ); 
  
 // Print out the results 
  
 for 
  
 ( 
 SpeechRecognitionResult 
  
 result 
  
 : 
  
 recognizeResponse 
 . 
 getResultsList 
 ()) 
  
 { 
  
 // There can be several alternative transcripts for a given chunk of speech. Just use the 
  
 // first (most likely) one here. 
  
 SpeechRecognitionAlternative 
  
 alternative 
  
 = 
  
 result 
 . 
 getAlternatives 
 ( 
 0 
 ); 
  
 System 
 . 
 out 
 . 
 format 
 ( 
 "Transcript: %s\n\n" 
 , 
  
 alternative 
 . 
 getTranscript 
 ()); 
  
 } 
  
 } 
 } 
 

Node.js

To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Node.js API reference documentation .

To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .

  // Imports the Google Cloud client library for Beta API 
 /** 
 * TODO(developer): Update client library import to use new 
 * version of API when desired features become available 
 */ 
 const 
  
 speech 
  
 = 
  
 require 
 ( 
 ' @google-cloud/speech 
' 
 ). 
 v1p1beta1 
 ; 
 const 
  
 fs 
  
 = 
  
 require 
 ( 
 'fs' 
 ); 
 // Creates a client 
 const 
  
 client 
  
 = 
  
 new 
  
 speech 
 . 
  SpeechClient 
 
 (); 
 /** 
 * TODO(developer): Uncomment the following lines before running the sample. 
 */ 
 // const filename = 'Local path to audio file, e.g. /path/to/audio.raw'; 
 // const encoding = 'Encoding of the audio file, e.g. LINEAR16'; 
 // const sampleRateHertz = 16000; 
 // const languageCode = 'BCP-47 language code, e.g. en-US'; 
 const 
  
 config 
  
 = 
  
 { 
  
 encoding 
 : 
  
 encoding 
 , 
  
 languageCode 
 : 
  
 languageCode 
 , 
  
 useEnhanced 
 : 
  
 true 
 , 
  
 model 
 : 
  
 'phone_call' 
 , 
 }; 
 const 
  
 audio 
  
 = 
  
 { 
  
 content 
 : 
  
 fs 
 . 
 readFileSync 
 ( 
 filename 
 ). 
 toString 
 ( 
 'base64' 
 ), 
 }; 
 const 
  
 request 
  
 = 
  
 { 
  
 config 
 : 
  
 config 
 , 
  
 audio 
 : 
  
 audio 
 , 
 }; 
 // Detects speech in the audio file 
 const 
  
 [ 
 response 
 ] 
  
 = 
  
 await 
  
 client 
 . 
 recognize 
 ( 
 request 
 ); 
 response 
 . 
 results 
 . 
 forEach 
 ( 
 result 
  
 = 
>  
 { 
  
 const 
  
 alternative 
  
 = 
  
 result 
 . 
 alternatives 
 [ 
 0 
 ]; 
  
 console 
 . 
 log 
 ( 
 alternative 
 . 
 transcript 
 ); 
 }); 
 

Additional languages

C#: Please follow the C# setup instructions on the client libraries page and then visit the Speech-to-Text reference documentation for .NET.

PHP: Please follow the PHP setup instructions on the client libraries page and then visit the Speech-to-Text reference documentation for PHP.

Ruby: Please follow the Ruby setup instructions on the client libraries page and then visit the Speech-to-Text reference documentation for Ruby.

What's next

Review how to make synchronous transcription requests .

Create a Mobile Website
View Site in Mobile | Classic
Share by: