Identify the different speakers in the audio sample.
Code sample
Java
To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Java API reference documentation .
To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .
import
com.google.cloud.speech.v1. RecognitionAudio
;
import
com.google.cloud.speech.v1. RecognitionConfig
;
import
com.google.cloud.speech.v1. RecognizeResponse
;
import
com.google.cloud.speech.v1. SpeakerDiarizationConfig
;
import
com.google.cloud.speech.v1. SpeechClient
;
import
com.google.cloud.speech.v1. SpeechRecognitionAlternative
;
import
com.google.cloud.speech.v1. WordInfo
;
import
com.google.protobuf. ByteString
;
import
java.io.IOException
;
import
java.nio.file.Files
;
import
java.nio.file.Path
;
import
java.nio.file.Paths
;
class
TranscribeDiarization
{
static
void
transcribeDiarization
()
throws
IOException
{
// TODO(developer): Replace these variables before running the sample.
String
fileName
=
"resources/commercial_mono.wav"
;
transcribeDiarization
(
fileName
);
}
// Transcribe the given audio file using speaker diarization.
static
void
transcribeDiarization
(
String
fileName
)
throws
IOException
{
Path
path
=
Paths
.
get
(
fileName
);
byte
[]
content
=
Files
.
readAllBytes
(
path
);
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests. After completing all of your requests, call
// the "close" method on the client to safely clean up any remaining background resources.
try
(
SpeechClient
client
=
SpeechClient
.
create
())
{
// Get the contents of the local audio file
RecognitionAudio
recognitionAudio
=
RecognitionAudio
.
newBuilder
().
setContent
(
ByteString
.
copyFrom
(
content
)).
build
();
SpeakerDiarizationConfig
speakerDiarizationConfig
=
SpeakerDiarizationConfig
.
newBuilder
()
.
setEnableSpeakerDiarization
(
true
)
.
setMinSpeakerCount
(
2
)
.
setMaxSpeakerCount
(
2
)
.
build
();
// Configure request to enable Speaker diarization
RecognitionConfig
config
=
RecognitionConfig
.
newBuilder
()
.
setEncoding
(
RecognitionConfig
.
AudioEncoding
.
LINEAR16
)
.
setLanguageCode
(
"en-US"
)
.
setSampleRateHertz
(
8000
)
.
setDiarizationConfig
(
speakerDiarizationConfig
)
.
build
();
// Perform the transcription request
RecognizeResponse
recognizeResponse
=
client
.
recognize
(
config
,
recognitionAudio
);
// Speaker Tags are only included in the last result object, which has only one alternative.
SpeechRecognitionAlternative
alternative
=
recognizeResponse
.
getResults
(
recognizeResponse
.
getResults
Count ()
-
1
).
getAlternatives
(
0
);
// The alternative is made up of WordInfo objects that contain the speaker_tag.
WordInfo
wordInfo
=
alternative
.
getWords
(
0
);
int
currentSpeakerTag
=
wordInfo
.
getSpeakerTag
();
// For each word, get all the words associated with one speaker, once the speaker changes,
// add a new line with the new speaker and their spoken words.
StringBuilder
speakerWords
=
new
StringBuilder
(
String
.
format
(
"Speaker %d: %s"
,
wordInfo
.
getSpeakerTag
(),
wordInfo
.
getWord
()));
for
(
int
i
=
1
;
i
<
alternative
.
getWordsCount
();
i
++
)
{
wordInfo
=
alternative
.
getWords
(
i
);
if
(
currentSpeakerTag
==
wordInfo
.
getSpeakerTag
())
{
speakerWords
.
append
(
" "
);
speakerWords
.
append
(
wordInfo
.
getWord
());
}
else
{
speakerWords
.
append
(
String
.
format
(
"\nSpeaker %d: %s"
,
wordInfo
.
getSpeakerTag
(),
wordInfo
.
getWord
()));
currentSpeakerTag
=
wordInfo
.
getSpeakerTag
();
}
}
System
.
out
.
println
(
speakerWords
.
toString
());
}
}
}
Node.js
To learn how to install and use the client library for Speech-to-Text, see Speech-to-Text client libraries . For more information, see the Speech-to-Text Node.js API reference documentation .
To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .
const
fs
=
require
(
'fs'
);
// Imports the Google Cloud client library
const
speech
=
require
(
' @google-cloud/speech
'
);
// Creates a client
const
client
=
new
speech
.
SpeechClient
();
// Set config for Diarization
const
diarizationConfig
=
{
enableSpeakerDiarization
:
true
,
maxSpeakerCount
:
2
,
};
const
config
=
{
encoding
:
'LINEAR16'
,
sampleRateHertz
:
8000
,
languageCode
:
'en-US'
,
diarizationConfig
:
diarizationConfig
,
model
:
'phone_call'
,
};
/**
* TODO(developer): Uncomment the following lines before running the sample.
*/
// const fileName = 'Local path to audio file, e.g. /path/to/audio.raw';
const
audio
=
{
content
:
fs
.
readFileSync
(
fileName
).
toString
(
'base64'
),
};
const
request
=
{
config
:
config
,
audio
:
audio
,
};
const
[
response
]
=
await
client
.
recognize
(
request
);
const
transcription
=
response
.
results
.
map
(
result
=
>
result
.
alternatives
[
0
].
transcript
)
.
join
(
'\n'
);
console
.
log
(
`Transcription:
${
transcription
}
`
);
console
.
log
(
'Speaker Diarization:'
);
const
result
=
response
.
results
[
response
.
results
.
length
-
1
];
const
wordsInfo
=
result
.
alternatives
[
0
].
words
;
// Note: The transcript within each result is separate and sequential per result.
// However, the words list within an alternative includes all the words
// from all the results thus far. Thus, to get all the words with speaker
// tags, you only have to take the words list from the last result:
wordsInfo
.
forEach
(
a
=
>
console
.
log
(
` word:
${
a
.
word
}
, speakerTag:
${
a
.
speakerTag
}
`
)
);
What's next
To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser .