/* * Copyright (c) 2017 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ using CommandLine; using Google.Apis.Auth.OAuth2; using Google.Cloud.Speech.V1; using Grpc.Auth; using System; using System.Collections.Generic; using System.IO; using System.Threading; using System.Threading.Tasks; namespace GoogleCloudSamples { class Options { [Value(0, HelpText = "A path to a sound file. Encoding must be " + "Linear16 with a sample rate of 16000.", Required = true)] public string FilePath { get; set; } } class StorageOptions { [Value(0, HelpText = "A path to a sound file. " + "Can be a local file path or a Google Cloud Storage path like " + "gs://my-bucket/my-object. " + "Encoding must be " + "Linear16 with a sample rate of 16000.", Required = true)] public string FilePath { get; set; } } [Verb("sync", HelpText = "Detects speech in an audio file.")] class SyncOptions : StorageOptions { [Option('w', HelpText = "Report the time offsets of individual words.")] public bool EnableWordTimeOffsets { get; set; } } [Verb("with-context", HelpText = "Detects speech in an audio file." + " Add additional context on stdin.")] class OptionsWithContext : StorageOptions { } [Verb("async", HelpText = "Creates a job to detect speech in an audio " + "file, and waits for the job to complete.")] class AsyncOptions : StorageOptions { [Option('w', HelpText = "Report the time offsets of individual words.")] public bool EnableWordTimeOffsets { get; set; } } [Verb("sync-creds", HelpText = "Detects speech in an audio file.")] class SyncOptionsWithCreds { [Value(0, HelpText = "A path to a sound file. Encoding must be " + "Linear16 with a sample rate of 16000.", Required = true)] public string FilePath { get; set; } [Value(1, HelpText = "Path to Google credentials json file.", Required = true)] public string CredentialsFilePath { get; set; } } [Verb("stream", HelpText = "Detects speech in an audio file by streaming " + "it to the Speech API.")] class StreamingOptions : Options { } [Verb("listen", HelpText = "Detects speech in a microphone input stream.")] class ListenOptions { [Value(0, HelpText = "Number of seconds to listen for.", Required = false)] public int Seconds { get; set; } = 3; } [Verb("rec", HelpText = "Detects speech in an audio file. Supports other file formats.")] class RecOptions : Options { [Option('b', Default = 16000, HelpText = "Sample rate in bits per second.")] public int BitRate { get; set; } [Option('e', Default = RecognitionConfig.Types.AudioEncoding.Linear16, HelpText = "Audio file encoding format.")] public RecognitionConfig.Types.AudioEncoding Encoding { get; set; } } public class Recognize { static object Rec(string filePath, int bitRate, RecognitionConfig.Types.AudioEncoding encoding) { var speech = SpeechClient.Create(); var response = speech.Recognize(new RecognitionConfig() { Encoding = encoding, SampleRateHertz = bitRate, LanguageCode = "en", }, RecognitionAudio.FromFile(filePath)); foreach (var result in response.Results) { foreach (var alternative in result.Alternatives) { Console.WriteLine(alternative.Transcript); } } return 0; } // [START speech_sync_recognize] static object SyncRecognize(string filePath) { var speech = SpeechClient.Create(); var response = speech.Recognize(new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 16000, LanguageCode = "en", }, RecognitionAudio.FromFile(filePath)); foreach (var result in response.Results) { foreach (var alternative in result.Alternatives) { Console.WriteLine(alternative.Transcript); } } return 0; } // [END speech_sync_recognize] // [START speech_sync_recognize_words] static object SyncRecognizeWords(string filePath) { var speech = SpeechClient.Create(); var response = speech.Recognize(new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 16000, LanguageCode = "en", EnableWordTimeOffsets = true, }, RecognitionAudio.FromFile(filePath)); foreach (var result in response.Results) { foreach (var alternative in result.Alternatives) { Console.WriteLine($"Transcript: { alternative.Transcript}"); Console.WriteLine("Word details:"); Console.WriteLine($" Word count:{alternative.Words.Count}"); foreach (var item in alternative.Words) { Console.WriteLine($" {item.Word}"); Console.WriteLine($" WordStartTime: {item.StartTime}"); Console.WriteLine($" WordEndTime: {item.EndTime}"); } } } return 0; } // [END speech_sync_recognize_words] /// /// Reads a list of phrases from stdin. /// static List ReadPhrases() { Console.Write("Reading phrases from stdin. Finish with blank line.\n> "); var phrases = new List(); string line = Console.ReadLine(); while (!string.IsNullOrWhiteSpace(line)) { phrases.Add(line.Trim()); Console.Write("> "); line = Console.ReadLine(); } return phrases; } static object RecognizeWithContext(string filePath, IEnumerable phrases) { var speech = SpeechClient.Create(); var config = new RecognitionConfig() { SpeechContexts = { new SpeechContext() { Phrases = { phrases } } }, Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 16000, LanguageCode = "en", }; var audio = IsStorageUri(filePath) ? RecognitionAudio.FromStorageUri(filePath) : RecognitionAudio.FromFile(filePath); var response = speech.Recognize(config, audio); foreach (var result in response.Results) { foreach (var alternative in result.Alternatives) { Console.WriteLine(alternative.Transcript); } } return 0; } static object SyncRecognizeWithCredentials(string filePath, string credentialsFilePath) { GoogleCredential googleCredential; using (Stream m = new FileStream(credentialsFilePath, FileMode.Open)) googleCredential = GoogleCredential.FromStream(m); var channel = new Grpc.Core.Channel(SpeechClient.DefaultEndpoint.Host, googleCredential.ToChannelCredentials()); var speech = SpeechClient.Create(channel); var response = speech.Recognize(new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 16000, LanguageCode = "en", }, RecognitionAudio.FromFile(filePath)); foreach (var result in response.Results) { foreach (var alternative in result.Alternatives) { Console.WriteLine(alternative.Transcript); } } return 0; } // [START speech_sync_recognize_gcs] static object SyncRecognizeGcs(string storageUri) { var speech = SpeechClient.Create(); var response = speech.Recognize(new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 16000, LanguageCode = "en", }, RecognitionAudio.FromStorageUri(storageUri)); foreach (var result in response.Results) { foreach (var alternative in result.Alternatives) { Console.WriteLine(alternative.Transcript); } } return 0; } // [END speech_sync_recognize_gcs] // [START speech_async_recognize] static object LongRunningRecognize(string filePath) { var speech = SpeechClient.Create(); var longOperation = speech.LongRunningRecognize(new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 16000, LanguageCode = "en", }, RecognitionAudio.FromFile(filePath)); longOperation = longOperation.PollUntilCompleted(); var response = longOperation.Result; foreach (var result in response.Results) { foreach (var alternative in result.Alternatives) { Console.WriteLine(alternative.Transcript); } } return 0; } // [END speech_async_recognize] // [START speech_async_recognize_gcs] static object AsyncRecognizeGcs(string storageUri) { var speech = SpeechClient.Create(); var longOperation = speech.LongRunningRecognize(new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 16000, LanguageCode = "en", }, RecognitionAudio.FromStorageUri(storageUri)); longOperation = longOperation.PollUntilCompleted(); var response = longOperation.Result; foreach (var result in response.Results) { foreach (var alternative in result.Alternatives) { Console.WriteLine($"Transcript: { alternative.Transcript}"); } } return 0; } // [END speech_async_recognize_gcs] // [START speech_async_recognize_gcs_words] static object AsyncRecognizeGcsWords(string storageUri) { var speech = SpeechClient.Create(); var longOperation = speech.LongRunningRecognize(new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 16000, LanguageCode = "en", EnableWordTimeOffsets = true, }, RecognitionAudio.FromStorageUri(storageUri)); longOperation = longOperation.PollUntilCompleted(); var response = longOperation.Result; foreach (var result in response.Results) { foreach (var alternative in result.Alternatives) { Console.WriteLine($"Transcript: { alternative.Transcript}"); Console.WriteLine("Word details:"); Console.WriteLine($" Word count:{alternative.Words.Count}"); foreach (var item in alternative.Words) { Console.WriteLine($" {item.Word}"); Console.WriteLine($" WordStartTime: {item.StartTime}"); Console.WriteLine($" WordEndTime: {item.EndTime}"); } } } return 0; } // [END speech_async_recognize_gcs_words] /// /// Stream the content of the file to the API in 32kb chunks. /// // [START speech_streaming_recognize] static async Task StreamingRecognizeAsync(string filePath) { var speech = SpeechClient.Create(); var streamingCall = speech.StreamingRecognize(); // Write the initial request with the config. await streamingCall.WriteAsync( new StreamingRecognizeRequest() { StreamingConfig = new StreamingRecognitionConfig() { Config = new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 16000, LanguageCode = "en", }, InterimResults = true, } }); // Print responses as they arrive. Task printResponses = Task.Run(async () => { while (await streamingCall.ResponseStream.MoveNext( default(CancellationToken))) { foreach (var result in streamingCall.ResponseStream .Current.Results) { foreach (var alternative in result.Alternatives) { Console.WriteLine(alternative.Transcript); } } } }); // Stream the file content to the API. Write 2 32kb chunks per // second. using (FileStream fileStream = new FileStream(filePath, FileMode.Open)) { var buffer = new byte[32 * 1024]; int bytesRead; while ((bytesRead = await fileStream.ReadAsync( buffer, 0, buffer.Length)) > 0) { await streamingCall.WriteAsync( new StreamingRecognizeRequest() { AudioContent = Google.Protobuf.ByteString .CopyFrom(buffer, 0, bytesRead), }); await Task.Delay(500); }; } await streamingCall.WriteCompleteAsync(); await printResponses; return 0; } // [END speech_streaming_recognize] // [START speech_streaming_mic_recognize] static async Task StreamingMicRecognizeAsync(int seconds) { if (NAudio.Wave.WaveIn.DeviceCount < 1) { Console.WriteLine("No microphone!"); return -1; } var speech = SpeechClient.Create(); var streamingCall = speech.StreamingRecognize(); // Write the initial request with the config. await streamingCall.WriteAsync( new StreamingRecognizeRequest() { StreamingConfig = new StreamingRecognitionConfig() { Config = new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 16000, LanguageCode = "en", }, InterimResults = true, } }); // Print responses as they arrive. Task printResponses = Task.Run(async () => { while (await streamingCall.ResponseStream.MoveNext( default(CancellationToken))) { foreach (var result in streamingCall.ResponseStream .Current.Results) { foreach (var alternative in result.Alternatives) { Console.WriteLine(alternative.Transcript); } } } }); // Read from the microphone and stream to API. object writeLock = new object(); bool writeMore = true; var waveIn = new NAudio.Wave.WaveInEvent(); waveIn.DeviceNumber = 0; waveIn.WaveFormat = new NAudio.Wave.WaveFormat(16000, 1); waveIn.DataAvailable += (object sender, NAudio.Wave.WaveInEventArgs args) => { lock (writeLock) { if (!writeMore) return; streamingCall.WriteAsync( new StreamingRecognizeRequest() { AudioContent = Google.Protobuf.ByteString .CopyFrom(args.Buffer, 0, args.BytesRecorded) }).Wait(); } }; waveIn.StartRecording(); Console.WriteLine("Speak now."); await Task.Delay(TimeSpan.FromSeconds(seconds)); // Stop recording and shut down. waveIn.StopRecording(); lock (writeLock) writeMore = false; await streamingCall.WriteCompleteAsync(); await printResponses; return 0; } // [END speech_streaming_mic_recognize] static bool IsStorageUri(string s) => s.Substring(0, 4).ToLower() == "gs:/"; public static int Main(string[] args) { return (int)Parser.Default.ParseArguments< SyncOptions, AsyncOptions, StreamingOptions, ListenOptions, RecOptions, SyncOptionsWithCreds, OptionsWithContext >(args).MapResult( (SyncOptions opts) => IsStorageUri(opts.FilePath) ? SyncRecognizeGcs(opts.FilePath) : opts.EnableWordTimeOffsets ? SyncRecognizeWords(opts.FilePath) : SyncRecognize(opts.FilePath), (AsyncOptions opts) => IsStorageUri(opts.FilePath) ? (opts.EnableWordTimeOffsets ? AsyncRecognizeGcsWords(opts.FilePath) : AsyncRecognizeGcs(opts.FilePath)) : LongRunningRecognize(opts.FilePath), (StreamingOptions opts) => StreamingRecognizeAsync(opts.FilePath).Result, (ListenOptions opts) => StreamingMicRecognizeAsync(opts.Seconds).Result, (RecOptions opts) => Rec(opts.FilePath, opts.BitRate, opts.Encoding), (SyncOptionsWithCreds opts) => SyncRecognizeWithCredentials( opts.FilePath, opts.CredentialsFilePath), (OptionsWithContext opts) => RecognizeWithContext(opts.FilePath, ReadPhrases()), errs => 1); } } }