Files
Examples/C#/api/Recognize/Recognize.cs
2018-04-20 10:15:15 +01:00

503 lines
20 KiB
C#

/*
* Copyright (c) 2017 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
using CommandLine;
using Google.Apis.Auth.OAuth2;
using Google.Cloud.Speech.V1;
using Grpc.Auth;
using System;
using System.Collections.Generic;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
namespace GoogleCloudSamples
{
class Options
{
[Value(0, HelpText = "A path to a sound file. Encoding must be "
+ "Linear16 with a sample rate of 16000.", Required = true)]
public string FilePath { get; set; }
}
class StorageOptions
{
[Value(0, HelpText = "A path to a sound file. "
+ "Can be a local file path or a Google Cloud Storage path like "
+ "gs://my-bucket/my-object. "
+ "Encoding must be "
+ "Linear16 with a sample rate of 16000.", Required = true)]
public string FilePath { get; set; }
}
[Verb("sync", HelpText = "Detects speech in an audio file.")]
class SyncOptions : StorageOptions
{
[Option('w', HelpText = "Report the time offsets of individual words.")]
public bool EnableWordTimeOffsets { get; set; }
}
[Verb("with-context", HelpText = "Detects speech in an audio file."
+ " Add additional context on stdin.")]
class OptionsWithContext : StorageOptions { }
[Verb("async", HelpText = "Creates a job to detect speech in an audio "
+ "file, and waits for the job to complete.")]
class AsyncOptions : StorageOptions
{
[Option('w', HelpText = "Report the time offsets of individual words.")]
public bool EnableWordTimeOffsets { get; set; }
}
[Verb("sync-creds", HelpText = "Detects speech in an audio file.")]
class SyncOptionsWithCreds
{
[Value(0, HelpText = "A path to a sound file. Encoding must be "
+ "Linear16 with a sample rate of 16000.", Required = true)]
public string FilePath { get; set; }
[Value(1, HelpText = "Path to Google credentials json file.", Required = true)]
public string CredentialsFilePath { get; set; }
}
[Verb("stream", HelpText = "Detects speech in an audio file by streaming "
+ "it to the Speech API.")]
class StreamingOptions : Options { }
[Verb("listen", HelpText = "Detects speech in a microphone input stream.")]
class ListenOptions
{
[Value(0, HelpText = "Number of seconds to listen for.", Required = false)]
public int Seconds { get; set; } = 3;
}
[Verb("rec", HelpText = "Detects speech in an audio file. Supports other file formats.")]
class RecOptions : Options
{
[Option('b', Default = 16000, HelpText = "Sample rate in bits per second.")]
public int BitRate { get; set; }
[Option('e', Default = RecognitionConfig.Types.AudioEncoding.Linear16,
HelpText = "Audio file encoding format.")]
public RecognitionConfig.Types.AudioEncoding Encoding { get; set; }
}
public class Recognize
{
static object Rec(string filePath, int bitRate,
RecognitionConfig.Types.AudioEncoding encoding)
{
var speech = SpeechClient.Create();
var response = speech.Recognize(new RecognitionConfig()
{
Encoding = encoding,
SampleRateHertz = bitRate,
LanguageCode = "en",
}, RecognitionAudio.FromFile(filePath));
foreach (var result in response.Results)
{
foreach (var alternative in result.Alternatives)
{
Console.WriteLine(alternative.Transcript);
}
}
return 0;
}
// [START speech_sync_recognize]
static object SyncRecognize(string filePath)
{
var speech = SpeechClient.Create();
var response = speech.Recognize(new RecognitionConfig()
{
Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 16000,
LanguageCode = "en",
}, RecognitionAudio.FromFile(filePath));
foreach (var result in response.Results)
{
foreach (var alternative in result.Alternatives)
{
Console.WriteLine(alternative.Transcript);
}
}
return 0;
}
// [END speech_sync_recognize]
// [START speech_sync_recognize_words]
static object SyncRecognizeWords(string filePath)
{
var speech = SpeechClient.Create();
var response = speech.Recognize(new RecognitionConfig()
{
Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 16000,
LanguageCode = "en",
EnableWordTimeOffsets = true,
}, RecognitionAudio.FromFile(filePath));
foreach (var result in response.Results)
{
foreach (var alternative in result.Alternatives)
{
Console.WriteLine($"Transcript: { alternative.Transcript}");
Console.WriteLine("Word details:");
Console.WriteLine($" Word count:{alternative.Words.Count}");
foreach (var item in alternative.Words)
{
Console.WriteLine($" {item.Word}");
Console.WriteLine($" WordStartTime: {item.StartTime}");
Console.WriteLine($" WordEndTime: {item.EndTime}");
}
}
}
return 0;
}
// [END speech_sync_recognize_words]
/// <summary>
/// Reads a list of phrases from stdin.
/// </summary>
static List<string> ReadPhrases()
{
Console.Write("Reading phrases from stdin. Finish with blank line.\n> ");
var phrases = new List<string>();
string line = Console.ReadLine();
while (!string.IsNullOrWhiteSpace(line))
{
phrases.Add(line.Trim());
Console.Write("> ");
line = Console.ReadLine();
}
return phrases;
}
static object RecognizeWithContext(string filePath, IEnumerable<string> phrases)
{
var speech = SpeechClient.Create();
var config = new RecognitionConfig()
{
SpeechContexts = { new SpeechContext() { Phrases = { phrases } } },
Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 16000,
LanguageCode = "en",
};
var audio = IsStorageUri(filePath) ?
RecognitionAudio.FromStorageUri(filePath) :
RecognitionAudio.FromFile(filePath);
var response = speech.Recognize(config, audio);
foreach (var result in response.Results)
{
foreach (var alternative in result.Alternatives)
{
Console.WriteLine(alternative.Transcript);
}
}
return 0;
}
static object SyncRecognizeWithCredentials(string filePath, string credentialsFilePath)
{
GoogleCredential googleCredential;
using (Stream m = new FileStream(credentialsFilePath, FileMode.Open))
googleCredential = GoogleCredential.FromStream(m);
var channel = new Grpc.Core.Channel(SpeechClient.DefaultEndpoint.Host,
googleCredential.ToChannelCredentials());
var speech = SpeechClient.Create(channel);
var response = speech.Recognize(new RecognitionConfig()
{
Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 16000,
LanguageCode = "en",
}, RecognitionAudio.FromFile(filePath));
foreach (var result in response.Results)
{
foreach (var alternative in result.Alternatives)
{
Console.WriteLine(alternative.Transcript);
}
}
return 0;
}
// [START speech_sync_recognize_gcs]
static object SyncRecognizeGcs(string storageUri)
{
var speech = SpeechClient.Create();
var response = speech.Recognize(new RecognitionConfig()
{
Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 16000,
LanguageCode = "en",
}, RecognitionAudio.FromStorageUri(storageUri));
foreach (var result in response.Results)
{
foreach (var alternative in result.Alternatives)
{
Console.WriteLine(alternative.Transcript);
}
}
return 0;
}
// [END speech_sync_recognize_gcs]
// [START speech_async_recognize]
static object LongRunningRecognize(string filePath)
{
var speech = SpeechClient.Create();
var longOperation = speech.LongRunningRecognize(new RecognitionConfig()
{
Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 16000,
LanguageCode = "en",
}, RecognitionAudio.FromFile(filePath));
longOperation = longOperation.PollUntilCompleted();
var response = longOperation.Result;
foreach (var result in response.Results)
{
foreach (var alternative in result.Alternatives)
{
Console.WriteLine(alternative.Transcript);
}
}
return 0;
}
// [END speech_async_recognize]
// [START speech_async_recognize_gcs]
static object AsyncRecognizeGcs(string storageUri)
{
var speech = SpeechClient.Create();
var longOperation = speech.LongRunningRecognize(new RecognitionConfig()
{
Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 16000,
LanguageCode = "en",
}, RecognitionAudio.FromStorageUri(storageUri));
longOperation = longOperation.PollUntilCompleted();
var response = longOperation.Result;
foreach (var result in response.Results)
{
foreach (var alternative in result.Alternatives)
{
Console.WriteLine($"Transcript: { alternative.Transcript}");
}
}
return 0;
}
// [END speech_async_recognize_gcs]
// [START speech_async_recognize_gcs_words]
static object AsyncRecognizeGcsWords(string storageUri)
{
var speech = SpeechClient.Create();
var longOperation = speech.LongRunningRecognize(new RecognitionConfig()
{
Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 16000,
LanguageCode = "en",
EnableWordTimeOffsets = true,
}, RecognitionAudio.FromStorageUri(storageUri));
longOperation = longOperation.PollUntilCompleted();
var response = longOperation.Result;
foreach (var result in response.Results)
{
foreach (var alternative in result.Alternatives)
{
Console.WriteLine($"Transcript: { alternative.Transcript}");
Console.WriteLine("Word details:");
Console.WriteLine($" Word count:{alternative.Words.Count}");
foreach (var item in alternative.Words)
{
Console.WriteLine($" {item.Word}");
Console.WriteLine($" WordStartTime: {item.StartTime}");
Console.WriteLine($" WordEndTime: {item.EndTime}");
}
}
}
return 0;
}
// [END speech_async_recognize_gcs_words]
/// <summary>
/// Stream the content of the file to the API in 32kb chunks.
/// </summary>
// [START speech_streaming_recognize]
static async Task<object> StreamingRecognizeAsync(string filePath)
{
var speech = SpeechClient.Create();
var streamingCall = speech.StreamingRecognize();
// Write the initial request with the config.
await streamingCall.WriteAsync(
new StreamingRecognizeRequest()
{
StreamingConfig = new StreamingRecognitionConfig()
{
Config = new RecognitionConfig()
{
Encoding =
RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 16000,
LanguageCode = "en",
},
InterimResults = true,
}
});
// Print responses as they arrive.
Task printResponses = Task.Run(async () =>
{
while (await streamingCall.ResponseStream.MoveNext(
default(CancellationToken)))
{
foreach (var result in streamingCall.ResponseStream
.Current.Results)
{
foreach (var alternative in result.Alternatives)
{
Console.WriteLine(alternative.Transcript);
}
}
}
});
// Stream the file content to the API. Write 2 32kb chunks per
// second.
using (FileStream fileStream = new FileStream(filePath, FileMode.Open))
{
var buffer = new byte[32 * 1024];
int bytesRead;
while ((bytesRead = await fileStream.ReadAsync(
buffer, 0, buffer.Length)) > 0)
{
await streamingCall.WriteAsync(
new StreamingRecognizeRequest()
{
AudioContent = Google.Protobuf.ByteString
.CopyFrom(buffer, 0, bytesRead),
});
await Task.Delay(500);
};
}
await streamingCall.WriteCompleteAsync();
await printResponses;
return 0;
}
// [END speech_streaming_recognize]
// [START speech_streaming_mic_recognize]
static async Task<object> StreamingMicRecognizeAsync(int seconds)
{
if (NAudio.Wave.WaveIn.DeviceCount < 1)
{
Console.WriteLine("No microphone!");
return -1;
}
var speech = SpeechClient.Create();
var streamingCall = speech.StreamingRecognize();
// Write the initial request with the config.
await streamingCall.WriteAsync(
new StreamingRecognizeRequest()
{
StreamingConfig = new StreamingRecognitionConfig()
{
Config = new RecognitionConfig()
{
Encoding =
RecognitionConfig.Types.AudioEncoding.Linear16,
SampleRateHertz = 16000,
LanguageCode = "en",
},
InterimResults = true,
}
});
// Print responses as they arrive.
Task printResponses = Task.Run(async () =>
{
while (await streamingCall.ResponseStream.MoveNext(
default(CancellationToken)))
{
foreach (var result in streamingCall.ResponseStream
.Current.Results)
{
foreach (var alternative in result.Alternatives)
{
Console.WriteLine(alternative.Transcript);
}
}
}
});
// Read from the microphone and stream to API.
object writeLock = new object();
bool writeMore = true;
var waveIn = new NAudio.Wave.WaveInEvent();
waveIn.DeviceNumber = 0;
waveIn.WaveFormat = new NAudio.Wave.WaveFormat(16000, 1);
waveIn.DataAvailable +=
(object sender, NAudio.Wave.WaveInEventArgs args) =>
{
lock (writeLock)
{
if (!writeMore) return;
streamingCall.WriteAsync(
new StreamingRecognizeRequest()
{
AudioContent = Google.Protobuf.ByteString
.CopyFrom(args.Buffer, 0, args.BytesRecorded)
}).Wait();
}
};
waveIn.StartRecording();
Console.WriteLine("Speak now.");
await Task.Delay(TimeSpan.FromSeconds(seconds));
// Stop recording and shut down.
waveIn.StopRecording();
lock (writeLock) writeMore = false;
await streamingCall.WriteCompleteAsync();
await printResponses;
return 0;
}
// [END speech_streaming_mic_recognize]
static bool IsStorageUri(string s) => s.Substring(0, 4).ToLower() == "gs:/";
public static int Main(string[] args)
{
return (int)Parser.Default.ParseArguments<
SyncOptions, AsyncOptions,
StreamingOptions, ListenOptions,
RecOptions, SyncOptionsWithCreds,
OptionsWithContext
>(args).MapResult(
(SyncOptions opts) => IsStorageUri(opts.FilePath) ?
SyncRecognizeGcs(opts.FilePath) : opts.EnableWordTimeOffsets ?
SyncRecognizeWords(opts.FilePath) : SyncRecognize(opts.FilePath),
(AsyncOptions opts) => IsStorageUri(opts.FilePath) ?
(opts.EnableWordTimeOffsets ? AsyncRecognizeGcsWords(opts.FilePath)
: AsyncRecognizeGcs(opts.FilePath))
: LongRunningRecognize(opts.FilePath),
(StreamingOptions opts) => StreamingRecognizeAsync(opts.FilePath).Result,
(ListenOptions opts) => StreamingMicRecognizeAsync(opts.Seconds).Result,
(RecOptions opts) => Rec(opts.FilePath, opts.BitRate, opts.Encoding),
(SyncOptionsWithCreds opts) => SyncRecognizeWithCredentials(
opts.FilePath, opts.CredentialsFilePath),
(OptionsWithContext opts) => RecognizeWithContext(opts.FilePath, ReadPhrases()),
errs => 1);
}
}
}