2020-05-14

[Google Cloud] 文字轉語音 Cloud Text-to-Speech


Cloud Text-to-Speech
透過機器學習技術將文字轉為語音

可以先在介紹網站中試用是否符合需求。


Text to Speech 收費定價

使用方式介紹

1.在 Google Cloud Platform 建立專案

註冊 Google Cloud 帳號

建立專用專案


設定專案名稱


 2.在 API 服務面板 中啟用 Cloud Text-to-Speech API
 
進入API服務面板



進入API搜尋頁面

選擇 Cloud Text-to-Speech API

啟用 API


 3.在API 面板建立 API 金鑰


建立API Key

複製 API Key


 如果有安全性需求,可以進入 API Key 管理裡面調整可存取的 API 權限。

 4.透過Url + API Key取得語音資料

 https://texttospeech.googleapis.com/v1beta1/text:synthesize?key=API金鑰

API 使用參考

{
  "audioConfig": {
    "audioEncoding": "LINEAR16",
    "pitch": 0,
    "speakingRate": 1
  },
  "input": {
    "text": "你好"
  },
  "voice": {
    "languageCode": "cmn-CN",
    "name": "cmn-CN-Wavenet-C"
  }
}



5.解析語音資料
上一步的請求會回傳 JSON,格式如下
{  "audioContent": "UklGRqSOAABXQVZFZm10IBAAAAABA..." }
※ audioContent 內容為Wav檔案的Base64編碼 (當audioEncoding為LINEAR16)




6. Sample Code (以 Unity C# 為例)

 public IEnumerator GetWordAudio(string word, Dictionary cache, string key)
 {
  AudioClip myAudioClip;
  var url = "https://texttospeech.googleapis.com/v1beta1/text:synthesize?key=" + UnityWebRequest.EscapeURL(api_key);
  var args = new Dictionary();
  args["audioConfig"] = new Dictionary() {
   // https://cloud.google.com/text-to-speech/docs/reference/rest/v1beta1/text/synthesize?apix_params=%7B%22resource%22%3A%7B%7D%7D#audioencoding
   {"audioEncoding", "LINEAR16" },
   {"pitch", 0.0f },
   {"speakingRate", 1.0f },
  };
  args["input"] = new Dictionary() {
   {"text", word }
  };
  args["voice"] = new Dictionary() {
   {"languageCode", "cmn-CN" },
   {"name", "cmn-CN-Wavenet-C" }
  };

  var postData = MiniJSON.Json.Serialize(args);
  var www = new UnityWebRequest(url, UnityWebRequest.kHttpVerbPOST);
  byte[] bodyRaw = Encoding.UTF8.GetBytes(postData);
  www.uploadHandler = new UploadHandlerRaw(bodyRaw);
  www.downloadHandler = new DownloadHandlerBuffer();
  www.SetRequestHeader("Content-Type", "application/json");
  yield return www.SendWebRequest();
  while (!www.isNetworkError && !www.isHttpError && !www.isDone)
  {
   yield return new WaitForEndOfFrame();
  }
  byte[] decodedBytes = null;
  try
  {
   if (www.downloadedBytes == 0) { myAudioClip = null; }
   else
   {
    var rawData = www.downloadHandler.text;
    var result = MiniJSON.Json.Deserialize(rawData) as Dictionary;
    var audioContent = result["audioContent"] as string;
    decodedBytes = Convert.FromBase64String(audioContent);
    myAudioClip = CreateAudioClipFromWaveBytes(decodedBytes, word);
   }
   www.Dispose();
  }
  catch (System.Exception err)
  {
   myAudioClip = null;
   Debug.LogWarning(err);
  }

  cache[key] = myAudioClip;
 }

 private static AudioClip CreateAudioClipFromWaveBytes(byte[] bytes, string name)
 {
  // http://soundfile.sapp.org/doc/WaveFormat/
  // ex. total 47806 bytes
  //[00] 82 73 70 70  52 49 46 46  => RIFF
  //[04] 182 186 0 0  b6 ba 00 00  => ChunkSize:47798 = Subchunk2Size + 36 = 47762 + 36
  //[08] 87 65 86 69  57 41 56 45  => WAVE
  //[12] 102 109 116 32  66 6d 74 20  => fmt 
  //[16] 16 0 0 0   10 00 00 00  => Subchunk1Size:16 (16 for PCM)
  //[20] 1 0 1 0   01 00 01 00  => AudioFormat:1 (PCM = 1 非壓縮) , => NumChannels:1
  //[24] 192 93 0 0   c0 5d 00 00  => SampleRate:24000
  //[28] 128 187 0 0  80 bb 00 00  => ByteRate:48000
  //[32] 2 0 16 0   02 00 10 00  => BlockAlign:2 , => BitsPerSample:16 bits
  //[36] 100 97 116 97  64 61 74 61  => data
  //[40] 146 186 0 0  92 ba 00 00  => Subchunk2Size:47762 
  //[44]  0 0 0 0 => sample1 , sample2 ...


  //var chunkSize = BitConverter.ToUInt32(bytes, 4);
  var numChannels = BitConverter.ToInt16(bytes, 22);
  var sampleRate = BitConverter.ToInt32(bytes, 24);
  //var byteRate = BitConverter.ToUInt32(bytes, 28);
  var blockAlign = BitConverter.ToInt16(bytes, 32);
  //var bitsPerSample = BitConverter.ToUInt16(bytes, 34);
  var subchunk2Size = BitConverter.ToInt32(bytes, 40);

  int audioLength = subchunk2Size / blockAlign;

  float[] floatArr = new float[audioLength];
  for (int i = 0; i < audioLength; i++)
  {
   int pos = 44 + i * 2;
   floatArr[i] = bytesToFloat(bytes[pos], bytes[pos + 1]);
  }

  var myAudioClip = AudioClip.Create(name, audioLength, numChannels, sampleRate, false);
  myAudioClip.SetData(floatArr, 0);

  return myAudioClip;
 }

 // convert two bytes to one float in the range -1 to 1
 static float bytesToFloat(byte firstByte, byte secondByte)
 {
  // convert two bytes to one short (little endian)
  short s = (short)((secondByte << 8) | firstByte);
  // convert to range from -1 to (just below) 1
  return s / 32768.0F;
 }

 static int bytesToInt(byte[] bytes, int offset = 0)
 {
  int value = 0;
  for (int i = 0; i < 4; i++)
  {
   value |= ((int)bytes[offset + i]) << (i * 8);
  }
  return value;
 }



參考資料
Cloud Text-to-Speech
Cloud Text-to-Speech API



沒有留言:

張貼留言