2020-05-14

[Google Cloud] 文字轉語音 Cloud Text-to-Speech


Cloud Text-to-Speech
透過機器學習技術將文字轉為語音

可以先在介紹網站中試用是否符合需求。


Text to Speech 收費定價

使用方式介紹

1.在 Google Cloud Platform 建立專案

註冊 Google Cloud 帳號

建立專用專案


設定專案名稱


 2.在 API 服務面板 中啟用 Cloud Text-to-Speech API
 
進入API服務面板



進入API搜尋頁面

選擇 Cloud Text-to-Speech API

啟用 API


 3.在API 面板建立 API 金鑰


建立API Key

複製 API Key


 如果有安全性需求,可以進入 API Key 管理裡面調整可存取的 API 權限。

 4.透過Url + API Key取得語音資料

 https://texttospeech.googleapis.com/v1beta1/text:synthesize?key=API金鑰

API 使用參考

  1. {
  2. "audioConfig": {
  3. "audioEncoding": "LINEAR16",
  4. "pitch": 0,
  5. "speakingRate": 1
  6. },
  7. "input": {
  8. "text": "你好"
  9. },
  10. "voice": {
  11. "languageCode": "cmn-CN",
  12. "name": "cmn-CN-Wavenet-C"
  13. }
  14. }



5.解析語音資料
上一步的請求會回傳 JSON,格式如下
{  "audioContent": "UklGRqSOAABXQVZFZm10IBAAAAABA..." }
※ audioContent 內容為Wav檔案的Base64編碼 (當audioEncoding為LINEAR16)




6. Sample Code (以 Unity C# 為例)

  1. public IEnumerator GetWordAudio(string word, Dictionary cache, string key)
  2. {
  3. AudioClip myAudioClip;
  4. var url = "https://texttospeech.googleapis.com/v1beta1/text:synthesize?key=" + UnityWebRequest.EscapeURL(api_key);
  5. var args = new Dictionary();
  6. args["audioConfig"] = new Dictionary() {
  7. // https://cloud.google.com/text-to-speech/docs/reference/rest/v1beta1/text/synthesize?apix_params=%7B%22resource%22%3A%7B%7D%7D#audioencoding
  8. {"audioEncoding", "LINEAR16" },
  9. {"pitch", 0.0f },
  10. {"speakingRate", 1.0f },
  11. };
  12. args["input"] = new Dictionary() {
  13. {"text", word }
  14. };
  15. args["voice"] = new Dictionary() {
  16. {"languageCode", "cmn-CN" },
  17. {"name", "cmn-CN-Wavenet-C" }
  18. };
  19. var postData = MiniJSON.Json.Serialize(args);
  20. var www = new UnityWebRequest(url, UnityWebRequest.kHttpVerbPOST);
  21. byte[] bodyRaw = Encoding.UTF8.GetBytes(postData);
  22. www.uploadHandler = new UploadHandlerRaw(bodyRaw);
  23. www.downloadHandler = new DownloadHandlerBuffer();
  24. www.SetRequestHeader("Content-Type", "application/json");
  25. yield return www.SendWebRequest();
  26. while (!www.isNetworkError && !www.isHttpError && !www.isDone)
  27. {
  28. yield return new WaitForEndOfFrame();
  29. }
  30. byte[] decodedBytes = null;
  31. try
  32. {
  33. if (www.downloadedBytes == 0) { myAudioClip = null; }
  34. else
  35. {
  36. var rawData = www.downloadHandler.text;
  37. var result = MiniJSON.Json.Deserialize(rawData) as Dictionary;
  38. var audioContent = result["audioContent"] as string;
  39. decodedBytes = Convert.FromBase64String(audioContent);
  40. myAudioClip = CreateAudioClipFromWaveBytes(decodedBytes, word);
  41. }
  42. www.Dispose();
  43. }
  44. catch (System.Exception err)
  45. {
  46. myAudioClip = null;
  47. Debug.LogWarning(err);
  48. }
  49. cache[key] = myAudioClip;
  50. }
  51. private static AudioClip CreateAudioClipFromWaveBytes(byte[] bytes, string name)
  52. {
  53. // http://soundfile.sapp.org/doc/WaveFormat/
  54. // ex. total 47806 bytes
  55. //[00] 82 73 70 70 52 49 46 46 => RIFF
  56. //[04] 182 186 0 0 b6 ba 00 00 => ChunkSize:47798 = Subchunk2Size + 36 = 47762 + 36
  57. //[08] 87 65 86 69 57 41 56 45 => WAVE
  58. //[12] 102 109 116 32 66 6d 74 20 => fmt
  59. //[16] 16 0 0 0 10 00 00 00 => Subchunk1Size:16 (16 for PCM)
  60. //[20] 1 0 1 0 01 00 01 00 => AudioFormat:1 (PCM = 1 非壓縮) , => NumChannels:1
  61. //[24] 192 93 0 0 c0 5d 00 00 => SampleRate:24000
  62. //[28] 128 187 0 0 80 bb 00 00 => ByteRate:48000
  63. //[32] 2 0 16 0 02 00 10 00 => BlockAlign:2 , => BitsPerSample:16 bits
  64. //[36] 100 97 116 97 64 61 74 61 => data
  65. //[40] 146 186 0 0 92 ba 00 00 => Subchunk2Size:47762
  66. //[44] 0 0 0 0 => sample1 , sample2 ...
  67. //var chunkSize = BitConverter.ToUInt32(bytes, 4);
  68. var numChannels = BitConverter.ToInt16(bytes, 22);
  69. var sampleRate = BitConverter.ToInt32(bytes, 24);
  70. //var byteRate = BitConverter.ToUInt32(bytes, 28);
  71. var blockAlign = BitConverter.ToInt16(bytes, 32);
  72. //var bitsPerSample = BitConverter.ToUInt16(bytes, 34);
  73. var subchunk2Size = BitConverter.ToInt32(bytes, 40);
  74. int audioLength = subchunk2Size / blockAlign;
  75. float[] floatArr = new float[audioLength];
  76. for (int i = 0; i < audioLength; i++)
  77. {
  78. int pos = 44 + i * 2;
  79. floatArr[i] = bytesToFloat(bytes[pos], bytes[pos + 1]);
  80. }
  81. var myAudioClip = AudioClip.Create(name, audioLength, numChannels, sampleRate, false);
  82. myAudioClip.SetData(floatArr, 0);
  83. return myAudioClip;
  84. }
  85. // convert two bytes to one float in the range -1 to 1
  86. static float bytesToFloat(byte firstByte, byte secondByte)
  87. {
  88. // convert two bytes to one short (little endian)
  89. short s = (short)((secondByte << 8) | firstByte);
  90. // convert to range from -1 to (just below) 1
  91. return s / 32768.0F;
  92. }
  93. static int bytesToInt(byte[] bytes, int offset = 0)
  94. {
  95. int value = 0;
  96. for (int i = 0; i < 4; i++)
  97. {
  98. value |= ((int)bytes[offset + i]) << (i * 8);
  99. }
  100. return value;
  101. }



參考資料
Cloud Text-to-Speech
Cloud Text-to-Speech API



沒有留言:

張貼留言