Cloud Text-to-Speech
透過機器學習技術將文字轉為語音
可以先在介紹網站中試用是否符合需求。
Text to Speech 收費定價 |
使用方式介紹
1.在 Google Cloud Platform 建立專案
註冊 Google Cloud 帳號 |
建立專用專案 |
設定專案名稱 |
2.在 API 服務面板 中啟用 Cloud Text-to-Speech API
進入API服務面板 |
進入API搜尋頁面 |
選擇 Cloud Text-to-Speech API |
啟用 API |
3.在API 面板建立 API 金鑰
建立API Key |
複製 API Key |
如果有安全性需求,可以進入 API Key 管理裡面調整可存取的 API 權限。
4.透過Url + API Key取得語音資料
https://texttospeech.googleapis.com/v1beta1/text:synthesize?key=API金鑰
API 使用參考 |
{ "audioConfig": { "audioEncoding": "LINEAR16", "pitch": 0, "speakingRate": 1 }, "input": { "text": "你好" }, "voice": { "languageCode": "cmn-CN", "name": "cmn-CN-Wavenet-C" } }
5.解析語音資料
上一步的請求會回傳 JSON,格式如下
{ "audioContent": "UklGRqSOAABXQVZFZm10IBAAAAABA..." }※ audioContent 內容為Wav檔案的Base64編碼 (當audioEncoding為LINEAR16)
6. Sample Code (以 Unity C# 為例)
public IEnumerator GetWordAudio(string word, Dictionarycache, string key) { AudioClip myAudioClip; var url = "https://texttospeech.googleapis.com/v1beta1/text:synthesize?key=" + UnityWebRequest.EscapeURL(api_key); var args = new Dictionary (); args["audioConfig"] = new Dictionary () { // https://cloud.google.com/text-to-speech/docs/reference/rest/v1beta1/text/synthesize?apix_params=%7B%22resource%22%3A%7B%7D%7D#audioencoding {"audioEncoding", "LINEAR16" }, {"pitch", 0.0f }, {"speakingRate", 1.0f }, }; args["input"] = new Dictionary () { {"text", word } }; args["voice"] = new Dictionary () { {"languageCode", "cmn-CN" }, {"name", "cmn-CN-Wavenet-C" } }; var postData = MiniJSON.Json.Serialize(args); var www = new UnityWebRequest(url, UnityWebRequest.kHttpVerbPOST); byte[] bodyRaw = Encoding.UTF8.GetBytes(postData); www.uploadHandler = new UploadHandlerRaw(bodyRaw); www.downloadHandler = new DownloadHandlerBuffer(); www.SetRequestHeader("Content-Type", "application/json"); yield return www.SendWebRequest(); while (!www.isNetworkError && !www.isHttpError && !www.isDone) { yield return new WaitForEndOfFrame(); } byte[] decodedBytes = null; try { if (www.downloadedBytes == 0) { myAudioClip = null; } else { var rawData = www.downloadHandler.text; var result = MiniJSON.Json.Deserialize(rawData) as Dictionary ; var audioContent = result["audioContent"] as string; decodedBytes = Convert.FromBase64String(audioContent); myAudioClip = CreateAudioClipFromWaveBytes(decodedBytes, word); } www.Dispose(); } catch (System.Exception err) { myAudioClip = null; Debug.LogWarning(err); } cache[key] = myAudioClip; } private static AudioClip CreateAudioClipFromWaveBytes(byte[] bytes, string name) { // http://soundfile.sapp.org/doc/WaveFormat/ // ex. total 47806 bytes //[00] 82 73 70 70 52 49 46 46 => RIFF //[04] 182 186 0 0 b6 ba 00 00 => ChunkSize:47798 = Subchunk2Size + 36 = 47762 + 36 //[08] 87 65 86 69 57 41 56 45 => WAVE //[12] 102 109 116 32 66 6d 74 20 => fmt //[16] 16 0 0 0 10 00 00 00 => Subchunk1Size:16 (16 for PCM) //[20] 1 0 1 0 01 00 01 00 => AudioFormat:1 (PCM = 1 非壓縮) , => NumChannels:1 //[24] 192 93 0 0 c0 5d 00 00 => SampleRate:24000 //[28] 128 187 0 0 80 bb 00 00 => ByteRate:48000 //[32] 2 0 16 0 02 00 10 00 => BlockAlign:2 , => BitsPerSample:16 bits //[36] 100 97 116 97 64 61 74 61 => data //[40] 146 186 0 0 92 ba 00 00 => Subchunk2Size:47762 //[44] 0 0 0 0 => sample1 , sample2 ... //var chunkSize = BitConverter.ToUInt32(bytes, 4); var numChannels = BitConverter.ToInt16(bytes, 22); var sampleRate = BitConverter.ToInt32(bytes, 24); //var byteRate = BitConverter.ToUInt32(bytes, 28); var blockAlign = BitConverter.ToInt16(bytes, 32); //var bitsPerSample = BitConverter.ToUInt16(bytes, 34); var subchunk2Size = BitConverter.ToInt32(bytes, 40); int audioLength = subchunk2Size / blockAlign; float[] floatArr = new float[audioLength]; for (int i = 0; i < audioLength; i++) { int pos = 44 + i * 2; floatArr[i] = bytesToFloat(bytes[pos], bytes[pos + 1]); } var myAudioClip = AudioClip.Create(name, audioLength, numChannels, sampleRate, false); myAudioClip.SetData(floatArr, 0); return myAudioClip; } // convert two bytes to one float in the range -1 to 1 static float bytesToFloat(byte firstByte, byte secondByte) { // convert two bytes to one short (little endian) short s = (short)((secondByte << 8) | firstByte); // convert to range from -1 to (just below) 1 return s / 32768.0F; } static int bytesToInt(byte[] bytes, int offset = 0) { int value = 0; for (int i = 0; i < 4; i++) { value |= ((int)bytes[offset + i]) << (i * 8); } return value; }
參考資料
Cloud Text-to-Speech
Cloud Text-to-Speech API
沒有留言:
張貼留言