折腾:
已经完成了,语音合成到Flask中了。
接着去考虑在前端页面中,支持调用参数给后端,传递到微软的Azure的tts语音合成中。
方便前端测试不同人声的语音合成的效果。
去前端页面中加配置
之前用的bootstrap,找找里面的列表选择控件:
【已解决】Bootstrap中实现列表选择默认值和获取当前选中的值
再去加上其他的配置,比如🔊音量大小和语音速度
<code> <div class="row"> <div class="col-lg-4 col-md-4 col-sm-6 col-xs-12 "> <div class="input-group"> <span class="input-group-addon">Voice Speed</span> <input type="text" class="form-control" placeholder="eg: -40.00%, +20.00%" id="voiceRate" value="-30.00%"> </div> </div> </div> <div class="row"> <div class="col-lg-4 col-md-4 col-sm-6 col-xs-12 "> <div class="input-group"> <span class="input-group-addon">Voice Volume</span> <input type="text" class="form-control" placeholder="eg: +25.00%, -30.00%" id="voiceVolume" value="+40.00%"> </div> </div> </div> </code>
界面效果:

然后再去写更新后台代码,加上ms的azure的tts的参数设置
<code># def doAudioSynthesis(unicodeText):
def doAudioSynthesis(unicodeText,
                     voiceName=MS_TTS_VOICE_NAME,
                     voiceRate=MS_TTS_VOICE_RATE,
                     voiceVolume=MS_TTS_VOICE_VOLUME):
    """
        do audio synthesis from unicode text
        if failed for token invalid/expired, will refresh token to do one more retry
    """
    # global app, log, gCurBaiduRespDict
    global app, log
    isOk = False
    audioBinData = None
    errMsg = ""
    # # for debug
    # gCurBaiduRespDict["access_token"] = "99.569b3b5b470938a522ce60d2e2ea2506.2592000.1528015602.282335-11192483"
    log.info("doAudioSynthesis: unicodeText=%s", unicodeText)
    # isOk, audioBinData, errNo, errMsg = baiduText2Audio(unicodeText)
    isOk, audioBinData, errNo, errMsg = msTTS(unicodeText, voiceName, voiceRate, voiceVolume)
    log.info("isOk=%s, errNo=%d, errMsg=%s", isOk, errNo, errMsg)
  
def msTTS(unicodeText,
          voiceName=MS_TTS_VOICE_NAME,
          voiceRate=MS_TTS_VOICE_RATE,
          voiceVolume=MS_TTS_VOICE_VOLUME):
    """call ms azure tts to generate audio(mp3/wav/...) from text"""
    global app, log, gMsToken
    log.info("msTTS: unicodeText=%s", unicodeText)
    isOk = False
    audioBinData = None
    errNo = 0
    errMsg = "Unknown error"
    msTtsUrl = app.config["MS_TTS_URL"]
    log.info("msTtsUrl=%s", msTtsUrl)
    reqHeaders = {
        "Content-Type": "application/ssml+xml",
        "X-Microsoft-OutputFormat": MS_TTS_OUTPUT_FORMAT,
        "Ocp-Apim-Subscription-Key": app.config["MS_TTS_SECRET_KEY"],
        "Authorization": "Bear " + gMsToken
    }
    log.info("reqHeaders=%s", reqHeaders)
    # # for debug
    # MS_TTS_VOICE_NAME = "zhang san"
    ssmlDataStr = """
        <speak version='1.0' xmlns="http://www.w3.org/2001/10/synthesis" xml:lang='en-US'>
            <voice name='%s'>
                <prosody rate='%s' volume='%s'>
                    %s
                </prosody>
            </voice>
        </speak>
    """ % (voiceName, voiceRate, voiceVolume, unicodeText)
    log.info("ssmlDataStr=%s", ssmlDataStr)
    resp = requests.post(msTtsUrl, headers=reqHeaders, data=ssmlDataStr)
    log.info("resp=%s", resp)
    statusCode = resp.status_code
    log.info("statusCode=%s", statusCode)
    if statusCode == 200:
        # respContentType = resp.headers["Content-Type"]  # 'audio/x-wav', 'audio/mpeg'
        # log.info("respContentType=%s", respContentType)
        # if re.match("audio/.*", respContentType):
        audioBinData = resp.content
        log.info("resp content is audio binary data, length=%d", len(audioBinData))
        isOk = True
        errMsg = ""
    else:
        isOk = False
        errNo = resp.status_code
        errMsg = resp.reason
        log.error("resp errNo=%d, errMsg=%s", errNo, errMsg)
        # errNo=400, errMsg=Voice zhang san not supported
        # errNo=401, errMsg=Unauthorized
        # errNo=413, errMsg=Content length exceeded the allowed limit of 1024 characters.
    return isOk, audioBinData, errNo, errMsg
class RobotQaAPI(Resource):
    def processResponse(self,
                        respDict,
                        voiceName=MS_TTS_VOICE_NAME,
                        voiceRate=MS_TTS_VOICE_RATE,
                        voiceVolume=MS_TTS_VOICE_VOLUME):
        """
            process response dict before return
                generate audio for response text part
        """
        global log, gTempAudioFolder
        unicodeText = respDict["data"]["response"]["text"]
        log.info("unicodeText=%s")
        if not unicodeText:
            log.info("No response text to do audio synthesis")
            return jsonify(respDict)
        isOk, audioBinData, errMsg = doAudioSynthesis(unicodeText, voiceName, voiceRate, voiceVolume)
        if isOk:
            # 1. save audio binary data into tmp file
            tempFilename = saveAudioDataToTmpFile(audioBinData)
            # 2. use celery to delay delete tmp file
            delayTimeToDelete = app.config["CELERY_DELETE_TMP_AUDIO_FILE_DELAY"]
            deleteTmpAudioFile.apply_async([tempFilename], countdown=delayTimeToDelete)
            log.info("Delay %s seconds to delete %s", delayTimeToDelete, tempFilename)
            # 3. generate temp audio file url
            tmpAudioUrl = "http://%s:%d/tmp/audio/%s" % (
                app.config["FILE_URL_HOST"],
                app.config["FLASK_PORT"],
                tempFilename)
            log.info("tmpAudioUrl=%s", tmpAudioUrl)
            respDict["data"]["response"]["audioUrl"] = tmpAudioUrl
        else:
            log.warning("Fail to get synthesis audio for errMsg=%s", errMsg)
        log.info("respDict=%s", respDict)
        return jsonify(respDict)
    def get(self):
        respDict = {
            "code": 200,
            "message": "generate response ok",
            "data": {
                "input": "",
                "response": {
                    "text": "",
                    "audioUrl": ""
                },
                "control": "",
                "audio": {}
            }
        }
        parser = reqparse.RequestParser()
        # i want to hear the story of Baby Sister Says No
        parser.add_argument('input', type=str, help="input words")
        parser.add_argument('voiceName', type=str, default=MS_TTS_VOICE_NAME, help="voice name/speaker")
        parser.add_argument('voiceRate', type=str, default=MS_TTS_VOICE_RATE, help="voice rate/speed")
        parser.add_argument('voiceVolume', type=str, default=MS_TTS_VOICE_VOLUME, help="voice volume")
        log.info("parser=%s", parser)
        parsedArgs = parser.parse_args()
        log.info("parsedArgs=%s", parsedArgs)
        if not parsedArgs:
            respDict["data"]["response"]["text"] = "Can not recognize input"
            return self.processResponse(respDict)
        inputStr = parsedArgs["input"]
        voiceName = parsedArgs["voiceName"]
        voiceRate = parsedArgs["voiceRate"]
        voiceVolume = parsedArgs["voiceVolume"]
        log.info("inputStr=%s, voiceName=%s, voiceRate=%s, voiceVolume=%s",
                 inputStr, voiceName, voiceRate, voiceVolume)
        if not inputStr:
            respDict["data"]["response"]["text"] = "Can not recognize parameter input"
            return self.processResponse(respDict, voiceName, voiceRate, voiceVolume)
        respDict["data"]["input"] = inputStr
        aiResult = QueryAnalyse(inputStr, aiContext)
        log.info("aiResult=%s", aiResult)
        if aiResult["response"]:
            respDict["data"]["response"]["text"] = aiResult["response"]
        if aiResult["control"]:
            respDict["data"]["control"] = aiResult["control"]
        log.info('respDict["data"]=%s', respDict["data"])
        audioFileIdStr = aiResult["mediaId"]
        log.info("audioFileIdStr=%s", audioFileIdStr)
        if audioFileIdStr:
            audioFileObjectId = ObjectId(audioFileIdStr)
            log.info("audioFileObjectId=%s", audioFileObjectId)
            if fsCollection.exists(audioFileObjectId):
                audioFileObj = fsCollection.get(audioFileObjectId)
                log.info("audioFileObj=%s", audioFileObj)
                encodedFilename = quote(audioFileObj.filename)
                log.info("encodedFilename=%s", encodedFilename)
                respDict["data"]["audio"] = {
                    "contentType": audioFileObj.contentType,
                    "name": audioFileObj.filename,
                    "size": audioFileObj.length,
                    "url": "http://%s:%d/files/%s/%s" %
                           (app.config["FILE_URL_HOST"],
                            app.config["FLASK_PORT"],
                            audioFileObj._id,
                            encodedFilename)
                }
                log.info("respDict=%s", respDict)
                return self.processResponse(respDict, voiceName, voiceRate, voiceVolume)
            else:
                log.info("Can not find file from id %s", audioFileIdStr)
                respDict["data"]["audio"] = {}
                return self.processResponse(respDict, voiceName, voiceRate, voiceVolume)
        else:
            log.info("Not response file id")
            respDict["data"]["audio"] = {}
            return self.processResponse(respDict, voiceName, voiceRate, voiceVolume)
</code>效果:
是可以实现,设置不同参数,输出对应合成的语音的:

然后再去部署代码到服务器上即可。