通过Python SDK调用通义千问实时语音合成-大模型服务平台百炼-阿里云

本文介绍 DashScope Python SDK 调用实时语音合成-通义千问时的关键接口与请求参数。

用户指南：关于模型介绍和选型建议请参见实时语音合成-通义千问。

在线体验：暂不支持。

前期准备

DashScope Python SDK 版本需要不低于1.24.10。

快速开始

server commit模式

import os
import base64
import threading
import time
import dashscope
from dashscope.audio.qwen_tts_realtime import *

qwen_tts_realtime: QwenTtsRealtime = None
text_to_synthesize = [
    '对吧~我就特别喜欢这种超市，',
    '尤其是过年的时候',
    '去逛超市',
    '就会觉得',
    '超级超级开心！',
    '想买好多好多的东西呢！'
]

DO_VIDEO_TEST = False

def init_dashscope_api_key():
    """
        Set your DashScope API-key. More information:
        https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md
    """

    # 新加坡和北京地域的API Key不同。获取API Key：https://help.aliyun.com/zh/model-studio/get-api-key
    if 'DASHSCOPE_API_KEY' in os.environ:
        dashscope.api_key = os.environ[
            'DASHSCOPE_API_KEY']  # load API-key from environment variable DASHSCOPE_API_KEY
    else:
        dashscope.api_key = 'your-dashscope-api-key'  # set API-key manually



class MyCallback(QwenTtsRealtimeCallback):
    def __init__(self):
        self.complete_event = threading.Event()
        self.file = open('result_24k.pcm', 'wb')

    def on_open(self) -> None:
        print('connection opened, init player')

    def on_close(self, close_status_code, close_msg) -> None:
        self.file.close()
        print('connection closed with code: {}, msg: {}, destroy player'.format(close_status_code, close_msg))

    def on_event(self, response: str) -> None:
        try:
            global qwen_tts_realtime
            type = response['type']
            if 'session.created' == type:
                print('start session: {}'.format(response['session']['id']))
            if 'response.audio.delta' == type:
                recv_audio_b64 = response['delta']
                self.file.write(base64.b64decode(recv_audio_b64))
            if 'response.done' == type:
                print(f'response {qwen_tts_realtime.get_last_response_id()} done')
            if 'session.finished' == type:
                print('session finished')
                self.complete_event.set()
        except Exception as e:
            print('[Error] {}'.format(e))
            return

    def wait_for_finished(self):
        self.complete_event.wait()


if __name__  == '__main__':
    init_dashscope_api_key()

    print('Initializing ...')

    callback = MyCallback()

    qwen_tts_realtime = QwenTtsRealtime(
        model='qwen-tts-realtime',
        callback=callback, 
        # 以下为北京地域url，若使用新加坡地域的模型，需将url替换为：wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime
        url='wss://dashscope.aliyuncs.com/api-ws/v1/realtime'
        )

    qwen_tts_realtime.connect()
    qwen_tts_realtime.update_session(
        voice = 'Cherry',
        language_type= "Chinese",
        response_format = AudioFormat.PCM_24000HZ_MONO_16BIT,
        mode = 'server_commit'        
    )
    for text_chunk in text_to_synthesize:
        print(f'send texd: {text_chunk}')
        qwen_tts_realtime.append_text(text_chunk)
        time.sleep(0.1)
    qwen_tts_realtime.finish()
    callback.wait_for_finished()
    print('[Metric] session: {}, first audio delay: {}'.format(
                    qwen_tts_realtime.get_session_id(), 
                    qwen_tts_realtime.get_first_audio_delay(),
                    ))

commit模式

import base64
import os
import threading
import dashscope
from dashscope.audio.qwen_tts_realtime import *

qwen_tts_realtime: QwenTtsRealtime = None
text_to_synthesize = [
    '这是第一句话。',
    '这是第二句话。',
    '这是第三句话。',
]

DO_VIDEO_TEST = False

def init_dashscope_api_key():
    """
        Set your DashScope API-key. More information:
        https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md
    """

    # 新加坡和北京地域的API Key不同。获取API Key：https://help.aliyun.com/zh/model-studio/get-api-key
    if 'DASHSCOPE_API_KEY' in os.environ:
        dashscope.api_key = os.environ[
            'DASHSCOPE_API_KEY']  # load API-key from environment variable DASHSCOPE_API_KEY
    else:
        dashscope.api_key = 'your-dashscope-api-key'  # set API-key manually



class MyCallback(QwenTtsRealtimeCallback):
    def __init__(self):
        super().__init__()
        self.response_counter = 0
        self.complete_event = threading.Event()
        self.file = open(f'result_{self.response_counter}_24k.pcm', 'wb')

    def reset_event(self):
        self.response_counter += 1
        self.file = open(f'result_{self.response_counter}_24k.pcm', 'wb')
        self.complete_event = threading.Event()

    def on_open(self) -> None:
        print('connection opened, init player')

    def on_close(self, close_status_code, close_msg) -> None:
        print('connection closed with code: {}, msg: {}, destroy player'.format(close_status_code, close_msg))

    def on_event(self, response: str) -> None:
        try:
            global qwen_tts_realtime
            type = response['type']
            if 'session.created' == type:
                print('start session: {}'.format(response['session']['id']))
            if 'response.audio.delta' == type:
                recv_audio_b64 = response['delta']
                self.file.write(base64.b64decode(recv_audio_b64))
            if 'response.done' == type:
                print(f'response {qwen_tts_realtime.get_last_response_id()} done')
                self.complete_event.set()
                self.file.close()
            if 'session.finished' == type:
                print('session finished')
                self.complete_event.set()
        except Exception as e:
            print('[Error] {}'.format(e))
            return

    def wait_for_response_done(self):
        self.complete_event.wait()


if __name__  == '__main__':
    init_dashscope_api_key()

    print('Initializing ...')

    callback = MyCallback()

    qwen_tts_realtime = QwenTtsRealtime(
        model='qwen-tts-realtime',
        callback=callback,
        # 以下为北京地域url，若使用新加坡地域的模型，需将url替换为：wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime
        url='wss://dashscope.aliyuncs.com/api-ws/v1/realtime'
        )

    qwen_tts_realtime.connect()
    qwen_tts_realtime.update_session(
        voice = 'Cherry',
        language_type= "Chinese",
        response_format = AudioFormat.PCM_24000HZ_MONO_16BIT,
        mode = 'commit'        
    )
    print(f'send texd: {text_to_synthesize[0]}')
    qwen_tts_realtime.append_text(text_to_synthesize[0])
    qwen_tts_realtime.commit()
    callback.wait_for_response_done()
    callback.reset_event()
    
    print(f'send texd: {text_to_synthesize[1]}')
    qwen_tts_realtime.append_text(text_to_synthesize[1])
    qwen_tts_realtime.commit()
    callback.wait_for_response_done()
    callback.reset_event()

    print(f'send texd: {text_to_synthesize[2]}')
    qwen_tts_realtime.append_text(text_to_synthesize[2])
    qwen_tts_realtime.commit()
    callback.wait_for_response_done()
    
    qwen_tts_realtime.finish()
    print('[Metric] session: {}, first audio delay: {}'.format(
                    qwen_tts_realtime.get_session_id(), 
                    qwen_tts_realtime.get_first_audio_delay(),
                    ))

访问github下载更多示例代码。

请求参数

下述请求参数可以通过QwenTtsRealtime的构造方法（_init_）进行设置。

参数

类型

说明

model

str

qwen-tts-realtime系列模型名称。参见模型列表。

url

str

中国大陆（北京）：wss://dashscope.aliyuncs.com/api-ws/v1/realtime

国际（新加坡）：wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime

下述请求参数可以通过update_session接口配置。

参数	类型	说明
voice	str	语音合成所使用的音色。参见支持的音色。
language_type	str	指定合成音频的语种，默认为 `Auto`。 `Auto`：适用无法确定文本的语种或文本包含多种语言的场景，模型会自动为文本中的不同语言片段匹配各自的发音，但无法保证发音完全精准。指定语种：适用于文本为单一语种的场景，此时指定为具体语种，能显著提升合成质量，效果通常优于 `Auto`。可选值包括： `Chinese` `English` `German` `Italian` `Portuguese` `Spanish` `Japanese` `Korean` `French` `Russian`
mode	str	使用的交互模式，可选"server_commit"和"commit"，默认 sever_commit 模式。 "server_commit"模式：文本缓冲区不会缓存数据，直接发送给服务器决定断句与音频合成。 "commit"模式：客户端主动触发语音合成。文本缓冲区最多放置xxx token。
format	str	模型输出音频的格式，支持的格式： "pcm"（默认） "wav" "mp3" "opus" Qwen-TTS Realtime（参见模型）仅支持pcm。
sample_rate	int	模型输出音频的采样率（Hz），支持的采样率： 16000 24000（默认） 48000 Qwen-TTS Realtime（参见模型）仅支持24000。
speech_rate	float	音频的语速。1.0为正常语速，小于1.0为慢速，大于1.0为快速。默认值：1.0。 Qwen-TTS Realtime（参见模型）不支持该参数。
volume	int	音频的音量。默认值：50。取值范围：[0, 100]。 Qwen-TTS Realtime（参见模型）不支持该参数。
pitch_rate	float	合成音频的语调。默认值：1.0。取值范围：[0.5, 2.0]。 Qwen-TTS Realtime（参见模型）不支持该参数。
bit_rate	int	指定音频的码率（kbps）。码率越大，音质越好，音频文件体积越大。仅在音频格式（`response_format`）为`opus`时可用。默认值：128。取值范围：[6, 510]。 Qwen-TTS Realtime（参见模型）不支持该参数。

关键接口

QwenTtsRealtime类

QwenTtsRealtime通过from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime方法引入。

方法签名	服务端响应事件（通过回调下发）	说明
`def connect(self) -> None`	session.created 会话已创建 session.updated 会话配置已更新	和服务端创建连接。
`def update_session(self, voice: str, response_format: AudioFormat = AudioFormat. PCM_24000HZ_MONO_16BIT, mode: str = 'server_commit', language_type : str = "Chinese", **kwargs) -> None`	session.updated 会话配置已更新	更新本次会话交互的默认配置。参数配置请参考《请求参数》章节。在您建立链接，服务端会及时返回用于此会话的默认输出输入配置。如果您需要更新默认会话配置，我们也推荐您总是在建立链接后即刻调用此接口。服务端在收到session.update事件后，会进行参数校验，如果参数不合法则返回错误，否则更新服务端侧的会话配置。
`def append_text(self, text: str) -> None`	无	将文本片段追加到云端输入文本缓冲区。缓冲区是你可以写入并稍后提交的临时存储。 "server_commit"模式下，服务器决定何时提交并合成文本缓冲区中的文本。 "commit"模式下，客户端需要主动通过commit触发语音合成。
`def clear_appended_text(self, ) -> None`	input_text_buffer.cleared 清空服务端收到的文本	删除当前云端缓冲区的文本。
`def commit(self, ) -> None`	input_text_buffer.committed 提交文本并触发语音合成 response.output_item.added 响应时有新的输出内容 response.content_part.added 新的输出内容添加到assistant message 项 response.audio.delta 模型增量生成的音频 response.audio.done 完成音频生成 response.content_part.done Assistant mesasge 的音频内容流式输出完成 response.output_item.done Assistant mesasge 的整个输出项流式传输完成 response.done 响应完成	提交之前通过append添加到云端缓冲区的文本，并立刻合成所有文本。如果输入的文本缓冲区为空将产生错误。 "server_commit"模式下，客户端不需要发送此事件，服务器会自动提交文本缓冲区。 "commit"模式下，客户端必须通过commit触发语音合成。
`def finish(self, ) -> None`	session.finished 响应完成	终止任务。
`def close(self, ) -> None`	无	关闭连接。
`def get_session_id(self) -> str`	无	获取当前任务的session_id。
`def get_last_response_id(self) -> str`	无	获取最近一次response的response_id。
`def get_first_audio_delay(self)`	无	获取首包音频延迟。

回调接口（QwenTtsRealtimeCallback）

服务端会通过回调的方式，将服务端响应事件和数据返回给客户端。您需要实现回调方法，处理服务端返回的信息或者数据。

通过from dashscope.audio.qwen_tts_realtime import QwenTtsRealtimeCallback引入。

方法	参数	返回值	描述
`def on_open(self) -> None`	无	无	当和服务端建立连接完成后，该方法立刻被回调。
`def on_event(self, message: str) -> None`	message：服务端响应事件。	无	包括对接口调用的回复响应和模型生成的文本和音频。具体可以参考：服务端事件
`def on_close(self, close_status_code, close_msg) -> None`	close_status_code：关闭websokcet的状态码。 close_msg：关闭websocket的关闭信息。	无	当服务已经关闭连接后进行回调。