说明
支持的领域 / 任务:audio(音频) / ttsv2(语音合成)。
CosyVoice是通义实验室依托大规模预训练语言模型,深度融合文本理解和语音生成的新一代生成式语音合成大模型,支持文本至语音的实时流式合成。可以应用于:
智能设备/机器人播报的语音内容,如智能客服机器人、智能音箱、数字人、语音助手等。
音视频创作中需要将文字转为语音播报的场景,如小说阅读、新闻播报、影视解说、剧本配音等。
基本概念
输入和输出
语音合成服务中,输入的是待合成文本,输出的是合成后的音频。
流式和非流式
将一次请求中只能一次调用或返回的接口称为非流式,支持一次请求中可多次调用,或者多次返回结果的调用过程称为流式调用。
其中,根据输入和输出的不同,调用方式也可以做组合,比如:
非流式合成:一次输入合成文本,一次返回所有对应文本的合成音频。
流式输出合成:一次输入合成文本,多次顺序返回合成音频。
全双工流式合成:多次输入合成文本,多次返回合成音频。
CosyVoice语音合成API支持以上所有模式的合成形态,方便用户在不同场景灵活定制产品形态。
前提条件
示例代码
更多常用场景的代码示例,请参见GitHub。
将合成音频保存为文件(非流式合成)
以下代码展示了将返回的二进制音频,保存为本地文件。
如您想使用Python和Java以外的编程语言,可以使用WebSocket协议进行调用。
# coding=utf-8
import dashscope
from dashscope.audio.tts_v2 import *
dashscope.api_key = "your-dashscope-api-key"
model = "cosyvoice-v1"
voice = "longxiaochun"
synthesizer = SpeechSynthesizer(model=model, voice=voice)
audio = synthesizer.call("今天天气怎么样?")
print('requestId: ', synthesizer.get_last_request_id())
with open('output.mp3', 'wb') as f:
f.write(audio)
package SpeechSynthesisDemo;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
public class Tts2File {
private static String apikey = "your-dashscope-api-key";
private static String model = "cosyvoice-v1";
private static String voice = "longxiaochun";
public static void process() {
SpeechSynthesisParam param =
SpeechSynthesisParam.builder()
.apiKey(apikey)
.model(model)
.voice(voice)
.build();
SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, null);
ByteBuffer audio = synthesizer.call("今天天气怎么样?");
System.out.print("requestId: " + synthesizer.getLastRequestId());
File file = new File("output.mp3");
try (FileOutputStream fos = new FileOutputStream(file)) {
fos.write(audio.array());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args) {
process();
System.exit(0);
}
}
将LLM生成的文本通过扬声器实时播放(全双工流式合成)
以下代码展示调用成功后,通过本地设备播放通义千问大语言模型(qwen-turbo)实时返回的文本内容。
说明
需要使用您的API-KEY替换示例中的your-dashscope-api-key
,代码才能正常运行。
运行Python示例前,需要通过pip安装第三方音频播放套件。
# coding=utf-8
# Installation instructions for pyaudio:
# APPLE Mac OS X
# brew install portaudio
# pip install pyaudio
# Debian/Ubuntu
# sudo apt-get install python-pyaudio python3-pyaudio
# or
# pip install pyaudio
# CentOS
# sudo yum install -y portaudio portaudio-devel && pip install pyaudio
# Microsoft Windows
# python -m pip install pyaudio
import pyaudio
import dashscope
from dashscope.audio.tts_v2 import *
from http import HTTPStatus
from dashscope import Generation
dashscope.api_key = "your-dashscope-api-key"
model = "cosyvoice-v1"
voice = "longxiaochun"
class Callback(ResultCallback):
_player = None
_stream = None
def on_open(self):
print("websocket is open.")
self._player = pyaudio.PyAudio()
self._stream = self._player.open(
format=pyaudio.paInt16, channels=1, rate=22050, output=True
)
def on_complete(self):
print("speech synthesis task complete successfully.")
def on_error(self, message: str):
print(f"speech synthesis task failed, {message}")
def on_close(self):
print("websocket is closed.")
# stop player
self._stream.stop_stream()
self._stream.close()
self._player.terminate()
def on_event(self, message):
print(f"recv speech synthsis message {message}")
def on_data(self, data: bytes) -> None:
print("audio result length:", len(data))
self._stream.write(data)
def synthesizer_with_llm():
callback = Callback()
synthesizer = SpeechSynthesizer(
model=model,
voice=voice,
format=AudioFormat.PCM_22050HZ_MONO_16BIT,
callback=callback,
)
messages = [{"role": "user", "content": "请介绍一下你自己"}]
responses = Generation.call(
model="qwen-turbo",
messages=messages,
result_format="message", # set result format as 'message'
stream=True, # enable stream output
incremental_output=True, # enable incremental output
)
for response in responses:
if response.status_code == HTTPStatus.OK:
print(response.output.choices[0]["message"]["content"], end="")
synthesizer.streaming_call(response.output.choices[0]["message"]["content"])
else:
print(
"Request id: %s, Status code: %s, error code: %s, error message: %s"
% (
response.request_id,
response.status_code,
response.code,
response.message,
)
)
synthesizer.streaming_complete()
print('requestId: ', synthesizer.get_last_request_id())
if __name__ == "__main__":
synthesizer_with_llm()
package SpeechSynthesisDemo;
import com.alibaba.dashscope.aigc.generation.Generation;
import com.alibaba.dashscope.aigc.generation.GenerationParam;
import com.alibaba.dashscope.aigc.generation.GenerationResult;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.common.Message;
import com.alibaba.dashscope.common.ResultCallback;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.InputRequiredException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.utils.Constants;
import io.reactivex.Flowable;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.sound.sampled.*;
public class LLM2Player {
// Needs to be replaced with your actual apikey
private static String apikey = "your-dashscope-api-key";
private static String model = "cosyvoice-v1";
private static String voice = "longxiaochun";
public static void process()
throws InterruptedException, NoApiKeyException, InputRequiredException {
// Playback thread
class PlaybackRunnable implements Runnable {
// Set the audio format. Please configure according to your actual device,
// synthesized audio parameters, and platform choice Here it is set to
// 22050Hz16bit single channel. It is recommended that customers choose other
// sample rates and formats based on the model sample rate and device
// compatibility.
private AudioFormat af = new AudioFormat(22050, 16, 1, true, false);
private DataLine.Info info = new DataLine.Info(SourceDataLine.class, af);
private SourceDataLine targetSource = null;
private AtomicBoolean runFlag = new AtomicBoolean(true);
private ConcurrentLinkedQueue<ByteBuffer> queue =
new ConcurrentLinkedQueue<>();
// Prepare the player
public void prepare() throws LineUnavailableException {
targetSource = (SourceDataLine) AudioSystem.getLine(info);
targetSource.open(af, 4096);
targetSource.start();
}
public void put(ByteBuffer buffer) {
queue.add(buffer);
}
// Stop playback
public void stop() {
runFlag.set(false);
}
@Override
public void run() {
if (targetSource == null) {
return;
}
while (runFlag.get()) {
if (queue.isEmpty()) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
}
continue;
}
ByteBuffer buffer = queue.poll();
if (buffer == null) {
continue;
}
byte[] data = buffer.array();
targetSource.write(data, 0, data.length);
}
// Play all remaining cache
if (!queue.isEmpty()) {
ByteBuffer buffer = null;
while ((buffer = queue.poll()) != null) {
byte[] data = buffer.array();
targetSource.write(data, 0, data.length);
}
}
// Release the player
targetSource.drain();
targetSource.stop();
targetSource.close();
}
}
// Create a subclass inheriting from ResultCallback<SpeechSynthesisResult>
// to implement the callback interface
class ReactCallback extends ResultCallback<SpeechSynthesisResult> {
private PlaybackRunnable playbackRunnable = null;
public ReactCallback(PlaybackRunnable playbackRunnable) {
this.playbackRunnable = playbackRunnable;
}
// Callback when the service side returns the streaming synthesis result
@Override
public void onEvent(SpeechSynthesisResult result) {
// Get the binary data of the streaming result via getAudio
if (result.getAudioFrame() != null) {
// Stream the data to the player
playbackRunnable.put(result.getAudioFrame());
}
}
// Callback when the service side completes the synthesis
@Override
public void onComplete() {
// Notify the playback thread to end
playbackRunnable.stop();
}
// Callback when an error occurs
@Override
public void onError(Exception e) {
// Tell the playback thread to end
System.out.println(e);
playbackRunnable.stop();
}
}
PlaybackRunnable playbackRunnable = new PlaybackRunnable();
try {
playbackRunnable.prepare();
} catch (LineUnavailableException e) {
throw new RuntimeException(e);
}
Thread playbackThread = new Thread(playbackRunnable);
// Start the playback thread
playbackThread.start();
/******* Call the Generative AI Model to get streaming text *******/
// Prepare for the LLM call
Generation gen = new Generation();
Message userMsg = Message.builder()
.role(Role.USER.getValue())
.content("请介绍一下你自己")
.build();
GenerationParam genParam =
GenerationParam.builder()
.apiKey(apikey)
.model("qwen-turbo")
.messages(Arrays.asList(userMsg))
.resultFormat(GenerationParam.ResultFormat.MESSAGE)
.topP(0.8)
.incrementalOutput(true)
.build();
// Prepare the speech synthesis task
SpeechSynthesisParam param =
SpeechSynthesisParam.builder()
.apiKey(apikey)
.model(model)
.voice(voice)
.format(SpeechSynthesisAudioFormat
.PCM_22050HZ_MONO_16BIT)
.build();
SpeechSynthesizer synthesizer =
new SpeechSynthesizer(param, new ReactCallback(playbackRunnable));
Flowable<GenerationResult> result = gen.streamCall(genParam);
result.blockingForEach(message -> {
String text =
message.getOutput().getChoices().get(0).getMessage().getContent();
System.out.println("LLM output:" + text);
synthesizer.streamingCall(text);
});
synthesizer.streamingComplete();
System.out.print("requestId: " + synthesizer.getLastRequestId());
try {
// Wait for the playback thread to finish playing all
playbackThread.join();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args)
throws InterruptedException, NoApiKeyException, InputRequiredException {
process();
System.exit(0);
}
}
了解更多
文档内容是否对您有帮助?