快速开始

说明

支持的领域 / 任务:audio(音频) / ttsv2(语音合成)。

CosyVoice是通义实验室依托大规模预训练语言模型,深度融合文本理解和语音生成的新一代生成式语音合成大模型,支持文本至语音的实时流式合成。可以应用于:

  • 智能设备/机器人播报的语音内容,如智能客服机器人、智能音箱、数字人、语音助手等。

  • 音视频创作中需要将文字转为语音播报的场景,如小说阅读、新闻播报、影视解说、剧本配音等。

基本概念

  • 输入和输出

    语音合成服务中,输入的是待合成文本,输出的是合成后的音频。

  • 流式和非流式

    将一次请求中只能一次调用或返回的接口称为非流式,支持一次请求中可多次调用,或者多次返回结果的调用过程称为流式调用。

    其中,根据输入和输出的不同,调用方式也可以做组合,比如:

    • 非流式合成:一次输入合成文本,一次返回所有对应文本的合成音频。

    • 流式输出合成:一次输入合成文本,多次顺序返回合成音频。

    • 全双工流式合成:多次输入合成文本,多次返回合成音频。

    CosyVoice语音合成API支持以上所有模式的合成形态,方便用户在不同场景灵活定制产品形态。

前提条件

示例代码

更多常用场景的代码示例,请参见GitHub

将合成音频保存为文件(非流式合成)

以下代码展示了将返回的二进制音频,保存为本地文件。

# coding=utf-8

import dashscope
from dashscope.audio.tts_v2 import *

dashscope.api_key = "your-dashscope-api-key"
model = "cosyvoice-v1"
voice = "longxiaochun"


synthesizer = SpeechSynthesizer(model=model, voice=voice)

audio = synthesizer.call("今天天气怎么样?")
print('requestId: ', synthesizer.get_last_request_id())
with open('output.mp3', 'wb') as f:
    f.write(audio)
package SpeechSynthesisDemo;

import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;

public class Tts2File {
  private static String apikey = "your-dashscope-api-key";
  private static String model = "cosyvoice-v1";
  private static String voice = "longxiaochun";

  public static void process() {
    SpeechSynthesisParam param =
        SpeechSynthesisParam.builder()
            .apiKey(apikey)
            .model(model)
            .voice(voice)
            .build();
    SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, null);
    ByteBuffer audio = synthesizer.call("今天天气怎么样?");
    System.out.print("requestId: " + synthesizer.getLastRequestId());
    File file = new File("output.mp3");
    try (FileOutputStream fos = new FileOutputStream(file)) {
      fos.write(audio.array());
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  public static void main(String[] args) {
    process();
    System.exit(0);
  }
}

将LLM生成的文本通过扬声器实时播放(全双工流式合成)

以下代码展示调用成功后,通过本地设备播放通义千问大语言模型(qwen-turbo)实时返回的文本内容。

说明

需要使用您的API-KEY替换示例中的your-dashscope-api-key ,代码才能正常运行。

运行Python示例前,需要通过pip安装第三方音频播放套件。

# coding=utf-8
# Installation instructions for pyaudio:
# APPLE Mac OS X
#   brew install portaudio
#   pip install pyaudio
# Debian/Ubuntu
#   sudo apt-get install python-pyaudio python3-pyaudio
#   or
#   pip install pyaudio
# CentOS
#   sudo yum install -y portaudio portaudio-devel && pip install pyaudio
# Microsoft Windows
#   python -m pip install pyaudio

import pyaudio
import dashscope
from dashscope.audio.tts_v2 import *


from http import HTTPStatus
from dashscope import Generation

dashscope.api_key = "your-dashscope-api-key"
model = "cosyvoice-v1"
voice = "longxiaochun"


class Callback(ResultCallback):
    _player = None
    _stream = None

    def on_open(self):
        print("websocket is open.")
        self._player = pyaudio.PyAudio()
        self._stream = self._player.open(
            format=pyaudio.paInt16, channels=1, rate=22050, output=True
        )

    def on_complete(self):
        print("speech synthesis task complete successfully.")

    def on_error(self, message: str):
        print(f"speech synthesis task failed, {message}")

    def on_close(self):
        print("websocket is closed.")
        # stop player
        self._stream.stop_stream()
        self._stream.close()
        self._player.terminate()

    def on_event(self, message):
        print(f"recv speech synthsis message {message}")

    def on_data(self, data: bytes) -> None:
        print("audio result length:", len(data))
        self._stream.write(data)


def synthesizer_with_llm():
    callback = Callback()
    synthesizer = SpeechSynthesizer(
        model=model,
        voice=voice,
        format=AudioFormat.PCM_22050HZ_MONO_16BIT,
        callback=callback,
        url=url,
    )

    messages = [{"role": "user", "content": "请介绍一下你自己"}]
    responses = Generation.call(
        model="qwen-turbo",
        messages=messages,
        result_format="message",  # set result format as 'message'
        stream=True,  # enable stream output
        incremental_output=True,  # enable incremental output 
    )
    for response in responses:
        if response.status_code == HTTPStatus.OK:
            print(response.output.choices[0]["message"]["content"], end="")
            synthesizer.streaming_call(response.output.choices[0]["message"]["content"])
        else:
            print(
                "Request id: %s, Status code: %s, error code: %s, error message: %s"
                % (
                    response.request_id,
                    response.status_code,
                    response.code,
                    response.message,
                )
            )
    synthesizer.streaming_complete()
    print('requestId: ', synthesizer.get_last_request_id())


if __name__ == "__main__":
    synthesizer_with_llm()
package SpeechSynthesisDemo;

import com.alibaba.dashscope.aigc.generation.Generation;
import com.alibaba.dashscope.aigc.generation.GenerationParam;
import com.alibaba.dashscope.aigc.generation.GenerationResult;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.common.Message;
import com.alibaba.dashscope.common.ResultCallback;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.InputRequiredException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.utils.Constants;
import io.reactivex.Flowable;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.sound.sampled.*;

public class LLM2Player {
    // Needs to be replaced with your actual apikey  
    private static String apikey = "your-dashscope-api-key"; 
    private static String model = "cosyvoice-v1";
    private static String voice = "longxiaochun";
    public static void process()
            throws InterruptedException, NoApiKeyException, InputRequiredException {
        // Playback thread
        class PlaybackRunnable implements Runnable {
            // Set the audio format. Please configure according to your actual device,
            // synthesized audio parameters, and platform choice Here it is set to
            // 22050Hz16bit single channel. It is recommended that customers choose other
            // sample rates and formats based on the model sample rate and device
            // compatibility.
            private AudioFormat af = new AudioFormat(22050, 16, 1, true, false);
            private DataLine.Info info = new DataLine.Info(SourceDataLine.class, af);
            private SourceDataLine targetSource = null;
            private AtomicBoolean runFlag = new AtomicBoolean(true);
            private ConcurrentLinkedQueue<ByteBuffer> queue =
                    new ConcurrentLinkedQueue<>();
            // Prepare the player
            public void prepare() throws LineUnavailableException {
                targetSource = (SourceDataLine) AudioSystem.getLine(info);
                targetSource.open(af, 4096);
                targetSource.start();
            }
            public void put(ByteBuffer buffer) {
                queue.add(buffer);
            }
            // Stop playback
            public void stop() {
                runFlag.set(false);
            }
            @Override
            public void run() {
                if (targetSource == null) {
                    return;
                }
                while (runFlag.get()) {
                    if (queue.isEmpty()) {
                        try {
                            Thread.sleep(100);
                        } catch (InterruptedException e) {
                        }
                        continue;
                    }
                    ByteBuffer buffer = queue.poll();
                    if (buffer == null) {
                        continue;
                    }
                    byte[] data = buffer.array();
                    targetSource.write(data, 0, data.length);
                }
                // Play all remaining cache
                if (!queue.isEmpty()) {
                    ByteBuffer buffer = null;
                    while ((buffer = queue.poll()) != null) {
                        byte[] data = buffer.array();
                        targetSource.write(data, 0, data.length);
                    }
                }
                // Release the player
                targetSource.drain();
                targetSource.stop();
                targetSource.close();
            }
        }
        // Create a subclass inheriting from ResultCallback<SpeechSynthesisResult>
        // to implement the callback interface
        class ReactCallback extends ResultCallback<SpeechSynthesisResult> {
            private PlaybackRunnable playbackRunnable = null;
            public ReactCallback(PlaybackRunnable playbackRunnable) {
                this.playbackRunnable = playbackRunnable;
            }
            // Callback when the service side returns the streaming synthesis result
            @Override
            public void onEvent(SpeechSynthesisResult result) {
                // Get the binary data of the streaming result via getAudio
                if (result.getAudioFrame() != null) {
                    // Stream the data to the player
                    playbackRunnable.put(result.getAudioFrame());
                }
            }
            // Callback when the service side completes the synthesis
            @Override
            public void onComplete() {
                // Notify the playback thread to end
                playbackRunnable.stop();
            }
            // Callback when an error occurs
            @Override
            public void onError(Exception e) {
                // Tell the playback thread to end
                System.out.println(e);
                playbackRunnable.stop();
            }
        }
        PlaybackRunnable playbackRunnable = new PlaybackRunnable();
        try {
            playbackRunnable.prepare();
        } catch (LineUnavailableException e) {
            throw new RuntimeException(e);
        }
        Thread playbackThread = new Thread(playbackRunnable);
        // Start the playback thread
        playbackThread.start();
        /*******  Call the Generative AI Model to get streaming text *******/
        // Prepare for the LLM call
        Generation gen = new Generation();
        Message userMsg = Message.builder()
                .role(Role.USER.getValue())
                .content("请介绍一下你自己")
                .build();
        GenerationParam genParam =
                GenerationParam.builder()
                        .apiKey(apikey)
                        .model("qwen-turbo")
                        .messages(Arrays.asList(userMsg))
                        .resultFormat(GenerationParam.ResultFormat.MESSAGE)
                        .topP(0.8)
                        .incrementalOutput(true)
                        .build();
        // Prepare the speech synthesis task
        SpeechSynthesisParam param =
                SpeechSynthesisParam.builder()
                        .apiKey(apikey)
                        .model(model)
                        .voice(voice)
                        .format(SpeechSynthesisAudioFormat
                                .PCM_22050HZ_MONO_16BIT)
                        .build();
        SpeechSynthesizer synthesizer =
                new SpeechSynthesizer(param, new ReactCallback(playbackRunnable));
        Flowable<GenerationResult> result = gen.streamCall(genParam);
        result.blockingForEach(message -> {
            String text =
                    message.getOutput().getChoices().get(0).getMessage().getContent();
            System.out.println("LLM output:" + text);
            synthesizer.streamingCall(text);
        });
        synthesizer.streamingComplete();
        System.out.print("requestId: " + synthesizer.getLastRequestId());
        try {
            // Wait for the playback thread to finish playing all
            playbackThread.join();
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }
    public static void main(String[] args)
            throws InterruptedException, NoApiKeyException, InputRequiredException {
        process();
        System.exit(0);
    }
}

了解更多