cosyvoice-v1(longjielidou):Listen here, boy. I'm gonna teach you the secret formula on one condition. You can never let it fall into the hands of Plankton.
# coding=utf-8
# Installation instructions for pyaudio:
# APPLE Mac OS X
# brew install portaudio
# pip install pyaudio
# Debian/Ubuntu
# sudo apt-get install python-pyaudio python3-pyaudio
# or
# pip install pyaudio
# CentOS
# sudo yum install -y portaudio portaudio-devel && pip install pyaudio
# Microsoft Windows
# python -m pip install pyaudio
import pyaudio
import dashscope
from dashscope.audio.tts_v2 import *
from http import HTTPStatus
from dashscope import Generation
# 若没有将API Key配置到环境变量中,需将下面这行代码注释放开,并将apiKey替换为自己的API Key
# dashscope.api_key = "apiKey"
model = "cosyvoice-v1"
voice = "longxiaochun"
class Callback(ResultCallback):
_player = None
_stream = None
def on_open(self):
print("websocket is open.")
self._player = pyaudio.PyAudio()
self._stream = self._player.open(
format=pyaudio.paInt16, channels=1, rate=22050, output=True
)
def on_complete(self):
print("speech synthesis task complete successfully.")
def on_error(self, message: str):
print(f"speech synthesis task failed, {message}")
def on_close(self):
print("websocket is closed.")
# stop player
self._stream.stop_stream()
self._stream.close()
self._player.terminate()
def on_event(self, message):
print(f"recv speech synthsis message {message}")
def on_data(self, data: bytes) -> None:
print("audio result length:", len(data))
self._stream.write(data)
def synthesizer_with_llm():
callback = Callback()
synthesizer = SpeechSynthesizer(
model=model,
voice=voice,
format=AudioFormat.PCM_22050HZ_MONO_16BIT,
callback=callback,
)
messages = [{"role": "user", "content": "请介绍一下你自己"}]
responses = Generation.call(
model="qwen-turbo",
messages=messages,
result_format="message", # set result format as 'message'
stream=True, # enable stream output
incremental_output=True, # enable incremental output
)
for response in responses:
if response.status_code == HTTPStatus.OK:
print(response.output.choices[0]["message"]["content"], end="")
synthesizer.streaming_call(response.output.choices[0]["message"]["content"])
else:
print(
"Request id: %s, Status code: %s, error code: %s, error message: %s"
% (
response.request_id,
response.status_code,
response.code,
response.message,
)
)
synthesizer.streaming_complete()
print('requestId: ', synthesizer.get_last_request_id())
if __name__ == "__main__":
synthesizer_with_llm()
Java
import com.alibaba.dashscope.aigc.generation.Generation;
import com.alibaba.dashscope.aigc.generation.GenerationParam;
import com.alibaba.dashscope.aigc.generation.GenerationResult;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.common.Message;
import com.alibaba.dashscope.common.ResultCallback;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.InputRequiredException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import io.reactivex.Flowable;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.sound.sampled.*;
public class Main {
private static String model = "cosyvoice-v1";
private static String voice = "longxiaochun";
public static void process() throws NoApiKeyException, InputRequiredException {
// Playback thread
class PlaybackRunnable implements Runnable {
// Set the audio format. Please configure according to your actual device,
// synthesized audio parameters, and platform choice Here it is set to
// 22050Hz16bit single channel. It is recommended that customers choose other
// sample rates and formats based on the model sample rate and device
// compatibility.
private AudioFormat af = new AudioFormat(22050, 16, 1, true, false);
private DataLine.Info info = new DataLine.Info(SourceDataLine.class, af);
private SourceDataLine targetSource = null;
private AtomicBoolean runFlag = new AtomicBoolean(true);
private ConcurrentLinkedQueue<ByteBuffer> queue =
new ConcurrentLinkedQueue<>();
// Prepare the player
public void prepare() throws LineUnavailableException {
targetSource = (SourceDataLine) AudioSystem.getLine(info);
targetSource.open(af, 4096);
targetSource.start();
}
public void put(ByteBuffer buffer) {
queue.add(buffer);
}
// Stop playback
public void stop() {
runFlag.set(false);
}
@Override
public void run() {
if (targetSource == null) {
return;
}
while (runFlag.get()) {
if (queue.isEmpty()) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
}
continue;
}
ByteBuffer buffer = queue.poll();
if (buffer == null) {
continue;
}
byte[] data = buffer.array();
targetSource.write(data, 0, data.length);
}
// Play all remaining cache
if (!queue.isEmpty()) {
ByteBuffer buffer = null;
while ((buffer = queue.poll()) != null) {
byte[] data = buffer.array();
targetSource.write(data, 0, data.length);
}
}
// Release the player
targetSource.drain();
targetSource.stop();
targetSource.close();
}
}
// Create a subclass inheriting from ResultCallback<SpeechSynthesisResult>
// to implement the callback interface
class ReactCallback extends ResultCallback<SpeechSynthesisResult> {
private PlaybackRunnable playbackRunnable = null;
public ReactCallback(PlaybackRunnable playbackRunnable) {
this.playbackRunnable = playbackRunnable;
}
// Callback when the service side returns the streaming synthesis result
@Override
public void onEvent(SpeechSynthesisResult result) {
// Get the binary data of the streaming result via getAudio
if (result.getAudioFrame() != null) {
// Stream the data to the player
playbackRunnable.put(result.getAudioFrame());
}
}
// Callback when the service side completes the synthesis
@Override
public void onComplete() {
// Notify the playback thread to end
playbackRunnable.stop();
}
// Callback when an error occurs
@Override
public void onError(Exception e) {
// Tell the playback thread to end
System.out.println(e);
playbackRunnable.stop();
}
}
PlaybackRunnable playbackRunnable = new PlaybackRunnable();
try {
playbackRunnable.prepare();
} catch (LineUnavailableException e) {
throw new RuntimeException(e);
}
Thread playbackThread = new Thread(playbackRunnable);
// Start the playback thread
playbackThread.start();
/******* Call the Generative AI Model to get streaming text *******/
// Prepare for the LLM call
Generation gen = new Generation();
Message userMsg = Message.builder()
.role(Role.USER.getValue())
.content("请介绍一下你自己")
.build();
GenerationParam genParam =
GenerationParam.builder()
// 若没有将API Key配置到环境变量中,需将下面这行代码注释放开,并将apiKey替换为自己的API Key
// .apiKey("apikey")
.model("qwen-turbo")
.messages(Arrays.asList(userMsg))
.resultFormat(GenerationParam.ResultFormat.MESSAGE)
.topP(0.8)
.incrementalOutput(true)
.build();
// Prepare the speech synthesis task
SpeechSynthesisParam param =
SpeechSynthesisParam.builder()
// 若没有将API Key配置到环境变量中,需将下面这行代码注释放开,并将apiKey替换为自己的API Key
// .apiKey("apikey")
.model(model)
.voice(voice)
.format(SpeechSynthesisAudioFormat
.PCM_22050HZ_MONO_16BIT)
.build();
SpeechSynthesizer synthesizer =
new SpeechSynthesizer(param, new ReactCallback(playbackRunnable));
Flowable<GenerationResult> result = gen.streamCall(genParam);
result.blockingForEach(message -> {
String text =
message.getOutput().getChoices().get(0).getMessage().getContent();
System.out.println("LLM output:" + text);
synthesizer.streamingCall(text);
});
synthesizer.streamingComplete();
System.out.print("requestId: " + synthesizer.getLastRequestId());
try {
// Wait for the playback thread to finish playing all
playbackThread.join();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args) throws NoApiKeyException, InputRequiredException {
process();
System.exit(0);
}
}
import dashscope
from dashscope.audio.tts import SpeechSynthesizer
# 若没有将API Key配置到环境变量中,需将下面这行代码注释放开,并将apiKey替换为自己的API Key
# dashscope.api_key = "apiKey"
result = SpeechSynthesizer.call(model='sambert-zhichu-v1',
# 当text内容的语种发生变化时,请确认model是否匹配。不同model支持不同的语种,详情请参见Sambert音色列表中的“语言”列。
text='今天天气怎么样',
sample_rate=48000,
format='wav')
if result.get_audio_data() is not None:
with open('output.wav', 'wb') as f:
f.write(result.get_audio_data())
print(' get response: %s' % (result.get_response()))
import com.alibaba.dashscope.audio.tts.SpeechSynthesizer;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisAudioFormat;
import com.alibaba.dashscope.common.ResultCallback;
import com.alibaba.dashscope.common.Status;
import java.io.*;
import java.nio.ByteBuffer;
public class Main {
public static void SyncAudioDataToFile() {
SpeechSynthesizer synthesizer = new SpeechSynthesizer();
SpeechSynthesisParam param = SpeechSynthesisParam.builder()
// 若没有将API Key配置到环境变量中,需将下面这行代码注释放开,并将apiKey替换为自己的API Key
// .apiKey(apikey)
.model("sambert-zhichu-v1")
// 当text内容的语种发生变化时,请确认model是否匹配。不同model支持不同的语种,详情请参见Sambert音色列表中的“语言”列。
.text("今天天气怎么样")
.sampleRate(48000)
.format(SpeechSynthesisAudioFormat.WAV)
.build();
File file = new File("output.wav");
// 调用call方法,传入param参数,获取合成音频
ByteBuffer audio = synthesizer.call(param);
try (FileOutputStream fos = new FileOutputStream(file)) {
fos.write(audio.array());
System.out.println("synthesis done!");
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args) {
SyncAudioDataToFile();
System.exit(0);
}
}
将合成的音频通过扬声器播放
合成语音后,通过本地设备播放实时返回的音频内容。
运行Python示例前,需要通过pip安装第三方音频播放库。
# coding=utf-8
#
# Installation instructions for pyaudio:
# APPLE Mac OS X
# brew install portaudio
# pip install pyaudio
# Debian/Ubuntu
# sudo apt-get install python-pyaudio python3-pyaudio
# or
# pip install pyaudio
# CentOS
# sudo yum install -y portaudio portaudio-devel && pip install pyaudio
# Microsoft Windows
# python -m pip install pyaudio
import dashscope
import sys
import pyaudio
from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
from dashscope.audio.tts import ResultCallback, SpeechSynthesizer, SpeechSynthesisResult
# 若没有将API Key配置到环境变量中,需将下面这行代码注释放开,并将apiKey替换为自己的API Key
# dashscope.api_key = "apiKey"
class Callback(ResultCallback):
_player = None
_stream = None
def on_open(self):
print('Speech synthesizer is opened.')
self._player = pyaudio.PyAudio()
self._stream = self._player.open(
format=pyaudio.paInt16,
channels=1,
rate=48000,
output=True)
def on_complete(self):
print('Speech synthesizer is completed.')
def on_error(self, response: SpeechSynthesisResponse):
print('Speech synthesizer failed, response is %s' % (str(response)))
def on_close(self):
print('Speech synthesizer is closed.')
self._stream.stop_stream()
self._stream.close()
self._player.terminate()
def on_event(self, result: SpeechSynthesisResult):
if result.get_audio_frame() is not None:
print('audio result length:', sys.getsizeof(result.get_audio_frame()))
self._stream.write(result.get_audio_frame())
if result.get_timestamp() is not None:
print('timestamp result:', str(result.get_timestamp()))
callback = Callback()
SpeechSynthesizer.call(model='sambert-zhichu-v1',
text='今天天气怎么样',
sample_rate=48000,
format='pcm',
callback=callback)
import com.alibaba.dashscope.audio.tts.SpeechSynthesizer;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisAudioFormat;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.common.ResultCallback;
import java.nio.ByteBuffer;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.sound.sampled.*;
public class Main {
public static void StreamAuidoDataToSpeaker() {
CountDownLatch latch = new CountDownLatch(1);
SpeechSynthesizer synthesizer = new SpeechSynthesizer();
SpeechSynthesisParam param =
SpeechSynthesisParam.builder()
// 若没有将API Key配置到环境变量中,需将下面这行代码注释放开,并将apiKey替换为自己的API Key
// .apiKey("apikey")
.text("今天天气怎么样")
.model("sambert-zhichu-v1")
.sampleRate(48000)
.format(SpeechSynthesisAudioFormat.PCM) // 流式合成使用PCM或者MP3
.build();
// 播放线程
class PlaybackRunnable implements Runnable {
// 设置音频格式,请根据实际自身设备,合成音频参数和平台选择配置
// 这里选择48k16bit单通道,建议客户根据选用的模型采样率情况和自身设备兼容性选择其他采样率和格式
private AudioFormat af = new AudioFormat(48000, 16, 1, true, false);
private DataLine.Info info = new DataLine.Info(SourceDataLine.class, af);
private SourceDataLine targetSource = null;
private AtomicBoolean runFlag = new AtomicBoolean(true);
private ConcurrentLinkedQueue<ByteBuffer> queue = new ConcurrentLinkedQueue<>();
// 准备播放器
public void prepare() throws LineUnavailableException {
targetSource = (SourceDataLine) AudioSystem.getLine(info);
targetSource.open(af, 4096);
targetSource.start();
}
public void put(ByteBuffer buffer) {
queue.add(buffer);
}
// 停止播放
public void stop() {
runFlag.set(false);
}
@Override
public void run() {
if (targetSource == null) {
return;
}
while (runFlag.get()) {
if (queue.isEmpty()) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
}
continue;
}
ByteBuffer buffer = queue.poll();
if (buffer == null) {
continue;
}
byte[] data = buffer.array();
targetSource.write(data, 0, data.length);
}
// 将缓存全部播放完
if (!queue.isEmpty()) {
ByteBuffer buffer = null;
while ((buffer = queue.poll()) != null) {
byte[] data = buffer.array();
targetSource.write(data, 0, data.length);
}
}
// 释放播放器
targetSource.drain();
targetSource.stop();
targetSource.close();
}
}
// 创建一个继承自ResultCallback<SpeechSynthesisResult>的子类来实现回调接口
class ReactCallback extends ResultCallback<SpeechSynthesisResult> {
private PlaybackRunnable playbackRunnable = null;
public ReactCallback(PlaybackRunnable playbackRunnable) {
this.playbackRunnable = playbackRunnable;
}
// 当服务侧返回流式合成结果后回调
@Override
public void onEvent(SpeechSynthesisResult result) {
// 通过getAudio获取流式结果二进制数据
if (result.getAudioFrame() != null) {
// 将数据流式推给播放器
playbackRunnable.put(result.getAudioFrame());
}
}
// 当服务侧完成合成后回调
@Override
public void onComplete() {
// 告知播放线程结束
playbackRunnable.stop();
latch.countDown();
}
// 当出现错误时回调
@Override
public void onError(Exception e) {
// 告诉播放线程结束
System.out.println(e);
playbackRunnable.stop();
latch.countDown();
}
}
PlaybackRunnable playbackRunnable = new PlaybackRunnable();
try {
playbackRunnable.prepare();
} catch (LineUnavailableException e) {
throw new RuntimeException(e);
}
Thread playbackThread = new Thread(playbackRunnable);
// 启动播放线程
playbackThread.start();
// 带Callback的call方法将不会阻塞当前线程
synthesizer.call(param, new ReactCallback(playbackRunnable));
// 等待合成完成
try {
latch.await();
// 等待播放线程全部播放完
playbackThread.join();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
public static void main(String[] args) {
StreamAuidoDataToSpeaker();
System.exit(0);
}
}