快速开始

Paraformer语音识别

说明

支持的领域 / 任务:audio(音频) / asr(语音识别)

Paraformer语音识别API基于通义实验室新一代非自回归端到端模型,提供基于实时音频流的语音识别以及对输入的各类音视频文件进行语音识别的能力。可被应用于:

  • 对语音识别结果返回的即时性有严格要求的实时场景,如实时会议记录、实时直播字幕、电话客服等。

  • 对音视频文件中语音内容的识别,从而进行内容理解分析、字幕生成等。

  • 对电话客服呼叫中心录音进行识别,从而进行客服质检等。

快速开始

前提条件

实时语音识别示例代码

实时语音识别是对不限时长的音频流做实时识别,达到“边说边出文字”的效果,内置智能断句,可提供每句话开始结束时间。可用于视频实时直播字幕、实时会议记录、实时法庭庭审记录、智能语音助手等场景。

更多常用场景的代码示例,请参见GitHub

实时语音识别支持通过同步调用的方式识别本地文件。

使用麦克风进行流式语音文字上屏

以下示例展示使用实时语音识别API,使用麦克风进行流式语音识别并进行文字上屏,达到“边说边出文字”的效果。

说明
  • 需要使用您的API-KEY替换示例中的 your-dashscope-api-key ,代码才能正常运行。

  • 运行Python示例前,需要通过pip install pyaudio命令安装第三方音频播放与采集套件。

# For prerequisites running the following sample, visit https://help.aliyun.com/zh/model-studio/getting-started/first-api-call-to-qwen
import os
import signal  # for keyboard events handling (press "Ctrl+C" to terminate recording and translation)
import sys

import dashscope
import pyaudio
from dashscope.audio.asr import *

mic = None
stream = None

# Set recording parameters
sample_rate = 16000  # sampling rate (Hz)
channels = 1  # mono channel
dtype = 'int16'  # data type
format_pcm = 'pcm'  # the format of the audio data
block_size = 3200  # number of frames per buffer


def init_dashscope_api_key():
    """
        Set your DashScope API-key. More information:
        https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md
    """

    if 'DASHSCOPE_API_KEY' in os.environ:
        dashscope.api_key = os.environ[
            'DASHSCOPE_API_KEY']  # load API-key from environment variable DASHSCOPE_API_KEY
    else:
        dashscope.api_key = '<your-dashscope-api-key>'  # set API-key manually


# Real-time speech recognition callback
class Callback(RecognitionCallback):
    def on_open(self) -> None:
        global mic
        global stream
        print('RecognitionCallback open.')
        mic = pyaudio.PyAudio()
        stream = mic.open(format=pyaudio.paInt16,
                          channels=1,
                          rate=16000,
                          input=True)

    def on_close(self) -> None:
        global mic
        global stream
        print('RecognitionCallback close.')
        stream.stop_stream()
        stream.close()
        mic.terminate()
        stream = None
        mic = None

    def on_complete(self) -> None:
        print('RecognitionCallback completed.')  # translation completed

    def on_error(self, message) -> None:
        print('RecognitionCallback task_id: ', message.request_id)
        print('RecognitionCallback error: ', message.message)
        # Stop and close the audio stream if it is running
        if 'stream' in globals() and stream.active:
            stream.stop()
            stream.close()
        # Forcefully exit the program
        sys.exit(1)

    def on_event(self, result: RecognitionResult) -> None:
        sentence = result.get_sentence()
        if 'text' in sentence:
            print('RecognitionCallback text: ', sentence['text'])
            if RecognitionResult.is_sentence_end(sentence):
                print(
                    'RecognitionCallback sentence end, request_id:%s, usage:%s'
                    % (result.get_request_id(), result.get_usage(sentence)))


def signal_handler(sig, frame):
    print('Ctrl+C pressed, stop translation ...')
    # Stop translation
    recognition.stop()
    print('Translation stopped.')
    print(
        '[Metric] requestId: {}, first package delay ms: {}, last package delay ms: {}'
        .format(
            recognition.get_last_request_id(),
            recognition.get_first_package_delay(),
            recognition.get_last_package_delay(),
        ))
    # Forcefully exit the program
    sys.exit(0)


# main function
if __name__ == '__main__':
    init_dashscope_api_key()
    print('Initializing ...')

    # Create the translation callback
    callback = Callback()

    # Call recognition service by async mode, you can customize the recognition parameters, like model, format,
    # sample_rate For more information, please refer to https://help.aliyun.com/document_detail/2712536.html
    recognition = Recognition(
        model='paraformer-realtime-v2',
        # 'paraformer-realtime-v1'、'paraformer-realtime-8k-v1'
        format=format_pcm,
        # 'pcm'、'wav'、'opus'、'speex'、'aac'、'amr', you can check the supported formats in the document
        sample_rate=sample_rate,
        # support 8000, 16000
        semantic_punctuation_enabled=False,
        callback=callback)

    # Start translation
    recognition.start()

    signal.signal(signal.SIGINT, signal_handler)
    print("Press 'Ctrl+C' to stop recording and translation...")
    # Create a keyboard listener until "Ctrl+C" is pressed

    while True:
        if stream:
            data = stream.read(3200, exception_on_overflow=False)
            recognition.send_audio_frame(data)
        else:
            break

    recognition.stop()
package com.alibaba.dashscope.sample.recognition.quickstart;

import com.alibaba.dashscope.audio.asr.recognition.Recognition;
import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.utils.ApiKey;
import io.reactivex.BackpressureStrategy;
import io.reactivex.Flowable;
import java.awt.event.KeyAdapter;
import java.awt.event.KeyEvent;
import java.nio.ByteBuffer;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.TargetDataLine;
import javax.swing.*;

/**
 * This demo showcases how to use Alibaba Cloud's DashScope model for recognition from microphone
 * audio input to text Type 'q' to finish recording and recognition.
 */
public class RecognizeSpeechFromMicrophoneUsingFlowable {

  public static void main(String[] args) {
    // set exit flags
    boolean[] shouldExit = {false};
    Object exitFlag = new Object();

    // Start a new thread to record and recognize
    new Thread(
            () -> {
              try {
                startRecordingAndRecognition(exitFlag, shouldExit);
              } catch (Exception e) {
                e.printStackTrace();
                System.exit(0);
              }
            })
        .start();
    // Exit the program
    waitForExitSignal(exitFlag, shouldExit);
  }

  // Wait for exit signal by key press.
  private static void waitForExitSignal(Object exitFlag, boolean[] shouldExit) {

    // Create a hidden JFrame to capture key events
    JFrame frame = new JFrame();
    frame.setUndecorated(true);
    frame.setSize(1, 1);
    frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);

    System.out.println("Press 'Ctrl+C' to stop recording and recognition...");
    frame.addKeyListener(
        new KeyAdapter() {
          @Override
          public void keyPressed(KeyEvent e) {
            if (e.isControlDown() && e.getKeyCode() == KeyEvent.VK_C) {
              synchronized (exitFlag) {
                shouldExit[0] = true;
                exitFlag.notifyAll(); // 通知录音线程退出
              }
              System.out.println("Exit signal received. Exiting...");
            }
          }
        });
    frame.setVisible(true);
  }

  private static void startRecordingAndRecognition(Object exitFlag, boolean[] shouldExit)
      throws NoApiKeyException {
    // Create a Flowable<ByteBuffer> for streaming audio data
    Flowable<ByteBuffer> audioSource = createAudioSourceWithControl(exitFlag, shouldExit);
    // Create speech Recognizer
    Recognition recognizer = new Recognition();
    // Create RecognitionParam, pass the Flowable<ByteBuffer> to audioFrames parameter
    RecognitionParam param =
        RecognitionParam.builder()
            .model("paraformer-realtime-v2")
            .format("pcm")
            .sampleRate(16000)
            .apiKey(getDashScopeApiKey()) // set your apikey in config.Environments.yourApikey
            .parameter("semantic_punctuation_enabled", false)
            .build();

    // Stream call interface for streaming audio to recognizer
    recognizer
        .streamCall(param, audioSource)
        .blockingForEach(
            result -> {
              // Subscribe to the output result
              if (result.isSentenceEnd()) {
                System.out.println("Final Result: " + result.getSentence().getText());
              } else {
                System.out.println("Intermediate Result: " + result.getSentence().getText());
              }
            });
    System.out.println("Recognition onComplete! , exit program...");

    System.out.println(
        "[Metric] requestId: "
            + recognizer.getLastRequestId()
            + ", first package delay ms: "
            + recognizer.getFirstPackageDelay()
            + ", last package delay ms: "
            + recognizer.getLastPackageDelay());

    System.exit(0);
  }

  private static Flowable<ByteBuffer> createAudioSourceWithControl(
      Object exitFlag, boolean[] shouldExit) {
    // Create a Flowable<ByteBuffer> for streaming audio data
    return Flowable.create(
        emitter -> {
          try {
            AudioFormat audioFormat = new AudioFormat(16000, 16, 1, true, false);
            TargetDataLine targetDataLine = AudioSystem.getTargetDataLine(audioFormat);
            targetDataLine.open(audioFormat);
            targetDataLine.start();
            ByteBuffer buffer = ByteBuffer.allocate(1024);

            while (!shouldExit[0]) {
              int read = targetDataLine.read(buffer.array(), 0, buffer.capacity());
              if (read > 0) {
                buffer.limit(read);
                emitter.onNext(buffer);
                buffer = ByteBuffer.allocate(1024);
                Thread.sleep(20); // Small delay to control CPU usage
              }
              synchronized (exitFlag) {
                if (shouldExit[0]) {
                  emitter.onComplete();
                  break;
                }
              }
            }
          } catch (Exception e) {
            emitter.onError(e);
            System.exit(0);
          }
        },
        BackpressureStrategy.BUFFER);
  }

  /**
   * Set your DashScope API key. More information: <a
   * href="https://help.aliyun.com/document_detail/2712195.html">...</a> In fact, if you have set
   * DASHSCOPE_API_KEY in your environment variable, you can ignore this, and the SDK will
   * automatically get the api_key from the environment variable
   */
  private static String getDashScopeApiKey() throws NoApiKeyException {
    String dashScopeApiKey = null;
    try {
      ApiKey apiKey = new ApiKey();
      dashScopeApiKey = apiKey.getApiKey(null); // Retrieve from environment variable.
    } catch (NoApiKeyException e) {
      System.out.println("No API key found in environment.");
    }
    if (dashScopeApiKey == null) {
      // If you cannot set api_key in your environment variable,
      // you can set it here by code
      dashScopeApiKey = "your-dashscope-api-key";
    }
    return dashScopeApiKey;
  }
}
重要

在这个示例中,通过semantic_punctuation_enabled参数关闭了语义断句,使用VAD断句从而获得更快的响应速度。关于参数的介绍请参考实时语音识别API详情

更多详细案例可参考语音识别实现音视频文件转写及实时文字上屏功能

使用同步接口进行文件转写

以下示例展示使用语音识别同步API接口进行文件转写,对于对话聊天、控制口令、语音输入法、语音搜索等较短的准实时语音识别场景可考虑采用该接口进行语音识别。

# For prerequisites running the following sample, visit https://help.aliyun.com/document_detail/611472.html

import requests
from http import HTTPStatus

import dashscope
from dashscope.audio.asr import Recognition

dashscope.api_key = '<your-dashscope-api-key>'

r = requests.get(
    'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_female2.wav'
)
with open('asr_example.wav', 'wb') as f:
    f.write(r.content)

recognition = Recognition(model='paraformer-realtime-v2',
                          format='wav',
                          sample_rate=16000,
                          callback=None)
result = recognition.call('asr_example.wav')


print(
    '[Metric] requestId: {}, first package delay ms: {}, last package delay ms: {}'
    .format(
        recognition.get_last_request_id(),
        recognition.get_first_package_delay(),
        recognition.get_last_package_delay(),
    ))

if result.status_code == HTTPStatus.OK:
    with open('asr_result.txt', 'w+') as f:
        for sentence in result.get_sentence():
            print(sentence['text'])
            f.write(str(sentence) + '\n')
    print('Recognition done!')
else:
    print('Error: ', result.message)
package com.alibaba.dashscope.sample.recognition.quickstart;

import com.alibaba.dashscope.audio.asr.recognition.Recognition;
import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;

public class Main {

  public static void main(String[] args) {
    // 用户可忽略url下载文件部分,可以直接使用本地文件进行相关api调用进行识别
    String exampleWavUrl =
        "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_female2.wav";
    try {
      InputStream in = new URL(exampleWavUrl).openStream();
      Files.copy(in, Paths.get("asr_example.wav"), StandardCopyOption.REPLACE_EXISTING);
    } catch (IOException e) {
      System.out.println("error: " + e);
      System.exit(1);
    }

    // 创建Recognition实例
    Recognition recognizer = new Recognition();
    // 创建RecognitionParam,请在实际使用中替换真实apiKey
    RecognitionParam param =
        RecognitionParam.builder()
            .model("paraformer-realtime-v2")
            .format("wav")
            .sampleRate(16000)
            .apiKey("your-dashscope-api-key")
            .build();
    // 直接将结果保存到script.txt中
    try (FileOutputStream fos = new FileOutputStream("asr_result.txt")) {
      String result = recognizer.call(param, new File("asr_example.wav"));
      System.out.println(result);
      System.out.println(
          "[Metric] requestId: "
              + recognizer.getLastRequestId()
              + ", first package delay ms: "
              + recognizer.getFirstPackageDelay()
              + ", last package delay ms: "
              + recognizer.getLastPackageDelay());      
      fos.write(result.getBytes());
    } catch (Exception e) {
      e.printStackTrace();
    }
    System.exit(0);
  }
}

调用成功后,实时识别的返回结果示例如下

{
	"begin_time": 280,
	"end_time": 4000,
	"text": "hello word, 这里是阿里巴巴语音实验室。",
	"words": [{
		"begin_time": 280,
		"end_time": 776,
		"text": "hello ",
		"punctuation": ""
	}, {
		"begin_time": 776,
		"end_time": 1024,
		"text": "word",
		"punctuation": ", "
	}, {
		"begin_time": 1024,
		"end_time": 1520,
		"text": "这里",
		"punctuation": ""
	}, {
		"begin_time": 1520,
		"end_time": 1768,
		"text": "是",
		"punctuation": ""
	}, {
		"begin_time": 1768,
		"end_time": 2760,
		"text": "阿里巴巴",
		"punctuation": ""
	}, {
		"begin_time": 2760,
		"end_time": 3256,
		"text": "语音",
		"punctuation": ""
	}, {
		"begin_time": 3256,
		"end_time": 4000,
		"text": "实验室",
		"punctuation": "。"
	}]
}

使用同步接口进行多语种文件转写

以下示例展示使用语音识别同步API接口进行日语文件转写,对于对话聊天、控制口令、语音输入法、语音搜索等较短的准实时语音识别场景可考虑采用该接口进行语音识别。

# For prerequisites running the following sample, visit https://help.aliyun.com/document_detail/611472.html

import requests
from http import HTTPStatus

import dashscope
from dashscope.audio.asr import Recognition

dashscope.api_key = '<your-dashscope-api-key>'

r = requests.get(
    'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/welcome_female_16k_mono_japanese.wav'
)
with open('asr_japanese_example.wav', 'wb') as f:
    f.write(r.content)

recognition = Recognition(model='paraformer-realtime-v2',
                          format='wav',
                          sample_rate=16000,
                          language_hints=['ja'],  # “language_hints”只支持paraformer-v2和paraformer-realtime-v2模型
                          callback=None)
result = recognition.call('asr_japanese_example.wav')

print(
    '[Metric] requestId: {}, first package delay ms: {}, last package delay ms: {}'
    .format(
        recognition.get_last_request_id(),
        recognition.get_first_package_delay(),
        recognition.get_last_package_delay(),
    ))

if result.status_code == HTTPStatus.OK:
    with open('asr_japanese_result.txt', 'w+') as f:
        for sentence in result.get_sentence():
            f.write(str(sentence) + '\n')
    print('Recognition done!')
else:
    print('Error: ', result.message)
package com.alibaba.dashscope.sample.recognition.quickstart;

import com.alibaba.dashscope.audio.asr.recognition.Recognition;
import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;

public class Main {

  public static void main(String[] args) {
    // 用户可忽略url下载文件部分,可以直接使用本地文件进行相关api调用进行识别
    String exampleWavUrl =
        "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/welcome_female_16k_mono_japanese.wav";
    try {
      InputStream in = new URL(exampleWavUrl).openStream();
      Files.copy(in, Paths.get("asr_japanese_example.wav"), StandardCopyOption.REPLACE_EXISTING);
    } catch (IOException e) {
      System.out.println("error: " + e);
      System.exit(1);
    }

    // 创建Recognition实例
    Recognition recognizer = new Recognition();
    // 创建RecognitionParam,请在实际使用中替换真实apiKey
    RecognitionParam param =
        RecognitionParam.builder()
            .model("paraformer-realtime-v2")
            .format("wav")
            .sampleRate(16000)
            .apiKey("your-dashscope-api-key")
            // “language_hints”只支持paraformer-v2和paraformer-realtime-v2模型
            .parameter("language_hints", new String[]{"ja"})
            .build();
    // 直接将结果保存到script.txt中
    try (FileOutputStream fos = new FileOutputStream("asr_japanese_result.txt")) {
      String result = recognizer.call(param, new File("asr_japanese_example.wav"));
      System.out.println(
          "[Metric] requestId: "
              + recognizer.getLastRequestId()
              + ", first package delay ms: "
              + recognizer.getFirstPackageDelay()
              + ", last package delay ms: "
              + recognizer.getLastPackageDelay());      
      System.out.println(result);
      fos.write(result.getBytes());
    } catch (Exception e) {
      e.printStackTrace();
    }
    System.exit(0);
  }
}

调用成功后,实时识别的返回结果示例如下

{
    "begin_time": 220,
    "end_time": 4280,
    "text": "アリババクラウドボイスサービスへようこそ。",
    "words": [
        {
            "begin_time": 220,
            "end_time": 626,
            "text": "アリ",
            "punctuation": ""
        },
        {
            "begin_time": 626,
            "end_time": 1032,
            "text": "ババ",
            "punctuation": ""
        },
        {
            "begin_time": 1032,
            "end_time": 1438,
            "text": "クラ",
            "punctuation": ""
        },
        {
            "begin_time": 1438,
            "end_time": 1844,
            "text": "ウド",
            "punctuation": ""
        },
        {
            "begin_time": 1844,
            "end_time": 2250,
            "text": "ボイ",
            "punctuation": ""
        },
        {
            "begin_time": 2250,
            "end_time": 2656,
            "text": "スサ",
            "punctuation": ""
        },
        {
            "begin_time": 2656,
            "end_time": 3062,
            "text": "ービ",
            "punctuation": ""
        },
        {
            "begin_time": 3062,
            "end_time": 3468,
            "text": "スへ",
            "punctuation": ""
        },
        {
            "begin_time": 3468,
            "end_time": 3874,
            "text": "よう",
            "punctuation": ""
        },
        {
            "begin_time": 3874,
            "end_time": 4280,
            "text": "こそ",
            "punctuation": "。"
        }
    ]
}

录音文件识别示例代码

以下示例展示了调用Paraformer语音识别文件转写异步API,对多个通过URL给出的音频文件进行语音识别批处理的代码。

录音文件识别目前不支持识别本地文件。

说明

需要使用您的API-KEY替换示例中的 your-dashscope-api-key ,代码才能正常运行。

# For prerequisites running the following sample, visit https://help.aliyun.com/document_detail/611472.html

import json
from urllib import request
from http import HTTPStatus

import dashscope

dashscope.api_key = 'your-dashscope-api-key'

task_response = dashscope.audio.asr.Transcription.async_call(
    model='paraformer-v2',
    file_urls=[
        'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_female2.wav',
        'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_male2.wav'
    ],
    language_hints=['zh', 'en'])  # “language_hints”只支持paraformer-v2和paraformer-realtime-v2模型

print('task_id: ', task_response.output.task_id)

transcription_response = dashscope.audio.asr.Transcription.wait(
    task=task_response.output.task_id)

if transcription_response.status_code == HTTPStatus.OK:
    for transcription in transcription_response.output['results']:
        if transcription['subtask_status'] == 'SUCCEEDED':
            url = transcription['transcription_url']
            result = json.loads(request.urlopen(url).read().decode('utf8'))
            print(json.dumps(result, indent=4, ensure_ascii=False))
        else:
            print('transcription failed!')
            print(transcription)
    print('transcription done!')
else:
    print('Error: ', transcription_response.output.message)
package com.alibaba.dashscope.sample.transcription;

import com.alibaba.dashscope.audio.asr.transcription.*;
import com.google.gson.*;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.*;
import java.net.HttpURLConnection;
import java.util.Arrays;
import java.util.List;

public class Main {
    public static void main(String[] args) {
      	// 创建转写请求参数,需要用真实apikey替换your-dashscope-api-key
        TranscriptionParam param =
                TranscriptionParam.builder()
                        .apiKey("your-dashscope-api-key")
                        .model("paraformer-v2")
                        // “language_hints”只支持paraformer-v2和paraformer-realtime-v2模型
                        .parameter("language_hints", new String[]{"zh", "en"})
                        .fileUrls(
                            Arrays.asList(
                                "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_female2.wav",
                                "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_male2.wav"))
            						.build();
        try {
            Transcription transcription = new Transcription();
          	// 提交转写请求
            TranscriptionResult result = transcription.asyncCall(param);
            // 等待转写完成
            System.out.println("RequestId: " + result.getRequestId());
            result = transcription.wait(
                            TranscriptionQueryParam.FromTranscriptionParam(param, result.getTaskId()));
            // 获取转写结果
          	List<TranscriptionTaskResult> taskResultList = result.getResults();
            if (taskResultList != null && taskResultList.size() > 0) {
                for (TranscriptionTaskResult taskResult : taskResultList) {
                  String transcriptionUrl = taskResult.getTranscriptionUrl();
                  HttpURLConnection connection =
                          (HttpURLConnection) new URL(transcriptionUrl).openConnection();
                  connection.setRequestMethod("GET");
                  connection.connect();
                  BufferedReader reader =
                          new BufferedReader(new InputStreamReader(connection.getInputStream()));
                  Gson gson = new GsonBuilder().setPrettyPrinting().create();
                  System.out.println(gson.toJson(gson.fromJson(reader, JsonObject.class)));
                }
            }
        } catch (Exception e) {
            System.out.println("error: " + e);
        }
        System.exit(0);
    }
}
说明
  • 通过URL指定进行语音转写的文件,其大小不超过2GB。

  • file_urls 参数支持传入多个文件URL,示例中展示了对多个文件URL进行转写的功能。

调用成功后,将会返回例如以下示例的文件转写结果。

{
    "file_url": "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_male2.wav",
    "properties": {
        "audio_format": "pcm_s16le",
        "channels": [
            0
        ],
        "original_sampling_rate": 16000,
        "original_duration_in_milliseconds": 4726
    },
    "transcripts": [
        {
            "channel_id": 0,
            "content_duration_in_milliseconds": 4570,
            "text": "Hello world, 这里是阿里巴巴语音实验室。",
            "sentences": [
                {
                    "begin_time": 140,
                    "end_time": 4710,
                    "text": "Hello world, 这里是阿里巴巴语音实验室。",
                    "words": [
                        {
                            "begin_time": 140,
                            "end_time": 597,
                            "text": "Hello ",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 597,
                            "end_time": 1054,
                            "text": "world",
                            "punctuation": ", "
                        },
                        {
                            "begin_time": 1054,
                            "end_time": 1663,
                            "text": "这里",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 1663,
                            "end_time": 2272,
                            "text": "是阿",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 2272,
                            "end_time": 2881,
                            "text": "里巴",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 2881,
                            "end_time": 3490,
                            "text": "巴语",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 3490,
                            "end_time": 4099,
                            "text": "音实",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 4099,
                            "end_time": 4710,
                            "text": "验室",
                            "punctuation": "。"
                        }
                    ]
                }
            ]
        }
    ]
}
{
    "file_url": "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_female2.wav",
    "properties": {
        "audio_format": "pcm_s16le",
        "channels": [
            0
        ],
        "original_sampling_rate": 16000,
        "original_duration_in_milliseconds": 3834
    },
    "transcripts": [
        {
            "channel_id": 0,
            "content_duration_in_milliseconds": 3530,
            "text": "Hello world, 这里是阿里巴巴语音实验室。",
            "sentences": [
                {
                    "begin_time": 280,
                    "end_time": 3810,
                    "text": "Hello world, 这里是阿里巴巴语音实验室。",
                    "words": [
                        {
                            "begin_time": 280,
                            "end_time": 633,
                            "text": "Hello ",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 633,
                            "end_time": 986,
                            "text": "world",
                            "punctuation": ", "
                        },
                        {
                            "begin_time": 986,
                            "end_time": 1456,
                            "text": "这里",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 1456,
                            "end_time": 1926,
                            "text": "是阿",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 1926,
                            "end_time": 2396,
                            "text": "里巴",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 2396,
                            "end_time": 2866,
                            "text": "巴语",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 2866,
                            "end_time": 3336,
                            "text": "音实",
                            "punctuation": ""
                        },
                        {
                            "begin_time": 3336,
                            "end_time": 3810,
                            "text": "验室",
                            "punctuation": "。"
                        }
                    ]
                }
            ]
        }
    ]
}

了解更多

有关Paraformer语音识别模型服务的实时语音识别API以及录音文件转写的详细调用方法,可前往实时语音识别API详情录音文件识别API详情页面进行了解。