Qwen-Omni

更新时间:
复制 MD 格式

The Qwen-Omni model accepts multimodal input and generates text or speech responses. It produces human-like voices and supports speech output in multiple languages and dialects. Use cases include content moderation, text creation, visual recognition, and audio-video interaction.

Supported regions: Beijing, Singapore. Use the API key for your region.

Getting started

Prerequisites

This example sends a text prompt to the Qwen-Omni API and returns a streaming response with both text and audio.

# Before you run this code:
# Install dependencies using these commands:
# pip install numpy soundfile openai

import os
import base64
import soundfile as sf
import numpy as np
from openai import OpenAI

# 1. Initialize the client
client = OpenAI(
    api_key=os.getenv("DASHSCOPE_API_KEY"),  # Confirm that the environment variable is set
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

# 2. Send the request
try:
    completion = client.chat.completions.create(
        model="qwen3.5-omni-plus",
        messages=[{"role": "user", "content": "Who are you?"}],
        modalities=["text", "audio"],  # Specify text and audio output
        audio={"voice": "Tina", "format": "wav"},
        stream=True,  # Must be set to True
        stream_options={"include_usage": True},
    )

    # 3. Process the streaming response and decode the audio
    print("Model response:")
    audio_base64_string = ""
    for chunk in completion:
        # Process the text part
        if chunk.choices and chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content, end="")

        # Collect the audio part
        if chunk.choices and hasattr(chunk.choices[0].delta, "audio") and chunk.choices[0].delta.audio:
            audio_base64_string += chunk.choices[0].delta.audio.get("data", "")

    # 4. Save the audio file
    if audio_base64_string:
        wav_bytes = base64.b64decode(audio_base64_string)
        audio_np = np.frombuffer(wav_bytes, dtype=np.int16)
        sf.write("audio_assistant.wav", audio_np, samplerate=24000)
        print("\nAudio file saved to: audio_assistant.wav")

except Exception as e:
    print(f"Request failed: {e}")
// Before you run this code:
// For Windows/Mac/Linux:
// 1. Ensure Node.js version >= 14 is installed.
// 2. Run the following command to install necessary dependencies:
//    npm install openai wav

import OpenAI from "openai";
import { createWriteStream } from 'node:fs';
import { Writer } from 'wav';

// Define a function to convert a Base64 string and save it as a standard WAV audio file
async function convertAudio(audioString, audioPath) {
    try {
        // Decode the Base64 string into a Buffer
        const wavBuffer = Buffer.from(audioString, 'base64');
        // Create a WAV file write stream
        const writer = new Writer({
            sampleRate: 24000,  // Sample rate
            channels: 1,        // Mono
            bitDepth: 16        // 16-bit depth
        });
        // Create an output file stream and establish a pipe connection
        const outputStream = createWriteStream(audioPath);
        writer.pipe(outputStream);

        // Write PCM data and end writing
        writer.write(wavBuffer);
        writer.end();

        // Use a Promise to wait for the file to finish writing
        await new Promise((resolve, reject) => {
            outputStream.on('finish', resolve);
            outputStream.on('error', reject);
        });

        // Add extra wait time to ensure audio integrity
        await new Promise(resolve => setTimeout(resolve, 800));

        console.log(`\nAudio file saved to: ${audioPath}`);
    } catch (error) {
        console.error('Error during processing:', error);
    }
}

//  1. Initialize the client
const openai = new OpenAI(
    {
        // If no environment variable is set, replace the next line with your Model Studio API key: apiKey: "sk-xxx",
        apiKey: process.env.DASHSCOPE_API_KEY,
        baseURL: "https://dashscope.aliyuncs.com/compatible-mode/v1"
    }
);
// 2. Send the request
const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",  
    messages: [
        {
            "role": "user",
            "content": "Who are you?"
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

let audioString = "";
console.log("Model response:")

// 3. Process the streaming response and decode the audio
for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        // Process text content
        if (chunk.choices[0].delta.content) {
            process.stdout.write(chunk.choices[0].delta.content);
        }
        // Process audio content
        if (chunk.choices[0].delta.audio) {
            if (chunk.choices[0].delta.audio["data"]) {
                audioString += chunk.choices[0].delta.audio["data"];
            }
        }
    }
}
// 4. Save the audio file
convertAudio(audioString, "audio_assistant.wav");
curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
        {
            "role": "user", 
            "content": "Who are you?"
        }
    ],
    "stream":true,
    "stream_options":{
        "include_usage":true
    },
    "modalities":["text","audio"],
    "audio":{"voice":"Tina","format":"wav"}
}'

Response

After you run the Python or Node.js code, the text response appears in the console and an audio file named audio_assistant.wav is saved in the same directory as your code file.

Model response:
I am a large language model developed by Alibaba Cloud. My name is Qwen. How can I help you?

Running HTTP code returns text and Base64-encoded audio data directly in the audio field.

data: {"choices":[{"delta":{"content":"I"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757647879,"system_fingerprint":null,"model":"qwen3.5-omni-plus","id":"chatcmpl-a68eca3b-c67e-4666-a72f-73c0b4919860"}
data: {"choices":[{"delta":{"content":"am"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757647879,"system_fingerprint":null,"model":"qwen3.5-omni-plus","id":"chatcmpl-a68eca3b-c67e-4666-a72f-73c0b4919860"}
......
data: {"choices":[{"delta":{"audio":{"data":"/v8AAAAAAAAAAAAAAA...","expires_at":1757647879,"id":"audio_a68eca3b-c67e-4666-a72f-73c0b4919860"}},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757647879,"system_fingerprint":null,"model":"qwen3.5-omni-plus","id":"chatcmpl-a68eca3b-c67e-4666-a72f-73c0b4919860"}
data: {"choices":[{"finish_reason":"stop","delta":{"content":""},"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1764763585,"system_fingerprint":null,"model":"qwen3.5-omni-plus","id":"chatcmpl-e8c82e9e-073e-4289-a786-a20eb444ac9c"}
data: {"choices":[],"object":"chat.completion.chunk","usage":{"prompt_tokens":207,"completion_tokens":103,"total_tokens":310,"completion_tokens_details":{"audio_tokens":83,"text_tokens":20},"prompt_tokens_details":{"text_tokens":207}},"created":1757940330,"system_fingerprint":null,"model":"qwen3.5-omni-plus","id":"chatcmpl-9cdd5a26-f9e9-4eff-9dcc-93a878165afc"}

Model selection

  • Qwen3.5-Omni series: Best for long video analysis, meeting summaries, caption generation, content moderation, and audio-video interaction.

    • Input limits: Up to 3 hours of audio or 1 hour of video

    • Audio control: Supports adjusting volume, speaking rate, and emotion through instructions

    • Visual capability: Matches the level of Qwen3.5. Understands images, speech, sound effects, and other multimodal input

    • Combined multimodal input: Supports any combination of text with images, audio, and video in a single request

    • Voice cloning: Supports custom voices (only qwen3.5-omni-plus and qwen3.5-omni-flash; snapshot versions are not supported). For details, see Voice cloning

  • Qwen3-Omni-Flash series: Best for short video analysis and cost-sensitive scenarios.

    • Input limits: Audio and video input up to 150 seconds

    • Thinking mode: The only Qwen-Omni series model that supports thinking mode

    • Input modality: Supports only a combination of text with a single other modality (image, audio, or video).

  • Qwen-Omni-Turbo series

    This series is no longer updated and has limited features. We recommend migrating to the Qwen3.5-Omni or Qwen3-Omni-Flash series.

Series

Audio-video description

Deep thinking

Input audio languages

Output audio languages

Supported voices

Qwen3.5-Omni

Latest-generation omni-modal model

Strong

Not supported

113

74 languages and 39 dialects

Languages: Chinese, English, German, French, Italian, Czech, Indonesian, Thai, Korean, Polish, Japanese, Vietnamese, Finnish, Portuguese, Spanish, Dutch, Russian, Malay, Catalan, Swedish, Turkish, Ukrainian, Romanian, Slovak, Danish, Icelandic, Norwegian (Bokmål), Macedonian, Greek, Hungarian, Galician, Filipino, Croatian, Bosnian, Slovenian, Bulgarian, Kazakh, Belarusian, Latvian, Estonian, Azerbaijani, Uyghur, Swahili, Hindi, Esperanto, Kyrgyz, Tajik, Cebuano, Afrikaans, Arabic, Lithuanian, Javanese, Bengali, Persian, Hebrew, Punjabi, Gujarati, Mongolian, Asturian, Kannada, Marathi, Interlingua, Malayalam, Maltese, Norwegian Nynorsk, Telugu, Urdu, Georgian, Basque, Tamil, Odia, Serbian, Maori

Dialects:
Northeastern Mandarin, Guizhou dialect, Cantonese, Henan dialect, Hong Kong Cantonese, Shanghainese, Shaanxi dialect, Tianjin dialect, Taiwanese Hokkien, Yunnan dialect, Anhui dialect, Fujian dialect, Gansu dialect, Guangdong dialect, Hubei dialect, Hunan dialect, Jiangxi dialect, Shandong dialect, Shanxi dialect, Sichuan dialect, Guangxi dialect, Hainan dialect, Chongqing dialect, Changsha dialect, Hangzhou dialect, Hefei dialect, Yinchuan dialect, Zhengzhou dialect, Shenyang dialect, Wenzhou dialect, Wuhan dialect, Kunming dialect, Taiyuan dialect, Nanchang dialect, Jinan dialect, Lanzhou dialect, Nanjing dialect, Hakka, Southern Min































































































































































































































































































































































































































































































































36

29 languages and 7 dialects

Languages:Chinese, English, German, Italian, Portuguese, Spanish, Japanese, Korean, French, Russian, Thai, Indonesian, Arabic, Vietnamese, Turkish, Finnish, Polish, Hindi, Dutch, Czech, Urdu, Tagalog, Swedish, Danish, Hebrew, Icelandic, Malay, Norwegian, Persian

Dialects:Sichuan dialect, Beijing dialect, Tianjin dialect, Nanjing dialect, Shaanxi dialect, Cantonese, Southern Min

55

Qwen3-Omni-Flash

Hybrid thinking model

Weaker

Supported

19

11 languages and 8 dialects

Language:

Chinese, English, German, French, Italian, Thai, Korean, Japanese, Russian, Spanish, Portuguese

Dialects:

Sichuan dialect, Shanghainese, Cantonese, Southern Min, Shaanxi dialect, Nanjing dialect, Tianjin dialect, Beijing dialect

19

11 languages and 8 dialects

Language:

Chinese, English, German, French, Italian, Thai, Korean, Japanese, Russian, Spanish, Portuguese

Dialects:

Sichuanese, Shanghainese, Cantonese, Hokkien, Shaanxi dialect, Nanjing dialect, Tianjin dialect, Beijing dialect

17 to 49

Varies by version

Qwen-Omni-Turbo

No longer updated

None

Not supported

Chinese, English

Chinese, English

4

For model names, context windows, pricing, and snapshot versions, check in the Model Studio console. For rate limits, see Rate limiting.

Model performance

Audio and video content analysis

Generate a comprehensive timestamped description of this video.

00:00.000 – 00:02.500

A rain-soaked city street fills the widescreen frame. Long-exposure photography creates streaks of red and blue car lights on wet pavement. A solitary man in a dark knee-length trench coat and light shirt with a tie walks toward the camera along the right sidewalk. Raindrops cling to his shoulders and hair. Each step makes a muffled plop on damp concrete. A low, ominous electronic hum underscores ambient sounds, while steady rain crackles all around. Colorful graffiti covers the brick wall beside him, and neon signs glow in the distance—one clearly displays orange-red cursive script reading "Nice," while another vertical sign shows the letters "AT" at its bottom.

......

00:12.300 – 00:14.533

The camera tilts down. Inside a flashlight beam, a yellowed, aged single sheet lies in a shallow puddle, its edges charred and corners curled. Inked paragraphs cover most of its surface, but moisture and scorch marks make the text unreadable. The man kneels, extending gloved fingers toward the document. Ripples spread outward as his hand nears. Sirens peak then begin to fade slightly, while the ambient hum remains steady. The scene freezes just before the trembling paper is touched, ending the clip.

Only partial timestamps are shown.

Multidialect understanding

Describe the vocal characteristics, emotional state, and full lyrics for singers in different Chinese dialects.

This audio features rap performances by singers from various regions of China, showcasing rich dialect diversity. The following are detailed descriptions of each singer's vocal traits, emotional state, and corresponding lyrics:

Hangzhou dialect (Wu)

Vocal traits:

The male singer's voice is soft and gentle, with the distinctive smoothness and warmth of the Wu dialect. His intonation rises and falls gently, and his rhythm is light and quick, creating a relaxed storytelling effect.

Emotional state:

Relaxed, comfortable, and full of everyday life. He describes Hangzhou's slow-paced lifestyle and urban changes, expressing affection for his hometown and reflections on modern development. Overall, he feels cozy and familiar.

Lyrics:

> Hangzhou dialect, listen up! Our culture includes local erhua pronunciation.

> Like flowers blooming by West Lake, no cultural survey needed—we understand it best.

> None of you understand. 'San bu da men', 'ge'r', 'lao'r', 'fen'r', 'ya'r'... how is an outsider supposed to tell any of these apart?

> Let's go! Transport is fast—subways everywhere. We'll check it out.

> Welcoming Southeast Asia and Thailand, watching "Thirteen Ta" shows. Proud of our city, direct in personality.

> So you only came for the "bridge-crossing" experience, and because the flavor is different, you're not returning?

Only partial results are shown.

Lyric caption generation

Transcribe the song lyrics and provide timestamps for each line in this format:[00:00:15,020 --> 00:00:28,085] : When you walk through a storm, hold your head up high.[00:00:28,085 --> 00:00:40,200] And don't be afraid of the dark. ......

[00:00:12,680 --> 00:00:16,960] Cat thread sways past moonlight on trees.

[00:00:18,400 --> 00:00:22,800] Radiators hum 1998 chart hits.

[00:00:24.160 → 00:00:28.080] Time parts the mist-like heat waves.

[00:00:28,920 --> 00:00:33,000] Neon from the screen shines on my nose bridge.

......

[00:03:16,720 --> 00:03:21,680] We nestle in the softest ring of the tree trunk.

[00:03:22,400 --> 00:03:27,000] Breathing turns residual warmth into honey-sugar.

[00:03:28,160 --> 00:03:33,200] The sofa sinks into cloud-fluff shape.

[00:03:34,000 --> 00:03:38,800] Every pore soaks in sunshine.

[00:04:09,000 --> 00:04:10,020] (End)

Only partial results are shown.

Audio-video programming

Usage

Streaming output

All requests to Qwen-Omni must set stream=True.

Model configuration

Configure parameters, prompts, and media lengths to balance cost, speed, and quality.

Audio-video understanding

Use case

Recommended video length

Recommended prompt

Recommended max_pixels

Fast review, low cost

≤60 minutes

Simple prompt within 50 words

230,400

Content extraction (long video segmentation)

≤60 minutes

921,600~2,073,600

Standard analysis (short video tagging)

≤4 minutes

Use the structured prompt below

Recommended prompt

Provide a detailed description of the video.
It should explicitly include three sections: 
1. A structured chronological storyline of **every noticeable audio and visual detail**
2. A structured list of all visible text. For each text element, include start timestamp, end timestamp, the exact text content, and the appearance characteristics. If no text appears, explicitly state so.
3. A structured speech-to-text transcription, include speaker (corresponding to the character or voice‑over in Section 1, including their accent and tone), exact spoken content, start timestamp, end timestamp, and speaking state (prosody, emotion, and style). If no speech appears, explicitly state so.
Aside from these three required sections, you are free to organize any additional content in any way you find helpful. This additional content can include global information about the entire video or localized information about specific moments. You may choose the topic of this extra content freely.
Output Format:
```
## Storyline
<xx:xx.xxx> - <xx:xx.xxx>
<an unstructured long paragraph in natural language describing what happened during this period, blending both audio and video details.>
<xx:xx.xxx> - <xx:xx.xxx>
<an unstructured long paragraph in natural language describing what happened during this period, blending both audio and video details.>
<xx:xx.xxx> - <xx:xx.xxx>
<an unstructured long paragraph in natural language describing what happened during this period, blending both audio and video details.>
...
## Visible Text
<xx:xx.xxx> - <xx:xx.xxx>
“<element>”: <appearance>
“<element>”: <appearance>
<xx:xx.xxx> - <xx:xx.xxx>
“<element>”: <appearance>
“<element>”: <appearance>
“<element>”: <appearance>
<xx:xx.xxx> - <xx:xx.xxx>
“<element>”: <appearance>
...
## Speakers and Transcript
Speaker profiles:
<speaker> - <profile>
<speaker> - <profile>
<speaker> - <profile>
...
<xx:xx.xxx> - <xx:xx.xxx>
Speaker: <speaker>
State: <description>
Content: “<content>”
<xx:xx.xxx> - <xx:xx.xxx>
Speaker: <speaker>
State: <description>
Content: “<content>”
<xx:xx.xxx> - <xx:xx.xxx>
Speaker: <speaker>
State: <description>
Content: “<content>”
...
## <another section>
<paragraphs>
## <another section>
<paragraphs>
...
```

921,600~2,073,600

Fine-grained analysis (multiple speakers/complex scenes)

≤2 minutes

2,073,600

Note

You can segment long videos first to obtain fine-grained descriptions.

Audio understanding

You can balance cost and quality by controlling the audio length and prompt complexity.

Use case

Recommended audio length

Recommended prompt

Fast review, low cost

≤60 minutes

Simple prompt within 50 words

Content extraction (segment long audio)

≤60 minutes

Standard analysis (audio tagging)

≤2 minutes

Use a structured prompt

Structured prompt

Provide a detailed description of the audio.

It should explicitly include two sections: 

1. A structured chronological storyline of **every noticeable audio detail**
2. A structured speech-to-text transcription, include speaker (Corresponding to the character or voice‑over in Section 1, including their accent and tone), exact spoken content, start timestamp, end timestamp, and speaking state (prosody, emotion, and style). If no speech appears, explicitly state so.

Aside from these two required components, you are free to organize any additional content in any way you find helpful. This additional content can include global information about the entire audio or localized information about specific moments. You may choose the topic of this extra content freely.

Output Format:

```
## Storyline

<xx:xx.xxx> - <xx:xx.xxx>
<an unstructured long paragraph in natural language describing what happened during this period, blending both audio details.>

<xx:xx.xxx> - <xx:xx.xxx>
<an unstructured long paragraph in natural language describing what happened during this period, blending both audio details.>

<xx:xx.xxx> - <xx:xx.xxx>
<an unstructured long paragraph in natural language describing what happened during this period, blending both audio details.>

...

...

## Speakers and Transcript

Speaker profiles:
<speaker> - <profile>
<speaker> - <profile>
<speaker> - <profile>
...

<xx:xx.xxx> - <xx:xx.xxx>
Speaker: <speaker>
State: <description>
Content: “<content>”

<xx:xx.xxx> - <xx:xx.xxx>
Speaker: <speaker>
State: <description>
Content: “<content>”

<xx:xx.xxx> - <xx:xx.xxx>
Speaker: <speaker>
State: <description>
Content: “<content>”

...

## <another section>

<paragraphs>

## <another section>

<paragraphs>

...
```

Fine-grained analysis (multiple speakers/complex scenes)

≤1 minute

Note

You can segment long audio first to obtain fine-grained descriptions.

Combined multimodal input

Note

Combined multimodal input is supported only by the Qwen3.5-Omni series. You can provide data in multiple modalities, such as any combination of image, audio, and text, or video, image, and text, in the same request.

The following example shows how to provide an image and audio in a single request for multimodal analysis.

OpenAI compatible

import os
from openai import OpenAI

client = OpenAI(
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"
                    },
                },
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20250211/tixcef/cherry.wav",
                        "format": "wav"
                    },
                },
                {"type": "text", "text": "Describe the image content and tell me what the audio is about."},
            ],
        },
    ],
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)
import OpenAI from "openai";

const openai = new OpenAI(
    {
        apiKey: process.env.DASHSCOPE_API_KEY,
        baseURL: "https://dashscope.aliyuncs.com/compatible-mode/v1"
    }
);

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",
    messages: [
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": { "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg" },
                },
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20250211/tixcef/cherry.wav",
                        "format": "wav"
                    },
                },
                { "type": "text", "text": "Describe the image content and tell me what the audio is about." }
            ]
        }
    ],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}
curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"
                    }
                },
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20250211/tixcef/cherry.wav",
                        "format": "wav"
                    }
                },
                {
                    "type": "text",
                    "text": "Describe the image content and tell me what the audio is about."
                }
            ]
        }
    ],
    "stream": true,
    "stream_options": {
        "include_usage": true
    },
    "modalities": ["text", "audio"],
    "audio": {"voice": "Tina", "format": "wav"}
}'

Single modality input

Each request contains text and one other modality (video, audio, or image). All Qwen-Omni models support this.

Video and text input

Provide the video as an image list or a video file (with audio support).

Video file (supports audio in the video)

  • Number of files:

    • Qwen3.5-Omni series: Up to 512 files using public URLs and up to 250 files using Base64 encoding.

    • Qwen3-Omni-Flash and Qwen-Omni-Turbo series: Only one file is allowed.

  • File size:

    • Using public URLs:

      • Qwen3.5-Omni series: Up to 2 GB

      • Qwen3-Omni-Flash: Up to 256 MB

      • Qwen-Omni-Turbo: Up to 150 MB

    • Using Base64 encoding: The encoded Base64 string must be smaller than 10 MB

  • Duration limits:

    • Qwen3.5-Omni series: 1 hour

    • Qwen3-Omni-Flash: 150 seconds

    • Qwen-Omni-Turbo: 40 seconds

  • File formats: MP4, AVI, MKV, MOV, FLV, and WMV.

  • Visual and audio information in the video file are billed separately.

OpenAI compatible

import os
from openai import OpenAI

client = OpenAI(
    # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
    # API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following base URL is for the China (Beijing) region. URLs vary by region.
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "video_url",
                    "video_url": {
                        "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241115/cqqkru/1.mp4"
                    },
                },
                {"type": "text", "text": "What is the video about?"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)
import OpenAI from "openai";

const openai = new OpenAI(
    {
        // If no environment variable is set, replace the next line with your Model Studio API key: apiKey: "sk-xxx"
        // API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following is the base URL for the Beijing region. URLs vary by region.
        baseURL: "https://dashscope.aliyuncs.com/compatible-mode/v1"
    }
);
const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus", // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "video_url",
                "video_url": { "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241115/cqqkru/1.mp4" },
            },
            { "type": "text", "text": "What is the video about?" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});


for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}
# ======= Important note =======
# API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
# The following base URL is for the China (Beijing) region. URLs vary by region./chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "video_url",
          "video_url": {
            "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241115/cqqkru/1.mp4"
          }
        },
        {
          "type": "text",
          "text": "What is the video about"
        }
      ]
    }
  ],
    "stream":true,
    "stream_options": {
        "include_usage": true
    },
    "modalities":["text","audio"],
    "audio":{"voice":"Tina","format":"wav"}
}'

Image list format

Number of images

  • Qwen3.5-Omni series: A minimum of 2 images and a maximum of 2048 images

  • Qwen3-Omni-Flash: A minimum of 2 images and a maximum of 128 images

  • Qwen-Omni-Turbo: A minimum of 4 images and a maximum of 80 images

OpenAI compatible

import os
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(
    # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
    # API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following base URL is for the China (Beijing) region. URLs vary by region.
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": [
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/xzsgiz/football1.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/tdescd/football2.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/zefdja/football3.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/aedbqh/football4.jpg",
                    ],
                },
                {"type": "text", "text": "Describe the process shown in this video"},
            ],
        }
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)
import OpenAI from "openai";

const openai = new OpenAI({
     // If no environment variable is set, replace the next line with your Model Studio API key: apiKey: "sk-xxx"
    // API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    apiKey: process.env.DASHSCOPE_API_KEY, 
    // The following base URL is for the China (Beijing) region. URLs vary by region.
    baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1'
});


const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",  // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [{
        role: "user",
        content: [
            {
                type: "video",
                video: [
                    "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/xzsgiz/football1.jpg",
                    "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/tdescd/football2.jpg",
                    "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/zefdja/football3.jpg",
                    "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/aedbqh/football4.jpg"
                ]
            },
            {
                type: "text",
                text: "Describe the process shown in this video"
            }
        ]
    }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}
# ======= Important note =======
# API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
# The following base URL is for the China (Beijing) region. URLs vary by region./chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": [
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/xzsgiz/football1.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/tdescd/football2.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/zefdja/football3.jpg",
                        "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/aedbqh/football4.jpg"
                    ]
                },
                {
                    "type": "text",
                    "text": "Describe the process shown in this video"
                }
            ]
        }
    ],
    "stream": true,
    "stream_options": {
        "include_usage": true
    },
    "modalities": ["text", "audio"],
    "audio": {
        "voice": "Tina",
        "format": "wav"
    }
}'

Audio and text input

  • Number of files:

    • Qwen3.5-Omni series: Up to 2048 files using public URLs and up to 250 files using Base64 encoding.

    • Qwen3-Omni-Flash and Qwen-Omni-Turbo series: Only one file is allowed.

  • File size:

    • Using public URLs:

      • Qwen3.5-Omni series: Up to 2 GB

      • Qwen3-Omni-Flash: Up to 100 MB

      • Qwen-Omni-Turbo: Up to 10 MB

    • Using Base64 encoding: The encoded Base64 string must be smaller than 10 MB

  • Duration limits:

    • Qwen3.5-Omni series: Up to 3 hours

    • Qwen3-Omni-Flash: Up to 20 minutes

    • Qwen-Omni-Turbo: Up to 3 minutes

  • File formats: AMR, WAV, 3GP, 3GPP, AAC, and MP3.

This example uses a public audio URL. To use a local file, see Send local files with Base64 encoding. Only streaming output is supported for the call.

OpenAI compatible

import os
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(
    # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
    # API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following base URL is for the China (Beijing) region. URLs vary by region.
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20250211/tixcef/cherry.wav",
                        "format": "wav",
                    },
                },
                {"type": "text", "text": "What is this audio about"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    print(chunk)
    # if chunk.choices:
    #     print(chunk.choices[0].delta)
    # else:
    #     print(chunk.usage)
import OpenAI from "openai";

// Initialize the OpenAI client
const openai = new OpenAI({
     // If no environment variable is set, replace the next line with your Model Studio API key: apiKey:"sk-xxx"
    // API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    apiKey: process.env.DASHSCOPE_API_KEY, 
    // The following base URL is for the China (Beijing) region. URLs vary by region.
    baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1'
});

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",  // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "input_audio",
                "input_audio": { "data": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20250211/tixcef/cherry.wav", "format": "wav" },
            },
            { "type": "text", "text": "What is this audio about" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}
# ======= Important note =======
# API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
# The following base URL is for the China (Beijing) region. URLs vary by region./chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "input_audio",
          "input_audio": {
            "data": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20250211/tixcef/cherry.wav",
            "format": "wav"
          }
        },
        {
          "type": "text",
          "text": "What is this audio about"
        }
      ]
    }
  ],
    "stream":true,
    "stream_options":{
        "include_usage":true
    },
    "modalities":["text","audio"],
    "audio":{"voice":"Tina","format":"wav"}
}'

Image and text input

The Qwen-Omni models support multiple images per request. Image requirements:

  • Number of images:

    • Using a public URL: Up to 2048 images

    • Using Base64 encoding: Up to 250 images

  • Image size:

    • Using public URLs:

      • Qwen3.5-Omni series: Each image file must not exceed 20 MB

      • Qwen3-Omni-Flash and Qwen-Omni-Turbo series: Each image file must not exceed 10 MB

    • Using Base64 encoding: The encoded Base64 string must be smaller than 10 MB.

  • Both width and height must exceed 10 pixels. The aspect ratio must not exceed 200:1 or 1:200.

  • Supported image types: See Image and video understanding.

This example uses a public image URL. To use a local file, see Input Base64-encoded local file. Streaming output is required.

OpenAI compatible

import os
from openai import OpenAI

client = OpenAI(
    # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
    # API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following base URL is for the China (Beijing) region. URLs vary by region.
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"
                    },
                },
                {"type": "text", "text": "What scene is depicted in the image?"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={
        "include_usage": True
    }
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)
import OpenAI from "openai";

const openai = new OpenAI(
    {
        // If no environment variable is set, replace the next line with your Model Studio API key: apiKey: "sk-xxx"
        // API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
        apiKey: process.env.DASHSCOPE_API_KEY,
        // The following base URL is for the China (Beijing) region. URLs vary by region.
        baseURL: "https://dashscope.aliyuncs.com/compatible-mode/v1"
    }
);
const completion = await openai.chat.completions.create({
    // For Qwen3-Omni-Flash, run in non-thinking mode.
    model: "qwen3.5-omni-plus", 
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "image_url",
                "image_url": { "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg" },
            },
            { "type": "text", "text": "What scene is depicted in the image?" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}
# ======= Important note =======
# API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
# The following base URL is for the China (Beijing) region. URLs vary by region./chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "image_url",
          "image_url": {
            "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"
          }
        },
        {
          "type": "text",
          "text": "What scene is depicted in the image?"
        }
      ]
    }
  ],
    "stream":true,
    "stream_options":{
        "include_usage":true
    },
    "modalities":["text","audio"],
    "audio":{"voice":"Tina","format":"wav"}
}'

Web search

The Qwen3.5-Omni series supports web search to retrieve real-time information and perform reasoning.

  • Web search is supported only in the Qwen3.5-Omni series. The search_strategy parameter only accepts agent.

  • For billing, see the agent policy in Billing.

To enable web search, set enable_search and search_strategy to agent:

OpenAI compatible

# Prerequisites:
# pip install openai

import os
from openai import OpenAI

# Initialize the client
client = OpenAI(
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

# Send request (with web search enabled)
try:
    completion = client.chat.completions.create(
        model="qwen3.5-omni-plus",
        messages=[{
            "role": "user", 
            "content": "What is today's date and day of the week, and what important holidays are there today?"
        }],
        stream=True,
        stream_options={"include_usage": True},
        # Enable web search
        extra_body={
            "enable_search": True   
        }
    )
    
    print("Model response (with real-time information):")
    for chunk in completion:
        if chunk.choices and chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content, end="")
    print()
    
except Exception as e:
    print(f"Request failed:{e}")
// Prerequisites:
// npm install openai

import OpenAI from "openai";

// Initialize the client
const openai = new OpenAI({
    apiKey: process.env.DASHSCOPE_API_KEY,
    baseURL: "https://dashscope.aliyuncs.com/compatible-mode/v1"
});

// Send request (with web search enabled)
const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",
    messages: [{
        "role": "user",
        "content": "What is today's date and day of the week, and what important holidays are there today?"
    }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    // Enable web search
    extra_body: {
        enable_search: true
    }
});

console.log("Model response (with real-time information):");

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        if (chunk.choices[0].delta.content) {
            process.stdout.write(chunk.choices[0].delta.content);
        }
    }
}
console.log();
curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3.5-omni-plus",
    "messages": [
        {
            "role": "user", 
            "content": "What is today's date and day of the week, and what important holidays are there today?"
        }
    ],
    "stream": true,
    "stream_options": {
        "include_usage": true
    },
    "enable_search": true
}'

Enable/disable thinking mode

In the Qwen-Omni series, only the Qwen3-Omni-Flash model is a hybrid thinking model. You can use the enable_thinking parameter to enable or disable the thinking mode:

  • true

  • false (default)

In thinking mode, audio output is not supported.

OpenAI compatible

import os
from openai import OpenAI

client = OpenAI(
    # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
    # API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following base URL is for the China (Beijing) region. URLs vary by region.
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3-omni-flash",
    messages=[{"role": "user", "content": "Who are you?"}],
    
    # Enable or disable thinking mode. Audio output is not supported in thinking mode. Qwen-Omni-Turbo does not support enable_thinking.
    extra_body={'enable_thinking': True},
    
    # Set the output modality. Two options are supported in non-thinking mode: ["text","audio"] and ["text"]. Only ["text"] is supported in thinking mode.
    modalities=["text"],
    
    # Set the voice. The audio parameter is not supported in thinking mode.
    # audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)
import OpenAI from "openai";

const openai = new OpenAI({
     // If no environment variable is set, replace the next line with your Model Studio API key: apiKey:"sk-xxx"
    // API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    apiKey: process.env.DASHSCOPE_API_KEY, 
    // The following base URL is for the China (Beijing) region. URLs vary by region.
    baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1'
});

const completion = await openai.chat.completions.create({
    model: "qwen3-omni-flash",
    messages: [
        { role: "user", content: "Who are you?" }
    ],
    // stream must be set to True, otherwise an error occurs.
    stream: true,
    stream_options: {
        include_usage: true
    },
    // Enable or disable thinking mode. Audio output is not supported in thinking mode. Qwen-Omni-Turbo does not support enable_thinking.
    extra_body:{'enable_thinking': true},
    //  Set the output modality. Two options are supported in non-thinking mode: ["text","audio"] and ["text"]. Only ["text"] is supported in thinking mode.
    modalities: ["text"],
    // Set the voice. The audio parameter is not supported in thinking mode.
    //audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}
# ======= Important note =======
# API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
# The following base URL is for the China (Beijing) region. URLs vary by region./chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
    "model": "qwen3-omni-flash",
    "messages": [
        {
            "role": "user",
            "content": "Who are you?"
        }
    ],
    "stream":true,
    "stream_options":{
        "include_usage":true
    },
    "modalities":["text"],
    "enable_thinking": true
}'

Response

data: {"choices":[{"delta":{"content":null,"role":"assistant","reasoning_content":""},"index":0,"logprobs":null,"finish_reason":null}],"object":"chat.completion.chunk","usage":null,"created":1757937336,"system_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
data: {"choices":[{"finish_reason":null,"logprobs":null,"delta":{"content":null,"reasoning_content":"Hmm"},"index":0}],"object":"chat.completion.chunk","usage":null,"reated":1757937336,"system_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
data: {"choices":[{"delta":{"content":null,"reasoning_content":","},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"reated":1757937336,"system_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
......
data: {"choices":[{"delta":{"content":"Tell me"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757937336,"tem_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
data: {"choices":[{"delta":{"content":"!"},"finish_reason":null,"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757937336,"systm_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
data: {"choices":[{"finish_reason":"stop","delta":{"content":"","reasoning_content":null},"index":0,"logprobs":null}],"object":"chat.completion.chunk","usage":null,"created":1757937336,"system_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}
data: {"choices":[],"object":"chat.completion.chunk","usage":{"prompt_tokens":11,"completion_tokens":363,"total_tokens":374,"completion_tokens_details":{"reasoning_tokens":195,"text_tokens":168},"prompt_tokens_details":{"text_tokens":11}},"created":1757937336,"system_fingerprint":null,"model":"qwen3-omni-flash","id":"chatcmpl-ce3d6fe5-e717-4b7e-8b40-3aef12288d4c"}

Multi-turn conversation

When using Qwen-Omni models for multi-turn conversations, note the following:

  • Assistant Message

    Assistant messages in the messages array can contain only text data.

  • User Message

    A user message can contain text and one other modality. In multi-turn conversations, you can input different modalities in different user messages.

OpenAI compatible

import os
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(
    # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
    # API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following base URL is for the China (Beijing) region. URLs vary by region.
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": "https://dashscope.oss-cn-beijing.aliyuncs.com/audios/welcome.mp3",
                        "format": "mp3",
                    },
                },
                {"type": "text", "text": "What is this audio about"},
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": "This audio says: Welcome to Alibaba Cloud"}],
        },
        {
            "role": "user",
            "content": [{"type": "text", "text": "Tell me about this company."}],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text"],
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)
import OpenAI from "openai";

const openai = new OpenAI({
     // If no environment variable is set, replace the next line with your Model Studio API key: apiKey: "sk-xxx"
    // API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    apiKey: process.env.DASHSCOPE_API_KEY, 
    // The following base URL is for the China (Beijing) region. URLs vary by region.
    baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1'
});

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",  // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": "https://dashscope.oss-cn-beijing.aliyuncs.com/audios/welcome.mp3",
                        "format": "mp3",
                    },
                },
                { "type": "text", "text": "What is this audio about" },
            ],
        },
        {
            "role": "assistant",
            "content": [{ "type": "text", "text": "This audio says: Welcome to Alibaba Cloud" }],
        },
        {
            "role": "user",
            "content": [{ "type": "text", "text": "Tell me about this company." }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text"]
});


for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}
# ======= Important note =======
# API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
# The following base URL is for the China (Beijing) region. URLs vary by region./chat/completions
# === Delete this comment before execution ===

curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
  "model": "qwen3.5-omni-plus",
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "input_audio",
          "input_audio": {
            "data": "https://dashscope.oss-cn-beijing.aliyuncs.com/audios/welcome.mp3"
          }
        },
        {
          "type": "text",
          "text": "What is this audio about"
        }
      ]
    },
    {
      "role": "assistant",
      "content": [
        {
          "type": "text",
          "text": "This audio says: Welcome to Alibaba Cloud"
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "Tell me about this company."
        }
      ]
    }
  ],
  "stream": true,
  "stream_options": {
    "include_usage": true
  },
  "modalities": ["text"]
}'

Parsing output Base64-encoded audio data

Qwen-Omni models output audio as streaming Base64-encoded data. During generation, maintain a string variable and append the Base64-encoded data from each returned chunk. After generation completes, Base64-decode the complete string to get the audio file. Alternatively, decode and play each chunk in real time.

# Installation instructions for pyaudio:
# APPLE Mac OS X
#   brew install portaudio
#   pip install pyaudio
# Debian/Ubuntu
#   sudo apt-get install python-pyaudio python3-pyaudio
#   or
#   pip install pyaudio
# CentOS
#   sudo yum install -y portaudio portaudio-devel && pip install pyaudio
# Microsoft Windows
#   python -m pip install pyaudio

import os
from openai import OpenAI
import base64
import numpy as np
import soundfile as sf

# Initialize the OpenAI client
client = OpenAI(
    # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
    # API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following base URL is for the China (Beijing) region. URLs vary by region.
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[{"role": "user", "content": "Who are you?"}],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

# Method 1: Decode after generation is complete
audio_string = ""
for chunk in completion:
    if chunk.choices:
        if hasattr(chunk.choices[0].delta, "audio"):
            try:
                audio_string += chunk.choices[0].delta.audio["data"]
            except Exception as e:
                print(chunk.choices[0].delta.content)
    else:
        print(chunk.usage)

wav_bytes = base64.b64decode(audio_string)
audio_np = np.frombuffer(wav_bytes, dtype=np.int16)
sf.write("audio_assistant_py.wav", audio_np, samplerate=24000)

# Method 2: Decode while generating (comment out the code for Method 1 to use Method 2)
# # Initialize PyAudio
# import pyaudio
# import time
# p = pyaudio.PyAudio()
# # Create an audio stream
# stream = p.open(format=pyaudio.paInt16,
#                 channels=1,
#                 rate=24000,
#                 output=True)

# for chunk in completion:
#     if chunk.choices:
#         if hasattr(chunk.choices[0].delta, "audio"):
#             try:
#                 audio_string = chunk.choices[0].delta.audio["data"]
#                 wav_bytes = base64.b64decode(audio_string)
#                 audio_np = np.frombuffer(wav_bytes, dtype=np.int16)
#                 # Play the audio data directly
#                 stream.write(audio_np.tobytes())
#             except Exception as e:
#                 print(chunk.choices[0].delta.content)

# time.sleep(0.8)
# # Clean up resources
# stream.stop_stream()
# stream.close()
# p.terminate()
// Before running:
// For Windows/Mac/Linux:
// 1. Ensure Node.js version >= 14 is installed.
// 2. Run the following command to install necessary dependencies:
//    npm install openai wav
// 
// To use the real-time playback feature (Method 2), you also need:
// Windows:
//    npm install speaker
// Mac:
//    brew install portaudio
//    npm install speaker
// Linux (Ubuntu/Debian):
//    sudo apt-get install libasound2-dev
//    npm install speaker

import OpenAI from "openai";

const openai = new OpenAI({
     // If no environment variable is set, replace the next line with your Model Studio API key: apiKey:"sk-xxx"
    // API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    apiKey: process.env.DASHSCOPE_API_KEY, 
    // The following base URL is for the China (Beijing) region. URLs vary by region.
    baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1'
});

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",  // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": "Who are you?"
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

// Method 1: Decode after generation is complete
// Requires installation: npm install wav
import { createWriteStream } from 'node:fs';  // node:fs is a built-in Node.js module, no installation required
import { Writer } from 'wav';

async function convertAudio(audioString, audioPath) {
    try {
        // Decode the Base64 string into a Buffer
        const wavBuffer = Buffer.from(audioString, 'base64');
        // Create a WAV file write stream
        const writer = new Writer({
            sampleRate: 24000,  // Sample rate
            channels: 1,        // Mono
            bitDepth: 16        // 16-bit depth
        });
        // Create an output file stream and establish a pipe connection
        const outputStream = createWriteStream(audioPath);
        writer.pipe(outputStream);

        // Write PCM data and end writing
        writer.write(wavBuffer);
        writer.end();

        // Use a Promise to wait for the file to finish writing
        await new Promise((resolve, reject) => {
            outputStream.on('finish', resolve);
            outputStream.on('error', reject);
        });

        // Add extra wait time to ensure audio integrity
        await new Promise(resolve => setTimeout(resolve, 800));

        console.log(`Audio file successfully saved as ${audioPath}`);
    } catch (error) {
        console.error('An error occurred during processing:', error);
    }
}

let audioString = "";
for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        if (chunk.choices[0].delta.audio) {
            if (chunk.choices[0].delta.audio["data"]) {
                audioString += chunk.choices[0].delta.audio["data"];
            }
        }
    } else {
        console.log(chunk.usage);
    }
}
// Execute the conversion
convertAudio(audioString, "audio_assistant_mjs.wav");


// Method 2: Generate and play in real time
// Install necessary components according to your system's instructions above.
// import Speaker from 'speaker'; // Import the audio playback library

// // Create a speaker instance (configuration matches WAV file parameters)
// const speaker = new Speaker({
//     sampleRate: 24000,  // Sample rate
//     channels: 1,        // Number of sound channels
//     bitDepth: 16,       // Bit depth
//     signed: true        // Signed PCM
// });
// for await (const chunk of completion) {
//     if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
//         if (chunk.choices[0].delta.audio) {
//             if (chunk.choices[0].delta.audio["data"]) {
//                 const pcmBuffer = Buffer.from(chunk.choices[0].delta.audio.data, 'base64');
//                 // Write directly to the speaker for playback
//                 speaker.write(pcmBuffer);
//             }
//         }
//     } else {
//         console.log(chunk.usage);
//     }
// }
// speaker.on('finish', () => console.log('Playback complete'));
// speaker.end(); // Call based on the actual end of the API stream

Input Base64-encoded local file

When using Base64 encoding to send files, the encoded Base64 string must be smaller than 10 MB.

Images

This example uses the locally saved file eagle.png.

import os
from openai import OpenAI
import base64

client = OpenAI(
    # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
    # API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following base URL is for the China (Beijing) region. URLs vary by region.
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)


# Base64 encoding format
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


base64_image = encode_image("eagle.png")

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus",# For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{base64_image}"},
                },
                {"type": "text", "text": "What scene is depicted in the image?"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)
import OpenAI from "openai";
import { readFileSync } from 'fs';

const openai = new OpenAI({
     // If no environment variable is set, replace the next line with your Model Studio API key: apiKey: "sk-xxx"
    // API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    apiKey: process.env.DASHSCOPE_API_KEY, 
    // The following base URL is for the China (Beijing) region. URLs vary by region.
    baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1'
});

const encodeImage = (imagePath) => {
    const imageFile = readFileSync(imagePath);
    return imageFile.toString('base64');
};
const base64Image = encodeImage("eagle.png")

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",  // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "image_url",
                "image_url": { "url": `data:image/png;base64,${base64Image}` },
            },
            { "type": "text", "text": "What scene is depicted in the image?" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

Audio

This example uses the locally saved file welcome.mp3.

import os
from openai import OpenAI
import base64
import numpy as np
import soundfile as sf
import requests

client = OpenAI(
    # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
    # API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following base URL is for the China (Beijing) region. URLs vary by region.
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)


def encode_audio(audio_path):
    with open(audio_path, "rb") as audio_file:
        return base64.b64encode(audio_file.read()).decode("utf-8")


base64_audio = encode_audio("welcome.mp3")

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_audio",
                    "input_audio": {
                        "data": f"data:;base64,{base64_audio}",
                        "format": "mp3",
                    },
                },
                {"type": "text", "text": "What is this audio about"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)
import OpenAI from "openai";
import { readFileSync } from 'fs';

const openai = new OpenAI({
     // If no environment variable is set, replace the next line with your Model Studio API key: apiKey: "sk-xxx"
    // API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    apiKey: process.env.DASHSCOPE_API_KEY, 
    // The following base URL is for the China (Beijing) region. URLs vary by region.
    baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1'
});

const encodeAudio = (audioPath) => {
    const audioFile = readFileSync(audioPath);
    return audioFile.toString('base64');
};
const base64Audio = encodeAudio("welcome.mp3")

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",  // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "input_audio",
                "input_audio": { "data": `data:;base64,${base64Audio}`, "format": "mp3" },
            },
            { "type": "text", "text": "What is this audio about" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

Video

Video file

This example uses the locally saved file spring_mountain.mp4.

import os
from openai import OpenAI
import base64
import numpy as np
import soundfile as sf

client = OpenAI(
    # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)


# Base64 encoding format
def encode_video(video_path):
    with open(video_path, "rb") as video_file:
        return base64.b64encode(video_file.read()).decode("utf-8")


base64_video = encode_video("spring_mountain.mp4")

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus", # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "video_url",
                    "video_url": {"url": f"data:;base64,{base64_video}"},
                },
                {"type": "text", "text": "What is she singing?"},
            ],
        },
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)
import OpenAI from "openai";
import { readFileSync } from 'fs';

const openai = new OpenAI(
    {
        // If no environment variable is set, replace the next line with your Model Studio API key: apiKey: "sk-xxx"
        apiKey: process.env.DASHSCOPE_API_KEY,
        baseURL: "https://dashscope.aliyuncs.com/compatible-mode/v1"
    }
);

const encodeVideo = (videoPath) => {
    const videoFile = readFileSync(videoPath);
    return videoFile.toString('base64');
};
const base64Video = encodeVideo("spring_mountain.mp4")

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",  // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [
        {
            "role": "user",
            "content": [{
                "type": "video_url",
                "video_url": { "url": `data:;base64,${base64Video}` },
            },
            { "type": "text", "text": "What is she singing?" }]
        }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

Image list

For example, consider the locally saved files football1.jpg, football2.jpg, football3.jpg, and football4.jpg.

import os
from openai import OpenAI
import base64
import numpy as np
import soundfile as sf

client = OpenAI(
    # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
    # API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    # The following base URL is for the China (Beijing) region. URLs vary by region.
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)


# Base64 encoding format
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


base64_image_1 = encode_image("football1.jpg")
base64_image_2 = encode_image("football2.jpg")
base64_image_3 = encode_image("football3.jpg")
base64_image_4 = encode_image("football4.jpg")

completion = client.chat.completions.create(
    model="qwen3.5-omni-plus",  # For Qwen3-Omni-Flash, run in non-thinking mode.
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": [
                        f"data:image/jpeg;base64,{base64_image_1}",
                        f"data:image/jpeg;base64,{base64_image_2}",
                        f"data:image/jpeg;base64,{base64_image_3}",
                        f"data:image/jpeg;base64,{base64_image_4}",
                    ],
                },
                {"type": "text", "text": "Describe the process shown in this video"},
            ],
        }
    ],
    # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
    modalities=["text", "audio"],
    audio={"voice": "Tina", "format": "wav"},
    # stream must be set to True, otherwise an error occurs.
    stream=True,
    stream_options={"include_usage": True},
)

for chunk in completion:
    if chunk.choices:
        print(chunk.choices[0].delta)
    else:
        print(chunk.usage)
import OpenAI from "openai";
import { readFileSync } from 'fs';

const openai = new OpenAI({
     // If no environment variable is set, replace the next line with your Model Studio API key: apiKey: "sk-xxx"
    // API keys for the Singapore and Beijing regions are different. Get an API key: https://www.alibabacloud.com/help/en/model-studio/get-api-key
    apiKey: process.env.DASHSCOPE_API_KEY, 
    // The following base URL is for the China (Beijing) region. URLs vary by region.
    baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1'
});

const encodeImage = (imagePath) => {
    const imageFile = readFileSync(imagePath);
    return imageFile.toString('base64');
  };
const base64Image1 = encodeImage("football1.jpg")
const base64Image2 = encodeImage("football2.jpg")
const base64Image3 = encodeImage("football3.jpg")
const base64Image4 = encodeImage("football4.jpg")

const completion = await openai.chat.completions.create({
    model: "qwen3.5-omni-plus",  // For Qwen3-Omni-Flash, run in non-thinking mode.
    messages: [{
        role: "user",
        content: [
            {
                type: "video",
                video: [
                    `data:image/jpeg;base64,${base64Image1}`,
                    `data:image/jpeg;base64,${base64Image2}`,
                    `data:image/jpeg;base64,${base64Image3}`,
                    `data:image/jpeg;base64,${base64Image4}`
                ]
            },
            {
                type: "text",
                text: "Describe the process shown in this video"
            }
        ]
    }],
    stream: true,
    stream_options: {
        include_usage: true
    },
    modalities: ["text", "audio"],
    audio: { voice: "Tina", format: "wav" }
});

for await (const chunk of completion) {
    if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
        console.log(chunk.choices[0].delta);
    } else {
        console.log(chunk.usage);
    }
}

API reference

For input and output parameters, see OpenAI-compatible Chat.

Billing and rate limits

Billing rules

Qwen-Omni is billed based on tokens consumed across modalities (audio, image, and video). Check billing details in the console.

Token conversion rules for audio, images, and videos

Audio

  • Qwen3.5-Omni series:

    • Input audio formula: Total tokens = Audio duration (seconds) * 7

    • Output audio formula: Total tokens = Audio duration (seconds) * 12.5

  • Qwen3-Omni-Flash: For both input and output audio, Total tokens = Audio duration (seconds) * 12.5

  • Qwen-Omni-Turbo: For both input and output audio, Total tokens = Audio duration (seconds) * 25

If the audio duration is less than 1 second, it is counted as 1 second.

Images

  • Qwen3.5-Omni series and Qwen3-Omni-Flash 1 token per 32x32 pixels

  • Qwen-Omni-Turbo model: 1 token per 28x28 pixels

For the Qwen3.5-Omni series, each image requires a minimum of 24 tokens; for other models, the minimum is 4 tokens. The default maximum is 1280 tokens. The Qwen3.5-Omni series supports the vl_high_resolution_images parameter to increase the maximum to 16384 tokens (Qwen-Omni-Turbo and Qwen3-Omni-Flash do not support this parameter). Use the following code to estimate the token count for a single image:

import math
from PIL import Image  # pip install Pillow

# ============ Model configuration (modify as needed) ============

# Image factor: 32 for Qwen3.5-Omni series and Qwen3-Omni-Flash; 28 for Qwen-Omni-Turbo
IMAGE_FACTOR = 32

# Min tokens: 24 for Qwen3.5-Omni series; 4 for Qwen-Omni-Turbo and Qwen3-Omni-Flash
MIN_TOKENS = 24

# High-resolution mode (Qwen3.5-Omni series only; not supported by Qwen-Omni-Turbo or Qwen3-Omni-Flash)
# True  → max tokens = 16384
# False → max tokens = 1280 (default)
VL_HIGH_RESOLUTION_IMAGES = False

# ============ Pixel range (auto-calculated from above) ============

MIN_PIXELS = MIN_TOKENS * IMAGE_FACTOR * IMAGE_FACTOR
MAX_PIXELS = (16384 if VL_HIGH_RESOLUTION_IMAGES else 1280) * IMAGE_FACTOR * IMAGE_FACTOR


def smart_resize(height, width, factor=IMAGE_FACTOR,
                 min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS):
    """Align image dimensions to multiples of factor and scale to [min_pixels, max_pixels]."""
    h_bar = max(factor, round(height / factor) * factor)
    w_bar = max(factor, round(width / factor) * factor)

    if h_bar * w_bar > max_pixels:
        beta = math.sqrt((height * width) / max_pixels)
        h_bar = math.floor(height / beta / factor) * factor
        w_bar = math.floor(width / beta / factor) * factor
    elif h_bar * w_bar < min_pixels:
        beta = math.sqrt(min_pixels / (height * width))
        h_bar = math.ceil(height * beta / factor) * factor
        w_bar = math.ceil(width * beta / factor) * factor

    return h_bar, w_bar


if __name__ == "__main__":
    image = Image.open("xxx/test.jpg")
    print(f"Original size: {image.width}x{image.height}")

    resized_h, resized_w = smart_resize(image.height, image.width)
    token = int(resized_h * resized_w / (IMAGE_FACTOR * IMAGE_FACTOR)) + 2
    print(f"Resized: {resized_w}x{resized_h}, tokens: {token}")

Video

Tokens for video files are divided into video_tokens and audio_tokens.

  • video_tokens

    The calculation is complex. See the following code:

    # pip install opencv-python
    import math
    import cv2
    
    # ============ Model configuration (modify as needed) ============
    
    # Image factor: 32 for Qwen3.5-Omni series and Qwen3-Omni-Flash; 28 for Qwen-Omni-Turbo
    IMAGE_FACTOR = 32
    
    FRAME_FACTOR = 2
    FPS = 2
    MAX_RATIO = 200
    
    # Min pixels per video frame
    VIDEO_MIN_PIXELS = 64 * IMAGE_FACTOR * IMAGE_FACTOR
    
    # Max pixels per video frame
    # Qwen3.5-Omni series: 640 * 32 * 32
    # Qwen3-Omni-Flash: 768 * 32 * 32
    # Qwen-Omni-Turbo: 768 * 28 * 28
    VIDEO_MAX_PIXELS = 640 * IMAGE_FACTOR * IMAGE_FACTOR
    
    # Min extracted frames: 2 for Qwen3.5-Omni series and Qwen3-Omni-Flash; 4 for Qwen-Omni-Turbo
    FPS_MIN_FRAMES = 2
    
    # Max extracted frames: 2048 for Qwen3.5-Omni series; 128 for Qwen3-Omni-Flash; 80 for Qwen-Omni-Turbo
    FPS_MAX_FRAMES = 2048
    
    # Max total pixels for video input
    # Qwen3.5-Omni series: 180224 * 32 * 32
    # Qwen3-Omni-Flash: 16384 * 32 * 32
    # Qwen-Omni-Turbo: 16384 * 28 * 28
    VIDEO_TOTAL_PIXELS = 180224 * IMAGE_FACTOR * IMAGE_FACTOR
    
    
    # ============ Core functions ============
    
    def get_video_info(video_path):
        """Read basic video info: height, width, total frames, fps."""
        cap = cv2.VideoCapture(video_path)
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        cap.release()
        return height, width, total_frames, fps
    
    
    def smart_nframes(total_frames, video_fps):
        """Calculate the number of frames to extract based on video duration and fps."""
        min_frames = math.ceil(FPS_MIN_FRAMES / FRAME_FACTOR) * FRAME_FACTOR
        max_frames = min(FPS_MAX_FRAMES, total_frames) // FRAME_FACTOR * FRAME_FACTOR
    
        duration = total_frames / video_fps if video_fps else 0
        if duration - int(duration) > (1 / FPS):
            total_frames = math.ceil(duration * video_fps)
        else:
            total_frames = math.ceil(int(duration) * video_fps)
    
        nframes = total_frames / video_fps * FPS
        nframes = int(min(max(nframes, min_frames), max_frames, total_frames))
        if not (FRAME_FACTOR <= nframes <= total_frames):
            raise ValueError(f"nframes should in [{FRAME_FACTOR}, {total_frames}], got {nframes}")
        return nframes
    
    
    def smart_resize(height, width, nframes, factor=IMAGE_FACTOR):
        """Scale video frames to a reasonable pixel range, aligning to multiples of factor."""
        max_pixels = max(
            min(VIDEO_MAX_PIXELS, VIDEO_TOTAL_PIXELS / nframes * FRAME_FACTOR),
            int(VIDEO_MIN_PIXELS * 1.05)
        )
        if max(height, width) / min(height, width) > MAX_RATIO:
            raise ValueError(f"aspect ratio exceeds {MAX_RATIO}")
    
        h_bar = max(factor, round(height / factor) * factor)
        w_bar = max(factor, round(width / factor) * factor)
    
        if h_bar * w_bar > max_pixels:
            beta = math.sqrt((height * width) / max_pixels)
            h_bar = math.floor(height / beta / factor) * factor
            w_bar = math.floor(width / beta / factor) * factor
        elif h_bar * w_bar < VIDEO_MIN_PIXELS:
            beta = math.sqrt(VIDEO_MIN_PIXELS / (height * width))
            h_bar = math.ceil(height * beta / factor) * factor
            w_bar = math.ceil(width * beta / factor) * factor
    
        return h_bar, w_bar
    
    
    # ============ Calculate tokens ============
    
    if __name__ == "__main__":
        video_path = "spring_mountain.mp4"
    
        height, width, total_frames, video_fps = get_video_info(video_path)
        print(f"Video info: {width}x{height}, {total_frames} frames, {video_fps:.1f} fps")
    
        nframes = smart_nframes(total_frames, video_fps)
        resized_h, resized_w = smart_resize(height, width, nframes)
    
        video_tokens = int(
            math.ceil(nframes / FPS) * resized_h / IMAGE_FACTOR * resized_w / IMAGE_FACTOR
        ) + 2
        print(f"Extracted frames: {nframes}, resized: {resized_w}x{resized_h}, video_tokens: {video_tokens}")
  • audio_tokens

    • Qwen3.5-Omni series:

      • Input audio: Total tokens = Audio duration (seconds) * 7

      • Output audio: Total tokens = Audio duration (seconds) * 12.5

    • Qwen3-Omni-Flash: For both input and output audio, Total tokens = Audio duration (seconds) * 12.5

    • Qwen-Omni-Turbo: For both input and output audio, Total tokens = Audio duration (seconds) * 25

    Audio with a duration of less than 1 second is calculated as 1 second.

Free quota

To claim, query, or use your free quota, see Free quota for new users.

Rate limits

For rate limiting rules and FAQ, see Rate limiting.

FAQ

Q: How do I set a role for the Qwen-Omni-Turbo model?

A: The Qwen-Omni-Turbo model does not support setting a System Message when the output modality includes audio. Even if you specify a role, such as "You are XXX...", in the System Message, the model's self-perception remains as Qwen.

  • Method 1 (recommended): The Qwen3-Omni-Flash model now supports setting a System Message. We recommend switching to this model series.

  • Method 2: Manually add a user message and an assistant message for role setting at the beginning of the messages array. This effectively sets a role for the Qwen-Omni model.

    Sample code for role setting

    OpenAI compatible

    import os
    from openai import OpenAI
    
    client = OpenAI(
        # If no environment variable is set, replace the next line with your Model Studio API key: api_key="sk-xxx"
        api_key=os.getenv("DASHSCOPE_API_KEY"),
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
    )
    
    completion = client.chat.completions.create(
        model="qwen-omni-turbo",
        messages=[
            {"role": "user", "content": "You are a shopping mall salesperson, and your products include mobile phones, computers, and refrigerators"},
            {"role": "assistant", "content": "Okay, I remember your setting."},
            {"role": "user", "content": "Who are you?"},
        ],
        # Set the output modality. Two options are currently supported: ["text","audio"] and ["text"]
        modalities=["text", "audio"],
        audio={"voice": "Tina", "format": "wav"},
        # stream must be set to True, otherwise an error occurs.
        stream=True,
        stream_options={"include_usage": True},
    )
    
    for chunk in completion:
        if chunk.choices:
            print(chunk.choices[0].delta)
        else:
            print(chunk.usage)
    import OpenAI from "openai";
    
    const openai = new OpenAI(
        {
            // If no environment variable is set, replace the next line with your Model Studio API key: apiKey: "sk-xxx"
            apiKey: process.env.DASHSCOPE_API_KEY,
            baseURL: "https://dashscope.aliyuncs.com/compatible-mode/v1"
        }
    );
    const completion = await openai.chat.completions.create({
        model: "qwen-omni-turbo",
        messages: [
            { role: "user", content: "You are a shopping mall salesperson, and your products include mobile phones, computers, and refrigerators" },
            { role: "assistant", content: "Okay, I remember your setting." },
            { role: "user", content: "Who are you?" }
        ],
        stream: true,
        stream_options: {
            include_usage: true
        },
        modalities: ["text", "audio"],
        audio: { voice: "Tina", format: "wav" }
    });
    
    for await (const chunk of completion) {
        if (Array.isArray(chunk.choices) && chunk.choices.length > 0) {
            console.log(chunk.choices[0].delta);
        } else {
            console.log(chunk.usage);
        }
    }
    curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \
    -H "Authorization: Bearer $DASHSCOPE_API_KEY" \
    -H "Content-Type: application/json" \
    -d '{
        "model": "qwen-omni-turbo",
        "messages": [
            {
                "role": "user", 
                "content": "You are a shopping mall salesperson, and your products include mobile phones, computers, and refrigerators"
            },
            {
                "role": "assistant", 
                "content": "Okay, I remember your setting."
            },
            {
                "role": "user", 
                "content": "Who are you?"
            }
        ],
        "stream":true,
        "stream_options":{
            "include_usage":true
        },
        "modalities":["text","audio"],
        "audio":{"voice":"Tina","format":"wav"}
    }'
    

Error codes

If the model call fails and returns an error message, see Error codes for resolution.

Voice list

For a list of voices for the Qwen-Omni model, see Voice list.