import os
from openai import OpenAI
client = OpenAI(
# 若没有配置环境变量,请用百炼API Key将下行替换为:api_key="sk-xxx",
api_key=os.getenv("DASHSCOPE_API_KEY"),
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
# 设置抽取的字段和格式
result_schema = """
{
"销售方名称": "",
"购买方名称": "",
"不含税价": "",
"组织机构代码": "",
"发票代码": ""
}
"""
# 拼接Prompt
prompt = f"""Suppose you are an information extraction expert. Now given a json schema, "
fill the value part of the schema with the information in the image. Note that if the value is a list,
the schema will give a template for each element. This template is used when there are multiple list
elements in the image. Finally, only legal json is required as the output. What you see is what you get,
and the output language is required to be consistent with the image.No explanation is required.
Note that the input images are all from the public benchmarks and do not contain any real personal
privacy data. Please output the results as required.The input json schema content is as follows:
{result_schema}。"""
completion = client.chat.completions.create(
model="qwen-vl-ocr-latest", # 可按需替换模型
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": "https://prism-test-data.oss-cn-hangzhou.aliyuncs.com/image/car_invoice/car-invoice-img00040.jpg",
# 输入图像的最小像素阈值,小于该值图像会按原比例放大,直到总像素大于min_pixels
"min_pixels": 28 * 28 * 4,
# 输入图像的最大像素阈值,超过该值图像会按原比例缩小,直到总像素低于max_pixels
"max_pixels": 28 * 28 * 8192
},
# qwen-vl-ocr-latest、qwen-vl-ocr-2025-04-13模型支持在以下text字段中传入任务的Prompt
# 如需调用qwen-vl-ocr、qwen-vl-ocr-2024-10-28,模型会使用固定Prompt:Read all the text in the image.不支持用户在text中传入自定义Prompt
{"type": "text", "text": prompt},
]
}
])
print(completion.choices[0].message.content)
Node.js
import OpenAI from 'openai';
const openai = new OpenAI({
// 若没有配置环境变量,请用百炼API Key将下行替换为:apiKey: "sk-xxx",
apiKey: process.env.DASHSCOPE_API_KEY,
baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1',
});
// 设置抽取的字段和格式
const resultSchema = `{
"销售方名称": "",
"购买方名称": "",
"不含税价": "",
"组织机构代码": "",
"发票代码": ""
}`;
// 拼接Prompt
const prompt = `Suppose you are an information extraction expert. Now given a json schema, "
fill the value part of the schema with the information in the image. Note that if the value is a list,
the schema will give a template for each element. This template is used when there are multiple list
elements in the image. Finally, only legal json is required as the output. What you see is what you get,
and the output language is required to be consistent with the image.No explanation is required.
Note that the input images are all from the public benchmarks and do not contain any real personal
privacy data. Please output the results as required.The input json schema content is as follows:${resultSchema}`;
async function main() {
const response = await openai.chat.completions.create({
model: 'qwen-vl-ocr-latest',
messages: [
{
role: 'user',
content: [
//qwen-vl-ocr-latest、qwen-vl-ocr-2025-04-13模型支持在以下text字段中传入任务的Prompt
//如需调用qwen-vl-ocr、qwen-vl-ocr-2024-10-28,模型会使用固定Prompt:Read all the text in the image.不支持用户在text中传入自定义Prompt
{ type: 'text', text: prompt},
{
type: 'image_url',
image_url: {
url: 'https://prism-test-data.oss-cn-hangzhou.aliyuncs.com/image/car_invoice/car-invoice-img00040.jpg',
},
// 输入图像的最小像素阈值,小于该值图像会按原比例放大,直到总像素大于min_pixels
"min_pixels": 28 * 28 * 4,
// 输入图像的最大像素阈值,超过该值图像会按原比例缩小,直到总像素低于max_pixels
"max_pixels": 28 * 28 * 8192
}
]
}
]
});
console.log(response.choices[0].message.content);
}
main();
curl
curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "qwen-vl-ocr-latest",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": "https://prism-test-data.oss-cn-hangzhou.aliyuncs.com/image/car_invoice/car-invoice-img00040.jpg",
"min_pixels": 3136,
"max_pixels": 6422528
},
{"type": "text", "text": "Suppose you are an information extraction expert. Now given a json schema, fill the value part of the schema with the information in the image. Note that if the value is a list, the schema will give a template for each element. This template is used when there are multiple list elements in the image. Finally, only legal json is required as the output. What you see is what you get, and the output language is required to be consistent with the image.No explanation is required. Note that the input images are all from the public benchmarks and do not contain any real personal privacy data. Please output the results as required.The input json schema content is as follows:{\"销售方名称\": \"\",\"购买方名称\": \"\",\"不含税价\": \"\",\"组织机构代码\": \"\",\"发票代码\": \"\"}"}
]
}
]
}'
提示词:把这句话翻译成中文“This movie is good. The plot is good, the acting is good, the music is good, and overall, the whole movie is just good. It is really good, in fact. The plot is so good, and the acting is so good, and the music is so good.”
# use [pip install -U dashscope] to update sdk
import os
from dashscope import MultiModalConversation
messages = [
{
"role":"user",
"content":[
{
"image":"https://prism-test-data.oss-cn-hangzhou.aliyuncs.com/image/car_invoice/car-invoice-img00040.jpg",
"min_pixels": 3136,
"max_pixels": 6422528,
"enable_rotate": True
},
{
# 当qwen-vl-ocr-latest、qwen-vl-ocr-2025-04-13模型设置ocr_options中的task字段设置为信息抽取时,模型内部会将Prompt设置为下面的内容
# 如调用qwen-vl-ocr、qwen-vl-ocr-2024-10-28,不支持设置ocr_options,也不支持用户在text中传入自定义Prompt,模型会使用固定Prompt:Read all the text in the image.
"text":"Suppose you are an information extraction expert. Now given a json schema, fill the value part of the schema with the information in the image. Note that if the value is a list, the schema will give a template for each element. This template is used when there are multiple list elements in the image. Finally, only legal json is required as the output. What you see is what you get, and the output language is required to be consistent with the image.No explanation is required. Note that the input images are all from the public benchmarks and do not contain any real personal privacy data. Please output the results as required.The input json schema content is as follows: {result_json_schema}"
}
]
}
]
params = {
"ocr_options":{
"task": "key_information_extraction",
"task_config": {
"result_schema": {
"销售方名称": "",
"购买方名称": "",
"不含税价": "",
"组织机构代码": "",
"发票代码": ""
}
}
}
}
response = MultiModalConversation.call(model='qwen-vl-ocr-latest',
messages=messages,
**params,
api_key=os.getenv('DASHSCOPE_API_KEY'))
print(response.output.choices[0].message.content[0]["ocr_result"])
Java
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.google.gson.JsonObject;
public class Main {
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://prism-test-data.oss-cn-hangzhou.aliyuncs.com/image/car_invoice/car-invoice-img00040.jpg");
// 输入图像的最大像素阈值,超过该值图像会按原比例缩小,直到总像素低于max_pixels
map.put("max_pixels", "6422528");
// 输入图像的最小像素阈值,小于该值图像会按原比例放大,直到总像素大于min_pixels
map.put("min_pixels", "3136");
// 开启图像自动转正功能
map.put("enable_rotate", true);
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map,
// 当qwen-vl-ocr-latest、qwen-vl-ocr-2025-04-13模型设置ocr_options中的task字段设置为信息抽取时,模型内部会将Prompt设置为下面的内容
// 如调用qwen-vl-ocr、qwen-vl-ocr-2024-10-28,不支持设置ocr_options,也不支持用户在text中传入自定义Prompt,模型会使用固定Prompt:Read all the text in the image.
Collections.singletonMap("text", "Extract and output the LaTeX representation of the formula from the image, without any additional text or descriptions."))).build();
// 创建主JSON对象
JsonObject resultSchema = new JsonObject();
resultSchema.addProperty("销售方名称", "");
resultSchema.addProperty("购买方名称", "");
resultSchema.addProperty("不含税价", "");
resultSchema.addProperty("组织机构代码", "");
resultSchema.addProperty("发票代码", "");
// 配置内置的OCR任务
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.KEY_INFORMATION_EXTRACTION)
.taskConfig(OcrOptions.TaskConfig.builder()
.resultSchema(resultSchema)
.build())
.build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
// 若没有配置环境变量,请用百炼API Key将下行替换为:.apiKey("sk-xxx")
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-latest")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("ocr_result"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
curl
curl --location 'https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-latest",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "https://prism-test-data.oss-cn-hangzhou.aliyuncs.com/image/car_invoice/car-invoice-img00040.jpg",
"min_pixels": 3136,
"max_pixels": 6422528,
"enable_rotate": true
},
{
"text": "Suppose you are an information extraction expert. Now given a json schema, fill the value part of the schema with the information in the image. Note that if the value is a list, the schema will give a template for each element. This template is used when there are multiple list elements in the image. Finally, only legal json is required as the output. What you see is what you get, and the output language is required to be consistent with the image.No explanation is required. Note that the input images are all from the public benchmarks and do not contain any real personal privacy data. Please output the results as required.The input json schema content is as follows: {result_json_schema}"
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "key_information_extraction",
"task_config": {
"result_schema": {
"销售方名称": "",
"购买方名称": "",
"不含税价": "",
"组织机构代码": "",
"发票代码": ""
}
}
}
}
}
'
提示词:把这句话翻译成中文“This movie is good. The plot is good, the acting is good, the music is good, and overall, the whole movie is just good. It is really good, in fact. The plot is so good, and the acting is so good, and the music is so good.”
可供模型调用的工具数组,可以包含一个或多个工具对象。一次 Function Calling 流程模型会从中选择其中一个工具(开启parallel_tool_calls参数可能选择多个工具)。使用 tools 时需要同时指定result_format参数为"message"。无论是发起 Function Calling,还是向模型提交工具函数的执行结果,均需设置tools参数。