获取训练任务的详情。
调试
您可以在OpenAPI Explorer中直接运行该接口,免去您计算签名的困扰。运行成功后,OpenAPI Explorer可以自动生成SDK代码示例。
授权信息
当前API暂无授权信息透出。
请求语法
GET /api/v1/trainingjobs/{TrainingJobId} HTTP/1.1
请求参数
| 名称 | 类型 | 必填 | 描述 | 示例值 |
|---|---|---|---|---|
| TrainingJobId | string | 是 | 训练任务 ID。 | train129f212o89d |
返回参数
示例
正常返回示例
JSON格式
{
"OutputChannels": [
{
"Name": "model",
"OutputUri": "oss://test-bucket.oss-cn-hangzhou-internal.aliyuncs.com/path/to/output/model/",
"DatasetId": "d-8o0hh35po15ejcdq2p",
"VersionName": "v1"
}
],
"Status": "Running",
"HyperParameters": [
{
"Name": "learning_rate",
"Value": 0.0001
}
],
"TrainingJobName": "qwen_llm",
"TrainingJobId": "traini6hhxiq69eo",
"Scheduler": {
"MaxRunningTimeInSeconds": 0,
"MaxRunningTimeInMinutes": 100
},
"ReasonMessage": "None",
"WorkspaceId": 86995,
"ComputeResource": {
"EcsCount": 1,
"EcsSpec": "ecs.gn5-c8g1.2xlarge",
"ResourceId": "quotam670lixikcl",
"ResourceName": "quota",
"InstanceCount": 1,
"InstanceSpec": {
"CPU": 8,
"Memory": 32,
"SharedMemory": 32,
"GPU": 1,
"GPUType": "V100"
},
"UseSpotInstance": true,
"SpotSpec": {
"SpotStrategy": "SpotWithPriceLimit",
"SpotDiscountLimit": 0.9
}
},
"Labels": [
{
"Key": "CreatedBy",
"Value": "QuickStart"
}
],
"AlgorithmProvider": "pai",
"InputChannels": [
{
"DatasetId": "d-475megosidivjfgfq6",
"Name": "model",
"InputUri": "oss://test-bucket.oss-cn-hangzhou-internal.aliyuncs.com/path/to/input/model/",
"Options": "ossAppendable=true",
"VersionName": "v1"
}
],
"AlgorithmName": "llm_training",
"ReasonCode": "TrainingJobSucceed",
"GmtModifiedTime": "2024-07-10T11:49:47Z",
"StatusTransitions": [
{
"EndTime": "2024-07-10T11:49:47Z",
"ReasonCode": "TrainingJobSucceed",
"ReasonMessage": "KubeDL job runs successfully",
"StartTime": "2024-07-10T11:49:47Z",
"Status": "Creating"
}
],
"TrainingJobDescription": "Qwen2大语言模型训练。",
"UserId": 123456789,
"AlgorithmVersion": "v0.0.1",
"LatestMetrics": [
{
"Name": "loss",
"Timestamp": "2024-07-10T11:49:47Z",
"Value": 0.11
}
],
"GmtCreateTime": "2024-07-10T11:49:47Z",
"RequestId": "473469C7-AA6F-4DC5-B3DB-A3DC0DE3C83E",
"Instances": [
{
"Name": "train1oug3yehan4-master-0",
"Role": "master",
"Status": "Succeeded"
}
],
"AlgorithmId": "algo-xsldfvu1334",
"TrainingJobUrl": "https://pai.console.aliyun.com/?regionId=cn-hangzhou&workspaceId=1234#/training/jobs/train1ouyadsl8n4",
"RoleArn": "acs:ram::{accountID}:role/{roleName}",
"AlgorithmSpec": {
"OutputChannels": [
{
"Description": "模型输出。",
"Required": true,
"Properties": {},
"Name": "Model",
"SupportedChannelTypes": [
"oss"
]
}
],
"HyperParameters": [
{
"DefaultValue": 0,
"Type": "Integer",
"Description": "参数定义了训练的batch_size。",
"Required": true,
"Name": "batch_size",
"Range": {
"Enum": [
0
],
"MinLength": 1,
"MaxLength": 30,
"Minimum": 0,
"Maximum": 10,
"ExclusiveMinimum": true,
"ExclusiveMaximum": true,
"Pattern": "^\\+?[1-9][0-9]*$"
},
"DisplayName": "Batch Size"
}
],
"JobType": "TFJob",
"Command": [
"python train.py"
],
"MetricDefinitions": [
{
"Description": "训练损失函数。",
"Regex": ".*train:loss=([-+]?[0-9]*\\\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*",
"Name": "loss"
}
],
"InputChannels": [
{
"Description": "模型输出。",
"Required": true,
"Properties": {},
"Name": "Model",
"SupportedChannelTypes": [
"oss"
]
}
],
"SupportsDistributedTraining": true,
"Image": "registry.cn-shanghai.aliyuncs.com/pai-training/kmeans:v1.0.0",
"SupportedInstanceTypes": [
"ecs.g6.2xlarge"
],
"ComputeResource": {
"Policy": {
"Version": "V1",
"Value": "[{\\\"MinTotalCount\\\":1,\\\"MaxTotalCount\\\":4,\\\"RolePolicies\\\":{\\\"chief\\\":{\\\"Count\\\":1},\\\"ps\\\":{\\\"Count\\\":1},\\\"worker\\\":{\\\"Percentage\\\":100}}},{\\\"MinTotalCount\\\":5,\\\"MaxTotalCount\\\":10,\\\"RolePolicies\\\":{\\\"chief\\\":{\\\"Count\\\":1},\\\"ps\\\":{\\\"Percentage\\\":20},\\\"worker\\\":{\\\"Percentage\\\":80}}},{\\\"MinTotalCount\\\":11,\\\"RolePolicies\\\":{\\\"chief\\\":{\\\"Count\\\":1},\\\"ps\\\":{\\\"Percentage\\\":40},\\\"worker\\\":{\\\"Percentage\\\":60}}}]"
}
},
"CodeDir": {
"LocationType": "OSS",
"LocationValue": {
"test": "test",
"test2": 1
}
},
"Customization": {
"CodeDir": true
},
"ResourceRequirements": [
{
"Key": "SupportedMachineTypes",
"Operator": "in",
"Values": [
"ecs.gn5-c4g1.xlarge"
]
}
],
"ProgressDefinitions": {
"OverallProgress": {
"Description": "training progress",
"Regex": "^[0-9]+([.][0-9]+){0,1}$"
},
"RemainingTime": {
"Description": "training remaining time",
"Regex": "^[0-9]+([.][0-9]+){0,1}$"
}
}
},
"IsTempAlgo": true,
"LatestProgress": {
"OverallProgress": {
"Timestamp": "2023-07-04T13:20:18Z",
"Value": 0.75
},
"RemainingTime": {
"Timestamp": "2023-07-04T13:20:18Z",
"Value": 3600
}
},
"UserVpc": {
"VpcId": "vpc-abcdef****",
"SwitchId": "vs-abcdef****",
"SecurityGroupId": "sg-abcdef****",
"ExtendedCIDRs": [
"192.168.0.1/24"
]
},
"OutputModel": {
"Uri": "oss://test-bucket.oss-cn-hangzhou-internal.aliyuncs.com/path/to/model/output/",
"OutputChannelName": "model"
},
"Settings": {
"BusinessUserId": 166924,
"Caller": "SilkFlow",
"Tags": {
"key": ""
},
"PipelineId": "pid-123456",
"EnableTideResource": true,
"EnableErrorMonitoringInAIMaster": false,
"ErrorMonitoringArgs": "--enable-log-hang-detection true",
"EnableRDMA": true,
"EnableOssAppend": true,
"OversoldType": "AcceptQuotaOverSold",
"AdvancedSettings": {
"test": "test",
"test2": 1
},
"Driver": "535.54.03",
"EnableSanityCheck": true,
"SanityCheckArgs": "--sanity-check-timing=AfterJobFaultTolerant --sanity-check-timeout-ops=MarkJobFai",
"JobReservedMinutes": 30,
"JobReservedPolicy": "Always"
},
"ExperimentConfig": {
"ExperimentId": "exp-ds9aefia90v",
"ExperimentName": "large_language_model_train"
},
"Duration": 7200,
"Environments": {
"key": "debug=true"
},
"PythonRequirements": [
"torch>=2.2.0"
],
"Priority": 0,
"AssignNodeSpec": {
"EnableAssignNode": true,
"NodeNames": "",
"AntiAffinityNodeNames": ""
}
}错误码
访问错误中心查看更多错误码。
变更历史
| 变更时间 | 变更内容概要 | 操作 |
|---|---|---|
| 2025-11-25 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2024-11-18 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2024-10-17 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2024-08-07 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2024-07-19 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2024-05-09 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2024-03-14 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2024-02-29 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2023-12-26 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2023-12-07 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2023-09-01 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2023-08-15 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2023-05-24 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2023-04-20 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2022-09-14 | OpenAPI 返回结构发生变更 | 查看变更详情 |
| 2022-09-14 | OpenAPI 返回结构发生变更 | 查看变更详情 |
