您可以通过Python SDK的方式提交PAI-DLC任务,本文介绍如何通过Python SDK提交使用公共DLC资源组或专有DLC资源组的训练任务,核心步骤包括下载Python SDK、安装Python SDK及创建并提交任务。

前提条件

背景信息

关于Python SDK更多接口的说明文档,请参见API参考

如果您需要通过界面提交PAI-DLC公共资源组或专有资源组任务,请参见提交任务(通过Training页面)。此外,您也可以登录PAI-DSW探索者版,一键体验本文涉及的操作。

步骤一:安装Python SDK

  • 安装AI工作空间SDK。
    pip install https://sdk-portal-us-prod.oss-accelerate.aliyuncs.com/downloads/u-d6f710a0-efc7-44b8-af12-fb4d8f9a9907-python-aiworkspace.zip
  • 安装DLC SDK。
    # 安装python2 sdk。
    pip install https://sdk-portal-cluster-prod.oss-cn-zhangjiakou.aliyuncs.com/downloads/u-46a0db30-e48a-4e12-8c87-b3a86321f3e2-python2-pai-dlc.zip
    # 安装python3 sdk。
    pip install https://sdk-portal-cluster-prod.oss-cn-zhangjiakou.aliyuncs.com/downloads/u-3536038a-3de7-4f2e-9379-0cb309d29355-python-pai-dlc.zip

步骤二:创建并提交任务

  • 使用公共资源组创建并提交任务。
    创建任务时,需要使用当前阿里云账号对应的AccessKey信息。关于如何获取AccessKey,请参见获取AccessKey。创建并提交任务的具体调用代码如下所示。
    #!/usr/bin/env python3
    
    from __future__ import print_function
    
    import json
    import time
    
    from alibabacloud_tea_openapi.models import Config
    
    from alibabacloud_pai_dlc20201203.client import Client as DLCClient
    from alibabacloud_pai_dlc20201203.models import (
        ListImagesRequest,
        ListCodeSourcesRequest,
        ListJobsRequest,
        ListEcsSpecsRequest,
        CreateJobRequest
    )
    
    from alibabacloud_aiworkspace20210204.client import Client as AIWorkspaceClient
    from alibabacloud_aiworkspace20210204.models import (
        ListWorkspacesRequest,
        CreateDatasetRequest,
        ListDatasetsRequest,
        ListImagesRequest,
        ListCodeSourcesRequest
    )
    
    
    def create_nas_dataset(client, region, workspace_id, name,
                           nas_id, nas_path, mount_path):
        '''创建NAS的数据集。
        '''
        response = client.create_dataset(CreateDatasetRequest(
            workspace_id=workspace_id,
            name=name,
            data_type='COMMON',
            data_source_type='NAS',
            property='DIRECTORY',
            uri=f'nas://{nas_id}.{region}{nas_path}',
            accessibility='PRIVATE',
            source_type='USER',
            options=json.dumps({
                'mountPath': mount_path
            })
        ))
        return response.body.dataset_id
    
    
    def create_oss_dataset(client, region, workspace_id, name,
                           oss_bucket, oss_endpoint, oss_path, mount_path):
        '''创建OSS数据集。
        '''
        response = client.create_dataset(CreateDatasetRequest(
            workspace_id=workspace_id,
            name=name,
            data_type='COMMON',
            data_source_type='OSS',
            property='DIRECTORY',
            uri=f'oss://{oss_bucket}.{oss_endpoint}{oss_path}',
            accessibility='PRIVATE',
            source_type='USER',
            options=json.dumps({
                'mountPath': mount_path
            })
        ))
        return response.body.dataset_id
    
    
    
    def wait_for_job_to_terminate(client, job_id):
        while True:
            job = client.get_job(job_id).body
            print('job is {}'.format(job.status))
            if job.status in ('Succeeded', 'Failed', 'Stopped'):
                return job.status
            time.sleep(5)
        return None
    
    
    def main():
    
        region_id = 'cn-beijing'
        access_key_id = '**'
        access_key_secret = '**'
    
        # 1. create client;
        workspace_client = AIWorkspaceClient(
            Config(access_key_id=access_key_id,
                   access_key_secret=access_key_secret,
                   region_id=region_id,
                   endpoint='aiworkspace.{}.aliyuncs.com'.format(region_id)))
    
        dlc_client = DLCClient(
            Config(access_key_id=access_key_id,
                   access_key_secret=access_key_secret,
                   region_id=region_id,
                   endpoint='pai-dlc.{}.aliyuncs.com'.format(region_id)))
    
        print('------- Workspaces -----------')
        # 获取工作空间列表。
        workspaces = workspace_client.list_workspaces(ListWorkspacesRequest(
            page_number=1, page_size=1, workspace_name='dlc03',
            module_list='PAI'
        ))
        for workspace in workspaces.body.workspaces:
            print(workspace.workspace_name, workspace.workspace_id,
                  workspace.status, workspace.creator)
    
        if len(workspaces.body.workspaces) == 0:
            raise RuntimeError('found no workspaces')
    
        workspace_id = workspaces.body.workspaces[0].workspace_id
    
        print('------- Images ------------')
        # 获取镜像列表。
        images = workspace_client.list_images(ListImagesRequest(
            labels=','.join(['system.supported.dlc=true',
                             'system.framework=Tensorflow 1.15',
                             'system.pythonVersion=3.6',
                             'system.chipType=CPU'])))
        for image in images.body.images:
            print(json.dumps(image.to_map(), indent=2))
    
        image_uri = images.body.images[0].image_uri
    
        print('------- Datasets ----------')
        # 获取数据集。
        datasets = workspace_client.list_datasets(ListDatasetsRequest(
            workspace_id=workspace_id,
            name='example-nas-data', properties='DIRECTORY'))
        for dataset in datasets.body.datasets:
            print(dataset.name, dataset.dataset_id, dataset.uri, dataset.options)
    
        if len(datasets.body.datasets) == 0:
            # 当前数据集不存在时,创建数据集。
            dataset_id = create_nas_dataset(
                client=workspace_client,
                region=region_id,
                workspace_id=workspace_id,
                name='example-nas-data',
                nas_id='**',
                nas_path='/',
                mount_path='/mnt/data/example-nas')
            print('create dataset with id: {}'.format(dataset_id))
        else:
            dataset_id = datasets.body.datasets[0].dataset_id
    
        print('------- Code Sources ----------')
        # 获取代码集列表。
        code_sources = workspace_client.list_code_sources(ListCodeSourcesRequest(
            workspace_id=workspace_id))
        for code_source in code_sources.body.code_sources:
            print(code_source.display_name, code_source.code_source_id, code_source.code_repo)
    
        print('-------- ECS SPECS ----------')
        # 获取DLC的节点规格列表。
        ecs_specs = dlc_client.list_ecs_specs(ListEcsSpecsRequest(page_size=100, sort_by='Memory', order='asc'))
        for spec in ecs_specs.body.ecs_specs:
            print(spec.instance_type, spec.cpu, spec.memory, spec.memory, spec.gpu_type)
    
        print('-------- Create Job ----------')
        # 创建DLC作业。
        create_job_resp = dlc_client.create_job(CreateJobRequest().from_map({
            'WorkspaceId': workspace_id,
            'DisplayName': 'sample-dlc-job',
            'JobType': 'TFJob',
            'JobSpecs': [
                {
                    "Type": "Worker",
                    "Image": image_uri,
                    "PodCount": 1,
                    "EcsSpec": ecs_specs.body.ecs_specs[0].instance_type,
                    "UseSpotInstance": False,
                },
            ],
            "UserCommand": "echo 'Hello World' && ls -R /mnt/data/ && sleep 30 && echo 'DONE'",
            'DataSources': [
                {
                    "DataSourceId": dataset_id,
                },
            ],
        }))
        job_id = create_job_resp.body.job_id
    
        wait_for_job_to_terminate(dlc_client, job_id)
    
        print('-------- List Jobs ----------')
        # 获取DLC的作业列表。
        jobs = dlc_client.list_jobs(ListJobsRequest(
            workspace_id=workspace_id,
            page_number=1,
            page_size=10,
        ))
        for job in jobs.body.jobs:
            print(job.display_name, job.job_id, job.workspace_name,
                  job.status, job.job_type)
        pass
    
    
    if __name__ == '__main__':
        main()
                    
  • 使用专有资源组创建并提交任务。
    1. 登录PAI控制台
    2. 按照下图操作指引,在工作空间列表页面查看您所在的工作空间ID。查询工作空间ID
    3. 按照下图操作指引,在PAI-DLC页面查看您的专有资源组的资源组ID。专有资源组ID
    4. 使用以下代码创建并提交任务。
      from alibabacloud_pai_dlc20201203.client import Client
      from alibabacloud_tea_openapi.models import Config
      from alibabacloud_pai_dlc20201203.models import CreateJobRequest, JobSpec
      
      # 初始化一个Client以访问DLC的API。
      region = 'cn-beijing'
      client = Client(
          Config(
              access_key_id='<替换成您自己的AccessKey>',
              access_key_secret='<替换成您自己的AccessKeySecret>',
              region_id=region,
              endpoint=f'pai-dlc.{region}.aliyuncs.com'
          )
      )
      
      # 声明任务的资源配置,关于镜像选择可以参考文档中公共镜像列表,也可以传入自己的镜像地址。
      spec = JobSpec(
          type='Worker',
          image='registry-vpc.cn-beijing.aliyuncs.com/pai-dlc/tensorflow-training:1.15-cpu-py36-ubuntu18.04',
          pod_count=1,
          resource_config=ResourceConfig(cpu='1', memory='2Gi')
      )
      
      # 声明任务的执行内容。
      req = CreateJobRequest(
          resource_id='<替换成您自己的资源组ID>',
          workspace_id='<替换成您自己的WorkspaceID'
          display_name='sample-dlc-job',
          job_type='TFJob',
          job_specs=[spec],
          user_command='echo "Hello World"'
      )
      
      # 提交任务。
      response = client.create_job(req)
      # 获取任务ID。
      job_id = response.body.job_id
      
      # 查询任务状态。
      job = client.get_job(job_id).body
      print('job status:', job.status)
      
      # 查看任务执行的命令。
      job.user_command