本文介绍如何进行PyODPS的数据缩放。

前提条件

请提前开通MaxCompute和DataWorks服务,并在DataWorks完成业务流程的创建。本例使用DataWorks简单模式。

操作步骤

  1. 准备测试数据。
    1. 登录DataWorks控制台,单击数据开发 > 业务流程 > 新建表 ,以DDL模式创建表pytable。新建表单击生成表结构 > 提交到生成环境,完成创建表。
      建表语句如下。
      CREATE TABLE `pytable` (
        `name` string,
        `id` bigint,
        `fid` double
      ) ;
    2. 将以下测试数据保存为pytable.txt文件。
      name1, 4, 5.3
      name2, 2, 3.5
      name2, 3, 1.5
      name1, 4, 4.2
      name1, 3, 2.2
      name1, 3, 4.1
    3. 将pytable.txt的数据导入到表pytable中。单击表pytable,右键选择导入数据,在数据导入向导中上传pytable.txt文件完成数据导入。导入数据
  2. 新建PyODPS节点。单击业务流程 > 数据开发,右键选择数据开发节点 > PyODPS,并将节点命名为数据缩放。新建节点.png
    数据缩放的代码如下。
    #数据缩放
    df = DataFrame(o.get_table('pytable'))
    
    #使用min_max_scale方法进行归一化
    print df.min_max_scale(columns=['fid']).head()
    
    #min_max_scale还支持使用feature_range参数指定输出值的范围。例如,需要使输出值在 (-1, 1) 范围内
    print df.min_max_scale(columns=['fid'],feature_range=(-1,1)).head()
    
    #如果需要保留原始值,可以使用preserve参数。此时,缩放后的数据将会以新增列的形式追加到数据中,列名默认为原列名追加_scaled后缀,该后缀可使用suffix参数更改
    print df.min_max_scale(columns=['fid'],preserve=True).head()
    
    #使用group分组
    print df.min_max_scale(columns=['fid'],group=['name']).head()
    
    #min_max_scale也支持使用group参数指定一个或多个分组列,在分组列中分别取最值进行缩放
    print df.std_scale(columns=['fid']).head()
  3. 单击运行按钮,运行该节点。运行节点.png
  4. 运行日志中查看运行结果。
    完整的运行结果如下。
    Sql compiled:
    CREATE TABLE tmp_pyodps_59cb5533_6dc3_4b64_9312_b2da29d70cee LIFECYCLE 1 AS
    SELECT /*+mapjoin(t5)*/ t2.`name`, t2.`id`, (IF((t5.`fid_max_1570691133` - t5.`fid_min_1570691133`) == 0, 0, (t2.`fid` - t5.`fid_min_1570691133`) / (t5.`fid_max_1570691133` - t5.`fid_min_1570691133`)) * 1) + 0 AS `fid`
    FROM (
      SELECT t1.`name`, t1.`id`, t1.`fid`, 1 AS `idx_col_1570691133`
      FROM WB_BestPractice_dev.`pytable` t1
    ) t2
    INNER JOIN
      (
        SELECT 1 AS `idx_col_1570691133`, MIN(t4.`fid`) AS `fid_min_1570691133`, MAX(t4.`fid`) AS `fid_max_1570691133`
        FROM (
          SELECT t3.`fid`
          FROM WB_BestPractice_dev.`pytable` t3
        ) t4
      ) t5
    ON t2.`idx_col_1570691133` == t5.`idx_col_1570691133`
    Instance ID: 2019101007053495gl8by72m
    
        name  id       fid
    0  name1   4  1.000000
    1  name2   2  0.526316
    2  name2   3  0.000000
    3  name1   4  0.710526
    4  name1   3  0.184211
    5  name1   3  0.684211
    Sql compiled:
    CREATE TABLE tmp_pyodps_88a24967_3bdc_41ce_85d7_4bbd23e4ce01 LIFECYCLE 1 AS
    SELECT /*+mapjoin(t5)*/ t2.`name`, t2.`id`, (IF((t5.`fid_max_1570691139` - t5.`fid_min_1570691139`) == 0, 0, (t2.`fid` - t5.`fid_min_1570691139`) / (t5.`fid_max_1570691139` - t5.`fid_min_1570691139`)) * 2) + -1 AS `fid`
    FROM (
      SELECT t1.`name`, t1.`id`, t1.`fid`, 1 AS `idx_col_1570691139`
      FROM WB_BestPractice_dev.`pytable` t1
    ) t2
    INNER JOIN
      (
        SELECT 1 AS `idx_col_1570691139`, MIN(t4.`fid`) AS `fid_min_1570691139`, MAX(t4.`fid`) AS `fid_max_1570691139`
        FROM (
          SELECT t3.`fid`
          FROM WB_BestPractice_dev.`pytable` t3
        ) t4
      ) t5
    ON t2.`idx_col_1570691139` == t5.`idx_col_1570691139`
    Instance ID: 20191010070539772gjo56292
    
        name  id       fid
    0  name1   4  1.000000
    1  name2   2  0.052632
    2  name2   3 -1.000000
    3  name1   4  0.421053
    4  name1   3 -0.631579
    5  name1   3  0.368421
    Sql compiled:
    CREATE TABLE tmp_pyodps_439117fc_9ef7_4086_899d_a5bf77d653e5 LIFECYCLE 1 AS
    SELECT /*+mapjoin(t5)*/ t2.`name`, t2.`id`, t2.`fid`, (IF((t5.`fid_max_1570691146` - t5.`fid_min_1570691146`) == 0, 0, (t2.`fid` - t5.`fid_min_1570691146`) / (t5.`fid_max_1570691146` - t5.`fid_min_1570691146`)) * 1) + 0 AS `fid_scaled`
    FROM (
      SELECT t1.`name`, t1.`id`, t1.`fid`, 1 AS `idx_col_1570691146`
      FROM WB_BestPractice_dev.`pytable` t1
    ) t2
    INNER JOIN
      (
        SELECT 1 AS `idx_col_1570691146`, MIN(t4.`fid`) AS `fid_min_1570691146`, MAX(t4.`fid`) AS `fid_max_1570691146`
        FROM (
          SELECT t3.`fid`
          FROM WB_BestPractice_dev.`pytable` t3
        ) t4
      ) t5
    ON t2.`idx_col_1570691146` == t5.`idx_col_1570691146`
    
    Instance ID: 20191010070546769g0c14f72
    
        name  id  fid  fid_scaled
    0  name1   4  5.3    1.000000
    1  name2   2  3.5    0.526316
    2  name2   3  1.5    0.000000
    3  name1   4  4.2    0.710526
    4  name1   3  2.2    0.184211
    5  name1   3  4.1    0.684211
    Sql compiled:
    CREATE TABLE tmp_pyodps_d3839b4b_1087_4d52_91f5_52763b72f272 LIFECYCLE 1 AS
    SELECT /*+mapjoin(t3)*/ t1.`name`, t1.`id`, (IF((t3.`fid_max_1570691151` - t3.`fid_min_1570691151`) == 0, 0, (t1.`fid` - t3.`fid_min_1570691151`) / (t3.`fid_max_1570691151` - t3.`fid_min_1570691151`)) * 1) + 0 AS `fid`
    FROM WB_BestPractice_dev.`pytable` t1
    INNER JOIN
      (
        SELECT t2.`name`, MAX(t2.`fid`) AS `fid_max_1570691151`, MIN(t2.`fid`) AS `fid_min_1570691151`
        FROM WB_BestPractice_dev.`pytable` t2
        GROUP BY t2.`name`
      ) t3
    ON t1.`name` == t3.`name`
    Instance ID: 20191010070551756gtf14f72
    
    
        name  id       fid
    0  name1   4  1.000000
    1  name2   2  1.000000
    2  name2   3  0.000000
    3  name1   4  0.645161
    4  name1   3  0.000000
    5  name1   3  0.612903
    Sql compiled:
    CREATE TABLE tmp_pyodps_1ea6e5b4_129f_4d1e_a6a7_410e08d77ae6 LIFECYCLE 1 AS
    SELECT /*+mapjoin(t5)*/ t2.`name`, t2.`id`, IF(t5.`fid_std_1570691157` == 0, 0, (t2.`fid` - t5.`fid_mean_1570691157`) / t5.`fid_std_1570691157`) AS `fid`
    FROM (
      SELECT t1.`name`, t1.`id`, t1.`fid`, 1 AS `idx_col_1570691157`
      FROM WB_BestPractice_dev.`pytable` t1
    ) t2
    INNER JOIN
      (
        SELECT 1 AS `idx_col_1570691157`, AVG(t4.`fid`) AS `fid_mean_1570691157`, STDDEV(t4.`fid`) AS `fid_std_1570691157`
        FROM (
          SELECT t3.`fid`
          FROM WB_BestPractice_dev.`pytable` t3
        ) t4
      ) t5
    ON t2.`idx_col_1570691157` == t5.`idx_col_1570691157`
    Instance ID: 20191010070557788gdl14f72
    
        name  id       fid
    0  name1   4  1.436467
    1  name2   2  0.026118
    2  name2   3 -1.540938
    3  name1   4  0.574587
    4  name1   3 -0.992468
    5  name1   3  0.496234