本文为您介绍如何进行PyODPS的列运算。
前提条件
请提前完成如下操作:
在DataWorks上完成业务流程创建,本例使用DataWorks简单模式。详情请参见创建业务流程。
操作步骤
创建表并导入数据。
下载鸢尾花数据集iris.data,重命名为iris.csv。
创建表pyodps_iris并上传数据集iris.csv。操作方法请参见建表并上传数据。
建表语句如下。
CREATE TABLE if not exists pyodps_iris ( sepallength DOUBLE comment '片长度(cm)', sepalwidth DOUBLE comment '片宽度(cm)', petallength DOUBLE comment '瓣长度(cm)', petalwidth DOUBLE comment '瓣宽度(cm)', name STRING comment '种类' );
- 登录DataWorks控制台。
在左侧导航栏上单击工作空间列表。
选择操作列中的 。
在数据开发页面,右键单击已经创建的业务流程,选择 。
在新建节点对话框,输入节点名称,并单击确认。
- 在PyODPS节点输入示例代码。
示例代码如下。
from odps import DataFrame import numpy as np import pandas as pd iris = DataFrame(o.get_table('pyodps_iris')) #判断是否为null。 print iris.sepallength.isnull().head(5) #逻辑判断。 print (iris.sepallength > 5).ifelse('gt5','lte5').rename('cmp5').head(5) #多条件判断。 print iris.sepallength.switch(4.9,'eq4.9',5.0,'eq5.0',default='noeq').rename('equalness').head(5) from odps.df import switch print switch(iris.sepallength == 4.9,'eq4.9',iris.sepallength == 5.0,'eq5.0',default='noeq').rename('equalness').head(5) #修该数据集某一列的一部分值。 iris[iris.sepallength > 5,'cmp5'] = 'gt5' iris[iris.sepallength <=5,'cmp5'] = 'lte5' print iris.head(5) #数学计算。 print (iris.sepallength * 10).log().head(5) fields = [iris.sepallength,(iris.sepallength /2).rename('sepallength/2'),(iris.sepallength ** 2).rename('sepallength的平方')] print iris[fields].head(5) print (iris.sepallength < 5).head(5) #集合类型相关操作。 data = {'id': [1,2], 'a': [['a1','b1'],['c1']], 'b': [{'a2': 0, 'b2': 1, 'c2': 2},{'d2': 3, 'e2': 4}]} df = pd.DataFrame(data) print df df1 = DataFrame(df, unknown_as_string=True, as_type={'a': 'list<string>','b' : 'dict<string,int64>'}) print df1.dtypes print df1.head() print df1[df1.id,df1.a[0],df1.b.len()].head() print df1.a.explode().head() print df1.a.explode(pos=True).head() print df1.b.explode().head() print df1.b.explode(['key','value']).head() #explode与其他并列多行输出。 print df1[df1.id,df1.a.explode()].head() #isin,notin,cut。 #isin用于判断Sequence里的元素是否在某个集合元素里;notin反之。 print iris.sepallength.isin([4.9,5.1]).rename('sepallength').head() #cut提供离散化的操作,可以将Sequence的数据拆成几个区段。 print iris.sepallength.cut(range(6),labels=['0-1','1-2','2-3','3-4','4-5']).rename('sepallength_cut').head(5) #include_under和include_over可以分别包括向下和向上的区间。 labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-'] iris.sepallength.cut(range(6), labels=labels, include_over=True).rename('sepallength_cut').head(5)
- 单击运行。
- 在运行日志中查看结果。
运行结果如下。
Executing user script with PyODPS 0.8.0 Sql compiled: CREATE TABLE tmp_pyodps_32e39a9a_8ae6_4815_b465_4b9dfa3bf1b9 LIFECYCLE 1 AS SELECT t1.`sepallength` IS NULL AS `sepallength` FROM DQC_0221_dev.`pyodps_iris` t1 sepallength 0 False 1 False 2 False 3 False 4 False Sql compiled: CREATE TABLE tmp_pyodps_d684aded_8ad8_4119_8558_1fddd774dd3f LIFECYCLE 1 AS SELECT IF(t1.`sepallength` > 5, 'gt5', 'lte5') AS `cmp5` FROM DQC_0221_dev.`pyodps_iris` t1 cmp5 0 lte5 1 lte5 2 lte5 3 lte5 4 gt5 Sql compiled: CREATE TABLE tmp_pyodps_cfc5bdb9_6d3e_4711_8acc_913b29b96c9a LIFECYCLE 1 AS SELECT CASE t1.`sepallength` WHEN 4.9 THEN 'eq4.9' WHEN 5.0 THEN 'eq5.0' ELSE 'noeq' END AS `equalness` FROM DQC_0221_dev.`pyodps_iris` t1 equalness 0 eq4.9 1 noeq 2 noeq 3 eq5.0 4 noeq Sql compiled: CREATE TABLE tmp_pyodps_d1fd70b4_2ad8_4dc9_838e_f566f7f5841e LIFECYCLE 1 AS SELECT CASE WHEN t1.`sepallength` == 4.9 THEN 'eq4.9' WHEN t1.`sepallength` == 5.0 THEN 'eq5.0' ELSE 'noeq' END AS `equalness` FROM DQC_0221_dev.`pyodps_iris` t1 equalness 0 eq4.9 1 noeq 2 noeq 3 eq5.0 4 noeq Sql compiled: CREATE TABLE tmp_pyodps_d9e5d64e_1f21_4ffb_993f_70d3f36b6553 LIFECYCLE 1 AS SELECT t1.`sepallength`, t1.`sepalwidth`, t1.`petallength`, t1.`petalwidth`, t1.`name`, IF(t1.`sepallength` <= 5, 'lte5', IF(t1.`sepallength` > 5, 'gt5', CAST(NULL AS STRING))) AS `cmp5` FROM DQC_0221_dev.`pyodps_iris` t1 sepallength sepalwidth petallength petalwidth name cmp5 0 4.9 3.0 1.4 0.2 Iris-setosa lte5 1 4.7 3.2 1.3 0.2 Iris-setosa lte5 2 4.6 3.1 1.5 0.2 Iris-setosa lte5 3 5.0 3.6 1.4 0.2 Iris-setosa lte5 4 5.4 3.9 1.7 0.4 Iris-setosa gt5 Sql compiled: CREATE TABLE tmp_pyodps_4bfb0aa0_7ec5_4f35_9480_d36baf2b8079 LIFECYCLE 1 AS SELECT LN(t1.`sepallength` * 10) AS `sepallength` FROM DQC_0221_dev.`tmp_pyodps_d9e5d64e_1f21_4ffb_993f_70d3f36b6553` t1 sepallength 0 3.891820 1 3.850148 2 3.828641 3 3.912023 4 3.988984 Sql compiled: CREATE TABLE tmp_pyodps_deb6fa45_fbb1_41d2_8174_76d35eebea9c LIFECYCLE 1 AS SELECT t1.`sepallength`, t1.`sepallength` / 2 AS `sepallength/2`, POW(t1.`sepallength`, 2) AS `sepallength的平方` FROM DQC_0221_dev.`tmp_pyodps_d9e5d64e_1f21_4ffb_993f_70d3f36b6553` t1 sepallength sepallength/2 sepallength的平方 0 4.9 2.45 24.01 1 4.7 2.35 22.09 2 4.6 2.30 21.16 3 5.0 2.50 25.00 4 5.4 2.70 29.16 Sql compiled: CREATE TABLE tmp_pyodps_fa326546_c5e8_466f_915d_e783b7dcacdf LIFECYCLE 1 AS SELECT t1.`sepallength` < 5 AS `sepallength` FROM DQC_0221_dev.`tmp_pyodps_d9e5d64e_1f21_4ffb_993f_70d3f36b6553` t1 sepallength 0 True 1 True 2 True 3 False 4 False a b id 0 [a1, b1] {u'c2': 2, u'a2': 0, u'b2': 1} 1 1 [c1] {u'd2': 3, u'e2': 4} 2 odps.Schema { a list<string> b dict<string,int64> id int64 } a b id 0 [a1, b1] {u'c2': 2, u'a2': 0, u'b2': 1} 1 1 [c1] {u'd2': 3, u'e2': 4} 2 id a b 0 1 a1 3 1 2 c1 2 a 0 a1 1 b1 2 c1 a_pos a 0 0 a1 1 1 b1 2 0 c1 b_key b_value 0 c2 2 1 a2 0 2 b2 1 3 d2 3 4 e2 4 key value 0 c2 2 1 a2 0 2 b2 1 3 d2 3 4 e2 4 id a 0 1 a1 1 1 b1 2 2 c1 Sql compiled: CREATE TABLE tmp_pyodps_fb28d9b9_fa45_4712_a564_dd751641286e LIFECYCLE 1 AS SELECT t1.`sepallength` IN (4.9, 5.1) AS `sepallength` FROM DQC_0221_dev.`tmp_pyodps_d9e5d64e_1f21_4ffb_993f_70d3f36b6553` t1 sepallength 0 True 1 False 2 False 3 False 4 False 5 False 6 False 7 False 8 True 9 False 10 False 11 False 12 False 13 False 14 False 15 False 16 True 17 False 18 True 19 False 20 True 21 False 22 True 23 False 24 False 25 False 26 False 27 False 28 False 29 False 30 False 31 False 32 False 33 True 34 False 35 False 36 True 37 False 38 True 39 False 40 False 41 False 42 False 43 True 44 False 45 True 46 False 47 False 48 False 49 False 50 False 51 False 52 False 53 False 54 False 55 False 56 True 57 False 58 False 59 False Sql compiled: CREATE TABLE tmp_pyodps_1f85e8e1_a9f0_473f_ac0e_d526fdd80dc4 LIFECYCLE 1 AS SELECT CASE WHEN (0 < t1.`sepallength`) AND (t1.`sepallength` <= 1) THEN '0-1' WHEN (1 < t1.`sepallength`) AND (t1.`sepallength` <= 2) THEN '1-2' WHEN (2 < t1.`sepallength`) AND (t1.`sepallength` <= 3) THEN '2-3' WHEN (3 < t1.`sepallength`) AND (t1.`sepallength` <= 4) THEN '3-4' WHEN (4 < t1.`sepallength`) AND (t1.`sepallength` <= 5) THEN '4-5' END AS `sepallength_cut` FROM DQC_0221_dev.`tmp_pyodps_d9e5d64e_1f21_4ffb_993f_70d3f36b6553` t1 sepallength_cut 0 4-5 1 4-5 2 4-5 3 4-5 4 None Sql compiled: CREATE TABLE tmp_pyodps_080a677e_904b_4edc_a961_160f8c664d7f LIFECYCLE 1 AS SELECT CASE WHEN (0 < t1.`sepallength`) AND (t1.`sepallength` <= 1) THEN '0-1' WHEN (1 < t1.`sepallength`) AND (t1.`sepallength` <= 2) THEN '1-2' WHEN (2 < t1.`sepallength`) AND (t1.`sepallength` <= 3) THEN '2-3' WHEN (3 < t1.`sepallength`) AND (t1.`sepallength` <= 4) THEN '3-4' WHEN (4 < t1.`sepallength`) AND (t1.`sepallength` <= 5) THEN '4-5' WHEN 5 < t1.`sepallength` THEN '5-' END AS `sepallength_cut` FROM DQC_0221_dev.`tmp_pyodps_d9e5d64e_1f21_4ffb_993f_70d3f36b6553` t1
文档内容是否对您有帮助?