使用ANALYZE命令收集表数据统计信息以优化查询性能-云原生大数据计算服务 MaxCompute-阿里云-云原生大数据计算服务 MaxCompute(MaxCompute)-阿里云帮助中心

ANALYZE命令用于收集表数据统计信息来获取元数据，为优化查询性能提供帮助。

命令格式

非分区表。

analyze table <table_name> compute statistics for columns [(<col_name> [,<col_name> ...])];

分区表。

analyze table <table_name> partition(<pt_spec>) compute statistics for columns (<column_name>[, <column_name>]);

您也可以根据需要使用如下删除指定列或所有列的统计值：

删所有列级统计值：

ANALYZE TABLE <table_name> DELETE STATISTICS FOR COLUMNS;

删指定列级统计值：

ANALYZE TABLE <table_name> DELETE STATISTICS FOR COLUMNS (column_name>[, <column_name>]);

参数说明

table_name：表名称。
column_name：指定收集统计信息的列名称。
pt_spec：指定收集统计信息分区表的分区信息。

使用示例

对非分区表的指定列或全部列收集统计信息。

-- 创建一张非分区表analyze2_test
create table if not exists analyze2_test
 (tinyint1 tinyint,
  smallint1 smallint,
  int1 int,
  bigint1 bigint,
  double1 double,
  decimal1 decimal,
  decimal2 decimal(20,10),
  string1 string,
  varchar1 varchar(10),
  boolean1 boolean,
  timestamp1 timestamp,
  datetime1 datetime )
 lifecycle 30;
--向表中插入数据
insert overwrite table analyze2_test select * from values 
(1Y, 20S, 4, 8L, 123452.3, 12.4, 52.5, 'str1', 'str21', false, timestamp '2018-09-17 00:00:00', datetime '2018-09-17 00:59:59'),
(10Y, 2S, 7, 11111118L, 67892.3, 22.4, 42.5, 'str12', 'str200', true, timestamp '2018-09-17 00:00:00', datetime '2018-09-16 00:59:59'),
(20Y, 7S, 4, 2222228L, 12.3, 2.4, 2.57, 'str123', 'str2', false, timestamp '2018-09-18 00:00:00', datetime '2018-09-17 00:59:59'),
(null, null, null, null, null, null, null, null, null, null, null , null) 
as t(tinyint1, smallint1, int1, bigint1, double1, decimal1, decimal2, string1, varchar1, boolean1, timestamp1, datetime1);

--收集tinyint1列的统计信息
analyze table analyze2_test compute statistics for columns (tinyint1); 

--收集smallint1、string1、boolean1和timestamp1列的统计信息
analyze table analyze2_test compute statistics for columns (smallint1, string1, boolean1, timestamp1);

--收集全部列的统计信息
analyze table analyze2_test compute statistics for columns;

执行show statistic命令测试统计信息收集结果，命令示例如下。

--测试tinyint1列的统计信息收集结果。
show statistic analyze2_test columns (tinyint1);

--测试smallint1、string1、boolean1和timestamp1列的统计信息收集结果。
show statistic analyze2_test columns (smallint1, string1, boolean1, timestamp1);

--测试全部列的统计信息收集结果。
show statistic analyze2_test columns;

返回结果如下。

--tinyint1列的统计信息收集结果。
ID = 20201126085225150gnqo****
tinyint1:MaxValue:      20                   --对应max。
tinyint1:DistinctNum:   4.0                  --对应ndv。
tinyint1:MinValue:      1                    --对应min。
tinyint1:NullNum:       1.0                  --对应nNulls。
tinyint1:TopK:  {1=1.0, 10=1.0, 20=1.0}      --对应topK。10=1.0表示列值10出现的频次为1。topK最多显示前20个最高频次的值。

--smallint1、string1、boolean1和timestamp1列的统计信息收集结果。
ID = 20201126091636149gxgf****
smallint1:MaxValue:     20
smallint1:DistinctNum:  4.0
smallint1:MinValue:     2
smallint1:NullNum:      1.0
smallint1:TopK:         {2=1.0, 7=1.0, 20=1.0}

string1:MaxLength       6.0                  --对应maxColLen。
string1:AvgLength:      3.0                  --对应avgColLen。
string1:DistinctNum:    4.0
string1:NullNum:        1.0
string1:TopK:   {str1=1.0, str12=1.0, str123=1.0}

boolean1:DistinctNum:   3.0
boolean1:NullNum:       1.0
boolean1:TopK:  {false=2.0, true=1.0}

timestamp1:DistinctNum:         3.0
timestamp1:NullNum:     1.0
timestamp1:TopK:        {2018-09-17 00:00:00.0=2.0, 2018-09-18 00:00:00.0=1.0}

--全部列的统计信息收集结果。
ID = 20201126092022636gzm1****
tinyint1:MaxValue:      20
tinyint1:DistinctNum:   4.0
tinyint1:MinValue:      1
tinyint1:NullNum:       1.0
tinyint1:TopK:  {1=1.0, 10=1.0, 20=1.0}

smallint1:MaxValue:     20
smallint1:DistinctNum:  4.0
smallint1:MinValue:     2
smallint1:NullNum:      1.0
smallint1:TopK:         {2=1.0, 7=1.0, 20=1.0}

int1:MaxValue:  7
int1:DistinctNum:       3.0
int1:MinValue:  4
int1:NullNum:   1.0
int1:TopK:      {4=2.0, 7=1.0}

bigint1:MaxValue:       11111118
bigint1:DistinctNum:    4.0
bigint1:MinValue:       8
bigint1:NullNum:        1.0
bigint1:TopK:   {8=1.0, 2222228=1.0, 11111118=1.0}

double1:MaxValue:       123452.3
double1:DistinctNum:    4.0
double1:MinValue:       12.3
double1:NullNum:        1.0
double1:TopK:   {12.3=1.0, 67892.3=1.0, 123452.3=1.0}

decimal1:MaxValue:      22.4
decimal1:DistinctNum:   4.0
decimal1:MinValue:      2.4
decimal1:NullNum:       1.0
decimal1:TopK:  {2.4=1.0, 12.4=1.0, 22.4=1.0}

decimal2:MaxValue:      52.5
decimal2:DistinctNum:   4.0
decimal2:MinValue:      2.57
decimal2:NullNum:       1.0
decimal2:TopK:  {2.57=1.0, 42.5=1.0, 52.5=1.0}

string1:MaxLength       6.0
string1:AvgLength:      3.0
string1:DistinctNum:    4.0
string1:NullNum:        1.0
string1:TopK:   {str1=1.0, str12=1.0, str123=1.0}

varchar1:MaxLength      6.0
varchar1:AvgLength:     3.0
varchar1:DistinctNum:   4.0
varchar1:NullNum:       1.0
varchar1:TopK:  {str2=1.0, str200=1.0, str21=1.0}

boolean1:DistinctNum:   3.0
boolean1:NullNum:       1.0
boolean1:TopK:  {false=2.0, true=1.0}

timestamp1:DistinctNum:         3.0
timestamp1:NullNum:     1.0
timestamp1:TopK:        {2018-09-17 00:00:00.0=2.0, 2018-09-18 00:00:00.0=1.0}

datetime1:DistinctNum:  3.0
datetime1:NullNum:      1.0
datetime1:TopK:         {1537117199000=2.0, 1537030799000=1.0}

对分区表的指定分区收集统计信息。

--创建分区表srcpart_test
create table if not exists srcpart_test 
  (key string,
    value string) 
partitioned by (ds string, hr string) 
lifecycle 30;

--向表中插入数据
insert into table srcpart_test partition(ds='20201220', hr='11') values 
('123', 'val_123'), ('76', 'val_76'), ('447', 'val_447'), ('1234', 'val_1234');
insert into table srcpart_test partition(ds='20201220', hr='12') values
 ('3', 'val_3'), ('12331', 'val_12331'), ('42', 'val_42'), ('12', 'val_12');
insert into table srcpart_test partition(ds='20201221', hr='11') values
 ('543', 'val_543'), ('2', 'val_2'), ('4', 'val_4'), ('9', 'val_9');
insert into table srcpart_test partition(ds='20201221', hr='12') values
 ('23', 'val_23'), ('56', 'val_56'), ('4111', 'val_4111'), ('12333', 'val_12333');

--收集指定分区的统计信息
analyze table srcpart_test partition(ds='20201221') compute statistics for columns (key , value);

执行show statistic命令测试统计信息收集结果，命令示例如下。

show statistic srcpart_test partition (ds='20201221') columns (key , value);

返回结果如下。

ID = 20210105121800689g28p****
(ds=20201221,hr=11) key:MaxLength       3.0
(ds=20201221,hr=11) key:AvgLength:      1.0
(ds=20201221,hr=11) key:DistinctNum:    4.0
(ds=20201221,hr=11) key:NullNum:        0.0
(ds=20201221,hr=11) key:TopK:   {2=1.0, 4=1.0, 543=1.0, 9=1.0}

(ds=20201221,hr=11) value:MaxLength     7.0
(ds=20201221,hr=11) value:AvgLength:    5.0
(ds=20201221,hr=11) value:DistinctNum:  4.0
(ds=20201221,hr=11) value:NullNum:      0.0
(ds=20201221,hr=11) value:TopK:         {val_2=1.0, val_4=1.0, val_543=1.0, val_9=1.0}

(ds=20201221,hr=12) key:MaxLength       5.0
(ds=20201221,hr=12) key:AvgLength:      3.0
(ds=20201221,hr=12) key:DistinctNum:    4.0
(ds=20201221,hr=12) key:NullNum:        0.0
(ds=20201221,hr=12) key:TopK:   {12333=1.0, 23=1.0, 4111=1.0, 56=1.0}

(ds=20201221,hr=12) value:MaxLength     9.0
(ds=20201221,hr=12) value:AvgLength:    7.0
(ds=20201221,hr=12) value:DistinctNum:  4.0
(ds=20201221,hr=12) value:NullNum:      0.0
(ds=20201221,hr=12) value:TopK:         {val_12333=1.0, val_23=1.0, val_4111=1.0, val_56=1.0}