这篇文档介绍DLA支持的文件格式。

​CREATE EXTERNAL TABLE IF NOT EXISTS test_avro (
    L_ORDERKEY INT,
    L_PARTKEY INT,
    L_SUPPKEY INT,
    L_LINENUMBER INT,
    L_QUANTITY DOUBLE,
    L_EXTENDEDPRICE DOUBLE,
    L_DISCOUNT DOUBLE,
    L_TAX DOUBLE,
    L_RETURNFLAG STRING,
    L_LINESTATUS STRING,
    L_SHIPDATE DATE,
    L_COMMITDATE DATE,
    L_RECEIPTDATE DATE,
    L_SHIPINSTRUCT STRING,
    L_SHIPMODE STRING,
    L_COMMENT STRING
)
STORED AS AVRO
LOCATION 'oss://bucket001/datasets/test/test_avro/';​

ORC

​CREATE EXTERNAL TABLE test_orc (
    L_ORDERKEY INT,
    L_PARTKEY INT,
    L_SUPPKEY INT,
    L_LINENUMBER INT,
    L_QUANTITY DOUBLE,
    L_EXTENDEDPRICE DOUBLE,
    L_DISCOUNT DOUBLE,
    L_TAX DOUBLE,
    L_RETURNFLAG STRING,
    L_LINESTATUS STRING,
    L_SHIPDATE DATE,
    L_COMMITDATE DATE,
    L_RECEIPTDATE DATE,
    L_SHIPINSTRUCT STRING,
    L_SHIPMODE STRING,
    L_COMMENT STRING
)
STORED AS ORC
LOCATION 'oss://bucket001/datasets/test/test_orc';​

Parquet

​CREATE EXTERNAL TABLE test_parquet_hive_serde (
    L_ORDERKEY INT,
    L_PARTKEY INT,
    L_SUPPKEY INT,
    L_LINENUMBER INT,
    L_QUANTITY DOUBLE,
    L_EXTENDEDPRICE DOUBLE,
    L_DISCOUNT DOUBLE,
    L_TAX DOUBLE,
    L_RETURNFLAG STRING,
    L_LINESTATUS STRING,
    L_SHIPDATE DATE,
    L_COMMITDATE DATE,
    L_RECEIPTDATE DATE,
    L_SHIPINSTRUCT STRING,
    L_SHIPMODE STRING,
    L_COMMENT STRING
)
STORED AS PARQUET
LOCATION 'oss://bucket001/datasets/test/test_parquet_hive_serde';​

RcFile

​CREATE EXTERNAL TABLE test_rcfile (
    L_ORDERKEY INT,
    L_PARTKEY INT,
    L_SUPPKEY INT,
    L_LINENUMBER INT,
    L_QUANTITY DOUBLE,
    L_EXTENDEDPRICE DOUBLE,
    L_DISCOUNT DOUBLE,
    L_TAX DOUBLE,
    L_RETURNFLAG STRING,
    L_LINESTATUS STRING,
    L_SHIPDATE DATE,
    L_COMMITDATE DATE,
    L_RECEIPTDATE DATE,
    L_SHIPINSTRUCT STRING,
    L_SHIPMODE STRING,
    L_COMMENT STRING
)
STORED AS RCFILE
LOCATION 'oss://bucket001/datasets/test/test_rcfile';​

TextFile: Normal

​CREATE EXTERNAL TABLE IF NOT EXISTS test_text_null (
    URL STRING,
    TITLE STRING
)
STORED AS TEXTFILE
LOCATION 'oss://bucket001/datasets/test/test_null/textfile/';​

TextFile: OpenCSV

在使用OpenCSVSerde时需要注意以下几点:

  • 用户可以为行的字段指定字段分隔符、字段内容引用符号和转义字符,例如:WITH SERDEPROPERTIES (“separatorChar” = “,”, “quoteChar” = “`”, “escapeChar” = “\” );
  • 不支持字段内嵌入的行分割符;
  • 所有字段定义STRING类型;
  • 其他数据类型的处理,可以在SQL中使用函数进行转换。
​CREATE EXTERNAL TABLE test_opencsv (
  id STRING,
  name STRING,
  location STRING,
  create_date STRING,
  create_timestamp STRING,
  longitude STRING,
  latitude STRING
) 
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties(
"separatorChar"=",",
"quoteChar"="\"",
"escapeChar"="\\"
)
STORED AS TEXTFILE 
LOCATION 'oss://bucket001/datasets/test/test_opencsv';​

TextFile: MultiDelimitSerDe

此文件格式的主要特点是对于TextFile里面列分隔符可以使用多个字符来做分隔。

​CREATE EXTERNAL TABLE test_csv_multidelimit (
  id STRING,
  name STRING,
  location STRING,
  create_date STRING,
  create_timestamp STRING,
  longitude STRING,
  latitude STRING
) 
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.MultiDelimitSerDe'
with serdeproperties(
"field.delim"="||"
)
STORED AS TEXTFILE 
LOCATION 'oss://bucket001/datasets/test/test_multidelimiter';​

RegexSerDe

此文件格式的主要使用场景是对一些没有固定格式、需要借助正则表达式来对数据中的字段进行扣取。

​CREATE EXTERNAL TABLE IF NOT EXISTS test_regex (
  host STRING,
  identity STRING,
  `user` STRING,
  time STRING,
  request STRING,
  status STRING,
  size INT,
  referer STRING,
  agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?"
)
STORED AS TEXTFILE
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/test/test_regex';​

JSON: org.apache.hadoop.hive.serde2.JsonSerDe

​CREATE EXTERNAL TABLE IF NOT EXISTS `customer_case_jiahe`.`single_latin1_broken` (
    `id` int ,
    `name` string ,
    `age` int 
)
STORED AS JSON
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/test/customer_case/jiahe/single_latin1_encode_broken_record.log';​

JSON: org.apache.hive.hcatalog.data.JsonSerDe

​CREATE external TABLE json_table_1 (
   docid string,
   user_1 struct<
               id:INT,
               username:string,
               name:string,
               shippingaddress:struct<
                                      address1:string,
                                      address2:string,
                                      city:string,
                                      state:string
                                      >,
               orders:array<
                            struct<
                                 itemid:INT,
                                  orderdate:string
                                  >
                              >
               >
   )
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/test/json/hcatalog_serde/table_1/test_json.json';​

JSON EsriJsonSerDe

DLA支持Esri ArcGIS的地理JSON数据文件的SerDe处理,关于这种地理JSON数据格式说明,可以参考:https://github.com/Esri/spatial-framework-for-hadoop/wiki/JSON-Formats

​CREATE EXTERNAL TABLE IF NOT EXISTS california_counties
 (
 Name string,
 BoundaryShape binary
 )
ROW FORMAT SERDE 'com.esri.hadoop.hive.serde.EsriJsonSerDe'
STORED AS INPUTFORMAT 'com.esri.json.hadoop.EnclosedEsriJsonInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'oss://oss-cn-beijing-for-openanalytics-test/datasets/geospatial/california-counties/'​