hdfs存储测试对比
数据格式
存储格式 | TEXTFILE | SequenceFile | RCfile | Avro | Parquet | ORC |
---|---|---|---|---|---|---|
数据大小 | 65G | 67G | 61.5G | 68.2 G | 28.9 G | 8.3G |
load hive use times | 286.319 seconds | 118.45 seconds | 106.212 seconds | 163.988 seconds | 136.663 seconds | 130.186 seconds |
CREATE TABLE IF NOT EXISTS prod_purchased_txt(
uid string,
event_time bigint,
touch_point_id string,
et_city string,
et_city_tier string,
et_has_gifts boolean,
et_has_points boolean,
et_order_discount float,
et_order_list_value float,
et_order_quantity float,
et_order_type string,
et_order_value float,
et_pay_mode string,
et_prod_brand_list array<string>,
et_prod_cate_name_list array<string>,
et_prod_id_list array<string>,
et_prod_list_price_list array<float>,
et_prod_name_list array<string>,
et_prod_quantity_list array<float>,
et_province string,
et_purchase_date_type string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
REATE TABLE IF NOT EXISTS prod_purchased_orc(
uid string,
event_time bigint,
touch_point_id string,
et_city string,
et_city_tier string,
et_has_gifts boolean,
et_has_points boolean,
et_order_discount float,
et_order_list_value float,
et_order_quantity float,
et_order_type string,
et_order_value float,
et_pay_mode string,
et_prod_brand_list array<string>,
et_prod_cate_name_list array<string>,
et_prod_id_list array<string>,
et_prod_list_price_list array<float>,
et_prod_name_list array<string>,
et_prod_quantity_list array<float>,
et_province string,
et_purchase_date_type string
)partitioned by (process_date string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS ORC ;
======================================
STORED AS SequenceFile;
STORED AS RCfile ;
STORED AS Avro ;
STORED AS parquetfile ;
STORED AS ORC ;
只有text支持load,其他可以通过insert into导入数据
load data local inpath '/home/hadoop/prod_purchased.txt' into table prod_purchased_txt ;
insert into table prod_purchased_sq partition(process_date = '2019-06-01') select * from prod_purchased_txt;