hive学习笔记
1.简单的 wordcount
select word,count(1) from
(
select explode(split(sentence,' ')) as word from t2
) t
group by word;
对t2表中的sentence列进行空格分隔,统计单词出现的数量
select word,count(1) as n from
(
select explode(split(sentence,' ')) as word from t2
) t
group by word
order by n desc;
对单词进行倒序排列,order by只产生一个reduce
2.建表,内部表,外部表
create table t3(sentence string)
partitioned by(dt string) //分区
row format delimited fields terminated by '\n'; //创建内部表
load data local inpath '本地路径' into table t3; //把本地数据导入内部表
create external table t2(sentence string)
row format delimited fields terminated by '\n'
stored as textfile
location '/file'; //把hdfs中的file目录下的数据导入外部表
查看表中的分区
show partitions tablename
插入分区数据
insert overwrite table t3 partition(dt='201911')
select * from t2 limit 100;
//把t2中的100行数据插入到t3表中的201911的分区中
分区筛选数据
select * from t3 where dt between '201911' and '201912'
//显示分区在201911和101912间的数据
表的分桶,建立4个桶的表
set hive.enforce.bucketing = true;
create table t1(
user_id int,
item_id string,
rating string
)
clustered by(user_id)
into 4 bucket;
分桶取样1/4
select * from t1 tablesample(bucket 1 out of 4 on user_id);
桶中取样建表t2
create table t2 as select * from t1 tablesample(bucket 1 out of 4 on user_id);