hive学习笔记1

hive学习笔记
1.简单的 wordcount

select word,count(1) from
(
select explode(split(sentence,' ')) as word from t2
) t
group by word;

对t2表中的sentence列进行空格分隔,统计单词出现的数量

select word,count(1) as n from
(
select explode(split(sentence,' ')) as word from t2
) t
group by word
order by n desc;

对单词进行倒序排列,order by只产生一个reduce

2.建表,内部表,外部表

create table t3(sentence string)
partitioned by(dt string) //分区
row format delimited fields terminated by '\n';  //创建内部表
load data local inpath '本地路径' into table t3;  //把本地数据导入内部表

create external  table t2(sentence string)
row format delimited fields terminated by '\n'
stored as textfile
location '/file'; //把hdfs中的file目录下的数据导入外部表

查看表中的分区

show partitions tablename

插入分区数据

insert overwrite table t3 partition(dt='201911')
select * from t2 limit 100;
//把t2中的100行数据插入到t3表中的201911的分区中

分区筛选数据

select * from t3 where dt between '201911' and '201912'
//显示分区在201911和101912间的数据

表的分桶,建立4个桶的表

set hive.enforce.bucketing = true;
create table t1(
user_id int,
item_id string,
rating string
)
clustered by(user_id)
into 4 bucket;

分桶取样1/4

select * from t1 tablesample(bucket 1 out of 4 on user_id);

桶中取样建表t2

create table t2 as select * from t1 tablesample(bucket 1 out of 4 on user_id);
发布了3 篇原创文章 · 获赞 0 · 访问量 84

猜你喜欢

转载自blog.csdn.net/qq_38599944/article/details/85638489