1. 基础语法
- 查看分区
show partitions tabel
- 查看表结构
describe table
- 字符串转日期
cast(string_field as date)
- 字符串转时间
cast(string_field as timestamp)
2. 求累计频数
SELECT ioverduedays,
sum(overdue_num) AS overdue_num,
SUM(SUM(overdue_num)) OVER (
ORDER BY ioverduedays ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_amount
FROM (
SELECT ioverduedays,
count(*) AS overdue_num
FROM parquet_myd.aa_ld_channel_feature_tmp
WHERE overdue_status = 'yes_overdue'
AND irepaystatus = 30
GROUP BY ioverduedays
ORDER BY ioverduedays ASC) AS t
group by ioverduedays
3. 分组排序
SELECT
row_number()over(Partition BY user_id ORDER BY gmt_create) as rownum
FROM Tabel_a
4. hive调优
set hive.cli.print.header=true;
set hive.exec.parallel=true;
set mapred.reduce.tasks=8;
set hive.exec.parallel.thread.number=16;
5. hive解析json
json:
{
"status": {
"person": {
"name": false
}
}
}
## 取出key为name对应的value
select get_json_object(content,'$.status.person.name') from test limit 1;
## 对json的要求 标准json形式,且字符串用""引起来
6.正则抽取字符串
## para1:待解析字符串,para2:正则表达式,para3:满足匹配的索引,一般默认为1
## 索引为1
SELECT regexp_extract('"学位":"博士","入学时间":"2018-05-20"', '"学位":"(.*)","入学(.*)"', 1)
## 索引为2
SELECT regexp_extract('"学位":"博士","入学时间":"2018-05-20"', '"学位":"(.*)","入学(.*)"', 2) # 时间":"2018-05-20
7. null转其他
## 如果为空值,则置0
SELECT NVL(field,0)
8. 时间和日期处理函数
1) 求某时间处于当年的第几月
## 衍生特征的时候总归会用到吧
SELECT month('2003-03-15 01:22:33')
2) 求某时间处于当年的第几周
SELECT weekofyear('2003-03-15 01:22:33')
3) 求某个时间处于星期几
## 逻辑要注意下:周日->周六:1->7
SELECT pmod(datediff('2018-06-03','1900-01-07'),7) +1
4) 求某时间处于当月的第几天
SELECT day('2003-03-15 01:22:33')
5) 求某时间距离当月月底日期
SELECT datediff(last_day('2018-06-01'),'2018-06-01')
6) 求某时间所在小时时刻
SELECT hour('2018-06-01 12:00:00')
9. hive 同步到impala
## 在impala中执行
invalidate metadata parquet_myd.tmp_education_user_0416
10. 指定分隔符拆分字符串
## hive
## 返回结果:["10","11","12"]
SELECT split("10,11,12",",")
## 返回结果:10
SELECT split("10,11,12",",")[0]
## impala中
## 返回10
SELECT split_part("10.11.12",".",1)
11.