hive常用sql

1、创建一张外部分区表(分区需要另外添加)

CREATE EXTERNAL TABLE `ext_base_click`(
  `reqid` string,
  `adslotid` string,
  `ip` string,
  `timestamp` string,
  `adgroupid` string,
  `lbs` string)
PARTITIONED BY (
  `date` string,
  `hour` int)
ROW FORMAT SERDE
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
  'field.delim'='|',
  'line.delim'='\n',
  'serialization.format'='|')
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'

2、创建一张外表(需写好分区位置)

CREATE EXTERNAL TABLE `b_base_adx`(
  `id` string,
  `enname` string,
  `zhname` string,
  `description` string,
  `supportdeal` string)
ROW FORMAT SERDE
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
  'field.delim'='|',
  'line.delim'='\n',
  'serialization.format'='|')
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://tercel/user/mapred2/hive_basetable/b_base_adx'

3、表字段的修改(注意锁表问题、ORC文件表字段的修改、矢量查询)

--hive 新增一列
ALTER TABLE dsp.int_optimize_bidrequest ADD COLUMNS (app_id String);
--hive 修改列名
ALTER TABLE ext_preprocess_click CHANGE col_old_name col_new_name String AFTER column_name;
--hive 删除替换全部列
ALTER TABLE ext_base_conversion REPLACE COLUMNS (reqid string, adslotid string, adx string, deviceid string);  
--解锁hive表
show locks your_table
unlock table your_table

4、hive常用语句

--外部表 load数据
load data inpath '/user/hebin/data/media/deviceid_10w/deviceid_10w.txt' 
 into table zhtsh_tmp.deviceid_10w;
--判断字符串是否都为数字
select 1 from base_media_baidu where '123456' rlike '^\\d+$';
--新增外部表分区
alter table media_stats add partition (date ="2018-02-02",hour=5) location 'hdfs://tercel/user/userplatform/DAILY/$day/media-stats';
--删除表分区
ALTER TABLE dsp.ext_preprocess_bidrequest DROP IF EXISTS PARTITION (date ="1018-01-05",hour=5);
--添加hive_udf
add jar hdfs://tercel//user/mapred2/myudf/hive/hiveudf.jar;
create temporary function md5 as  'com.Md5';

5、分割字符串,行转列

select adslotallowedsize_  
from dsp_media.b_report_flow_daily  lateral  view explode(split(adslotallowedsize,'_'))  b AS adslotallowedsize_
where date='2018-03-07'

6、select除了某些字段之外的剩余所有字段

set hive.support.quoted.identifiers=None;
select `(name|id|pwd)?+.+` from tableName;

7、不同的adx中的曝光top3的机型

select * 
from
(
  select adx,brandmodelid,c,row_number() over (partition by adx order by adx,c desc) ord 
  from
  (
    select adx,brandmodelid,count(1) c from dsp.ext_base_impression WHERE date='2017-07-03' AND hour=4 
    group by adx,brandmodelid
  ) t1
) t2
where ord<=3;

8、设备信息的组内占比

select a.source AS source,
    a.sdk_version AS sdk_version,
    a.device_info_count AS device_info_count,
    sum(a.device_info_count) over(partition by a.source) as group_device_info_count  
from
(select source, split(sdk_version,'\\|')[0] AS sdk_version, count(1) AS device_info_count
from stg.stg_dev_device_info_d
  where day=20180426
group by source, split(sdk_version,'\\|')[0]
order by cast(source AS int),sdk_version) AS a

猜你喜欢

转载自blog.csdn.net/u010010664/article/details/79375481
今日推荐