利用sqoop每日增量抽取业务数据库的数据到hive(airflow)

mysql -> hive

第一步:hive建表
create table airplane(
ID int,
ID_scard string,
staion_from string,
station_to string,
plane_no string,
time_takeoff string
)
row format delimited fields terminated by ',';


第二步:写成一个.sh脚本
#!/bin/bash
echo "mysql的airplane数据导入hive表"
sqoop job --create sqoop_job -- import \
--connect jdbc:mysql://node3:3306/policedb \
--table airplane \
--username root \
--password-file /sqoop/mysql.pwd \
--target-dir /user/hive/warehouse/data_works.db/airplane \
--num-mappers 1 \
--fields-terminated-by "," \
--incremental append \
--check-column ID \
--last-value 5000
执行sqoop job --exec sqoop_job

第五步:用airflow的python脚本并调度-->3分钟调度一次
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators import EmailOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019,9,2,14,40,00),
    'retries': 3,
    'retryDelay': timedelta(seconds=5)
}
dag = DAG('sqoophive',
    default_args=default_args,
    schedule_interval=timedelta(minutes=3))

sqoophive = BashOperator(
    task_id='sqoophive',
    dag=dag,
    bash_command='set -e; sqoop job --exec sqoop_job {{ds_nodash}} ;'
)

发布了183 篇原创文章 · 获赞 300 · 访问量 54万+

猜你喜欢

转载自blog.csdn.net/shujuelin/article/details/100284806
今日推荐