HDFS data cleaned regularly

HDFS data clean up some of the ways:

datanode data do reblance
clean up the temporary directory, log directory file
full amount of the partition table partition clean up the history of
the use of lzo, orc data compression format
clean-up or archiving historical data cold
increase datanode lateral expansion
attach automatically clean up expired files directory

#! / bin / the bash
Source ~ / .bash_profile



# directory to be detected (may be a plurality) loaded into the array
yarn_log_dir = / logs-App / Spark / logs
spark_log_dir = / Spark-History
spark2_log_dir = / History-spark2
mr_log_dir = /-History of Mr / DONE / $ (DATE + "the Y% /% m" -d "-1 days")

array_check = ($ $ yarn_log_dir mr_log_dir)

# expire days
expire_days = 14

# current timestamp
today_timestamp = $ (date -d "$ (DATE +"% Y-M-% D%% H:% M ")" +% S)

#Func: deletes the specified time expires before
removeOutDate () {
Hadoop FS. 1 -ls $> temp_list.txt
temp_list.txt CAT | quanxian the while TEMP Read User Group Day hour filepath size
do
current_file_time = "$ $ hour Day"
current_file_timestamp = $ (DATE -d "$ current_file_time" +% S)
if [ $(($today_timestamp-$current_file_timestamp)) -ge $(($expire_days*24*60*60)) ];then
echo "$day $hour $size $filepath"
hadoop fs -rm -r -skipTrash $filepath > /dev/null 2>&1
fi
done
}

#Func: 执行删除
execute(){
echo -e "\n\n"
echo "$(date +'%Y-%m-%d %H:%M:%S') start to remove outdate files in hdfs"
echo "$(date +'%Y-%m-%d %H:%M:%S') today is: $(date +"%Y-%m-%d %H:%M:%S")"

for i in ${array_check[@]}
do
echo "$(date +'%Y-%m-%d %H:%M:%S') processing filepath: $i"
removeOutDate $i
echo -e "\n"
done

echo "$(date +'%Y-%m-%d %H:%M:%S') remove outdate files in hdfs finished"
echo -e "\n\n"

-f temp_list.txt RM
}

# begin
execute

Guess you like

Origin www.cnblogs.com/hanhaotian/p/11610901.html