搜索关键字脚本

#!/bin/bash                                                                                                                                                                                    
#
# Usage: this.sh [dir]
# Result: [dir]/result/filelist (files which contain keys)
# Default dir: /home

keys="型谱|新品|秘密|机密|绝密|涉密|计算机模块研制|处理器适配研改|处理机系统研制|CPU IP核|内场实验|外场实验|嵌入64位CPU|国产化基础软硬件平台|虚拟化|装备承制|许可证|武器装备|承制资格|资格证书"
types="doc docx xls xlsx ppt pptx"
FindDIR=""

# check user
if [ $UID -ne 0 ]; then
    echo "Please run $0 with root user!"
    exit
fi

# find dir
if [ $# -eq 0 ]; then
    FindDIR=/home
else
    FindDIR=$1
    if [ ! -d $FindDIR ]; then
        echo "$FindDIR is not a director!"
        exit 1
    fi
fi
ResultDIR="${FindDIR}/result"

rm -rf $ResultDIR 2> /dev/null
mkdir -p $ResultDIR

# Find file name contain "keys"
for key in `echo $keys | tr '|' ' '`; do
    find $FindDIR -name "*${key}*" >> ${ResultDIR}/filelist
done

# file content contain "keys"
#egrep "(`echo $keys`)" -l -r $FindDIR >> ${ResultDIR}/filelist

# find document(office) file contain "keys"
for t in `echo $types`; do
    find $FindDIR -name "*.${t}" >> ${ResultDIR}/document.list
done
sed -i 's/ /|/g' ${ResultDIR}/document.list

for file in `cat ${ResultDIR}/document.list` ; do
    unzip `echo $file | tr '|' ' '` -d ${ResultDIR}/tmp > /dev/null 2>&1
    egrep "(`echo $keys`)" -r ${ResultDIR}/tmp > /dev/null 2>&1 && echo $file >> ${ResultDIR}/filelist
    rm -rf ${ResultDIR}/tmp 2> /dev/null
done

# for ppt file
find ${FindDIR} -name "*.pdf" >> ${ResultDIR}/pdf.list
sed -i 's/ /|/g' ${ResultDIR}/pdf.list
for file in `cat $ResultDIR/pdf.list` ; do
    pdftotext "`echo $file | tr '|' ' '`" $ResultDIR/tmp.txt 2> /dev/null
    egrep "(`echo $keys`)" ${ResultDIR}/tmp.txt > /dev/null 2>&1 && echo $file >> ${ResultDIR}/filelist
    rm -f ${ResultDIR}/tmp.txt 2> /dev/null
done

# clean tmp file
rm -f ${ResultDIR}/document.list
rm -f ${ResultDIR}/pdf.list

# Result
sed -i 's/|/ /g' ${ResultDIR}/filelist
echo "Result: ${ResultDIR}/filelist!"



搜索word文档脚本

#!/bin/bash                                                                                                                                                                                    
#
# Usage: this.sh [dir]
# Result: [dir]/result/filelist (files which contain keys)
# Default dir: /home

keys="型谱|新品|秘密|机密|绝密|涉密|计算机模块研制|处理器适配研改|处理机系统研制|CPU IP核|内场实验|外场实验|嵌入64位CPU|国产化基础软硬件平台|虚拟化|装备承制|许可证|武器装备|承制资格|资格证书|
types="doc docx"
FindDIR=""

# check user
if [ $UID -ne 0 ]; then
    echo "Please run $0 with root user!"
    exit
fi

# find dir
if [ $# -eq 0 ]; then
    FindDIR=/home
else
    FindDIR=$1
    if [ ! -d $FindDIR ]; then
        echo "$FindDIR is not a director!"
        exit 1
    fi  
fi
ResultDIR="${FindDIR}/result"

rm -rf $ResultDIR 2> /dev/null
mkdir -p $ResultDIR

yum install -y antiword

# file content contain "keys"
#egrep "(`echo $keys`)" -l -r $FindDIR >> ${ResultDIR}/filelist

# find document(office) file contain "keys"
for t in `echo $types`; do  
    find $FindDIR -name "*.${t}" >> ${ResultDIR}/document.list
done
sed -i 's/ /|/g' ${ResultDIR}/document.list

for file in `cat ${ResultDIR}/document.list` ; do
     antiword `echo $file | tr '|' ' '` |  egrep -i "(`echo $keys`)"  > /dev/null 2>&1 && echo $file >> ${ResultDIR}/filelist
    rm -rf ${ResultDIR}/tmp 2> /dev/null
done

# for ppt file
find ${FindDIR} -name "*.ppt" >> ${ResultDIR}/filelist
find ${FindDIR} -name "*.pptx" >> ${ResultDIR}/filelist

# clean tmp file
rm -f ${ResultDIR}/document.list

# Result
sed -i 's/|/ /g' ${ResultDIR}/filelist
echo "Result: ${ResultDIR}/filelist!"

猜你喜欢

转载自blog.csdn.net/Lq19880521/article/details/79100262