wget下载页面脚本

首先配置文件:

[wusx@centos245 gather]$ cat config 
start=359822
end=550000
threads=15
output=/home/wusx/gather/output
log=/home/wusx/gather/_wget.log

 然后运行脚本:

[wusx@centos245 gather]$ cat wget.sh 
#!bin/sh
#执行下载任务
#
#Threads=10;
#i=100000;
#max=999999;

filename=/home/wusx/gather/config

eval `cat $filename | awk -F '=' '{print $1"="$2}'`

i=$start
max=$end
Threads=$threads

#echo $i $max $Threads $output $log

while(( 1 ));
do
        url="http://www.abc.com/$i/cc.html";
        out="$output/$i.info"
        wget -q --user-agent='Baiduspider' -O $out $url  > /dev/null &
        i=$(($i+1));

        #将当前下的文档序列,写回config文件。
        sed -i "1s/start=$(($i-1))/start=$i/" /home/wusx/gather/config
        echo $url >> $log

        Running=$(ps -ef| grep $$ | grep 'Baiduspider' | grep -v 'grep' | wc -l)
        while [ $Running -ge $Threads ]; do
                #echo "Threads:${Running} >= ${Threads},sleep 30 seconds..."
                sleep 2 
                Running=$(ps -ef| grep $$ | grep 'Baiduspider' | grep -v 'grep' | wc -l)
        done
        if [ $i -ge $max ];then
                break;
        fi

done
 

猜你喜欢

转载自wushexin.iteye.com/blog/1020293