首先配置文件:
[wusx@centos245 gather]$ cat config start=359822 end=550000 threads=15 output=/home/wusx/gather/output log=/home/wusx/gather/_wget.log
然后运行脚本:
[wusx@centos245 gather]$ cat wget.sh #!bin/sh #执行下载任务 # #Threads=10; #i=100000; #max=999999; filename=/home/wusx/gather/config eval `cat $filename | awk -F '=' '{print $1"="$2}'` i=$start max=$end Threads=$threads #echo $i $max $Threads $output $log while(( 1 )); do url="http://www.abc.com/$i/cc.html"; out="$output/$i.info" wget -q --user-agent='Baiduspider' -O $out $url > /dev/null & i=$(($i+1)); #将当前下的文档序列,写回config文件。 sed -i "1s/start=$(($i-1))/start=$i/" /home/wusx/gather/config echo $url >> $log Running=$(ps -ef| grep $$ | grep 'Baiduspider' | grep -v 'grep' | wc -l) while [ $Running -ge $Threads ]; do #echo "Threads:${Running} >= ${Threads},sleep 30 seconds..." sleep 2 Running=$(ps -ef| grep $$ | grep 'Baiduspider' | grep -v 'grep' | wc -l) done if [ $i -ge $max ];then break; fi done