我写这个只是针对这个资源网站,for循环嵌套效率感觉很低,菜鸟一枚,如果有大佬有好的建议以及改成方法,希望可以提出,我一定会吸取采纳。感谢!
from urllib import request
import re # 使用正则表达式
from lxml import etree
import requests
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
import os
basepath = "http://www.glass.umd.edu/"
path="http://www.glass.umd.edu/Download.html"
def getResponse(url):
while True:
try:
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
url_request = request.Request(url,headers=head)
url_response = request.urlopen(url_request,timeout=30)
break
except:
print("url_response出错了")
return url_response # 返回这个对象
def get_file(data):
html = etree.HTML(data)
urllist=html.xpath("//div[@class='demo']/a/@href")
#print(urllist)
for i in urllist:
url = basepath + i
data1 = getData(url)
html1 = etree.HTML(data1)
urllist1 = html1.xpath("//td/a/@href")
del urllist1[0]
for j in urllist1:
savepathname = i +"/"+ j
print("savepathname is %s" % savepathname)
if ".tif" in j or ".txt" in j:
downLoad(url,j,savepathname)
#print(j)
pass
else:
if(url[-1]==r'/'):
url1 = url + j
else:
url1 = url +"/"+ j
data2 = getData(url1)
#print(url1)
html2 = etree.HTML(data2)
urllist2 = html2.xpath("//td/a/@href")
del urllist2[0]
for z in urllist2:
savepathname1 = savepathname + z
print("savepathname1 is %s" % savepathname1)
if ".dat" in z or ".hdr" in z or ".hdf" in z or ".jpg" in z or ".xml" in z:
downLoad(url1,z,savepathname1)
#print(z)
pass
else:
url2 = url1 + z
data3 = getData(url2)
html3 = etree.HTML(data3)
urllist3 = html3.xpath("//td/a/@href")
del urllist3[0]
for k in urllist3:
print("k is %s" % k)
if ".hdf" in k or ".jpg" in k or ".xml" in k:
downLoad(url2,k,savepathname1)
#print(k)
pass
else:
print("找不到了,继续往下寻找。")
def downLoad(jpgUrl,name, savepathname):
# request.urlretrieve(jpg_link, path)
maxTryNum = 20
if not os.path.exists("E://url_D/" + savepathname):
os.makedirs("E://url_D/" + savepathname)
print("E://url_D/" + savepathname)
for tries in range(maxTryNum):
try:
print("开始保存")
res=s.get(jpgUrl,timeout=30)
with open("E://url_D/" + savepathname+name,'wb+') as f:
f.write(res.content)
print("保存成功")
#request.urlretrieve(jpgUrl, '%s.jpg' % n)
except Exception as e:
if tries < (maxTryNum - 1):
continue
else:
print("出现异常%s" % e)
pass
def getData(path):
http_response = getResponse(path) # 拿到http请求后的上下文对象(HTTPResponse object)
# print(http_response.read().decode('utf-8'))
data = http_response.read().decode('utf-8')
return data
def main():
data = getData(path)
get_file(data)
if __name__ == '__main__':
main()
print("爬取完毕")