关于爬虫-爬取下载页面下所有的文件夹以及子文件夹里的文件

我写这个只是针对这个资源网站,for循环嵌套效率感觉很低,菜鸟一枚,如果有大佬有好的建议以及改成方法,希望可以提出,我一定会吸取采纳。感谢!

  from urllib import request
  import re  # 使用正则表达式
  from lxml import etree
  import requests
  requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
  s = requests.session()
  s.keep_alive = False # 关闭多余连接
  import os
  
  basepath = "http://www.glass.umd.edu/"
  path="http://www.glass.umd.edu/Download.html"

  def getResponse(url):
      while True:
          try:
              head = {}
              head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
              url_request = request.Request(url,headers=head)
              url_response = request.urlopen(url_request,timeout=30)
              break
          except:
              print("url_response出错了")
      return url_response  # 返回这个对象
  
  def get_file(data):
      html = etree.HTML(data)
      urllist=html.xpath("//div[@class='demo']/a/@href")
      #print(urllist)
      for i in urllist:
         url = basepath + i
         data1 = getData(url)
         html1 = etree.HTML(data1)
         urllist1 = html1.xpath("//td/a/@href")
         del urllist1[0]
         for j in urllist1:
              savepathname = i +"/"+ j
              print("savepathname is %s" % savepathname)
              if ".tif" in j or ".txt" in j:
                  downLoad(url,j,savepathname)
                  #print(j)
                  pass
              else:
                  if(url[-1]==r'/'):
                      url1 = url + j
                  else:
                      url1 = url +"/"+ j
                  data2 = getData(url1)
                  #print(url1)
                  html2 = etree.HTML(data2)
                  urllist2 = html2.xpath("//td/a/@href")
                  del urllist2[0]
                  for z in urllist2:
                      savepathname1 = savepathname + z
                      print("savepathname1 is %s" % savepathname1)
                      if ".dat" in z or ".hdr" in z or ".hdf" in z or ".jpg" in z or ".xml" in z:
                          downLoad(url1,z,savepathname1)
                          #print(z)
                          pass
                      else:
                          url2 = url1 + z
                          data3 = getData(url2)
                          html3 = etree.HTML(data3)
                          urllist3 = html3.xpath("//td/a/@href")
                          del urllist3[0]
                          for k in urllist3:
                              print("k is %s" % k)
                              if ".hdf" in k or ".jpg" in k or ".xml" in k:
                                  downLoad(url2,k,savepathname1)
                                  #print(k)
                                  pass
                              else:
                                  print("找不到了,继续往下寻找。")
  
  def downLoad(jpgUrl,name, savepathname):
      # request.urlretrieve(jpg_link, path)
      maxTryNum = 20
      if not os.path.exists("E://url_D/" + savepathname):
          os.makedirs("E://url_D/" + savepathname)
      print("E://url_D/" + savepathname)
      for tries in range(maxTryNum):
          try:
              print("开始保存")
              res=s.get(jpgUrl,timeout=30)
              with open("E://url_D/" + savepathname+name,'wb+') as f:
                  f.write(res.content)
                  print("保存成功")
              #request.urlretrieve(jpgUrl, '%s.jpg' % n)
          except Exception as e:
              if tries < (maxTryNum - 1):
                  continue
              else:
                  print("出现异常%s" % e)
                  pass
  
        
  def getData(path):
      http_response = getResponse(path)  # 拿到http请求后的上下文对象(HTTPResponse object)
      # print(http_response.read().decode('utf-8'))
      data = http_response.read().decode('utf-8')
      return data
  
  def main():
      data = getData(path)
      get_file(data)
  
  if __name__ == '__main__':
      main()
      print("爬取完毕")

猜你喜欢

转载自www.cnblogs.com/xfhaixx/p/13187256.html