1.4.2python网站地图爬虫(每天一更)

# -*- coding: utf-8 -*-
'''
Created on 2019年5月6日

@author: 薛卫卫
'''

import urllib.request
import re

def download(url, user_agent="wswp",num_retries=2):
    print("Downloading: " , url)
    headers = { 'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.request.URLError as e:
        print('Download error:' , e.reason)
        html = None
        if num_retries > 0 :
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_retries-1)
    return html

def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # 不修改正则表达式,修改输出的结果,将urlopen().read()返回的data进行解码
    sitemap = sitemap.decode('utf-8')
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    #download each link
    for link in links:
        html = download(link)
        # scrape html here
        # ...
        
crawl_sitemap("http://example.webscraping.com/sitemap.xml")

  

猜你喜欢

转载自www.cnblogs.com/xww115/p/10828446.html