- Python3.x在学到爬虫是需要注意不同于Python2.x需要将读取的html数据进行解码
html = html.decode('utf-8')
:
import urllib
import re
def download(url,user_agent='XD',num_retries=2):
print('Downloading:',url)
headers = {'User-agent':user_agent}
request = urllib.request.Request(url,headers=headers)
try:
html = urllib.request.urlopen(url).read()
except urllib.error.URLError as e:
print('Download error:',e.reason)
html = None
if num_retries >0:
if hasattr(e,'code') and 500<= e.code <600:
# recursively retry 5xx HTTP errors
return download(url,user_agent,num_retries-1)
return html
def crawl_sitemap(url):
# download the sitemap file
sitemap = download(url)
sitemap = sitemap.decode('utf-8')
# extract the sitemap links
links = re.findall('<loc>(.*?)</loc>',sitemap)
# download each link
for link in links:
html = download(link)
# scrape html here
#...
if __name__ == '__main__':
crawl_sitemap('http://www.baidu.com/sitemap.xml')
在def crawl_sitemap(url):
中加入sitemap = sitemap.decode('utf-8')
进行解码操作