对于开始抓取一个网站的信息,有的网站会防止此类爬虫抓取,所以会出现如下错误:
这是问题代码:
from urllib import request
class Count():
url = 'https://www.douyu.com/g_LOL'
# headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
def __fetch_content(self):
# req = request.Request(url=Count.url, headers=Count.headers)
r = request.urlopen(Count.url)
htmls = r.read()
def go(self):
self.__fetch_content()
spider = Count()
spider.go()
处理过后的代码(网上找的)Python3 :
from urllib import request
class Count():
url = 'https://www.douyu.com/g_LOL'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
def __fetch_content(self):
req = request.Request(url=Count.url, headers=Count.headers)
r = request.urlopen(req)
htmls = r.read()
def go(self):
self.__fetch_content()
spider = Count()
spider.go()