import re import urllib.request """ 1.获取排名前100的电影 2.每页电影的数量是25个 3.获得的html页面中,跟电影名相关的有是这行: <span class="title">肖申克的救赎</span> 如下这行跟上一行的结构类似,所以正则匹配的时候需要将此行去掉: <span class="title"> / The Shawshank Redemption</span> """ class SpiderDouBan(object):#在豆瓣上爬取数据 def __init__(self):#初始化 self.page = 0 self.base_url = "http://movie.douban.com/top250?start={page}&filter=&type=" self.result = [] self._index = 1 def get_page_html(self):#获取页面 url = self.base_url try: my_page = urllib.request.urlopen(url.format(page = self.page * 25)).read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e, "code"): print ("The server couldn't fulfill the request.") print ("Error code: %s" % e.code) elif hasattr(e, "reason"): print ("We failed to reach a server. Please check your url and read the Reason") print ("Reason: %s" % e.reason) return my_page#返回页面 def get_index_and_name(self,my_page): tmp_result = [] items = re.findall(r'<span.*?class="title">(.*?)</span>', my_page, re.S) for item in items: if item.find(" ") == -1 : tmp_result.append("top" + str(self._index) + " " + item) self._index += 1 self.result.extend(tmp_result) def start_spider(self): while self.page <= 3: my_page = self.get_page_html() self.get_index_and_name(my_page) self.page += 1 def main(): spider = SpiderDouBan() spider.start_spider() for item in spider.result: print (item) if __name__ == '__main__': main()
#这是https://blog.csdn.net/bitcarmanlee/article/details/67661958的博主大佬写的,但他的是python2.7的版本,有些小地方要改改,借光啦