python3.0以上版本爬取豆瓣电影

import re
import urllib.request

"""
1.获取排名前100的电影
2.每页电影的数量是25个
3.获得的html页面中,跟电影名相关的有是这行:
    <span class="title">肖申克的救赎</span>
  如下这行跟上一行的结构类似,所以正则匹配的时候需要将此行去掉:
    <span class="title">&nbsp;/&nbsp;The Shawshank Redemption</span>
"""
class SpiderDouBan(object):#在豆瓣上爬取数据

    def __init__(self):#初始化
        self.page = 0
        self.base_url = "http://movie.douban.com/top250?start={page}&filter=&type="
        self.result = []
        self._index = 1

    def get_page_html(self):#获取页面
        url = self.base_url
        try:
            my_page = urllib.request.urlopen(url.format(page = self.page * 25)).read().decode("utf-8")
        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print ("The server couldn't fulfill the request.")
                print ("Error code: %s" % e.code)
            elif hasattr(e, "reason"):
                print ("We failed to reach a server. Please check your url and read the Reason")
                print ("Reason: %s" % e.reason)

        return my_page#返回页面

    def get_index_and_name(self,my_page):
        tmp_result = []
        items = re.findall(r'<span.*?class="title">(.*?)</span>', my_page, re.S)
        for item in items:
            if item.find("&nbsp") == -1 :
                tmp_result.append("top" + str(self._index) + " " + item)
                self._index += 1

        self.result.extend(tmp_result)

    def start_spider(self):
        while self.page <= 3:
            my_page = self.get_page_html()
            self.get_index_and_name(my_page)
            self.page += 1

def main():
    spider = SpiderDouBan()
    spider.start_spider()
    for item in spider.result:
        print (item)

if __name__ == '__main__':
    main()

#这是https://blog.csdn.net/bitcarmanlee/article/details/67661958的博主大佬写的,但他的是python2.7的版本,有些小地方要改改,借光啦

猜你喜欢

转载自blog.csdn.net/sinat_38832964/article/details/81111929