到豆瓣爬取电影信息

初学puthon爬虫,于是自己怕了豆瓣以电影信息,直接上源码

import re
import requests
from bs4 import BeautifulSoup
import urllib
import os

class movie:

    def __init__(self):
        self.url="https://movie.douban.com/subject/25933890/?tag=%E7%83%AD%E9%97%A8&from=gaia_video"
        self.head={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
                 }


    def getpag(self):
        req=requests.get(self.url,self.head)
        html=req.content
        html=html.decode('utf-8')
        return html

    def gettit(self,page):
        title = r'<span property="v:itemreviewed">(.+?)</span>'
        power = r'<strong class="ll rating_num" property="v:average">(.+?)</strong>'
        tit = re.findall(title, page)
        powe = re.findall(power,page)
        tit = str(tit)
        print(tit, '\n')
        print("豆瓣评分:", powe, '\n')
    def getinfo(self,page):
        soup = BeautifulSoup(page, "lxml")
        infor = soup.find_all('div', 'info')
        for info in infor:
            print(info.get_text())
    def getping(self,page):
        soup = BeautifulSoup(page, "lxml")
        ping = soup.find_all('div', 'comment')
        for pin in ping:
            pname=pin.fin
            pn=pname.find_all('a').d_all('span',class_='comment-info')
            for pnam in pname:
                for p in pn:
                    print(p.get_text())
                    arg=pin.find_all('p')
                    for ar in arg:
                         print(ar.get_text())

    def start(self):
        page=self.getpag()
        self.gettit(page)
        self.getinfo(page)
        self.getping(page)
movie().start()

爬取成功

我利用的是BeautifulSoup设个库,这个库将可以将heml代码进行按标签进行分类整理,还可以读取标签属性,详情可以自己搜索,对于爬虫来说非常强大

我的代码理念理念是利用BeautifulSoup,利用for循环一层一层的往下搜索找到自己想要的数据

猜你喜欢

转载自www.cnblogs.com/hatkids/p/8973611.html