Go to Douban to crawl movie information

Beginning to learn puthon crawler, so I was afraid of Douban, using movie information, go directly to the source code

import re
import requests
from bs4 import BeautifulSoup
import urllib
import os

class movie:

    def __init__(self):
        self.url="https://movie.douban.com/subject/25933890/?tag=%E7%83%AD%E9%97%A8&from=gaia_video"
        self.head={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
                 }


    def getpag(self):
        req=requests.get(self.url,self.head)
        html=req.content
        html=html.decode('utf-8')
        return html

    def gettit(self,page):
        title = r'<span property="v:itemreviewed">(.+?)</span>'
        power = r'<strong class="ll rating_num" property="v:average">(.+?)</strong>'
        tit = re.findall(title, page)
        powe = re.findall(power,page)
        tit = str(tit)
         print (tit, ' \n ' )
         print ( " Douban score: " , powe, ' \n ' )
     def getinfo(self,page):
        soup = BeautifulSoup(page, "lxml")
        infor = soup.find_all('div', 'info')
        for info in infor:
            print(info.get_text())
    def getping(self,page):
        soup = BeautifulSoup(page, "lxml")
        ping = soup.find_all('div', 'comment')
        for pin in ping:
            pname=pin.fin
            pn=pname.find_all('a').d_all('span',class_='comment-info')
            for pnam in pname:
                for p in pn:
                    print(p.get_text())
                    arg=pin.find_all('p')
                    for ar in arg:
                         print(ar.get_text())

    def start(self):
        page=self.getpag()
        self.gettit(page)
        self.getinfo(page)
        self.getping(page)
movie().start()

Crawled successfully

I am using BeautifulSoup to set up a library, this library will be able to sort the heml code by tags, and can also read the tag attributes, the details can be searched by yourself, it is very powerful for crawlers

My code concept is to use BeautifulSoup, use for loop to search down layer by layer to find the data you want

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325074206&siteId=291194637