Beginning to learn puthon crawler, so I was afraid of Douban, using movie information, go directly to the source code
import re import requests from bs4 import BeautifulSoup import urllib import os class movie: def __init__(self): self.url="https://movie.douban.com/subject/25933890/?tag=%E7%83%AD%E9%97%A8&from=gaia_video" self.head={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', } def getpag(self): req=requests.get(self.url,self.head) html=req.content html=html.decode('utf-8') return html def gettit(self,page): title = r'<span property="v:itemreviewed">(.+?)</span>' power = r'<strong class="ll rating_num" property="v:average">(.+?)</strong>' tit = re.findall(title, page) powe = re.findall(power,page) tit = str(tit) print (tit, ' \n ' ) print ( " Douban score: " , powe, ' \n ' ) def getinfo(self,page): soup = BeautifulSoup(page, "lxml") infor = soup.find_all('div', 'info') for info in infor: print(info.get_text()) def getping(self,page): soup = BeautifulSoup(page, "lxml") ping = soup.find_all('div', 'comment') for pin in ping: pname=pin.fin pn=pname.find_all('a').d_all('span',class_='comment-info') for pnam in pname: for p in pn: print(p.get_text()) arg=pin.find_all('p') for ar in arg: print(ar.get_text()) def start(self): page=self.getpag() self.gettit(page) self.getinfo(page) self.getping(page) movie().start()
Crawled successfully
I am using BeautifulSoup to set up a library, this library will be able to sort the heml code by tags, and can also read the tag attributes, the details can be searched by yourself, it is very powerful for crawlers
My code concept is to use BeautifulSoup, use for loop to search down layer by layer to find the data you want