1. 打开页面获取源代码
1.1 urllib模块
import urllib.request
html =urllib.request.urlopen('https://movie.douban.com/subject/3168101/?from=showing').read()
html = html.decode('utf-8')
print(html)
1.2 requests模块
import requests
html = requests.get('https://movie.douban.com/subject/3168101/?from=showing').text
print(html)
2 获取需要的信息
2.1 re正则
get_re = re.findall(r'<span class="short">(.*?)</span>',html)
print(get_re)
2.2 BeautifulSoup
from bs4 import BeautifulSoup
get = BeautifulSoup(html,'lxml')
b=get.find(attrs={'class':"short"})
a=get.find_all(attrs={'class':"short"})
print(b)
print(a)
2.3 Xpath
from lxml import etree
html = etree.HTML(html)
get = html.xpath('//*[@id="hot-comments"]/div[5]/div/p/span/text()')
print(get)