1. 获取豆瓣读书页信息,网址为:https://book.douban.com/,代码如下:
# coding:utf-8 import requests from lxml import etree # 1.获取豆瓣读书网页内容 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36", "Referer": "https://www.douban.com/" } url = "https://book.douban.com/" response = requests.get(url, headers=headers) text = response.text # with open("book.html", "w") as fp: # fp.write(response.content) # 2.通过一定规则获取html文件中的内容 html = etree.HTML(text) ul = html.xpath("//ul[@class='list-col list-col5 list-express slide-item']")[0] # print etree.tostring(ul, encoding="utf-8").decode("utf-8") # 保存成html文件 # with open("ul.html", "w") as fp: # fp.write(etree.tostring(ul, encoding="utf-8")) lis = ul.xpath(".//li") # print etree.tostring(lis[0], encoding="utf-8").decode("utf-8") # 通过循环获取lis下面的元素及属性 books = [] for li in lis: meta = li.xpath(".//div[@class='more-meta']")[0] # strip()去掉前后的空格 # /text()爬取中间text文本 title = meta.xpath(".//h4[@class='title']/text()")[0].strip() author = meta.xpath(".//span[@class='author']/text()")[0].strip() year = meta.xpath(".//span[@class='year']/text()")[0].strip() publisher = meta.xpath(".//span[@class='publisher']/text()")[0].strip() abstract = meta.xpath(".//p[@class='abstract']/text()")[0].strip() book = { "title": title, "author": author, "year": year, "publisher": publisher, "abstract": abstract } books.append(book) # 3.保存抓取到的books信息 print books
2. 获取豆瓣电影页信息,网址为:https://movie.douban.com/cinema/nowplaying/beijing/,代码如下:
# -- coding:utf-8 -- import requests from lxml import etree # 1.将目标网站上的页面抓取下来 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36", "Referer": "https://movie.douban.com/" } url = "https://movie.douban.com/cinema/nowplaying/beijing/" response = requests.get(url, headers=headers) text = response.text with open("responses.html", "w") as fp: fp.write(response.content) # 注意response.content数据类型和response.text数据类型 # 2.将抓取下来的数据根据一定的规则进行提取 html = etree.HTML(text) ul = html.xpath("//ul[@class='lists']")[0] lis = ul.xpath("./li") movies = [] for li in lis: title = li.xpath("@data-title", encoding="utf-8")[0] score = li.xpath("@data-score", encoding="utf-8")[0] duration = li.xpath("@data-duration", encoding="utf-8")[0] region = li.xpath("@data-region", encoding="utf-8")[0] thumbnail = li.xpath(".//img/@src")[0] movie = { "title": title, "score": score, "duration": duration, "region": region, "thumbnail": thumbnail } movies.append(movie) print movies
3. 电影天堂网站的爬取,http://www.ygdy8.net/html/gndy/dyzz/list_23_1.html
# coding:utf-8 import requests from lxml import etree import chardet BASED_URL = "http://www.ygdy8.net" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36" } def get_detailed_urls(url): # 1.获取html元素 response = requests.get(url, headers=HEADERS) # 注意使用response.content容易出问题 text = response.text # 2.寻找detailed_urls html = etree.HTML(text) hrefs = html.xpath("//table[@class='tbspan']//a//@href") detailed_urls = map(lambda url: BASED_URL+url, hrefs) return detailed_urls def parse_detailed_page(url): movie = {} # url = "http://www.ygdy8.net/html/gndy/dyzz/20180603/56925.html" # 1.获取页面元素 response = requests.get(url, headers=HEADERS) text = response.content.decode("gbk") # 2.寻找相应的内容 html = etree.HTML(text) title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0] movie["title"] = title infor = html.xpath("//div[@id='Zoom']//span//text()") other_name = infor[2].replace("◎片 名".decode("utf-8"), "").strip() movie["other_name"] = other_name year = infor[3].replace("◎年 代".decode("utf-8"), "").strip() movie["year"] = year country = infor[4].replace("◎产 地".decode("utf-8"), "").strip() movie["country"] = country typing = infor[5].replace("◎类 别".decode("utf-8"), "").strip() movie["typing"] = typing return movie def spider(): movies = [] base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html" for i in range(1, 8): url = base_url.format(i) detailed_urls = get_detailed_urls(url) for detailed_url in detailed_urls: movie = parse_detailed_page(detailed_url) movies.append(movie) for x in movie: print movie[x] break break if __name__ == '__main__': spider()