# 使用代理ip获取网页数据
import requests, re
# fake_useragent:实现了User-Agent的动态维护,利用它每次随机获取一个User-Agent的值。
from fake_useragent import UserAgent
from lxml.html import etree, HTMLParser
from requests.exceptions import ConnectionError
def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").text
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
def get_list(url):
headers = {
'User-Agent': ua.random,
}
proxy = get_proxy()
print('正在使用代理IP:{}请求页面{}'.format(proxy, url))
proxies = {'http': 'http://'+proxy}
try:
response = requests.get(url, headers=headers, proxies=proxies)
if response.status_code == 200:
print('{}请求成功'.format(url))
return response.text
else:
print('{}请求异常'.format(url))
return None
except ConnectionError as e:
print('{}连接主机异常'.format(url))
return None
def parse_list(html, url):
if html != None:
# parser=HTMLParser(encoding='utf-8')自定义的解析器,有默认的解析器。
html_obj = etree.HTML(html, parser=HTMLParser(encoding='utf-8'))
# 获取title,判断获取的源代码是否含有数据/猫眼访问控制。
title = html_obj.xpath('//head/title/text()')[0]
if "TOP100榜" in title:
# 说明获取是有数据的网页
dds = html_obj.xpath('//dd')
for dd in dds:
# 排名
rank = dd.xpath('i/text()')[0]
# 名称
div = dd.xpath('div/div/div')
name = div[0].xpath('p/a/text()')[0]
zhuyan = div[0].xpath('p[@class="star"]/text()')[0]
date = div[0].xpath('p[@class="releasetime"]/text()')[0]
# 评分
movie_sc2 = dd[0].xpath(
'//div[contains(@class, "movie-item-number score-num")]/p[@class="score"]/i/text()')
print(movie_sc2)
score = []
for i in range(0, 10):
s = movie_sc2[i * 2] + movie_sc2[i * 2 + 1]
score.append(s)
# print(score)
for i in range(0, 10):
res_dict = {
"rank":rank,
"name": name,
"zhuyan": zhuyan,
"date": date,
"score": score[i],
}
print(res_dict)
else:
# 说明获取的是没有数据的网页
print('{}获取了没有数据的网页', url)
# 将当前的代理IP从redis数据库中删除。
delete_proxy(proxy)
# 重新请求当前的url
html = get_list(url)
parse_list(html, url)
else:
# 重新请求当前的url
html = get_list(url)
parse_list(html, url)
def main():
for x in range(0, 100, 10):
url = 'http://maoyan.com/board/4?offset={}'.format(x)
html = get_list(url)
if html:
parse_list(html, url)
if __name__ == '__main__':
proxy = ""
ua = UserAgent()
main()