import requests, re, json
# fake_useragent:实现了User-Agent的动态维护,利用它每次随机获取一个User-Agent的值。
from fake_useragent import UserAgent
from lxml.html import etree, HTMLParser
from requests.exceptions import ConnectionError
# 获取代理ip
def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").text
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
def get_list(url):
headers = {
'User-Agent': ua.random,
}
proxy = get_proxy()
print('正在使用代理IP:{}请求页面{}'.format(proxy, url))
proxies = {'http': 'http://' + proxy}
try:
response = requests.get(url, headers=headers, proxies=proxies)
if response.status_code == 200:
print('{}请求成功'.format(url))
return response.text
else:
print('{}请求异常'.format(url))
return None
except ConnectionError as e:
print('{}连接主机异常'.format(url))
return None
def get_comment(page_num):
headers = {
'User-Agent': ua.random,
}
url = 'https://tieba.baidu.com/p/totalComment?t=1528876530434&tid=689121746&fid=1141317&pn={}&see_lz=0'.format(page_num)
response = requests.get(url, headers=headers)
json_obj = json.loads(response.text)
comment_list = json_obj['data']['comment_list']
print(comment_list)
return comment_list
def parse_list(html, url, x):
comment_json = get_comment(x)
if html != None:
html_obj = etree.HTML(html, parser=HTMLParser(encoding='utf-8'))
divs = html_obj.xpath('//div[contains(@class, "l_post")]')
for div in divs:
guanggao = div.xpath('div[contains(@class, "d_post_content_main")]//span[@class="label_text"]')
if guanggao:
print('广告,跳过...')
continue
# 取昵称div
nick_name = div.xpath('div[@class="d_author"]//a[contains(@class,"p_author_name")]/text()')[0]
# 内容
content_div = div.xpath('div[contains(@class, "d_post_content_main")]//div[contains(@class, "d_post_content")]')[0]
article_id = content_div.xpath('@id')[0].split('_')[2]
content = content_div.xpath('text()')[0]
info = div.xpath('div[contains(@class, "d_post_content_main")]//span[@class="tail-info"]/text()')
floor = info[0]
datetime = info[1]
comment_result = {}
comment_result['nick_name'] = nick_name
comment_result['content'] = content
comment_result['floor'] = floor
comment_result['datetime'] = datetime
# 需要准备字典,判断当前article_id对应的是否有评论。
if article_id in comment_json:
# 有评论
comment_list = []
comments = comment_json[article_id]['comment_info']
for comment in comments:
comment_dic = {}
comment_dic['username'] = comment['username']
comment_dic['content'] = comment['content']
comment_list.append(comment_dic)
comment_result['comment'] = comment_list
else:
# 没有评论
comment_result['comment'] = '没有评论'
print(comment_result)
else:
# 重新请求当前的url
html = get_list(url)
parse_list(html, url)
def main():
for x in range(1, 14):
url = 'https://tieba.baidu.com/p/689121746?pn={}'.format(x)
html = get_list(url)
if html:
parse_list(html, url, x)
if __name__ == '__main__':
proxy = ""
ua = UserAgent()
main()