xpath获取网页数据(二)

import requests, re, json

# fake_useragent:实现了User-Agent的动态维护,利用它每次随机获取一个User-Agent的值。
from fake_useragent import UserAgent
from lxml.html import etree, HTMLParser
from requests.exceptions import ConnectionError

# 获取代理ip
def get_proxy():
    return requests.get("http://127.0.0.1:5010/get/").text


def delete_proxy(proxy):
    requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))


def get_list(url):
    headers = {
        'User-Agent': ua.random,
    }
    proxy = get_proxy()
    print('正在使用代理IP{}请求页面{}'.format(proxy, url))
    proxies = {'http': 'http://' + proxy}
    try:
        response = requests.get(url, headers=headers, proxies=proxies)
        if response.status_code == 200:
            print('{}请求成功'.format(url))
            return response.text
        else:
            print('{}请求异常'.format(url))
            return None
    except ConnectionError as e:
        print('{}连接主机异常'.format(url))
        return None


def get_comment(page_num):
    headers = {
        'User-Agent': ua.random,
    }
    url = 'https://tieba.baidu.com/p/totalComment?t=1528876530434&tid=689121746&fid=1141317&pn={}&see_lz=0'.format(page_num)
    response = requests.get(url, headers=headers)
    json_obj = json.loads(response.text)
    comment_list = json_obj['data']['comment_list']
    print(comment_list)
    return comment_list


def parse_list(html, url, x):

    comment_json = get_comment(x)

    if html != None:
        html_obj = etree.HTML(html, parser=HTMLParser(encoding='utf-8'))
        divs = html_obj.xpath('//div[contains(@class, "l_post")]')
        for div in divs:

            guanggao = div.xpath('div[contains(@class, "d_post_content_main")]//span[@class="label_text"]')
            if guanggao:
                print('广告,跳过...')
                continue

            # 取昵称div
            nick_name = div.xpath('div[@class="d_author"]//a[contains(@class,"p_author_name")]/text()')[0]
            # 内容
            content_div = div.xpath('div[contains(@class, "d_post_content_main")]//div[contains(@class, "d_post_content")]')[0]

            article_id = content_div.xpath('@id')[0].split('_')[2]
            content = content_div.xpath('text()')[0]
            info = div.xpath('div[contains(@class, "d_post_content_main")]//span[@class="tail-info"]/text()')
            floor = info[0]
            datetime = info[1]

            comment_result = {}

            comment_result['nick_name'] = nick_name
            comment_result['content'] = content
            comment_result['floor'] = floor
            comment_result['datetime'] = datetime


            # 需要准备字典,判断当前article_id对应的是否有评论。
            if article_id in comment_json:
                # 有评论
                comment_list = []
                comments = comment_json[article_id]['comment_info']
                for comment in comments:
                    comment_dic = {}
                    comment_dic['username'] = comment['username']
                    comment_dic['content'] = comment['content']
                    comment_list.append(comment_dic)

                comment_result['comment'] = comment_list

            else:
                # 没有评论
                comment_result['comment'] = '没有评论'

            print(comment_result)

    else:
        # 重新请求当前的url
        html = get_list(url)
        parse_list(html, url)


def main():
    for x in range(1, 14):
        url = 'https://tieba.baidu.com/p/689121746?pn={}'.format(x)
        html = get_list(url)
        if html:
            parse_list(html, url, x)


if __name__ == '__main__':
    proxy = ""
    ua = UserAgent()
    main()

猜你喜欢

转载自blog.csdn.net/qq_42336542/article/details/80697831