xpath获取网页数据(一)

#  使用代理ip获取网页数据
import requests, re

# fake_useragent:实现了User-Agent的动态维护,利用它每次随机获取一个User-Agent的值。
from fake_useragent import UserAgent
from lxml.html import etree, HTMLParser
from requests.exceptions import ConnectionError


def get_proxy():
    return requests.get("http://127.0.0.1:5010/get/").text


def delete_proxy(proxy):
    requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))


def get_list(url):
    headers = {
        'User-Agent': ua.random,
    }
    proxy = get_proxy()
    print('正在使用代理IP{}请求页面{}'.format(proxy, url))
    proxies = {'http': 'http://'+proxy}
    try:
        response = requests.get(url, headers=headers, proxies=proxies)
        if response.status_code == 200:
            print('{}请求成功'.format(url))
            return response.text
        else:
            print('{}请求异常'.format(url))
            return None
    except ConnectionError as e:
        print('{}连接主机异常'.format(url))
        return None


def parse_list(html, url):
    if html != None:
        # parser=HTMLParser(encoding='utf-8')自定义的解析器,有默认的解析器。
        html_obj = etree.HTML(html, parser=HTMLParser(encoding='utf-8'))

        # 获取title,判断获取的源代码是否含有数据/猫眼访问控制。
        title = html_obj.xpath('//head/title/text()')[0]
        if "TOP100" in title:
            # 说明获取是有数据的网页
            dds = html_obj.xpath('//dd')
            for dd in dds:
                # 排名
                rank = dd.xpath('i/text()')[0]
                # 名称
                div = dd.xpath('div/div/div')
                name = div[0].xpath('p/a/text()')[0]
                zhuyan = div[0].xpath('p[@class="star"]/text()')[0]
                date = div[0].xpath('p[@class="releasetime"]/text()')[0]

                # 评分
                movie_sc2 = dd[0].xpath(
                    '//div[contains(@class, "movie-item-number score-num")]/p[@class="score"]/i/text()')
                print(movie_sc2)
                score = []
                for i in range(0, 10):
                    s = movie_sc2[i * 2] + movie_sc2[i * 2 + 1]
                    score.append(s)
                # print(score)
                for i in range(0, 10):
                    res_dict = {
                        "rank":rank,
                        "name": name,
                        "zhuyan": zhuyan,
                        "date": date,
                        "score": score[i],
                    }
                    print(res_dict)

        else:
            # 说明获取的是没有数据的网页
            print('{}获取了没有数据的网页', url)

            # 将当前的代理IPredis数据库中删除。
            delete_proxy(proxy)

            # 重新请求当前的url
            html = get_list(url)
            parse_list(html, url)
    else:
        # 重新请求当前的url
        html = get_list(url)
        parse_list(html, url)


def main():
    for x in range(0, 100, 10):
        url = 'http://maoyan.com/board/4?offset={}'.format(x)
        html = get_list(url)
        if html:
            parse_list(html, url)


if __name__ == '__main__':
    proxy = ""
    ua = UserAgent()
    main()

猜你喜欢

转载自blog.csdn.net/qq_42336542/article/details/80697798