Python.requests.bs4.xpath.pquery

import requests
import requests.adapters

from bs4 import BeautifulSoup
from lxml import etree
from pyquery import PyQuery as pq


def get_url_txt(url, headers, encoding, data=None):
    ret = ''
    try:
        requests.adapters.DEFAULT_RETRIES = 5
        session = requests.session()
        session.keep_alive = False
        if data is None:
            response = session.get(url, headers=headers)
        else:
            response = session.get(url, headers=headers, data=data)
        if response.status_code == 200:
            response.encoding = encoding
            ret = response.text
        response.close()
        session.close()
    except Exception as e:
        print(e)
    return ret


def get_url_byte(url, headers, data_dict=None):
    ret = b''
    try:
        requests.adapters.DEFAULT_RETRIES = 5
        session = requests.session()
        session.keep_alive = False
        if data_dict is None:
            response = session.get(url, headers=headers)
        else:
            response = session.get(url, headers=headers, data=data_dict)
        if response.status_code == 200:
            ret = response.content
        response.close()
        session.close()
    except Exception as e:
        print(e)
    return ret


if __name__ == '__main__':
    url1 = 'https://www.baidu.com/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
    encoding = 'utf8'
    text = get_url_txt(url=url1, headers=headers, encoding=encoding)
    # print(text)
    """
    <a href="http://news.baidu.com" name="tj_trnews" class="mnav">新闻</a>
    selector    #u1 > a:nth-child(2)
    xpath       //*[@id="u1"]/a[2]
    full xpath  /html/body/div[1]/div[1]/div/div[3]/a[2]    
    """
    pass  # bs4 .get_text() .get('属性')
    soup = BeautifulSoup(text, 'lxml')
    soup.prettify()
    rets = soup.select('#u1 > a:nth-child(2)')
    # for ret in rets:
    #     print(ret.get_text(), ret.get('href'))
    pass  # xpath //text() //@属性
    xpath = etree.HTML(text)
    rets1 = xpath.xpath('//*[@id="u1"]/a[2]//text()')
    rets2 = xpath.xpath('//*[@id="u1"]/a[2]//@href')
    # print(rets1, rets2)
    pass  # PyQuery tag'tag名称' id'#id值' class'.class值'
    doc = pq(text)
    tags = doc('a')
    # print(len(tags), tags)
    # for i in tags:
    #     print(pq(i).text(), pq(i).attr('href'))
    ids = doc('#u1')
    # print(len(ids), ids)
    classes = doc('.mnav')
    # print(len(classes), classes)
    # for i in classes:
    #     print(pq(i).text(), pq(i).attr('href'))

Guess you like

Origin www.cnblogs.com/dailycode/p/12466910.html