BeautifulSoup解析工具与css选择器使用简介

# coding:utf-8

import requests
from lxml import etree
from bs4 import BeautifulSoup
import chardet
BASE_DOMAIN = "http://www.ygdy8.net"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
}


def get_detailed_urls(url):
    # 1.获取网页信息
    response = requests.get(url, headers=HEADERS)
    # 查看网页后发现,编码方式为“gb2312”charset
    encode_style = chardet.detect(response.content)["encoding"]
    # text = response.content.decode(encode_style, "ignore")
    text = response.content.decode("gbk", "ignore")

    # 2.对获取的text进行解析,解析成元素
    soup = BeautifulSoup(text, "lxml")

    # 2.1获取所有"a"标签
    # trs = soup.select("tr")
    # for tr in trs:
    #     print tr

    # 2.2获取第2个"a"标签
    # trs = soup.select("tr")[1]
    # print trs

    # 2.3获取class为even的标签
    # trs = soup.select("table.tbspan")
    # for tr in trs:
    #     print tr

    # 2.4获取所有a标签的herf属性
    # trs = soup.select("a")
    # for tr in trs:
    #     print tr["href"]

    # 2.5获取所有的职位信息(text文本)
    trs = soup.select("tr")
    for tr in trs:
        infos=list(tr.stripped_strings)
        print infos


def spider():
    # 1.获取第二页详细url
    # url = "http://www.ygdy8.net/html/gndy/dyzz/index.html"
    base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
    for i in range(1, 8):
        url = base_url.format(i)
        get_detailed_urls(url)
        break


if __name__ == '__main__':
    spider()

猜你喜欢

转载自blog.csdn.net/qq_42281053/article/details/80692351
今日推荐