# coding:utf-8 import requests from lxml import etree from bs4 import BeautifulSoup import chardet BASE_DOMAIN = "http://www.ygdy8.net" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36" } def get_detailed_urls(url): # 1.获取网页信息 response = requests.get(url, headers=HEADERS) # 查看网页后发现,编码方式为“gb2312”charset encode_style = chardet.detect(response.content)["encoding"] # text = response.content.decode(encode_style, "ignore") text = response.content.decode("gbk", "ignore") # 2.对获取的text进行解析,解析成元素 soup = BeautifulSoup(text, "lxml") # 2.1获取所有"a"标签 # trs = soup.select("tr") # for tr in trs: # print tr # 2.2获取第2个"a"标签 # trs = soup.select("tr")[1] # print trs # 2.3获取class为even的标签 # trs = soup.select("table.tbspan") # for tr in trs: # print tr # 2.4获取所有a标签的herf属性 # trs = soup.select("a") # for tr in trs: # print tr["href"] # 2.5获取所有的职位信息(text文本) trs = soup.select("tr") for tr in trs: infos=list(tr.stripped_strings) print infos def spider(): # 1.获取第二页详细url # url = "http://www.ygdy8.net/html/gndy/dyzz/index.html" base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html" for i in range(1, 8): url = base_url.format(i) get_detailed_urls(url) break if __name__ == '__main__': spider()
BeautifulSoup解析工具与css选择器使用简介
猜你喜欢
转载自blog.csdn.net/qq_42281053/article/details/80692351
今日推荐
周排行