1.HTML解析器有很多种,比如:
解析工具 | 解析速度 | 难度 |
BeautifulSoup | 最慢 | 最简单 |
lxml | 快 | 简单 |
正则 | 最快 | 最难 |
2.Beautiful Soup解析工具的官方文档链接。
2.1获取所有"a"标签、2.2获取第2个"a"标签、2.3获取class='ulink'的"a"标签、2.4获取满足多个条件的"a"标签、2.5获取所有"a"标签的href属性、2.6获取纯文本text信息。示例代码如下:
# coding:utf-8 import requests from lxml import etree from bs4 import BeautifulSoup import chardet BASE_DOMAIN = "http://www.ygdy8.net" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36" } def get_detailed_urls(url): # 1.获取网页信息 response = requests.get(url, headers=HEADERS) # 查看网页后发现,编码方式为“gb2312”charset encode_style = chardet.detect(response.content)["encoding"] # text = response.content.decode(encode_style, "ignore") text = response.content.decode("gbk", "ignore") # 2.对获取的text进行解析,解析成元素 soup = BeautifulSoup(text, "lxml") # 2.1获取所有"a"标签 # all_a = soup.find_all("a") # for i in all_a: # print i # # Tag类型 # # print type(i) # # from bs4.element import Tag # 2.2获取第2个"a"标签 # all_a = soup.find_all("a", limit=2)[1] # print all_a # 2.3获取class='ulink'的"a"标签 # # 方法一 # # all_a = soup.find_all("a", class_="ulink") # # 方法二 # all_a = soup.find_all("a", attrs={"class": "ulink"}) # for i in all_a: # print i # 2.4获取满足多个条件的"a"标签 # 方法一 # all_a = soup.find_all("a", class_="ulink", href="/html/gndy/dyzz/20180605/56940.html") # 方法二 # all_a = soup.find_all("a", attrs={"class": "ulink", "href": "/html/gndy/dyzz/20180605/56940.html"}) # for i in all_a: # print i # 2.5获取所有"a"标签的href属性 # all_a = soup.find_all("a") # # for a in all_a: # # # 方法一:通过下标的方式 # # # href = a["href"] # # # print href # # # 方法二:通过attrs属性的方式 # # href = a.attrs["href"] # # print href # 2.6获取纯文本text信息 all_a = soup.find_all("td", attrs={"colspan": "2"})[1:] for a in all_a: # 方法一:a.string # print a.string # print "="*30 # 方法二:a.strings # infos = a.strings # for info in infos: # print info # print "=" * 30 # 方法二:a.strings # infos = list(a.strings) # print infos # 方法三:a.stripped_strings # infos = a.stripped_strings # for info in infos: # print info # print "=" * 30 # 方法四:a.get_text() # infos = a.get_text() # print infos def spider(): # 1.获取第二页详细url # url = "http://www.ygdy8.net/html/gndy/dyzz/index.html" base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html" for i in range(1, 8): url = base_url.format(i) get_detailed_urls(url) break if __name__ == '__main__': spider()