1 # coding = utf-8 2 # BeautifulSoup 主要功能是解析提取HTML数据 3 # re lxml bs4 4 5 # pip install Beautifulsoup4 6 7 # from bs4 import BeautifulSoup 8 9 html = ''' 10 <html><head><title>The Dormouse's story</title></head> 11 12 <p class="title"><b>The Dormouse's story</b></p> 13 14 <p class="story">Once upon a time there were three little sisters; and their names were 15 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, 16 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 17 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 18 and they lived at the bottom of a well.</p> 19 20 <p class="story">...</p> 21 22 ''' 23 ############################################################################ 24 # BeautifulSoup部分 25 ############################################################################# 26 27 # soup = BeautifulSoup(html, 'lxml') 28 29 #Four types of objects: Tag NavigableString Beautifulsoup Comment 30 31 # print(soup.a) # Get the a tag 32 # print(soup.a.get('href')) # Get the attribute of the a tag and get the hyperlink 33 # print (soup.a.text) # Get the text under the a tag, if there are subtags under a, you may not get 34 # print(soup.a.string) # Get the text under the a tag (including the subtags under a) Text 35 36 37 #Search documents: find find_all Match according to certain filter conditions 38 39 #String 40 # print(soup.find_all('a')) # Match the a tag in the entire document 41 # print(soup.find_all (attrs={'class': 'title'})) # Match tags with class as title 42 43 # #Regular expression 44 # import re 45 # print(soup.find_all(re.compile('^p'))) # matches tags starting with p 46 # print(soup.find_all(re.compile('y$'))) # matches Tags ending with y 47 # print(soup.find_all(re.compile('t'))) # matches tags containing t 48 49 #list 50 # for tag in soup.find_all ( ['a', 'b' ]): # Match a tag, b tag 51 # print(tag) 52 53 # for tag in soup.find_all('p', class_='story'): # Match p tag 54 of class=story # print(tag ) 55 56 # # The method passes a method to find_all as a filter condition 57 # def has_class_but_no_id(tag): 58 # """ 59 #Define a method to judge that there is a class attribute but no id attribute, as a filter condition 60 # """ 61 # return tag.has_attr ( ' class ') and not tag.has_attr ('id') 62 # 63 # for tag in soup. find_all(has_class_but_no_id): 64 # print(tag) 65 66 67 # css selector 68 # print(soup. select('title')) # Find by tag name 69 # print(soup.select('.sister')) # Find 70 by class name # print(soup.select('#link1')) # find 71 by id name # print(soup.select('p #link2')) # Combination search, the p tag with id of link2 72 73 # > can only search for 74 level by level # print(soup.select('body > p . sister')) # Find p 75 76 77 #Baidu search python, attribute search on the returned page 78 # import requests 79 # url = 'http://www.baidu.com/s?wd =python' 80 # response = requests.get(url) # The obtained data is the source code of the web page, not rendered by js 81 # 82 # soup = BeautifulSoup(response.text, 'lxml') 83 84 #Find the returned page and search for it Result 85 # items = soup.find_all('div', class_='result c-container ') 86 87 #Print search results 88 # for item in items: 89 # print ( item.select('h3 > a')[0]. get('href') # take a label 90 # print(item.select('h3 > a')[0].get_text()) 91 92 # ############### ################################################## ############## 93 # xpath part 94 #wildcard // @ # . . . 95 # / means match from current node // whole document match @ select attribute * 96 # # ################################################## #################################### 97 html = ''' 98 <html><head><title>The Dormouse's story</title></head> 99 <p class="title"><b>The Dormouse's story</b></p> 100 <p class="story">Once upon a time there were three little sisters; and their names were 101 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, 102 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 103 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 104 and they lived at the bottom of a well.</p> 105 <p class="story">...</p> 106 ''' 107 #from lxml import etree 108 # e = etree.HTML(html) 109 # for i in e.xpath('//p'): # Search the entire document for p tags 110 # # print(i.xpath('string(. )')) # Get all the text under the current label (the label set under the label), including the text of the following sublabels 111 # print(i.text) # Match the text content under the current label, excluding sublabels 112 113 """ 114 # for i in e.xpath('//p/@class'): # Select the class attribute of p 115 # for i in e.xpath('//p[@class=title]'): # Search for class =title's p tag 116 //title[@*] matches all title tags with attributes 117 """ 118 #Baidu search python, use xpath to find 119 import requests 120 from lxmlimport etree 121 122 url = ' http://www.baidu.com/s?wd=python ' 123 response = requests.get(url) #The data obtained is the source code of the webpage 124 tree = etree.HTML(response.text ) 125 126 #find and return the result of the page search 127 items = tree.xpath( ' //div[@class="result c-container "] ' ) 128 for item in items: 129 # print(item.xpath(' h3/a/@href')) 130 print (item.xpath( ' h3/a ' )[0].xpath( 'string(.)'))