Web page analysis--BeautifulSoup exercise

  1 # coding = utf-8
  2 # BeautifulSoup 主要功能是解析提取HTML数据
  3 # re lxml bs4
  4 
  5 # pip install Beautifulsoup4
  6 
  7 # from bs4 import BeautifulSoup
  8 
  9 html = '''
 10 <html><head><title>The Dormouse's story</title></head>
 11 
 12 <p class="title"><b>The Dormouse's story</b></p>
 13 
 14 <p class="story">Once upon a time there were three little sisters; and their names were
 15 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
 16 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
 17 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
 18 and they lived at the bottom of a well.</p>
 19 
 20 <p class="story">...</p>
 21 
 22 '''
 23 ############################################################################
 24 # BeautifulSoup部分
 25 #############################################################################
 26 
 27 # soup = BeautifulSoup(html, 'lxml')
 28 
 29 #Four types of objects: Tag NavigableString Beautifulsoup Comment 
30  
31  # print(soup.a) # Get the a tag 
32  # print(soup.a.get('href')) # Get the attribute of the a tag and get the hyperlink 
33  # print (soup.a.text) # Get the text under the a tag, if there are subtags under a, you may not get 
34  # print(soup.a.string) # Get the text under the a tag (including the subtags under a) Text 
35  
36  
37  #Search documents: find find_all Match according to certain filter conditions 
38  
39  #String 40 # print(soup.find_all('a')) # Match the a tag in the entire document 41 # print(soup.find_all (attrs={'class': 'title'})) # Match tags with class as title 42 43 # #Regular expression 44
 
 
 
 
 # import re 
45  # print(soup.find_all(re.compile('^p'))) # matches tags starting with p 
46  # print(soup.find_all(re.compile('y$'))) # matches Tags ending with y 
47  # print(soup.find_all(re.compile('t'))) # matches tags containing t 
48 49 #list 50 # for  
tag  in soup.find_all ( ['a', 'b' ]): # Match a tag, b tag 51 #      print(tag) 52 53 # for tag in soup.find_all('p', class_='story'): # Match p tag 54 of class=story #      print(tag ) 55 56 # # The method passes a method to find_all as a filter condition 57 #
 
 
 
 
 
 
 
 def has_class_but_no_id(tag): 
58  #      """ 
59 #Define       a method to judge that there is a class attribute but no id attribute, as a filter condition 60 # """ 61 # return tag.has_attr      ( ' class      ') and not tag.has_attr ('id') 62 #
 63 # for tag in soup. find_all(has_class_but_no_id): 64 #      print(tag) 65 66 67 # css selector 68 # print(soup. select('title')) # Find by tag name 69 # print(soup.select('.sister')) # Find 70 by class name # print(soup.select('#link1')) # find 71 by id name
 
 
  
 
 
 
 
 
 
 
 # print(soup.select('p #link2')) # Combination search, the p tag with id of link2 
72  
73  # > can only search for 74 level by level 
# print(soup.select('body > p . sister')) # Find p 75 76 77 #Baidu search python, attribute search on the returned page 78 # import requests 79 # url = 'http://www.baidu.com/s?wd =python' 80 # response = requests.get(url) # The obtained data is the source code of the web page, not rendered by js 81 #
 82 # soup = BeautifulSoup(response.text, 'lxml') 83 84 #Find the returned page and search for it Result 85 # 
 
 
 
 
 
 
  
 
 
 items = soup.find_all('div', class_='result c-container ') 
86  
87 #Print  search results 88 # for item in items: 89 # print (      item.select('h3 > a')[0]. get('href') # take a label 90 #      print(item.select('h3 > a')[0].get_text()) 91 92 # ############### ################################################## ############## 93 # xpath part 94 #wildcard // @ # . . . 95 # /    means match from current node // whole document match @ select attribute * 96 # # ################################################## ####################################
 
 
 
 
 
 
 
 
 
 97 html = '''
 98 <html><head><title>The Dormouse's story</title></head>
 99 <p class="title"><b>The Dormouse's story</b></p>
100 <p class="story">Once upon a time there were three little sisters; and their names were
101 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
102 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
103 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
104 and they lived at the bottom of a well.</p>
105 <p class="story">...</p>
106 '''
107 #from lxml import etree 
108  # e = etree.HTML(html) 
109  # for i in e.xpath('//p'): # Search the entire document for p tags 
110  #      # print(i.xpath('string(. )')) # Get all the text under the current label (the label set under the label), including the text of the following sublabels 
111  #      print(i.text) # Match the text content under the current label, excluding sublabels 
112  
113  """ 
114  # for i in e.xpath('//p/@class'): # Select the class attribute of p
 115  # for i in e.xpath('//p[@class=title]'): # Search for class =title's p tag
 116  //title[@*] matches all title tags with attributes
 117  """
118  #Baidu search python, use xpath to find 
119  import requests
 120  from lxmlimport etree
 121  
122 url = ' http://www.baidu.com/s?wd=python ' 
123 response = requests.get(url)   #The data obtained is the source code of the webpage 
124 tree = etree.HTML(response.text )
 125  
126  #find and return the result of the page search 
127 items = tree.xpath( ' //div[@class="result c-container "] ' )
 128  for item in items:
 129      # print(item.xpath(' h3/a/@href')) 
130      print (item.xpath( ' h3/a ' )[0].xpath( 'string(.)'))
Web page analysis--BeautifulSoup exercise

Guess you like