Basic knowledge of python crawler - the use of requests and bs4

Basic usage of requests

1. get usage

without parameters

 #!\user\bin\python
 #-*-coding:utf-8-*
 import requests
 url="http://www.baidu.com"
 r=requests.get(url)
 print r.text

get request with parameters

#!\user\bin\python
#-*-coding:utf-8-*-
import requests
url="http://www.baidu.com"
payload={'key1':'value1','key2':'value2'}
r=requests.get(url,params=payload)
print r.url
print r.text

join headres

import requests
 
payload = {'key1': 'value1', 'key2': 'value2'}
headers = {'content-type': 'application/json'}
r = requests.get("http://httpbin.org/get", params=payload, headers=headers)
print r.url

>>> import l8
http://www.baidu.com/?key2=value2&key1=value1

2. post request

An http request consists of three parts, which are the request line, the request header, and the message body, similar to the following:

request line

request header

message body

The HTTP protocol stipulates that the data submitted by the post must be placed in the message body, but the protocol does not stipulate what encoding method must be used. The server learns how the message body in the request is encoded according to the Content-Type field in the request header, and then parses the message body . Specific encoding methods include:

Application/x-www-form-urlencoded
is the most common way to submit data by post, which is to submit data in the form of a form.
application/json
submits data as a json string.
multipart/form-data
is generally used to upload files.

Send post request in form

Reqeusts supports sending post requests in the form of a form. You only need to construct the parameters of the request into a dictionary, and then pass it to the data parameter of requests.post().

url = 'http://httpbin.org/post'
d = {'key1': 'value1', 'key2': 'value2'}
r = requests.post(url, data=d)
print r.text

output result

{ 
“args”: {}, 
“data”: “”, 
“files”: {}, 
“form”: { 
“key1”: “value1”, 
“key2”: “value2” 
}, 
“headers”: { 
…… 
“Content-Type”: “application/x-www-form-urlencoded”, 
…… 
}, 
“json”: null, 
…… 
}

Send post request in json form

You can pass a string of json to the data parameter of requests.post(),

url = 'http://httpbin.org/post'
s = json.dumps({'key1': 'value1', 'key2': 'value2'})
r = requests.post(url, data=s)
print r.text

output result

{ 
“args”: {}, 
“data”: “{\”key2\”: \”value2\”, \”key1\”: \”value1\”}”, 
“files”: {}, 
“form”: {}, 
“headers”: { 
…… 
“Content-Type”: “application/json”, 
…… 
}, 
“json”: { 
“key1”: “value1”, 
“key2”: “value2” 
}, 
…… 
}

Through the above method, we can POST data in JSON format

If you want to upload a file, you can use the file parameter directly

Create a new a.txt file and write Hello World!

Send post request as multipart

Requests also supports sending post requests in the form of multipart, just pass a file to the files parameter of requests.post().

url = 'http://httpbin.org/post'
files = {'file': open('report.txt', 'rb')}
r = requests.post(url, files=files)
print r.text

输出结果


{ 
“args”: {}, 
“data”: “”, 
“files”: { 
“file”: “Hello world!” 
}, 
“form”: {}, 
“headers”: {…… 
“Content-Type”: “multipart/form-data; boundary=467e443f4c3d403c8559e2ebd009bf4a”, 
…… 
}, 
“json”: null,

Basic usage of beautifulsoup

Custom test html, get soup from html text

html = '''
<html>
    <body>
        <h1 id="title">Hello World</h1>
        <a href="#link1" class="link">This is link1</a>
        <a href="#link2" class="link">This is link2</a>
    </body>
</html>
'''
from bs4 import BeautifulSoup
# 这里指定解析器为html.parser（python默认的解析器），指定html文档编码为utf-8
soup = BeautifulSoup(html,'html.parser',from_encoding='utf-8')
print type(soup)
print soup
#print soup的结果
<html>
<body>
<h1 id="title">Hello World</h1>
<a class="link" href="#link1">This is link1</a>
<a class="link" href="#link2">This is link2</a>
</body>
</html>

# 输出：<class 'bs4.BeautifulSoup'>

1.soup.select() function usage

Get the content of the specified tag

from bs4 import BeautifulSoup as bs
soup=bs(html,'html.parser')
header = soup.select('h1')#是一个列表
print type(header)#是一个列表
print header#打印出一个列表，内容是一个html标签
print header[0]#打印出一个列表，内容是一个html标签
print type(header[0])#打出一个类，内容是一个tag标签
print header[0].text#打印出列表中的内容

# 输出
'''
<type 'list'>
[<h1 id="title">Hello World</h1>]
<h1 id="title">Hello World</h1>
<class 'bs4.element.Tag'>
Hello World
'''

      1 html = '''
      2 <html>
      3     <body>
      4         <h1 id="title">Hello World</h1>
      5         <a href="#link1" class="link">This is link1</a>
      6         <a href="#link2" class="link">This is link2</a>
      7     </body>
      8 </html>
      9                                 '''
     10 from bs4 import BeautifulSoup as bs
     11 soup=bs(html,'html.parser',from_encoding='utf-8')
     12 a_links=soup.select('a')
     13 l=[x.text for x in a_links]
     14 print l
     15 print a_links
     16 print type(a_links)
     17 print a_links[0]
     18 print type(a_links[0])
     19 print a_links[0].text
     20 print a_links[0].text

>>> import l9
[u'This is link1', u'This is link2']
[<a class="link" href="#link1">This is link1</a>, <a class="link" href="#link2">This is link2</a>]
<type 'list'>
<a class="link" href="#link1">This is link1</a>
<class 'bs4.element.Tag'>
This is link1
This is link1
>>>

2. Get the content of the tag with the specified id (with '#')



html = '''
      2 <html>
      3     <body>
      4         <h1 id="title">Hello World</h1>
      5         <a href="#link1" class="link">This is link1</a>
      6         <a href="#link2" class="link">This is link2</a>
      7     </body>
      8 </html>
      9                                 '''
     10 from bs4 import BeautifulSoup as bs
     11 soup=bs(html,'html.parser',from_encoding='utf-8')
     12 title=soup.select('#title')
     13 print title
     14 print type(title)
     15 print title[0]
     16 print type(title[0])
     17 print title[0].text
     18 


>>> import l9
[<h1 id="title">Hello World</h1>]
<type 'list'>
<h1 id="title">Hello World</h1>
<class 'bs4.element.Tag'>
Hello World
>>>

3. Get the content of the tag of the specified class (use '.')

from bs4 import BeautifulSoup as bs
      2 html = '''
      3 <html>
      4     <body>
      5         <h1 id="title">Hello World</h1>
      6         <a href="#link1" class="link">This is link1</a>
      7         <a href="#link2" class="link">This is link2</a>
      8     </body>
      9 </html>
     10         '''
     11 soup=bs(html,'html.parser')
     12 h=soup.select('a.link')
     13 print h
        print [x.text for x in h]
     14 for i in [x.text for x in h]:
     15     print i

>>> import l9
[u'This is link1', u'This is link2']
[<a class="link" href="#link1">This is link1</a>, <a class="link" href="#link2">This is link2</a>]
This is link1
This is link2


一.回顾
1.在前面的笔记中，学习了三种抓取办法。

使用select()函数获取标签，但是获取标签的方法有三种；第一种是直接获取的标签('tag'),第二种方法是获取id的属性（'#id属性'）,第三种方法是获取class属性('.class属性')

2.前面的笔记根据html页面的特性进行的：

（1）selecet('tag')可以获取所有的tag
（2）“#”用于获取制定id的内容
（3）“.”用于获取指定class的标签内容
二.下面介绍以下剩余的标签
1.获取a标签的链接（href属性值）


2.获取一个标签下所有的子标签的text

代码示例：

from bs4 import BeautifulSoup as bs
import requests
html = '''
<html>
    <body>
        <h1 id="title">Hello World</h1>
        <a href="#link1" class="link">This is link1</a>
        <a href="#link2" class="link">This is link2</a>
    </body>
</html>
'''
soup=bs(html,'html.parser')
alinks=soup.select('a')
a=[x.text for x in alinks]
print (a)
for i in a:
    print (i)
print (alinks[0]['href'])

输出结果：

['This is link1', 'This is link2']
This is link1
This is link2
#link1

from bs4 import BeautifulSoup as bs
import requests
html = '''
<html>
    <body>
        <h1 id="title">Hello World</h1>
        <a href="#link1" class="link">This is link1</a>
        <a href="#link2" class="link">This is link2</a>
    </body>
</html>
'''
soup=bs(html,'html.parser')
a=soup.select('h1')
b=[x.text for x in a]
print(b)
'''soup=bs(html,'html.parser')
a=soup.select('#title')
b=[x.text for x in a]
print (b)
soup=bs(html,'html.parser')
alinks=soup.select('a')
soup=bs(html,'html.parser')
h_id=soup.select('.link')
a=[x.text for x in h_id]

print (h_id[0]['href'])
print(a)
a=[x.text for x in alinks]
print (a)
for i in a:
    print (i)
print (alinks[0]['href'])'''

4. Get the text of all subtags under a tag

      1 from bs4 import BeautifulSoup as bs
      2 html = '''
      3 <html>
      4     <body>
      5         <h1 id="title">Hello World</h1>
      6         <a href="#link1" class="link">This is link1</a>
      7         <a href="#link2" class="link">This is link2</a>
      8     </body>
      9 </html>
     10                                 '''
     11 soup=bs(html,'html.parser')
     13 h=soup.select('body')[0]
     14 print type(h)
     15 print h
     17 print h.text


#输出结果
<class 'bs4.element.Tag'>

<body>
<h1 id="title">Hello World</h1>
<a class="link" href="#link1">This is link1</a>
<a class="link" href="#link2">This is link2</a>
</body>

Hello World
This is link1
This is link2

5.Soup.find() and soup.find_all() function usage

find() and find_all() function prototypes

Both the find and find_all functions can find tag objects from html text according to multiple conditions, but the return object type of find is bs4.element.Tag, which is the first found Tag that satisfies the conditions.
The return object of find_all is bs4.element.ResultSet (actually the Tag list).

find(name=None, attrs={}, recursive=True, text=None, **kwargs) 
#其中name、attrs、text的值都支持正则匹配。
find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) 
#其中name、attrs、text的值都支持正则匹配

Code example:


    find_all( name , attrs , recursive , text , **kwargs )

    find( name , attrs , recursive , text , **kwargs )




name 参数
name 参数可以查找所有名字为 name 的tag,字符串对象会被自动忽略掉.

简单的用法如下:

    soup.find_all("title")
    # [<title>The Dormouse's story</title>]

复制代码




keyword 参数
如果一个指定名字的参数不是搜索内置的参数名,搜索时会把该参数当作指定名字tag的属性来搜索,如果包含一个名字为 id 的参数,Beautiful Soup会搜索每个tag的”id”属性.


    soup.find_all([u]id[/u]='link2')
    # [<a class="sister" href="http://example.com/lacie"[u] id[/u]="link2">Lacie</a>]
    如果传入 href 参数,Beautiful Soup会搜索每个tag的”href”属性:

    搜索指定名字的属性时可以使用的参数值包括 字符串 , 正则表达式 , 列表, True .

    来段代码：
    [code]from bs4 import BeautifulSoup as bs
    html = '''<table border=16 width='66%' align='center'>
                    <thead align='center'>
                            <caption>鱼C信息</caption>
                            <tr>
                                    <td colspan="3">鱼C信息表</td>
                            </tr>
                            <tr>
                                    <th id='th1'>姓名</th>
                                    <th id='th2'>年龄</th>
                                    <th id='th3'>颜值</th>
                            </tr>
                    </thead>
                    <tbody align='center'>
                            <tr>
                                    <td>不二如是：</td>
                                    <td>18</td>
                                    <td>下一位更帅~</td>
                            </tr>
                            <tr>
                                    <td>小甲鱼老湿：</td>
                                    <td>28</td>
                                    <td>下一位更帅~</td>
                            </tr>
                            <tr>
                                    <td>MSK：</td>
                                    <td>16</td>
                                    <td>第一位最帅~</td>
                            </tr>
                            <tr>
                                    <td colspan='3'>村里有个姑娘叫小花~</td>
                            </tr>
                    </tbody>       
            </table>'''
    soup = bs(html,'html.parser')

复制代码


ps:在这段代码中，只有<th>标签拥有id

当name传入字符串(a)时，将会查找所有name属性为a的Tag

    temp = soup.find_all('tr')
    temp
    #[<tr>
    <td colspan="3">鱼C信息表</td>
    </tr>, <tr>
    <th id="th1">姓名</th>
    <th id="th2">年龄</th>
    <th id="th3">颜值</th>
    </tr>, <tr>
    <td>不二如是：</td>
    <td>18</td>
    <td>下一位更帅~</td>
    </tr>, <tr>
    <td>小甲鱼老湿：</td>
    <td>28</td>
    <td>下一位更帅~</td>
    </tr>, <tr>
    <td>MSK：</td>
    <td>16</td>
    <td>第一位最帅~</td>
    </tr>, <tr>
    <td colspan="3">村里有个姑娘叫小花~</td>
    </tr>]

复制代码


传入正则表达式时re.compile('a')，将查找所有包含'a'的Tag

    soup.find_all([u]href[/u]=re.compile("elsie"))
    # [<a class="sister" [u]href[/u]="http://example.com/elsie" id="link1">Elsie</a>]

复制代码


传入列表时，将查找所有包含列表中元素的Tag

    soup.find_all(['th','td'])
    [<td colspan="3">鱼C信息表</td>, <th id="th1">姓名</th>, <th id="th2">年龄</th>, <th id="th3">颜值</th>, <td>不二如是：</td>, <td>18</td>, <td>下一位更帅~</td>, <td>小甲鱼老湿：</td>, <td>28</td>, <td>下一位更帅~</td>, <td>MSK：</td>, <td>16</td>, <td>第一位最帅~</td>, <td colspan="3">村里有个姑娘叫小花~</td>]

复制代码


传入True时，我不会解释，你自己看：

    soup.find_all(id=True)
    [<th id="th1">姓名</th>, <th id="th2">年龄</th>, <th id="th3">颜值</th>]

复制代码

将所有具有id属性的Tag查找了出来


text参数
通过 text 参数可以搜搜文档中的字符串内容.与 name 参数的可选值一样, text 参数接受 字符串 , 正则表达式 , 列表, True

    soup.find_all(text='下一位更帅~')
    #['下一位更帅~', '下一位更帅~']
    soup.find_all(text=re.compile('帅'))
    #['下一位更帅~', '下一位更帅~', '第一位最帅~']
    soup.find_all(text=True)
    #['\n', '\n', '鱼C信息', '\n', '\n', '鱼C信息表', '\n', '\n', '\n', '姓名', '\n', '年龄', '\n', '颜值', '\n', '\n', '\n', '\n', '\n', '不二如是：', '\n', '18', '\n', '下一位更帅~', '\n', '\n', '\n', '小甲鱼老湿：', '\n', '28', '\n', '下一位更帅~', '\n', '\n', '\n', 'MSK：', '\n', '16', '\n', '第一位最帅~', '\n', '\n', '\n', '村里有个姑娘叫小花~', '\n', '\n', '\n']

复制代码



limit 参数
限制返回个数

recursive 参数
指定为True时，搜索范围是子孙节点，如果设为False，只搜索子节点

bs4 supplement: reprinted from https://www.jianshu.com/p/a2d68ae3d02d

soup = BeautifulSoup(open("index.html"), "lxml")
使用 request 向服务器请求网页

wb_data = requests.get("http://www.baidu.com")    # 获得完整的 HTTP response

使用 beautifulsoup 解析网页

soup = Beautifulsoup(wb_data.text,'lxml')   # 用`.text`提取 HTTP 体，即 HTML 文档

Search the document tree

Describe where the element to be crawled is, get a list of elements/tags

filter type

string
re
list

If you pass in a list parameter, Beautiful Soup will return content that matches any element in the list. The following code finds all <a> tags and <b> tags in the document:

soup.find_all(["a", "b"])
# [<b>The Dormouse's story</b>,
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

CSS selectors

Beautiful SoupSupports most CSS selectors , pass string parameters in the .select() method of the Tag or BeautifulSoup object, and you can use the syntax of CSS selectors to find tags.

xx = Soup.select()Fill in the path describing the location of the element to get a list of tags

Find tabs:

soup.select("title")
# [<title>The Dormouse's story</title>]

Find layer by layer through tag tags, 遍历子标签:

soup.select("body a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie"  id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

Find under a tag 直接子标签:

soup.select("head > title")
# [<title>The Dormouse's story</title>]

soup.select("p > a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie"  id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

By CSS的类名finding:

soup.select(".sister")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select("[class~=sister]")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

By tag的idfinding:

soup.select("#link1")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select("a#link2")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

是否存在某个属性Find by :

soup.select('a[href]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

属性的值Find by :

soup.select('a[href="http://example.com/elsie"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select('a[href^="http://example.com/"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href$="tillie"]')
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href*=".com/el"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

Author: hypernet
link: https://www.jianshu.com/p/a2d68ae3d02d
Source: Jianshu
The copyright belongs to the author. For commercial reprints, please contact the author for authorization, and for non-commercial reprints, please indicate the source.