python爬虫基础知识——requests、bs4的使用

requests基本用法

1.get用法

不带参数的

 #!\user\bin\python
 #-*-coding:utf-8-*
 import requests
 url="http://www.baidu.com"
 r=requests.get(url)
 print r.text

带参数的get请求

#!\user\bin\python
#-*-coding:utf-8-*-
import requests
url="http://www.baidu.com"
payload={'key1':'value1','key2':'value2'}
r=requests.get(url,params=payload)
print r.url
print r.text

加入headres

import requests
 
payload = {'key1': 'value1', 'key2': 'value2'}
headers = {'content-type': 'application/json'}
r = requests.get("http://httpbin.org/get", params=payload, headers=headers)
print r.url

>>> import l8
http://www.baidu.com/?key2=value2&key1=value1

2.post请求

一个http请求包括三个部分，为别为请求行，请求报头，消息主体，类似以下这样：

请求行

请求报头

消息主体

HTTP协议规定post提交的数据必须放在消息主体中，但是协议并没有规定必须使用什么编码方式。服务端通过是根据请求头中的Content-Type字段来获知请求中的消息主体是用何种方式进行编码，再对消息主体进行解析。具体的编码方式包括：

application/x-www-form-urlencoded
最常见post提交数据的方式，以form表单形式提交数据。
application/json
以json串提交数据。
multipart/form-data
一般使用来上传文件。

以form形式发送post请求

Reqeusts支持以form表单形式发送post请求，只需要将请求的参数构造成一个字典，然后传给requests.post()的data参数即可

url = 'http://httpbin.org/post'
d = {'key1': 'value1', 'key2': 'value2'}
r = requests.post(url, data=d)
print r.text

输出结果

{ 
“args”: {}, 
“data”: “”, 
“files”: {}, 
“form”: { 
“key1”: “value1”, 
“key2”: “value2” 
}, 
“headers”: { 
…… 
“Content-Type”: “application/x-www-form-urlencoded”, 
…… 
}, 
“json”: null, 
…… 
}

以json形式发送post请求

可以将一json串传给requests.post()的data参数，

url = 'http://httpbin.org/post'
s = json.dumps({'key1': 'value1', 'key2': 'value2'})
r = requests.post(url, data=s)
print r.text

输出结果

{ 
“args”: {}, 
“data”: “{\”key2\”: \”value2\”, \”key1\”: \”value1\”}”, 
“files”: {}, 
“form”: {}, 
“headers”: { 
…… 
“Content-Type”: “application/json”, 
…… 
}, 
“json”: { 
“key1”: “value1”, 
“key2”: “value2” 
}, 
…… 
}

通过上述方法，我们可以POST JSON格式的数据

如果想要上传文件，那么直接用 file 参数即可

新建一个 a.txt 的文件，内容写上 Hello World!

以multipart形式发送post请求

Requests也支持以multipart形式发送post请求，只需将一文件传给requests.post()的files参数即可。

url = 'http://httpbin.org/post'
files = {'file': open('report.txt', 'rb')}
r = requests.post(url, files=files)
print r.text

输出结果


{ 
“args”: {}, 
“data”: “”, 
“files”: { 
“file”: “Hello world!” 
}, 
“form”: {}, 
“headers”: {…… 
“Content-Type”: “multipart/form-data; boundary=467e443f4c3d403c8559e2ebd009bf4a”, 
…… 
}, 
“json”: null,

beautifulsoup基本用法

自定义测试html,从html文本中获取soup

html = '''
<html>
    <body>
        <h1 id="title">Hello World</h1>
        <a href="#link1" class="link">This is link1</a>
        <a href="#link2" class="link">This is link2</a>
    </body>
</html>
'''
from bs4 import BeautifulSoup
# 这里指定解析器为html.parser（python默认的解析器），指定html文档编码为utf-8
soup = BeautifulSoup(html,'html.parser',from_encoding='utf-8')
print type(soup)
print soup
#print soup的结果
<html>
<body>
<h1 id="title">Hello World</h1>
<a class="link" href="#link1">This is link1</a>
<a class="link" href="#link2">This is link2</a>
</body>
</html>

# 输出：<class 'bs4.BeautifulSoup'>

1.soup.select()函数用法

获取指定标签的内容

from bs4 import BeautifulSoup as bs
soup=bs(html,'html.parser')
header = soup.select('h1')#是一个列表
print type(header)#是一个列表
print header#打印出一个列表，内容是一个html标签
print header[0]#打印出一个列表，内容是一个html标签
print type(header[0])#打出一个类，内容是一个tag标签
print header[0].text#打印出列表中的内容

# 输出
'''
<type 'list'>
[<h1 id="title">Hello World</h1>]
<h1 id="title">Hello World</h1>
<class 'bs4.element.Tag'>
Hello World
'''

      1 html = '''
      2 <html>
      3     <body>
      4         <h1 id="title">Hello World</h1>
      5         <a href="#link1" class="link">This is link1</a>
      6         <a href="#link2" class="link">This is link2</a>
      7     </body>
      8 </html>
      9                                 '''
     10 from bs4 import BeautifulSoup as bs
     11 soup=bs(html,'html.parser',from_encoding='utf-8')
     12 a_links=soup.select('a')
     13 l=[x.text for x in a_links]
     14 print l
     15 print a_links
     16 print type(a_links)
     17 print a_links[0]
     18 print type(a_links[0])
     19 print a_links[0].text
     20 print a_links[0].text

>>> import l9
[u'This is link1', u'This is link2']
[<a class="link" href="#link1">This is link1</a>, <a class="link" href="#link2">This is link2</a>]
<type 'list'>
<a class="link" href="#link1">This is link1</a>
<class 'bs4.element.Tag'>
This is link1
This is link1
>>>

2.获取指定id的标签的内容（用’#’）



html = '''
      2 <html>
      3     <body>
      4         <h1 id="title">Hello World</h1>
      5         <a href="#link1" class="link">This is link1</a>
      6         <a href="#link2" class="link">This is link2</a>
      7     </body>
      8 </html>
      9                                 '''
     10 from bs4 import BeautifulSoup as bs
     11 soup=bs(html,'html.parser',from_encoding='utf-8')
     12 title=soup.select('#title')
     13 print title
     14 print type(title)
     15 print title[0]
     16 print type(title[0])
     17 print title[0].text
     18 


>>> import l9
[<h1 id="title">Hello World</h1>]
<type 'list'>
<h1 id="title">Hello World</h1>
<class 'bs4.element.Tag'>
Hello World
>>>

3.获取指定class的标签的内容（用’.’）

from bs4 import BeautifulSoup as bs
      2 html = '''
      3 <html>
      4     <body>
      5         <h1 id="title">Hello World</h1>
      6         <a href="#link1" class="link">This is link1</a>
      7         <a href="#link2" class="link">This is link2</a>
      8     </body>
      9 </html>
     10         '''
     11 soup=bs(html,'html.parser')
     12 h=soup.select('a.link')
     13 print h
        print [x.text for x in h]
     14 for i in [x.text for x in h]:
     15     print i

>>> import l9
[u'This is link1', u'This is link2']
[<a class="link" href="#link1">This is link1</a>, <a class="link" href="#link2">This is link2</a>]
This is link1
This is link2


一.回顾
1.在前面的笔记中，学习了三种抓取办法。

使用select()函数获取标签，但是获取标签的方法有三种；第一种是直接获取的标签('tag'),第二种方法是获取id的属性（'#id属性'）,第三种方法是获取class属性('.class属性')

2.前面的笔记根据html页面的特性进行的：

（1）selecet('tag')可以获取所有的tag
（2）“#”用于获取制定id的内容
（3）“.”用于获取指定class的标签内容
二.下面介绍以下剩余的标签
1.获取a标签的链接（href属性值）


2.获取一个标签下所有的子标签的text

代码示例：

from bs4 import BeautifulSoup as bs
import requests
html = '''
<html>
    <body>
        <h1 id="title">Hello World</h1>
        <a href="#link1" class="link">This is link1</a>
        <a href="#link2" class="link">This is link2</a>
    </body>
</html>
'''
soup=bs(html,'html.parser')
alinks=soup.select('a')
a=[x.text for x in alinks]
print (a)
for i in a:
    print (i)
print (alinks[0]['href'])

输出结果：

['This is link1', 'This is link2']
This is link1
This is link2
#link1

from bs4 import BeautifulSoup as bs
import requests
html = '''
<html>
    <body>
        <h1 id="title">Hello World</h1>
        <a href="#link1" class="link">This is link1</a>
        <a href="#link2" class="link">This is link2</a>
    </body>
</html>
'''
soup=bs(html,'html.parser')
a=soup.select('h1')
b=[x.text for x in a]
print(b)
'''soup=bs(html,'html.parser')
a=soup.select('#title')
b=[x.text for x in a]
print (b)
soup=bs(html,'html.parser')
alinks=soup.select('a')
soup=bs(html,'html.parser')
h_id=soup.select('.link')
a=[x.text for x in h_id]

print (h_id[0]['href'])
print(a)
a=[x.text for x in alinks]
print (a)
for i in a:
    print (i)
print (alinks[0]['href'])'''

4.获取一个标签下的所有子标签的text

      1 from bs4 import BeautifulSoup as bs
      2 html = '''
      3 <html>
      4     <body>
      5         <h1 id="title">Hello World</h1>
      6         <a href="#link1" class="link">This is link1</a>
      7         <a href="#link2" class="link">This is link2</a>
      8     </body>
      9 </html>
     10                                 '''
     11 soup=bs(html,'html.parser')
     13 h=soup.select('body')[0]
     14 print type(h)
     15 print h
     17 print h.text


#输出结果
<class 'bs4.element.Tag'>

<body>
<h1 id="title">Hello World</h1>
<a class="link" href="#link1">This is link1</a>
<a class="link" href="#link2">This is link2</a>
</body>

Hello World
This is link1
This is link2

5.soup.find()和soup.find_all()函数用法

find()和find_all()函数原型

find和find_all函数都可根据多个条件从html文本中查找标签对象，只不过find的返回对象类型为bs4.element.Tag，为查找到的第一个满足条件的Tag。
而find_all的返回对象为bs4.element.ResultSet（实际上就是Tag列表）。

find(name=None, attrs={}, recursive=True, text=None, **kwargs) 
#其中name、attrs、text的值都支持正则匹配。
find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) 
#其中name、attrs、text的值都支持正则匹配

代码示例：


    find_all( name , attrs , recursive , text , **kwargs )

    find( name , attrs , recursive , text , **kwargs )




name 参数
name 参数可以查找所有名字为 name 的tag,字符串对象会被自动忽略掉.

简单的用法如下:

    soup.find_all("title")
    # [<title>The Dormouse's story</title>]

复制代码




keyword 参数
如果一个指定名字的参数不是搜索内置的参数名,搜索时会把该参数当作指定名字tag的属性来搜索,如果包含一个名字为 id 的参数,Beautiful Soup会搜索每个tag的”id”属性.


    soup.find_all([u]id[/u]='link2')
    # [<a class="sister" href="http://example.com/lacie"[u] id[/u]="link2">Lacie</a>]
    如果传入 href 参数,Beautiful Soup会搜索每个tag的”href”属性:

    搜索指定名字的属性时可以使用的参数值包括 字符串 , 正则表达式 , 列表, True .

    来段代码：
    [code]from bs4 import BeautifulSoup as bs
    html = '''<table border=16 width='66%' align='center'>
                    <thead align='center'>
                            <caption>鱼C信息</caption>
                            <tr>
                                    <td colspan="3">鱼C信息表</td>
                            </tr>
                            <tr>
                                    <th id='th1'>姓名</th>
                                    <th id='th2'>年龄</th>
                                    <th id='th3'>颜值</th>
                            </tr>
                    </thead>
                    <tbody align='center'>
                            <tr>
                                    <td>不二如是：</td>
                                    <td>18</td>
                                    <td>下一位更帅~</td>
                            </tr>
                            <tr>
                                    <td>小甲鱼老湿：</td>
                                    <td>28</td>
                                    <td>下一位更帅~</td>
                            </tr>
                            <tr>
                                    <td>MSK：</td>
                                    <td>16</td>
                                    <td>第一位最帅~</td>
                            </tr>
                            <tr>
                                    <td colspan='3'>村里有个姑娘叫小花~</td>
                            </tr>
                    </tbody>       
            </table>'''
    soup = bs(html,'html.parser')

复制代码


ps:在这段代码中，只有<th>标签拥有id

当name传入字符串(a)时，将会查找所有name属性为a的Tag

    temp = soup.find_all('tr')
    temp
    #[<tr>
    <td colspan="3">鱼C信息表</td>
    </tr>, <tr>
    <th id="th1">姓名</th>
    <th id="th2">年龄</th>
    <th id="th3">颜值</th>
    </tr>, <tr>
    <td>不二如是：</td>
    <td>18</td>
    <td>下一位更帅~</td>
    </tr>, <tr>
    <td>小甲鱼老湿：</td>
    <td>28</td>
    <td>下一位更帅~</td>
    </tr>, <tr>
    <td>MSK：</td>
    <td>16</td>
    <td>第一位最帅~</td>
    </tr>, <tr>
    <td colspan="3">村里有个姑娘叫小花~</td>
    </tr>]

复制代码


传入正则表达式时re.compile('a')，将查找所有包含'a'的Tag

    soup.find_all([u]href[/u]=re.compile("elsie"))
    # [<a class="sister" [u]href[/u]="http://example.com/elsie" id="link1">Elsie</a>]

复制代码


传入列表时，将查找所有包含列表中元素的Tag

    soup.find_all(['th','td'])
    [<td colspan="3">鱼C信息表</td>, <th id="th1">姓名</th>, <th id="th2">年龄</th>, <th id="th3">颜值</th>, <td>不二如是：</td>, <td>18</td>, <td>下一位更帅~</td>, <td>小甲鱼老湿：</td>, <td>28</td>, <td>下一位更帅~</td>, <td>MSK：</td>, <td>16</td>, <td>第一位最帅~</td>, <td colspan="3">村里有个姑娘叫小花~</td>]

复制代码


传入True时，我不会解释，你自己看：

    soup.find_all(id=True)
    [<th id="th1">姓名</th>, <th id="th2">年龄</th>, <th id="th3">颜值</th>]

复制代码

将所有具有id属性的Tag查找了出来


text参数
通过 text 参数可以搜搜文档中的字符串内容.与 name 参数的可选值一样, text 参数接受 字符串 , 正则表达式 , 列表, True

    soup.find_all(text='下一位更帅~')
    #['下一位更帅~', '下一位更帅~']
    soup.find_all(text=re.compile('帅'))
    #['下一位更帅~', '下一位更帅~', '第一位最帅~']
    soup.find_all(text=True)
    #['\n', '\n', '鱼C信息', '\n', '\n', '鱼C信息表', '\n', '\n', '\n', '姓名', '\n', '年龄', '\n', '颜值', '\n', '\n', '\n', '\n', '\n', '不二如是：', '\n', '18', '\n', '下一位更帅~', '\n', '\n', '\n', '小甲鱼老湿：', '\n', '28', '\n', '下一位更帅~', '\n', '\n', '\n', 'MSK：', '\n', '16', '\n', '第一位最帅~', '\n', '\n', '\n', '村里有个姑娘叫小花~', '\n', '\n', '\n']

复制代码



limit 参数
限制返回个数

recursive 参数
指定为True时，搜索范围是子孙节点，如果设为False，只搜索子节点

bs4补充：转自https://www.jianshu.com/p/a2d68ae3d02d

soup = BeautifulSoup(open("index.html"), "lxml")
使用 request 向服务器请求网页

wb_data = requests.get("http://www.baidu.com")    # 获得完整的 HTTP response

使用 beautifulsoup 解析网页

soup = Beautifulsoup(wb_data.text,'lxml')   # 用`.text`提取 HTTP 体，即 HTML 文档

搜索文档树

描述要爬取的元素在哪儿，获取元素/标签列表

过滤器类型

字符串
re
列表

如果传入列表参数,Beautiful Soup会将与列表中任一元素匹配的内容返回.下面代码找到文档中所有<a>标签和<b>标签:

soup.find_all(["a", "b"])
# [<b>The Dormouse's story</b>,
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

CSS选择器

Beautiful Soup支持大部分CSS选择器 ,在 Tag 或 BeautifulSoup 对象的 .select() 方法中传入字符串参数,即可使用CSS选择器的语法找到tag。

xx = Soup.select()填入描述元素所在位置的路径，获取标签列表

查找tab标签：

soup.select("title")
# [<title>The Dormouse's story</title>]

通过tag标签逐层查找，遍历子标签:

soup.select("body a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie"  id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

找到某个tag标签下的直接子标签:

soup.select("head > title")
# [<title>The Dormouse's story</title>]

soup.select("p > a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie"  id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

通过CSS的类名查找:

soup.select(".sister")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select("[class~=sister]")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

通过tag的id查找:

soup.select("#link1")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select("a#link2")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

通过是否存在某个属性来查找:

soup.select('a[href]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

通过属性的值来查找:

soup.select('a[href="http://example.com/elsie"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select('a[href^="http://example.com/"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href$="tillie"]')
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href*=".com/el"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

作者：超net
链接：https://www.jianshu.com/p/a2d68ae3d02d
來源：简书
著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。