python爬虫一些基本编码语句

#coding=utf-8
import requests
import re
from bs4 import BeautifulSoup

#BeautifulSoup正则表达式搜索
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
<b>The Dormouse's <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;story11111111111111111</b></p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html,features='lxml')
#soup = BeautifulSoup(open('index.php')) 打开开本地文件

print soup.prettify()  #将文件格式化

all_href = soup.find_all('a')  #将网页中所有的网址都输出
print all_href
for l in all_href:
    print l['href']

story = soup.find('p',{'class':'story'})#正则表达式的搜索自标签内的东西
d_story = story.find_all('b')
for t in d_story:
    print t.get_text()

'''     
#get
param = {"wd":"莫烦python"}
r=requests.get('https://www.baidu.com/s',params = param)
r.encoding="utf-8"
print r.text

#Post
data = {'username':'zhangsan','password':'1234560'}
url="http://www.baidu.com"
r=requests.post(url,data=data)
print r.text

#文件上传
file = {'uploadFile':open('./imag.png','rb')}
r = request.post(url,files=file)
print r.text

#cookie登陆
session = requests.Session()
payload = {'username':'12131321','password':'11111111'}
r = requests.post('https://www.baidu.com/s',data=payload)
print r.cookies.get_dict()
r = session.get('登陆框的地址')
print r.text

#电影，图片，文件下载
root="文件存放地址"
r = requests.get(IMAG_URL, stream=True)
path =root + imgss.split('/')[-1]
with open(path, 'wb') as f:
    for chunk in r.iter_content(chunk_size=32):
        f.write(chunk)

#unicode字符转换为中文
import json
    for l in get_div:
        ul = l('a')
        title = ul[0]['title']
        print json.dumps(title).decode('unicode-escape') #输出章节
        
#正则表达式
import re
    reg = '<a title=(.*?) href="/lishi/268522/(.*?)">(.*?)</a>'
    name_url = re.finditer(reg,html)
    for l in name_url:
        print l.group(2), l.group(1)
 '''
python爬虫一些基本编码语句

猜你喜欢