#coding=utf-8
import requests
import re
from bs4 import BeautifulSoup
#BeautifulSoup正则表达式搜索
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
<b>The Dormouse's <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;story11111111111111111</b></p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html,features='lxml')
#soup = BeautifulSoup(open('index.php')) 打开开本地文件
print soup.prettify() #将文件格式化
all_href = soup.find_all('a') #将网页中所有的网址都输出
print all_href
for l in all_href:
print l['href']
story = soup.find('p',{'class':'story'})#正则表达式的搜索自标签内的东西
d_story = story.find_all('b')
for t in d_story:
print t.get_text()
'''
#get
param = {"wd":"莫烦python"}
r=requests.get('https://www.baidu.com/s',params = param)
r.encoding="utf-8"
print r.text
#Post
data = {'username':'zhangsan','password':'1234560'}
url="http://www.baidu.com"
r=requests.post(url,data=data)
print r.text
#文件上传
file = {'uploadFile':open('./imag.png','rb')}
r = request.post(url,files=file)
print r.text
#cookie登陆
session = requests.Session()
payload = {'username':'12131321','password':'11111111'}
r = requests.post('https://www.baidu.com/s',data=payload)
print r.cookies.get_dict()
r = session.get('登陆框的地址')
print r.text
#电影,图片,文件下载
root="文件存放地址"
r = requests.get(IMAG_URL, stream=True)
path =root + imgss.split('/')[-1]
with open(path, 'wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)
#unicode字符转换为中文
import json
for l in get_div:
ul = l('a')
title = ul[0]['title']
print json.dumps(title).decode('unicode-escape') #输出章节
#正则表达式
import re
reg = '<a title=(.*?) href="/lishi/268522/(.*?)">(.*?)</a>'
name_url = re.finditer(reg,html)
for l in name_url:
print l.group(2), l.group(1)
'''
python爬虫一些基本编码语句
猜你喜欢
转载自blog.csdn.net/qq_42133828/article/details/83548428
今日推荐
周排行