day07 StudyNote
1.爬虫類
import requests
import re
response = requests.get('http://www.taobao.com')# 模拟浏览器打开淘宝网页
response = requests.get('http://duanziwang.com/')# 模拟浏览器打开段子网网页
data = reponse.text
# .代表匹配所有字符,*表示前面的字符0到无穷个
res = re.findall('href="(.*?)"',data)
print(res)
2.段落サブネット
import requests
import re
response = requests.get('http://duanziwang.com/')
print(response.status_code)
print(response.encoding)
data = response.text
print(data)
# .代表匹配所有字符,*表示前面的字符0到无穷个
content_res = re.findall('<div class="content">(.*?)</div>',data)# 查找内容赋给print(content_res)
title_res = re.findall('<a href="/subject/">(.*?)</a>')
print(title_res.index('活得糊涂的人,容易幸福))# 打印title_res的索引位置9
print(title_res.index('购买银行理财产品亏损后如何起诉'))#打印title_res的索引位置60
title_res = title_res[10:60]
print(type(title_res))
dict = {}
for i in range(len(title_res):# for循环title_res列表的长度
dict[titile(i)] = content_res(i) # 字典里面添加内容
for i in title_content_dic.items():# 循环字典,i是元组类型
print(f'{i[0]:<40} {i[1]:<1000}')
3.ニック・ダイアグラムネットワーク爬虫類
import requests
import re
response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1')
# 模仿浏览器打开昵图网的网页
data = response.text
# print(data)
# .代表匹配所有字符,*表示前面的字符0到无穷个
res = re.findall('data-src="(.*?)"',data)
# print(res)
for i in res:# 循环渠道的res,列表类型
print(i)
res_response = requests.get(i)
res_data = res_response.content
res_name = i.split('/')[-1]
f=open(res_name,'wb')
f.write(res_data)
# f.flush()
4.動画の爬虫類
import requests
import re
response = requests.get('http://www.mod.gov.cn/v/index.htm')
# response.encoding = 'utf8'
data = response.text
# print(data)
# mp4_res1 = re.findall('<a href="(.*?)" class="img">',data)
# for i in mp4_res1:
# print(i)
mp4_res2 = re.findall('<a href="(.*?)">', data)
for i in mp4_res2: # type:str
res = re.findall('(.*?htm)', i)[0]
res = 'http://www.mod.gov.cn/v/' + res
response = requests.get(res)
data = response.text
# http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4
url_res = re.findall('//Video(.*?.mp4)',data)[0]
mp4_response = requests.get(url_res)
mp4_data = mp4_response.content
f = open('test.mp4','wb')
f.write(mp4_data)
# break