2019年7月22日（day07）の研究ノート（爬虫類）

day07 StudyNote

1.爬虫類

import requests
import re

response = requests.get('http://www.taobao.com')# 模拟浏览器打开淘宝网页
response = requests.get('http://duanziwang.com/')# 模拟浏览器打开段子网网页
data = reponse.text
# .代表匹配所有字符，*表示前面的字符0到无穷个
res = re.findall('href="(.*?)"',data)
print(res)

2.段落サブネット

import requests
import re

response = requests.get('http://duanziwang.com/')
print(response.status_code)
print(response.encoding)
data = response.text
print(data)

# .代表匹配所有字符，*表示前面的字符0到无穷个
content_res = re.findall('<div class="content">(.*?)</div>',data)# 查找内容赋给print(content_res)
title_res = re.findall('<a href="/subject/">(.*?)</a>')
print(title_res.index('活得糊涂的人，容易幸福))# 打印title_res的索引位置9
print(title_res.index('购买银行理财产品亏损后如何起诉'))#打印title_res的索引位置60
title_res = title_res[10:60]
print(type(title_res))
                      
                      
dict = {}
for i in range(len(title_res):# for循环title_res列表的长度
     dict[titile(i)] = content_res(i) # 字典里面添加内容    
           
for i in title_content_dic.items():# 循环字典,i是元组类型
    print(f'{i[0]:<40} {i[1]:<1000}')

3.ニック・ダイアグラムネットワーク爬虫類

import requests
import re

response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1')
# 模仿浏览器打开昵图网的网页
data = response.text
# print(data)

# .代表匹配所有字符，*表示前面的字符0到无穷个
res = re.findall('data-src="(.*?)"',data)

# print(res)
for i in res:# 循环渠道的res，列表类型
    print(i)
    res_response = requests.get(i)
    res_data = res_response.content
    res_name = i.split('/')[-1]

    f=open(res_name,'wb')
    f.write(res_data)
    # f.flush()

4.動画の爬虫類

import requests
import re

response = requests.get('http://www.mod.gov.cn/v/index.htm')
# response.encoding = 'utf8'
data = response.text
# print(data)

# mp4_res1 = re.findall('<a href="(.*?)"  class="img">',data)
# for i in mp4_res1:
#     print(i)


mp4_res2 = re.findall('<a href="(.*?)">', data)

for i in mp4_res2:  # type:str
    res = re.findall('(.*?htm)', i)[0]
    res = 'http://www.mod.gov.cn/v/' + res

    response = requests.get(res)
    data = response.text
    # http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4
    url_res = re.findall('//Video(.*?.mp4)',data)[0]


    mp4_response = requests.get(url_res)
    mp4_data = mp4_response.content
    f = open('test.mp4','wb')
    f.write(mp4_data)
    # break

2019年7月22日（day07）の研究ノート（爬虫類）

day07 StudyNote

1.爬虫類

2.段落サブネット

おすすめ