table of Contents
python reptile
Install python package
pip install requests
Not all sites can use the crawling reptiles, an important part of the site, there will be anti-climb measures, and therefore not so No. crawling.
So we choose some sites easier crawling of practice
requests the library built-in method
get (url) method
Use requests.get initiation request, it returns a response to the response data.
text method
Library requests built by the method to obtain response data source
content method
Library requests built by the method to obtain response data text
re library built-in method
findall method
By regular expression match
Download pictures or video files can use to operate. Set binary mode collection
Example:
import requests
import re
data = requests.get("https://ishuo.cn/") # 拿到网页的源码
data = data.text
# .匹配所有字符 *是表示前面的字符是0到无穷个
res = re.findall("<li class=\"list_li\">(.*?)</li>",data) # 拿到网页中列表的全部数据
res_dict = {}
for i in res:
title = re.findall("<a href=\"/subject/.*?\">(.*?)</a>", i)[0] # 拿到标题数据
content = re.findall("<div class=\"content\">(.*?)</div>", i)[0] # 拿到文本数据
res_dict[title] = content
for i in res_dict.items():
print(f"标题:{i[0]} \n内容:{i[1]}\n")
Select a website crawling text and do word frequency and word cloud analysis , choose another website crawling picture content.
# 爬取文字并做词频和词云分析
import re
import requests
data = requests.get("http://www.haha56.net/xiaohua/gushi/")
data.encoding = "gb2312"
data = data.text
url_list = re.findall("<dt><a href=\"(.*?.html)", data) # 拿到url列表
for url in url_list: # 解析url
data = requests.get(url)
data.encoding = "gb2312"
data = data.text
title = re.findall("<div class=\"title\">(.*?)</div>", data)[0] # 拿到标题
data = re.findall("【1】 (.*?)” </div>",data) # 拿到文本数据
content_list = ""
for i in data:
content_list += i # 拼接文本
# 开始词频分析
import jieba
content_list = jieba.lcut(content_list)
word_dict = {} # 声明词频字典
for word in content_list:
if len(word) == 1 or word == "ldquo" or word == "rdquo": # 去掉脏数据
continue
if word not in word_dict:
word_dict[word] = 1
else:
word_dict[word] += 1
word_list = list(word_dict.items()) # 把字典中的键值对转成list类型
def func(i):
return i[1]
word_list.sort(key=func) # 排序
word_list.reverse()
for i in word_list[:10]: # 输出前十名
print(i)
# 生成词云
import wordcloud
fdata = ""
for i in word_list:
fdata += i[0]+":"+str(i[1])+" "
w = wordcloud.WordCloud(r"C:\Windows\Fonts\simkai.ttf")
w.generate(fdata)
w.to_file(title+".png")
break
# 爬取图片内容
import re
import requests
data = requests.get("https://www.doutula.com/")
data = data.text
# print(data)
res = re.findall("<img referrerpolicy=.*?alt=\".*?\">",data)
for i in res:
title = re.findall("alt=\"(.*?)\"", i)[0] # 标题
img_url = re.findall("data-original=\"(.*?)\"",i)[0] # 都是图片
if title == "":
title = str(img_url).split("/")[-1]
data = requests.get(img_url)
f = open(title+"."+str(img_url).split(".")[-1],"wb")
f.write(data.content)
print("拿到一张图片")