day06 HomeWork
1. reptiles Pictures
import re
import requests
response = requests.get('https://www.doutula.com/')
print(response.status_code)
print(response.encoding)
data = response.text
# print(data)
res = re.findall('<img referrerpolicy=".*?" data-original="(.*?)".*?>',data)
for i in res:
print(i)
res_response = requests.get(i)
res_data = res_response.content
res_name = i.split('/')[-1]
f = open(res_name,'wb')
f.write(res_data)
2. reptiles word frequency statistics
import re
import requests
import jieba
response = requests.get('http://www.haha56.net/a/2016/12/14140.html')
response.encoding='gbk'
data = response.text
# print(data)
res_content = re.findall('<meta name=".*?" content="(.*?)">',data)
res_content = str(res_content)
print(type(res_content))
res_content_jieba = jieba._lcut(res_content)
# print(type(res_content_jieba))
# print(res_content_jieba)
dict = {}
for word in res_content_jieba:
if len(word)==1:
continue
if word in dict:# 如果循环的这个词已将在我们定义的count_dict这个集合里面了
dict[word] += 1# 那count_dict[word]的值就累加1,用来计数
else:
dict[word] = 1# 如果不在,那就在集合里面添加这个词,并且赋给他值1
# print(dict)
def func(i):
return i[1]
data_list=list(dict.items())
data_list.sort(key=func)
data_list.reverse()
print(data_list)
3. word cloud
import wordcloud
import re
import requests
response = requests.get('http://www.haha56.net/a/2016/12/14140.html')
response.encoding='gbk'
data = response.text
res_content = re.findall('<meta name=".*?" content="(.*?)">',data)
print(res_content)
dat =str(res_content)
w = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\simkai.ttf')
w.generate(dat)
w.to_file('ciyun.png')