When crawling a stand and do simple analysis, we come across problems and to share, to avoid mistakes:
Dian site is a path /info/1013/13930.htm, where 13930 is the ID value of different news, but although this number is ascending, but without ascending any laws.
Solution:
Use range order of crawling, wrong site will be reported as shown in page error:
Then we went first to determine whether the return to the page that contains str 'Sorry, Page Not Found', if it contains skip, crawl page does not contain the key information
Second, find other pages in the crawl process, the content has been revoked, then to judge my normal page, and skip, I found it impossible to skip
Solution:
See page encoded: UTF-8
When the existence of str with a judge if a page, the page content will first be UTF-8 encoding can be solved:
response.encoding = 'utf-8'
Third, when crawling the site, met with no text in the main page news all pictures
Workaround: different page then view pictures and news pages
Picture page Key Tags:
News page Key Tags:
Found under different div tag ID, then we skip id = 'vsb_content' can, in skip, should first determine the content of the page is determined id = 'vsb_content_4', because vsb_content_4 contains vsb_content
Fourth, after crawling news, BUG when the news is written csv files, duplicate content
Solution:
In the list of files written to the write will clear the list, because there will be gaps in the execution cycle skipping page does not exist, then there is the data_list content
The final code:
1 #!/user/bin/env python 2 # -*- coding:utf-8 -*- 3 # Author: Mr.riy 4 5 6 import re 7 import requests 8 import csv 9 import time 10 import jieba 11 import jieba.analyse 12 from requests.exceptions import RequestException 13 from bs4 import BeautifulSoup 14 15 16 class Downloader: 17 def __init__(self): 18 self.data_list = [] 19 20 def download(self, url, num_retries=3): 21 '判断页面' 22 print('Downloading:', url) 23 global response 24 response = requests.get(url) 25 response.encoding='utf-8' 26 try: 27 if 'Sorry, Page Not Found' in response.text: 28 print(url, '页面不存在') 29 elif '该内容已经被撤销' in response.text: 30 print(url, '页面不存在') 31 elif response.status_code == 200: 32 print('下载成功,开始执行......') 33 # print(response.text) 34 # print(response.encoding) 35 page = response.content 36 self.find_all(page) 37 time.sleep(1) 38 else: 39 if num_retries > 0 and 500 <= response.status_code <= 600: 40 html = self.download(url, num_retries-1) 41 except RequestException as e: 42 print(e) 43 44 def find_all(self, page): 45 '爬取内容' 46 soup_title = BeautifulSoup(page, 'lxml') 47 sp_title_items = soup_title.find('h2', attrs={'align': 'center'}) 48 title = sp_title_items.text 49 print(title) 50 51 sp_time_items = soup_title.find('div', attrs={'style': 'line-height:400%;color:#444444;font-size:14px'}) 52 times = sp_time_items.text 53 # print(times) 54 time = re.findall(r'\d{4}年\d{2}月\d{2}日 \d{2}:\d{2}', times) 55 # print(time) 56 author = re.findall(r'作者:(.*)', times) 57 # print(author) 58 global response 59 if 'vsb_content_4' in response.text: 60 sp_words_items = soup_title.find('div', attrs={'id': 'vsb_content_4'}) 61 elif 'vsb_content_501' in response.text: 62 sp_words_items = soup_title.find('div', attrs={'id': 'vsb_content_501'}) 63 else: 64 sp_words_items = soup_title.find('div', attrs={'id': 'vsb_content'}) 65 66 words = sp_words_items.text 67 # print(words) 68 row = [] 69 row.append(time) 70 row.append(author) 71 row.append(words) 72 self.data_list.append(row) 73 74 def write_csv(self, filename, all_list): 75 '写入csv文件' 76 with open(filename, 'w', encoding="utf-8-sig", newline='') as f: 77 writer = csv.writer(f) 78 fields = ('时间', '作者', '内容') 79 writer.writerow(fields) 80 for row in all_list: 81 writer.writerow(row) 82 83 def fetch_data(self): 84 '设置爬取页面' 85 all_list = [] 86 for page in range(13795, 14000, 1): #设置爬取的页面范围 87 url = f'http://www.xxxxxx.cn/info/1013/{page}.htm' 88 self.download(url) 89 all_list += self.data_list 90 self.data_list = [] 91 92 self.write_csv('data.csv', all_list) 93 94 95 class analyze: 96 def get_all_text(self, filename): 97 '取出所有评价的句子' 98 comment_list = [] 99 with open(filename, encoding="utf-8-sig") as f: 100 rows = csv.reader(f) 101 for row in rows: 102 one_comment = row[-1] 103 comment_list.append(one_comment) 104 105 return ''.join(comment_list[1:]) 106 107 def cut_text(self, all_text): 108 '找到评价中重要关键词' 109 jieba.analyse.set_stop_words('stop_words.txt') 110 text_tags = jieba.analyse.extract_tags(all_text, topK=30) 111 return text_tags 112 113 114 def main(): 115 temp = Downloader() 116 temp.fetch_data() 117 b = analyze() 118 all_text = b.get_all_text('data.csv') 119 text_tags = b.cut_text(all_text) 120 print(text_tags) 121 122 123 if __name__ == "__main__": 124 main()
运行截图:最近新闻出现最多的关键字为,防疫,疫情,工作