1. Choose a topic that interests you.
2. Write a crawler program in python to crawl data on related topics from the Internet.
3. Perform text analysis on the crawled data to generate a word cloud.
4. Explain the text analysis results.
5. Write a complete blog, describing the above implementation process, problems encountered and solutions, data analysis ideas and conclusions.
6. Finally submit all the crawled data, crawler and data analysis source code.
import requests import re import jieba from bs4 import BeautifulSoup from datetime import datetime def getNewsDetail(newsUrl): resd = requests.get(newsUrl) resd.encoding = 'gb2312' soupd = BeautifulSoup(resd.text, 'html.parser') content = soupd.select('#endText')[0].text info = soupd.select('.post_time_source')[0].text date = re.search('(\d{4}.\d{2}.\d{2}\s\d{2}.\d{2}.\d{2})', info).group(1) dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') sources = re.search('source:\s*(.*)', info).group(1) TopWords = getTopWords(content) print('Published time: {0}\nSource: {1}'.format(dateTime, sources)) print('Keywords: {}, {}, {}, {}, {}'.format(TopWords[0], TopWords[1], TopWords[2],TopWords[3],TopWords[4])) print(content) f = open("D:\python/test.txt", 'w', encoding='utf8') f.write(content) f.close() def getTopWords(content): str = '''one! "",. ? ;'"',.,:\n''' for s in str: content=content.replace(s, ' ') wordlist = list(jieba.cut(content)) exclude = {'this', '\u3000', '\r', '\xa0','when','right','on','and','etc','no','',' no', 'many', 'of', 'big', 'out', '_', 'to', '', 'will', 'in', 'is', 'out', 'one', 'Also', 'Also', '《', '》', '(', ')', 'and', 'I', 'we', 'their', 'can', 'with', 'a ','Short','Medium','Yes','Not'} set2 = set(wordlist) - exclude dict = {} for key in set2: dict[key] = wordlist.count(key) dictlist = list(dict.items()) dictlist.sort(key=lambda x: x[1], reverse=True) return dictlist; def getListPage(listUrl): res = requests.get(listUrl) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'html.parser') for new in soup.select('#news-flow-content')[0].select('li'): url = new.select('a')[0]['href'] title = new.select('a')[0].text print('Title: {0}\nLink: {1}'.format(title, url)) getNewsDetail(url) break listUrl = 'http://tech.163.com/internet/' getListPage(listUrl) for i in range(2, 10): listUrl = 'http://tech.163.com/special/it_2016_%02d/' % i getListPage(listUrl)