Big crawler work (modified)

1. Choose a topic that interests you.

2. Write a crawler program in python to crawl data on related topics from the Internet.

3. Perform text analysis on the crawled data to generate a word cloud.

4. Explain the text analysis results.

5. Write a complete blog, describing the above implementation process, problems encountered and solutions, data analysis ideas and conclusions.

6. Finally submit all the crawled data, crawler and data analysis source code.

 

import requests
import re
import jieba
from bs4 import BeautifulSoup
from datetime import datetime


def getNewsDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = 'gb2312'
    soupd = BeautifulSoup(resd.text, 'html.parser')

    content = soupd.select('#endText')[0].text
    info = soupd.select('.post_time_source')[0].text
    date = re.search('(\d{4}.\d{2}.\d{2}\s\d{2}.\d{2}.\d{2})', info).group(1)
    dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    sources = re.search('source:\s*(.*)', info).group(1)
    TopWords = getTopWords(content)
    print('Published time: {0}\nSource: {1}'.format(dateTime, sources))
    print('Keywords: {}, {}, {}, {}, {}'.format(TopWords[0], TopWords[1], TopWords[2],TopWords[3],TopWords[4]))
    print(content)
    f = open("D:\python/test.txt", 'w', encoding='utf8')
    f.write(content)
    f.close()


def getTopWords(content):
    str = '''one! "",. ? ;'"',.,:\n'''
    for s in str:
        content=content.replace(s, ' ')
    wordlist = list(jieba.cut(content))
    exclude = {'this', '\u3000', '\r', '\xa0','when','right','on','and','etc','no','',' no', 'many', 'of', 'big', 'out', '_', 'to', '', 'will', 'in', 'is', 'out', 'one', 'Also', 'Also', '《', '》', '(', ')', 'and', 'I', 'we', 'their', 'can', 'with', 'a ','Short','Medium','Yes','Not'}
    set2 = set(wordlist) - exclude
    dict = {}
    for key in set2:
        dict[key] = wordlist.count(key)
    dictlist = list(dict.items())
    dictlist.sort(key=lambda x: x[1], reverse=True)
    return dictlist;

def getListPage(listUrl):
    res = requests.get(listUrl)
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text, 'html.parser')
    for new in soup.select('#news-flow-content')[0].select('li'):
        url = new.select('a')[0]['href']
        title = new.select('a')[0].text
        print('Title: {0}\nLink: {1}'.format(title, url))
        getNewsDetail(url)
        break

listUrl = 'http://tech.163.com/internet/'
getListPage(listUrl)
for i in range(2, 10):
    listUrl = 'http://tech.163.com/special/it_2016_%02d/' % i
    getListPage(listUrl)
 
 

  

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325068327&siteId=291194637