基于python爬虫的热点时事新闻文章采集

    1. 实验目的
  1. 熟练掌握爬虫采集互联网数据的全过程;
  2. 了解互联网网站的一些常用反爬虫机制。
    1. 实验工具
  1. 编程语言:Python;
  2. 开发环境:PyCharm(或Sublime Text、Eclipse + PyDev、Visual Studio等);
  3. 常用模块:scrapy、urllib、request等。
    1. 实验题目

热点时事新闻文章采集:

  1. 仅下载当天最新、热点的时事新闻;
  2. 不同网站的新闻保存在不同文件夹中,并记录每篇新闻的来源、标题、发布时间、下载时间、url地址等信息;
  3. 爬虫初始种子:新浪(news.sina.com.cn)、搜狐(news.sohu.com)、凤凰(news.ifeng.com)、网易(news.163.com)、百度(news.baidu.com)。
    1. 实验步骤

1. 安装Python、PyCharm开发环境,并下载爬虫所需的开发模块;

2. 建立相关爬虫项目,从不同新闻网站下载热点新闻文章。(请附上详细代码、爬虫下载截图、运行效果等内容)

1.新浪

import requests

import bs4

import re

import datetime

headers = {

    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

扫描二维码关注公众号,回复: 17325127 查看本文章

}

url = 'https://news.sina.com.cn/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll('div', { 'class': 'ct_t_01'})#找到新闻标题的所在标签名称

i = 0

for each in p1:

    href = each.select('a')#a标签下的href

    #detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'[a-zA-z]+://[^\s]*ml')

ls = pattern.findall(href)

print(ls)

title = [[] for _ in range(25)]

data = [[] for _ in range(25)]

source = [[] for _ in range(25)]

while i < ls.__len__():

    response = requests.get(ls[i], headers=headers)

    response.encoding = 'utf-8'

    text = response.text

    soup = bs4.BeautifulSoup(text, 'html.parser')

    title[i] = soup.find('h1', class_='main-title').get_text()

    data[i] = soup.find('span', class_='date').text

    source[i] = soup.find('a', class_='source').text

    s1 = soup.findAll('div', { 'class': 'article'})

    for each in s1:

        hr = each.select('p')

    # print(hr)

    hr = str(hr)

    findjs = re.compile(r'<p.*?>\u3000\u3000(.*?)</.*?>')

    js=findjs.findall(hr)

    # print(js)

    file3 = open(r'新浪/%s.txt'%title[i], 'w', encoding='UTF-8')

    for j in range(len(js)):

        s = re.sub(r'<.*?>', '', str(js[j]))

        file3.write(s + '\n')

    curr_time = datetime.datetime.now()

    time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

    file3.write("爬取时间:" + time_str + "\n")

    file3.write("发布时间:     " + data[i] + "     来源:    " + source[i] + "      标题:   " + title[i] + "     网址:   " + ls[i] + "\n")

    file3.close()

    i = i+1

  1. 搜狐 

import requests

import bs4

import re

import datetime

headers = {

    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'http://news.sohu.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll('div', { 'id': 'block4'})#找到新闻标题的所在标签名称

i = 0

k = 0

# print(p1)

for each in p1:

    href = each.select('a')#a标签下的href

    #detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'href="(.*?)" ')

l = pattern.findall(href)

prefix = 'http://news.sohu.com'

ls = [prefix + url for url in l]

print(ls)

title = [[] for _ in range(50)]

data = [[] for _ in range(50)]

source = [[] for _ in range(50)]

while i < ls.__len__():

    print(ls[i])

    response = requests.get(ls[i], headers=headers)

    response.encoding = 'utf-8'

    text = response.text

    soup = bs4.BeautifulSoup(text, 'html.parser')

    title[i] = soup.find('h1').text

    title[i]= ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))

    print(title[i])

    data[i] = soup.find('span', class_='time').text

    source[i] = soup.find('span', { 'data-role': 'original-link'}).text.strip()

    s1 = soup.findAll('article', { 'class': 'article'})

    for each in s1:

        hr = each.select('p')

    hr = str(hr)

    findjs = re.compile(r'<p.*?>(.*?)</.*?>')

    js = findjs.findall(hr)

    print(js)

    file3 = open(r'搜狐/%s.txt'%title[i], 'w', encoding='UTF-8')

    for j in range(len(js)):

        s = re.sub(r'<.*?>', '', str(js[j]))

        file3.write(s + '\n')

    curr_time = datetime.datetime.now()

    time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

    file3.write("爬取时间:" + time_str + "\n")

    file3.write("发布时间:     " + data[i] + "     来源:    " + source[i] + "      标题:   " + title[i] + "     网址:   " + ls[i] + "\n")

    file3.close()

    i = i+1

  1. 凤凰

import requests

import bs4

import re

import datetime

headers = {

    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'https://news.ifeng.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll('div', { 'class': 'index_content_RQk8t'})#找到新闻标题的所在标签名称

i = 0

print(p1)

for each in p1:

    href = each.select('a')#a标签下的href

    #detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'href="//(.*?)" ')

l = pattern.findall(href)

prefix = 'http://'

ls = [prefix + url for url in l]

print(ls)

title = [[] for _ in range(100)]

data = [[] for _ in range(100)]

source = [[] for _ in range(100)]

while i < ls.__len__():

    print(ls[i])

    response = requests.get(ls[i], headers=headers)

    response.encoding = 'utf-8'

    text = response.text

    soup = bs4.BeautifulSoup(text, 'html.parser')

    title[i] = soup.find('h1', class_='index_topic_5hyUE').get_text()

    print(title[i])

    data[i] = soup.find('div', class_='index_timeBref_20hzr').text

    print(data[i])

    source[i] = soup.find('div', class_='index_sourceTitleText_wlTy-').text

    print(source[i])

    s1 = soup.findAll('div', { 'class': 'index_main_content_j-HoG'})

    for each in s1:

        hr = each.select('p')

    hr = str(hr)

    findjs = re.compile(r'<p.*?>(.*?)</.*?>')

    js = findjs.findall(hr)

    print(js)

    file3 = open(r'凤凰/%s.txt'%title[i], 'w', encoding='UTF-8')

    for j in range(len(js)):

        s = re.sub(r'<.*?>', '', str(js[j]))

        file3.write(s + '\n')

    curr_time = datetime.datetime.now()

    time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

    file3.write("爬取时间:" + time_str + "\n")

    file3.write("发布时间:     " + data[i] + "     来源:    " + source[i] + "      标题:   " + title[i] + "     网址:   " + ls[i] + "\n")

    file3.close()

    i = i+1

  1. 网易

import requests

import bs4

import re

import datetime

headers = {

    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'https://news.163.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll('div', { 'class': 'news_default_news'})#找到新闻标题的所在标签名称

i = 0

for each in p1:

    href = each.select('a')#a标签下的href

    #detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'[a-zA-z]+://[^\s]*ml')

ls = pattern.findall(href)

title = [[] for _ in range(25)]

data = [[] for _ in range(25)]

source = [[] for _ in range(25)]

while i < ls.__len__():

 try:

    print(ls[i])

    response = requests.get(ls[i], headers=headers)

    response.encoding = 'utf-8'

    text = response.text

    soup = bs4.BeautifulSoup(text, 'html.parser')

    title[i] = soup.find('h1', class_='post_title').get_text()

    title[i] = ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))

    print(title[i])

    div_tag = soup.find('div', class_='post_info')

    data[i] = div_tag.get_text(strip=True).split(' ')[0]

    print(data[i])

    source[i] = soup.find("div", class_="post_info").find("a").text

    print(source[i])

    s1 = soup.findAll('div', { 'class': 'post_body'})

    for each in s1:

        hr = each.select('p')

    #print(hr)

    hr = str(hr)

    findjs = re.compile(r'<p.*?>(.*?)</.*?>')

    js=findjs.findall(hr)

    print(js)

    file3 = open(r'网易/%s.txt'%title[i], 'w', encoding='UTF-8')

    for j in range(len(js)):

        s = re.sub(r'<.*?>', '', str(js[j]))

        file3.write(s + '\n')

    curr_time = datetime.datetime.now()

    time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

    file3.write("爬取时间:" + time_str + "\n")

    file3.write("发布时间:     " + data[i] + "     来源:    " + source[i] + "      标题:   " + title[i] + "     网址:   " + ls[i] + "\n")

    file3.close()

 except:

     i = i + 1

     continue

 i = i+1

  1. 百度

import requests

import bs4

import re

import datetime

headers = {

    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"

}

url = 'https://news.baidu.com/'

response = requests.get(url, headers=headers)

print(response.status_code)

response.encoding = 'utf-8'

text = response.text

soup = bs4.BeautifulSoup(text, 'html.parser')

p1 = soup.findAll('div', { 'class': 'mod-tab-pane active'})#找到新闻标题的所在标签名称

i = 3

#print(p1)

for each in p1:

    href = each.select('a')#a标签下的href

    #detail_url = href.get('href')

print(href)

href = str(href)

pattern = re.compile(r'href="(.*?)" ')

ls = pattern.findall(href)

print(ls)

title = [[] for _ in range(100)]

data = [[] for _ in range(100)]

source = [[] for _ in range(100)]

while i < ls.__len__():

    print(ls[i])

    response = requests.get(ls[i], headers=headers)

    response.encoding = 'utf-8'

    text = response.text

    soup = bs4.BeautifulSoup(text, 'html.parser')

    title[i] = soup.find('h1', class_='title').get_text()

    pattern = re.compile(r'[\u4e00-\u9fa5]+')  # 匹配中文字符

    title[i] = ''.join(pattern.findall(title[i]))  # 提取中文并拼接成字

    print(title[i])

    data[i]=soup.find('div', class_='media-publish-time').get_text()

    # a=soup.find('span', class_='year').get_text()

    # b=soup.find('span', class_='day').get_text()

    # c=soup.find('span', class_='time').get_text()

    # data[i] = '.'.join([a, b, c])

    print(data[i])

    source[i] = soup.find('div', class_='source').text

    print(source[i])

    s1 = soup.findAll('div', { 'id': 'detail'})

    for each in s1:

        hr = each.select('p')

    hr = str(hr)

    findjs = re.compile(r'<p.*?>(.*?)</.*?>')

    js = findjs.findall(hr)

    print(js)

    file3 = open(r'百度/%s.txt'%title[i], 'w', encoding='UTF-8')

    for j in range(len(js)):

        s = re.sub(r'<.*?>', '', str(js[j]))

        file3.write(s + '\n')

    curr_time = datetime.datetime.now()

    time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')

    file3.write("爬取时间:" + time_str + "\n")

    file3.write("发布时间:     " + data[i] + "     来源:    " + source[i] + "      标题:   " + title[i] + "     网址:   " + ls[i] + "\n")

    file3.close()

    i = i+1

    1. 实验心得

简要介绍你在实验中使用到的各爬虫模块主要功能、爬虫下载的主要步骤、常见的反爬虫机制等?

猜你喜欢

转载自blog.csdn.net/qq_63042830/article/details/135117644