Webdriver 爬取新浪滚动新闻
初始想法
本人现在是国际关系学院2016级的本科生,学的是信息管理与信息系统。讲道理不知道这个专业到底是干啥的,现在选择的后续方向是数据科学与工程,并且在老师的自然语言处理小组。爬虫是做自然语言处理的基础嘛,学习机器学习之前先学学怎么爬取内容还是挺有意义的。本来开始想着爬一下新浪微博的内容,但是又涉及到滚动爬取,账号登陆之类的繁琐问题,还是先玩玩滚动新闻吧。其实讲道理中国新闻网的滚动新闻做的比新浪的好多了,界面也好看,不过这都是爬完之后才发现的哈哈哈哈哈哈哈
背景介绍
本项目为基于新浪滚动新闻(https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1)进行网页爬取,以新闻标题为文件名称,以新闻内容为文件正文存储。
内容介绍
环境要求
- 环境要求:python3.7+
- 安装包要求:time,requests,random,Beautifulsoup,selenium等
文件介绍
- main:调取其他所有文件相关函数,输入初始url,并计算爬取全部网页耗时
- date_helper:对网页日期进行调整实现自动翻页
- data_helper:对数据的所有路径进行调整
- spider:爬取网页的主文件,调用Webdriver获取主索引页的子页面并获取网页内容
- article_spider:爬取新闻正文内容
代码
main
from date_helper import date_processing
from data_helper import pickle_writer
from spider import *
import time
start = time.clock()
if __name__ == '__main__':
url_org = 'http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&date={}&ch=01&k=&offset_page=0&offset_num=0&num=60&asc=&page='
while True:
date = date_processing() # 获取日期
output_list = [] # 存放输出序列 list
url = url_org.format(date) # 生成待爬取URL
sina(url,output_list,date) # 爬虫
print(output_list)
print(len(output_list))
file_name = ''.format(date)
pickle_writer(output_list, file_name) # 写入临时文件存放
end = time.clock()
print('Running:%s seconds.'%(end - start))
date_helper
from selenium import webdriver
import re
import time
import calendar
import re
import codecs
from data_helper import *
def count_days(year, month):
cal = calendar.monthrange(year, month)
pattern = re.compile(r'\d+')
days = pattern.findall(str(cal))[1]
return days
def month_sub(year,month):
if month > 10:
month -= 1
month = str(month)
elif month <= 10 and month > 1 :
month -= 1
month = '0'+str(month)
else:
year -= 1
month = 12
return year,month
def date_sub(year,month,day):
if day > 10:
day -= 1
day = str(day)
elif day <= 10 and day > 1:
day -= 1
day = '0'+str(day)
else:
year, month = month_sub(int(year),int(month))
days = count_days(year, int(month))
day = days
date = str(year)+'-'+str(month) +'-'+str(day) #新浪滚动新闻
return date
def date_processing():
date_txt = ""
last_date = txt_load(date_txt)
date = str(last_date[0])
year = int(date.split("-")[0])
month = date.split("-")[1]
day = int(date.split("-")[2])
date = date_sub(year, month, day)
writer = codecs.open(date_txt,'w','UTF-8')
writer.write(date)
writer.flush()
return date
data_helper
import re
import pickle
import codecs
import jieba
'''
读取原始数据
'''
def txt_load(path):
reader = codecs.open(path,'r','UTF-8')
lines = reader.readlines()
return lines
def join_list(ss):
c = ""
for k in ss:
c+=k
return c
def pickle_writer(input_,name):
'''
:param input_: 待保存的数据
:param name: 存放路径
'''
writer = open(name,"wb")
pickle.dump(input_,writer)
writer.close()
print("finish to write data")
# 定义读plk文件函数
def pickle_load(input_):
'''
:param input_: 路径
:return: 原始数据
'''
raeder = open(input_,"rb")
content = pickle.load(raeder)
raeder.close()
print("finish to read data")
return content
def jieba_cut(content):
'''
:param content: str 句子 待分词
:return: 分好词的list
'''
cut = jieba.cut(content)
l = []
for con in cut:
if con!=" ":
l.append(con)
return l
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
return uchar
elif uchar == re.sub('[^a-zA-Z]', '', uchar):
return str(uchar).lower()
else:
return ''
spider
# -*- coding: utf-8 -*-
from selenium import webdriver
from article_spider import *
import re
def get_pages(driver,url):
'''
:param driver: Webdriver页面
:param url: 指定日期的链接
:return page_num: 指定日期内页面的数量
'''
start_url = url + '1'
driver.get(start_url)
time.sleep(2)
driver.refresh()
time.sleep(2)
page_html = driver.page_source
pagelist = re.findall('onclick="newsList.page.goTo(.*?);return false', page_html, re.S)
pattern = re.compile('\d+') # 获取页码数
page_num = pattern.findall(pagelist[len(pagelist)-1])[0]
return (page_num)
def Get_content(driver,page_num,url,output_list,date):
'''
:param driver: Webdriver页面
:param page_num: 指定日期内页面的数量
:param url: 指定日期的链接
:param output_list: 输出list
:param date: 指定日期
'''
k = 1
while k <= int(page_num):
driver.get(url + str(k))
time.sleep(2.5)
driver.refresh()
for i in range(1, 11):
for j in range(1, 6):
classfy_cn = driver.find_element_by_xpath(
'//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[1]').text
title = driver.find_element_by_xpath(
'//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[2]/a').text
href = driver.find_element_by_xpath(
'//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[2]/a').get_attribute('href')
times = driver.find_element_by_xpath(
'//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[3]').text
pubtime = times.split(" ")[1]
content, classfy_en = get_article(href)
content_list = [classfy_cn, classfy_en, date, pubtime, title, href, content]
test = '' + title + '.txt'
with open(test, 'w') as f:
for content_list_element in content_list:
f.write(content_list_element)
output_list.append(content_list)
print(len(output_list))
k = k + 1
def sina(url,output_list,date):
'''
:param url: 待爬取的url
:param output_list: 输出list
:param date: 日期
:return:
'''
driver = webdriver.Chrome()
page_num = get_pages(driver, url)
Get_content(driver, page_num, url, output_list, date)
driver.close()
article_spider
#-*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from user_agents import agents
import requests
import time
import random
def get_article(url):
'''
:param url: 指定日期的链接
:return content: 文本的内容
:return classfy: 文本的类型
'''
try:
classfy = url.split('.')[0].split('//')[1]
agent = random.choice(agents)
header = {'User-Agent': agent}
res = requests.get(url.rsplit('\r\n')[0], headers=header)
time.sleep(1)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
newsArticle = getnewsArticle(soup.select('.art_p'))
content = ''
for con in newsArticle:
content = content + con
return content, classfy
except Exception as e:
print(e)
def getnewsArticle(news):
'''
:param news: 新闻主题内容链接
:return newsArticle: 新闻主题内容
'''
newsArticle = []
for p in news:
newsArticle.append(p.text.strip())
return newsArticle
后记
爬取下来的内容还算ok,虽然这样看起来又繁琐又蛋疼,而且好像如果新闻标题中含有" / "这个字符的话,就会报错,显示没有这个文件夹,或许第二版代码我再考虑怎么加一下中文匹配(?)。代码里面的agent就随便网上找点就行,不用太在意。
第一次写,想来也有很多很多毛病,如果有人看到,还请指出,感恩的心,感谢有你。