火车头类的东西,以后我怕是用不上了,用python自己写,速度快而且透明。
数据抓取的大概步骤
1.抓取列表页面链接
2.根据链接,抓取内容,并保存到数据库
以下是cnblog的新闻为例,使用sqlite做数据库(python内置的,当然是首选的)
1.getnewsList.py
# -*- coding:utf-8 -*- from bs4 import BeautifulSoup import urllib.request from urllib import request # 导入SQLite驱动: import sqlite3 def saveNews(newsLink,newsTitle=None): if newsTitle is None: newsTitle="" conn = sqlite3.connect('news.db') cursor = conn.cursor() # 执行一条SQL语句,创建user表: cursor.execute('create table IF NOT EXISTS news (id INTEGER PRIMARY KEY, title varchar(100),link vachar(100),content text,status Integer)') cursor.execute('select * from news where link=\''+newsLink+'\'') values=cursor.fetchall() if len(values) > 0:#链接以前就存在 print('链接已经存在:'+newsLink) else: cursor.execute('insert into news (title, link,status) values (\''+newsTitle+'\', \''+newsLink+'\',0)') print("save success."+newsTitle+":"+newsLink) # 关闭Cursor: cursor.close() # 提交事务: conn.commit() # 关闭Connection: conn.close() def readNews(): conn = sqlite3.connect('news.db') cursor = conn.cursor() cursor.execute('select * from news') values = cursor.fetchall() #print(values) cursor.close() conn.close() #1.获取页面内容html with request.urlopen('http://news.cnblogs.com/') as f: html_doc=f.read() #2.分析页面内容,获取标题内容和链接[格式如下] #<h2 class="news_entry"> # <a href="/n/535728/" target="_blank">传Windows 10 Mobile Build 11088下月初发布</a> #</h2> soup = BeautifulSoup(html_doc,"html.parser") news_array=soup.find_all('h2', {'class': 'news_entry'}) for news in news_array: #print(news.a.get("href"))#获取链接 #print(news.a.string)#获取标题 saveNews("http://news.cnblogs.com"+news.a.get("href"),news.a.string) #readNews()
2.getnewsContent.py
# -*- coding:utf-8 -*- from bs4 import BeautifulSoup import urllib.request from urllib import request # 导入SQLite驱动: import sqlite3 def updateNewsContent(): conn = sqlite3.connect('news.db') cursor = conn.cursor() cursor.execute('select * from news where status=0') values = cursor.fetchall() for line in values: id=line[0] link=line[2] content=getNewsContent(link) cursor.execute('update news set content=?,status=1 where id=?',(content, id)) cursor.close() conn.commit() conn.close() #根据链接获取内容 def getNewsContent(newsLink): #1.获取页面内容html with request.urlopen(newsLink) as f: html_doc=f.read() #2.分析页面内容,获取内容 soup = BeautifulSoup(html_doc,"html.parser") news_content=soup.find('div', {'id': 'news_body'}) s=news_content.contents text='' for x in s: text=text+str(x) return text #将所有没有内容的新闻,抓取一下,将内容填充进去 updateNewsContent()