python数据抓取

经过折腾，python果然是数据抓取的利器。
火车头类的东西，以后我怕是用不上了，用python自己写，速度快而且透明。
数据抓取的大概步骤
1.抓取列表页面链接
2.根据链接，抓取内容，并保存到数据库

以下是cnblog的新闻为例，使用sqlite做数据库（python内置的，当然是首选的）
1.getnewsList.py

# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request
from urllib import request
# 导入SQLite驱动:
import sqlite3



def saveNews(newsLink,newsTitle=None):
    if newsTitle is None:
	    newsTitle=""
    conn = sqlite3.connect('news.db')
    cursor = conn.cursor()
    # 执行一条SQL语句，创建user表:
    cursor.execute('create table IF NOT EXISTS news (id INTEGER PRIMARY KEY, title varchar(100),link vachar(100),content text,status Integer)')
    cursor.execute('select * from news where link=\''+newsLink+'\'')
    values=cursor.fetchall()
    if len(values) > 0:#链接以前就存在
        print('链接已经存在:'+newsLink)
    else:
        cursor.execute('insert into news (title, link,status) values (\''+newsTitle+'\', \''+newsLink+'\',0)')
        print("save success."+newsTitle+":"+newsLink)    
# 关闭Cursor:
    cursor.close()
# 提交事务:
    conn.commit()
# 关闭Connection:
    conn.close()
	
def readNews():
    conn = sqlite3.connect('news.db')
    cursor = conn.cursor()
    cursor.execute('select * from news')
    values = cursor.fetchall()
    #print(values)
    cursor.close()
    conn.close()

	
#1.获取页面内容html
with request.urlopen('http://news.cnblogs.com/') as f:
    html_doc=f.read()
	
#2.分析页面内容，获取标题内容和链接[格式如下]
#<h2 class="news_entry">
#	<a href="/n/535728/" target="_blank">传Windows 10 Mobile Build 11088下月初发布</a>
#</h2>
soup = BeautifulSoup(html_doc,"html.parser")
news_array=soup.find_all('h2', {'class': 'news_entry'})
for news in news_array:
    #print(news.a.get("href"))#获取链接
    #print(news.a.string)#获取标题
    saveNews("http://news.cnblogs.com"+news.a.get("href"),news.a.string)
    
#readNews()

2.getnewsContent.py

# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request
from urllib import request
# 导入SQLite驱动:
import sqlite3

	
def updateNewsContent():
    conn = sqlite3.connect('news.db')
    cursor = conn.cursor()
    cursor.execute('select * from news where status=0')
    values = cursor.fetchall()
    
    for line in values:
        id=line[0]
        link=line[2]
        content=getNewsContent(link)
        cursor.execute('update news set content=?,status=1 where id=?',(content, id))
    cursor.close()
    conn.commit()
    conn.close()

#根据链接获取内容
def getNewsContent(newsLink):
    #1.获取页面内容html
    with request.urlopen(newsLink) as f:
        html_doc=f.read()
        
    #2.分析页面内容，获取内容
    soup = BeautifulSoup(html_doc,"html.parser")
    news_content=soup.find('div', {'id': 'news_body'})
    s=news_content.contents
    text=''
    for x in s:
        text=text+str(x)
    return text

#将所有没有内容的新闻，抓取一下，将内容填充进去
updateNewsContent()

猜你喜欢