# -*- coding: utf-8 -*- import urllib2 import re import cookielib import sys class CSDN: def __init__(self): self.url = 'http://blog.csdn.net/aricover/article/details/78684894/' self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' self.headers = { 'User-Agent' : self.user_agent } self.filename = 'csdncookie.txt' self.patterns = { '<p>':'', '</p>':'', '<pre name="code".*?>':'', '</pre>':'\n', '<br>':'', '</br>':'', '<div.*?>':'', '</div>':'', r'<':'<', r'>':'>', '+':'+', '"':'\"' } def getPage(self): try: req = urllib2.Request(self.url,headers = self.headers) cookie = cookielib.MozillaCookieJar(self.filename) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) response = opener.open(req) cookie.save(ignore_discard=True,ignore_expires=True) content = response.read() return content except urllib2.URLError,e: if hasattr(e,'reason'): print '错误发生的原因:%s' %(e.reason) def replaceSB(self,content): for k,v in self.patterns.items(): pattern = re.compile(k,re.S) content = re.sub(pattern,v,content) return content def write2File(self,content): file = open('blog.txt','w') file.write(content) file.close(); def getContent(self): content = self.getPage() pattern = re.compile(r'<div id="article_content".*?</div>',re.S) items = re.findall(pattern,content) for item in items: zz = self.replaceSB(item) self.write2File(zz) print zz csdn = CSDN() csdn.getContent()
练练手,熟悉下python和爬虫,这里以抓取自己的博文为例,只是爬去特定链接的文章,不是大范围爬取
存在部分字符被csdn的编辑器给替换了,源码链接http://download.csdn.net/download/aricover/10148984#