# -*- coding: utf-8 -*- from urllib2 import urlopen,Request import urllib from lxml import * import lxml.html as HTML import time def error(txt): with open("../it/error.txt","a") as f: f.write(txt + '\n') def con(url,count=4): try: req = Request(url) req.add_header('Referer','http://www.baidu.com') req.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') res = urlopen(req,timeout = 20) page = res.read() res.close() #dom = HTML.document_fromstring(page) return page except Exception,e: if count >= 10: print e error(url) else: count += 1 time.sleep(1) return con(url,count) def menu(url): page = con(url) dom = HTML.document_fromstring(page) path = "//h5/a" node = dom.xpath(path) for n in node: dic = {} dic['title'] = n.text_content() dic['url'] = "http:" + n.get("href") if dic['title'] and dic['url']: yield dic def save(title,content): with open('../it/'+unicode(title)+'.html','w') as f: f.write(content) def blog(): prev = menu("http://www.schooltop.net") for dic in prev: title = dic.get("title",'') url = dic.get("url",'') page = con(url) save(title,page) print "saved ",unicode(title) if __name__ == "__main__": ## try: blog() ## except Exception,e: ## print e
方法二:
import urllib2 import re arr = ['289','300'] for i in arr: content = urllib2.urlopen('http://www.schooltop.net/blogs/'+i).read() pattern = re.compile('<div class="article">(.*?)<div class="row t_margin_20">', re.S) match = re.search(pattern, content) if match: print match.group(1) else: print 111