python简单爬取一个blogs内容

# -*- coding: utf-8 -*-

from urllib2 import urlopen,Request

import urllib

from lxml import *

import lxml.html as HTML

import time

def error(txt):

    with open("../it/error.txt","a") as f:

        f.write(txt + '\n')

def con(url,count=4):

    try:

        req = Request(url)

        req.add_header('Referer','http://www.baidu.com')

        req.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')

        res = urlopen(req,timeout = 20)

        page = res.read()

        res.close()

        #dom = HTML.document_fromstring(page)

        return page

    except Exception,e:

        if count >= 10:

            print e

            error(url)

        else:

            count += 1

            time.sleep(1)

            return con(url,count)

def menu(url):

    page = con(url)

    dom = HTML.document_fromstring(page)

    path = "//h5/a"

    node = dom.xpath(path)

    for n in node:

        dic = {}

        dic['title'] = n.text_content()

        dic['url'] = "http:" + n.get("href")

        if dic['title'] and dic['url']:

            yield dic

def save(title,content):

    with open('../it/'+unicode(title)+'.html','w') as f:

        f.write(content)

def blog():

    prev = menu("http://www.schooltop.net")

    for dic in prev:

        title = dic.get("title",'')

        url = dic.get("url",'')

        page = con(url)

        save(title,page)

        print "saved      ",unicode(title)

 

if __name__ == "__main__":

##    try:

        blog()

##    except Exception,e:

##        print e



方法二:
import urllib2
import re  
arr = ['289','300']
for i in arr:
  content = urllib2.urlopen('http://www.schooltop.net/blogs/'+i).read()
  pattern = re.compile('<div class="article">(.*?)<div class="row t_margin_20">', re.S)
  match = re.search(pattern, content)
  if match:
    print match.group(1)
  else: 
    print 111

猜你喜欢

转载自schooltop.iteye.com/blog/2399769