Python爬虫学习---正则抓取内涵段子

#coding: utf-8
'''
Python2.7
bo 2018-06-02 22:28:00
'''
import urllib2
import re
from multiprocessing.pool import Pool

class Spider:
    def __init__(self,max):
        self.enable = True
        self.page = 1
        self.max = max

    def loadPage(self,page):
        url = 'http://www.neihan8.com/article/list_5_'+ str(page) + '.html'
        user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5221.400 QQBrowser/10.0.1125.400'
        headers = {'User-Agent':user_agent}
        req = urllib2.Request(url,headers=headers)
        response = urllib2.urlopen(req)
        html = response.read()

        gbk_html = html.decode('gbk').encode('utf-8')

        pattern = re.compile('<h4>.*?html">(.*?)</a>.*?</h4>'+
                             '.*?<div class="f18 mb20">(.*?)</div>',re.S)
        item_list = pattern.findall(gbk_html)
        return item_list
    def printOnePage(self,item_list):
        for item in item_list:
            self.writeToFile("---------------------------------------------------------------")
            self.writeToFile('T:' + re.sub(r'<.*?>|&.*?;|\s+| ','',item[0]) + "\n")
            self.writeToFile('C:' +  re.sub(r'<.*?>|&.*?;|\s+| ','',item[1]))
    def writeToFile(self,content):
        with open('MyStory.txt', 'a') as f:
            f.write(content + "\n")
            f.close()

    def doWork(self):
        while self.enable:
            try:
                item_list = self.loadPage(self.page)
            except urllib2.URLError,e:
                print e.reason
                continue

            self.printOnePage(item_list)
            print('抓取第%d页成功'%self.page)
            if self.page == self.max:
                self.enable = False
                print('爬取完毕...')
                break

            self.page += 1

def main(count):
    mySpider = Spider(count)
    mySpider.doWork()

if __name__ == '__main__':

    pool = Pool()
    pool.map(main,[1])

猜你喜欢

转载自blog.csdn.net/u010314160/article/details/80552178