#coding: utf-8
'''
Python2.7
bo 2018-06-02 22:28:00
'''
import urllib2
import re
from multiprocessing.pool import Pool
class Spider:
def __init__(self,max):
self.enable = True
self.page = 1
self.max = max
def loadPage(self,page):
url = 'http://www.neihan8.com/article/list_5_'+ str(page) + '.html'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5221.400 QQBrowser/10.0.1125.400'
headers = {'User-Agent':user_agent}
req = urllib2.Request(url,headers=headers)
response = urllib2.urlopen(req)
html = response.read()
gbk_html = html.decode('gbk').encode('utf-8')
pattern = re.compile('<h4>.*?html">(.*?)</a>.*?</h4>'+
'.*?<div class="f18 mb20">(.*?)</div>',re.S)
item_list = pattern.findall(gbk_html)
return item_list
def printOnePage(self,item_list):
for item in item_list:
self.writeToFile("---------------------------------------------------------------")
self.writeToFile('T:' + re.sub(r'<.*?>|&.*?;|\s+| ','',item[0]) + "\n")
self.writeToFile('C:' + re.sub(r'<.*?>|&.*?;|\s+| ','',item[1]))
def writeToFile(self,content):
with open('MyStory.txt', 'a') as f:
f.write(content + "\n")
f.close()
def doWork(self):
while self.enable:
try:
item_list = self.loadPage(self.page)
except urllib2.URLError,e:
print e.reason
continue
self.printOnePage(item_list)
print('抓取第%d页成功'%self.page)
if self.page == self.max:
self.enable = False
print('爬取完毕...')
break
self.page += 1
def main(count):
mySpider = Spider(count)
mySpider.doWork()
if __name__ == '__main__':
pool = Pool()
pool.map(main,[1])
Python爬虫学习---正则抓取内涵段子
猜你喜欢
转载自blog.csdn.net/u010314160/article/details/80552178
今日推荐
周排行