使用python爬取马蜂窝游记

马蜂窝( http://www.mafengwo.cn/ )是著名的游记分享网站。本例从中爬取2018年“南京”地区游记。

观察游记列表，发现以时间顺序url以类似“http://www.mafengwo.cn/yj/10684/2-0-74.html"的格式排列。其中74代表第74页。而南京2018年游记分布于74至456页之间。
在这里插入图片描述
首先爬取列表中目标url信息。

# coding=utf-8
# author=zhangjingyuan
# python3

import urllib.request
import urllib.parse
import re
import time
import io
import gzip
import random

#页数范围
min = 74
max = 456

headerS={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,en-US;q=0.7,en;q=0.3',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'mfw_uuid=5c84cb93-758a-762e-0127-ea70f6cb3e64; _r=bing; _rp=a%3A2%3A%7Bs%3A1%3A%22p%22%3Bs%3A12%3A%22cn.bing.com%2F%22%3Bs%3A1%3A%22t%22%3Bi%3A1552206739%3B%7D; oad_n=a%3A5%3A%7Bs%3A5%3A%22refer%22%3Bs%3A20%3A%22https%3A%2F%2Fcn.bing.com%2F%22%3Bs%3A2%3A%22hp%22%3Bs%3A11%3A%22cn.bing.com%22%3Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222019-03-10+16%3A32%3A19%22%3B%7D; __mfwlv=1552358747; __mfwvn=3; __mfwlt=1552363839; uva=s%3A144%3A%22a%3A4%3A%7Bs%3A13%3A%22host_pre_time%22%3Bs%3A10%3A%222019-03-10%22%3Bs%3A2%3A%22lt%22%3Bi%3A1552206739%3Bs%3A10%3A%22last_refer%22%3Bs%3A20%3A%22https%3A%2F%2Fcn.bing.com%2F%22%3Bs%3A5%3A%22rhost%22%3Bs%3A11%3A%22cn.bing.com%22%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1552206739%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A11%3A%22cn.bing.com%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5c84cb93-758a-762e-0127-ea70f6cb3e64; UM_distinctid=16966bb483d45-0f494e03ad41f4-4c312f7f-144000-16966bb483e2ab; CNZZDATA30065558=cnzz_eid%3D1199525799-1552206059-null%26ntime%3D1552353780; PHPSESSID=madinq41vdjrvfvsk97bhp7cf0; all_ad=1',
'Host':'www.mafengwo.cn',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'
}
file = open("E://pages.txt", 'a')
for i in range(min,max):
    try:
        url="http://www.mafengwo.cn/yj/10684/2-0-"+str(i)+".html"
        request = urllib.request.Request(url,data=None,headers=headerS)
        response = urllib.request.urlopen(request)
        page = response.read()
        iopage=io.BytesIO(page)
        depage = gzip.GzipFile(fileobj=iopage, mode="rb")
        #gzip解压缩
        html=depage.read().decode('utf-8')
        print(i)
        pattern = re.compile('/i/.*?.html', re.S)
        #查找其中形如/i/…….html的链接
        result = re.findall(pattern, html)
        print(result)
        for item in result:
            file.write(item+'\n')
        time.sleep(random.random())
    except urllib.request.URLError as e:
        if hasattr(e, 'reason'):
            print('出错：' + str(e.reason))
        print('pass')
file.close()

第二步针对保存的url逐个下载内容

# coding=utf-8
# author=zhangjingyuan
# python3
from html.parser import HTMLParser
import lxml
from lxml import etree
import urllib.request
import urllib.parse
import re
import time
import io
import gzip
import random
import codecs


min = 50
max = 500

headerS={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,en-US;q=0.7,en;q=0.3',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'mfw_uuid=5c84cb93-758a-762e-0127-ea70f6cb3e64; _r=bing; _rp=a%3A2%3A%7Bs%3A1%3A%22p%22%3Bs%3A12%3A%22cn.bing.com%2F%22%3Bs%3A1%3A%22t%22%3Bi%3A1552206739%3B%7D; oad_n=a%3A5%3A%7Bs%3A5%3A%22refer%22%3Bs%3A20%3A%22https%3A%2F%2Fcn.bing.com%2F%22%3Bs%3A2%3A%22hp%22%3Bs%3A11%3A%22cn.bing.com%22%3Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222019-03-10+16%3A32%3A19%22%3B%7D; __mfwlv=1552358747; __mfwvn=3; __mfwlt=1552363839; uva=s%3A144%3A%22a%3A4%3A%7Bs%3A13%3A%22host_pre_time%22%3Bs%3A10%3A%222019-03-10%22%3Bs%3A2%3A%22lt%22%3Bi%3A1552206739%3Bs%3A10%3A%22last_refer%22%3Bs%3A20%3A%22https%3A%2F%2Fcn.bing.com%2F%22%3Bs%3A5%3A%22rhost%22%3Bs%3A11%3A%22cn.bing.com%22%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1552206739%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A11%3A%22cn.bing.com%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5c84cb93-758a-762e-0127-ea70f6cb3e64; UM_distinctid=16966bb483d45-0f494e03ad41f4-4c312f7f-144000-16966bb483e2ab; CNZZDATA30065558=cnzz_eid%3D1199525799-1552206059-null%26ntime%3D1552353780; PHPSESSID=madinq41vdjrvfvsk97bhp7cf0; all_ad=1',
'Host':'www.mafengwo.cn',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'
}
list=open("E://pages.txt",'r')
file=codecs.open("E://rsult.csv",'a','utf-8')
i=0
for line in list:
    i = i + 1
    print(i)
    if(i%2==0):
        continue
    try:
        content=""
        print(line)
        url="http://www.mafengwo.cn"+line
        request = urllib.request.Request(url,data=None,headers=headerS)
        response = urllib.request.urlopen(request)
        page = response.read()
        iopage=io.BytesIO(page)
        depage = gzip.GzipFile(fileobj=iopage, mode="rb")
        html=depage.read().decode('utf-8')
        Htree = etree.HTML(html)
        #print(etree.tostring(Htree))
        body=Htree[1]
        #print(etree.tostring(body))
        main=body[1]
        #print(etree.tostring(main))
        view=main[3]
        #此处使用etree以获取游记正文部分
        content = content + etree.tostring(view).decode('utf-8')
        content=HTMLParser().unescape(content)
        dr = re.compile(r'<[^>]+>', re.S)
        dd = dr.sub('', content)
        dr = re.compile('\n', re.S)
        dd = dr.sub('', dd)
        dr = re.compile(' ', re.S)
        res = dr.sub('', dd)
        #去除富文本标签、换行符、空格等
        print(res)
        file.write(str(res)+'\n')
    except:
        pass
file.close()

注意，连续爬取约40篇游记时会被IP封禁数小时，因采取适当的策略分时爬取或采用多IP代理的方法进行处理。
结果如下：

在这里插入图片描述

使用python爬取马蜂窝游记

猜你喜欢