动态爬取,酒店评论

使用Python爬取动态网页,获取评论

python2.7.15

酒店的评论都在js文件中它们连着数据库,是动态加载的,找到js文件的URL和它们的规律,爬取就成功了一半。

我获取了评论中的五项,这五项中有的可能没有,因为是在字典中取值,如果没有的话会报错,说没有该键,所以在查找和写入的时候要加一个try-except,如果有就获取并写入,如果没有就赋值none

headers字典和data字典是必须的,从自己浏览器里找,
F12或右键审查元素。

代码如下

# coding=utf-8
import urllib2
import re
import MySQLdb
import json
import requests

conn=MySQLdb.connect(host="127.0.0.1",user="root",passwd="199855pz",db="pz",charset='utf8')
print '连接成功'
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS yilong")
sql = '''CREATE TABLE yilong(姓名 char(10) ,评价 char(100) ,商品 char(10) ,日期 char(10) ,评分 char(10))'''
cursor.execute(sql)

def hotelname(shoplist,n):
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    nameurl = "http://hotel.elong.com/" + shoplist[n] + "/"
    request = urllib2.Request(nameurl, headers=header)
    response = urllib2.urlopen(request)
    cont = response.read()
    pattern = re.compile('<title>【(.*?)】地址:.*?艺龙旅行网</title>')
    name = re.findall(pattern, cont)
    name = name[0]
    print name
    return name


url = "http://hotel.elong.com/ajax/tmapilist/asyncsearch"
data = {'code':9559991,
'listRequest.areaID':'',
'listRequest.bookingChannel':'1',
'listRequest.cardNo':192928,
'listRequest.checkInDate':'2018-11-08 00:00:00',
'listRequest.checkOutDate':'2018-11-09 00:00:00',
'listRequest.cityName':'上海市',
'listRequest.customLevel':11,
'listRequest.distance':20,
'listRequest.orderFromID':50426}
headers = {'Accept':'application/json, text/javascript, */*; q=0.01',
           'Accept-Encoding':'gzip, deflate',
           'Accept-Language':'zh-CN,zh;q=0.9',
           'Connection':'keep-alive',
           'Content-Length':'1686',
           'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
           'Cookie':'CookieGuid=afc5a26f-f88c-4c8b-aee3-284aa693a358; _fid=afc5a26f-f88c-4c8b-aee3-284aa693a358; CitySearchHistory=0101%23%E5%8C%97%E4%BA%AC%E5%B8%82%23beijing%23; s_eVar44=pz360sem; SHBrowseHotel=cn=92947173%2C%2C%2C%2C%2C%2C%3B91282394%2C%2C%2C%2C%2C%2C%3B92385687%2C%2C%2C%2C%2C%2C%3B&; SessionGuid=befebc42-ecba-4f5b-91c3-d48efe3f9e6a; Esid=5d309ea3-4aea-4f53-9fbe-1edd8113ce9e; semid=pz360sem; outerFrom=pz360sem; com.eLong.CommonService.OrderFromCookieInfo=Status=1&Orderfromtype=5&Isusefparam=0&Pkid=50426&Parentid=4300&Coefficient=0.0&Makecomefrom=0&Cookiesdays=0&Savecookies=0&Priority=9001; fv=pcweb; ext_param=bns%3D4%26ct%3D3; s_cc=true; s_visit=1; newjava2=5db7fb36946d2a8fdb8546870157311e; JSESSIONID=6287F52D048B99E57C52BE155FDD0435; anti_token=32F13A20-A664-4F0B-A84B-6B2FD7DCF052; ShHotel=CityID=0201&CityNameCN=%E4%B8%8A%E6%B5%B7%E5%B8%82&CityName=%E4%B8%8A%E6%B5%B7%E5%B8%82&OutDate=2018-11-09&CityNameEN=shanghai&InDate=2018-11-08; s_sq=%5B%5BB%5D%5D; __tctmc=0.244490012; __tctmd=0.52917361; __tccgd=0.1; __tctmb=0.3937967179605747.1541661372067.1541661372067.1',
           'Host':'hotel.elong.com',
           'Origin':'http://hotel.elong.com',
           'Referer':'http://hotel.elong.com/shanghai/',
           'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
           'X-Requested-With':'XMLHttpRequest'}
request = requests.post(url, headers=headers, data = data).content
shoplist = json.loads(request)
shoplist = shoplist['value']['hotelIds']
shoplist = shoplist.encode('utf-8')
shoplist = shoplist.split(",",19)
print shoplist

for n in range(21):
    hn = hotelname(shoplist, n)
    f = open(hn.decode('utf-8')+'.txt', 'a+')
    for p in range(1,26) :
        url = "http://hotel.elong.com/ajax/comment/getcommentbypage/?hotelId=" + shoplist[n] + "&recommendedType=0&pageIndex=" + str(p) + "&mainTagId=0&subTagId=0&rankType=0&eToken=afc5a26f-f88c-4c8b-aee3-284aa693a358&code=9342551&_=1541592274486"
        header = {'Accept':'application/json, text/javascript, */*; q=0.01',
               'Accept-Encoding':'gzip, deflate',
               'Accept-Language':'zh-CN,zh;q=0.9',
               'Connection':'keep-alive',
               'Host':'hotel.elong.com',
               'Referer':'http://hotel.elong.com/92947173/',
               'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
               'X-Requested-With':'XMLHttpRequest'}
        request = requests.get(url, headers=header).content
        pl = json.loads(request)
        for q in range(20):
            Nickname = pl['value']['Comments'][q]['CommentUser']['NickName']
            f.write(Nickname.encode('utf-8') + '\n')
            print Nickname

            pinglun = pl['value']['Comments'][q]['Content']
            f.write(pinglun.encode('utf-8') + '\n')
            print pinglun

            try :
                room = pl['value']['Comments'][q]['CommentExt']['Order']['RoomTypeName']
                f.write(room.encode('utf-8') + '\n')
                print room
            except :
                room = 'Null'
                print room

            time = pl['value']['Comments'][q]['CreateTime']
            f.write(time.encode('utf-8') + '\n')
            print time

            try :
                score = pl['value']['Comments'][q]['CommentScore']['Score']
                score = str(score)
                f.write(score.encode('utf-8') + '\n')
                print score
            except :
                score = 'Null'
                print score

            insert_yilong = ("INSERT INTO yilong(姓名 , 评价 , 商品 , 日期 , 评分)" "VALUES(%s,%s,%s,%s,%s)")
            data_yilong = (Nickname, pinglun, room, time, score)
            cursor.execute(insert_yilong, data_yilong)
            conn.commit()
    f.close()



猜你喜欢

转载自blog.csdn.net/memoirs_pz/article/details/84500872