Public comment reptiles

## reptiles public comment review scripts Guide

### preparation before crawling

- mysql database installation, open service
- Modify User name Password mysqls.py program database, etc., and create a corresponding database and table, you can use mysqls.creat_table () function
- Log public comment official website, available through Google developer tools, etc. to the current cookie, modify the cookie variable main.py
- Examines the shop ID and the comments of the shop crawling pages, modify the corresponding position main.py
- If there xuchuan.txt (save the current review crawling progress) Please delete before crawling (each for a shop to be deleted once)

### during crawling

- Because when crawling around about every 100 pages, you need to authenticate once, when found to get Comments 0 or abnormal, open a browser page review, slide the slider to unlock, and then restart the program, there are HTTP, not virtual ~
- when replacing shop, shop ID need to be replaced, as well as a review of the number of pages should be replaced, but also delete xuchuan.txt

After crawling end ###

MYSQL data stored in the database can be used to read a variety of methods, La La ~

 

1. Install Mysql, install Navicat, create a database dianping.

2. The packaging operations database tools

# -*- coding: utf-8 -*-
"""
Created on Tue Jul 24 15:45:05 2018

@author: bin
"""

import pymysql

#连接MYSQL数据库
db = pymysql.connect("localhost","root","root","dianping" )
cursor = db.cursor()

#在数据库建表
def creat_table():
    cursor.execute("DROP TABLE IF EXISTS DZDP")
    sql = '''CREATE TABLE DZDP(
            cus_id varchar(100),
            comment_time varchar(55),
            comment_star varchar(55),
            cus_comment text(5000),
            kouwei varchar(55),
            huanjing varchar(55),
            fuwu varchar(55),
            shopID varchar(55)
            );'''
    cursor.execute(sql)
    return

#存储爬取到的数据
def save_data(data_dict):
    sql = '''INSERT INTO DZDP(cus_id,comment_time,comment_star,cus_comment,kouwei,huanjing,fuwu,shopID) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)'''
    value_tup = (data_dict['cus_id']
                 ,data_dict['comment_time']
                 ,data_dict['comment_star']
                 ,data_dict['cus_comment']
                 ,data_dict['kouwei']
                 ,data_dict['huanjing']
                 ,data_dict['fuwu']
                 ,data_dict['shopID']
                 )
    try:
        cursor.execute(sql,value_tup)
        db.commit()
    except:
        print('数据库写入失败')
    return

#关闭数据库
def close_sql():
    db.close()

3. Create a table

creat_table()

4. Create a python file in the current directory with a proxies.txt, writes agent pool, or you can create your own pool put forward proxy manually, create an agent pool if they can refer CRAW_IP.py

 

CRAW_IP.py:

import requests

from bs4 import BeautifulSoup

import lxml

from multiprocessing import Process, Queue

import random

import json

import time

import requests


class Proxies(object):
    """docstring for Proxies"""

    def __init__(self, page=3):

        self.proxies = []

        self.verify_pro = []

        self.page = page

        self.headers = {

            'Accept': '*/*',

            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',

            'Accept-Encoding': 'gzip, deflate, sdch',

            'Accept-Language': 'zh-CN,zh;q=0.8'

        }

        self.get_proxies()

        self.get_proxies_nn()

    def get_proxies(self):

        page = random.randint(1, 10)

        page_stop = page + self.page

        while page < page_stop:

            url = 'http://www.xicidaili.com/nt/%d' % page

            html = requests.get(url, headers=self.headers).content

            soup = BeautifulSoup(html, 'lxml')

            ip_list = soup.find(id='ip_list')

            for odd in ip_list.find_all(class_='odd'):
                protocol = odd.find_all('td')[5].get_text().lower() + '://'

                self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))

            page += 1

    def get_proxies_nn(self):

        page = random.randint(1, 10)

        page_stop = page + self.page

        while page < page_stop:

            url = 'http://www.xicidaili.com/nn/%d' % page

            html = requests.get(url, headers=self.headers).content

            soup = BeautifulSoup(html, 'lxml')

            ip_list = soup.find(id='ip_list')

            for odd in ip_list.find_all(class_='odd'):
                protocol = odd.find_all('td')[5].get_text().lower() + '://'

                self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))

            page += 1

    def verify_proxies(self):

        # 没验证的代理

        old_queue = Queue()

        # 验证后的代理

        new_queue = Queue()

        print('verify proxy........')

        works = []

        for _ in range(15):
            works.append(Process(target=self.verify_one_proxy, args=(old_queue, new_queue)))

        for work in works:
            work.start()

        for proxy in self.proxies:
            old_queue.put(proxy)

        for work in works:
            old_queue.put(0)

        for work in works:
            work.join()

        self.proxies = []

        while 1:

            try:

                self.proxies.append(new_queue.get(timeout=1))

            except:

                break

        print('verify_proxies done!')

    def verify_one_proxy(self, old_queue, new_queue):

        while 1:

            proxy = old_queue.get()

            if proxy == 0: break

            protocol = 'https' if 'https' in proxy else 'http'

            proxies = {protocol: proxy}

            try:

                if requests.get('http://www.baidu.com', proxies=proxies, timeout=2).status_code == 200:
                    print('success %s' % proxy)

                    new_queue.put(proxy)

            except:

                print('fail %s' % proxy)


if __name__ == '__main__':

    a = Proxies()

    a.verify_proxies()

    print(a.proxies)

    proxie = a.proxies

    with open('proxies.txt', 'a') as f:

        for proxy in proxie:
            f.write(proxy + '\n')




proxies.txt:

https://59.37.18.243:3128
https://183.129.207.74:14823
https://49.73.6.90:3128
https://115.239.255.190:3128
https://203.86.26.9:3128
https://120.92.74.189:3128
http://183.62.196.10:3128
https://183.129.244.17:10010
https://171.221.239.11:808
https://14.29.32.106:53281
https://218.60.8.83:3129
https://183.129.207.80:21776
https://203.130.46.108:9090
https://183.21.81.58:40539
https://182.18.13.149:53281
https://114.113.126.83:80
https://118.212.95.34:53281
https://114.113.126.82:80
https://183.129.207.78:18118
https://211.101.136.86:8080
https://114.249.112.16:9000
https://163.125.68.149:8888
https://111.202.37.195:8080
https://61.145.203.234:38695
https://119.254.94.92:48494
https://27.46.20.55:888
https://175.6.2.174:8088
https://59.72.126.3:8123
https://59.37.26.226:8080
https://120.27.14.125:80
https://61.140.108.57:54689
https://58.240.220.86:53281
https://183.30.201.8:9797
https://111.170.156.182:53281
https://218.15.25.157:8088
https://180.173.152.33:9000
https://117.35.51.77:53281
https://119.90.126.106:7777
https://121.228.125.27:3128
https://218.89.222.110:9999
https://61.155.112.228:61591
https://171.37.30.82:9797
https://125.123.122.59:9000
https://125.123.143.171:9000
https://60.191.57.79:3128
https://163.125.19.43:9999
https://112.65.19.122:8080
https://163.125.17.241:8888
https://163.125.17.238:8888
https://180.213.181.96:8118
https://114.86.227.164:33657
https://118.187.50.154:8080
https://118.190.217.182:80
https://118.190.217.61:80
http://183.129.244.13:10800
https://125.123.127.24:9000
https://124.237.83.14:53281
https://163.125.74.243:9797
https://61.175.172.216:8123
https://175.152.223.235:8123
https://123.165.115.55:9797
https://223.245.127.165:44765
https://59.78.1.5:1080
https://118.25.177.187:1080
https://59.39.196.122:55637
https://119.4.172.217:8118
https://116.30.123.148:9000
https://112.74.207.50:3128
https://14.149.68.120:1080
https://58.251.233.122:9797
https://182.88.187.149:9797
https://182.150.63.89:46073
https://163.125.70.70:9999
https://58.251.234.137:9797
https://101.132.122.230:3128
https://119.129.98.65:45522
https://112.81.143.172:8118
https://220.184.129.224:3128
https://112.250.109.173:53281
https://116.196.92.155:1080
https://14.20.235.117:808
https://182.88.187.83:9797
https://110.52.8.171:53281
https://159.226.170.42:3128
https://121.9.199.70:32431
https://113.118.201.133:9797
https://58.250.23.210:1080
https://119.250.26.39:9000
https://171.36.179.27:9797
https://175.25.185.57:3128
https://118.190.155.23:80
https://114.119.116.93:61066
https://171.36.210.248:9797
https://112.193.130.123:8118
https://123.183.11.166:53386
https://118.186.2.210:8080
https://112.64.38.161:51099
https://222.186.45.146:63756
https://183.14.76.165:9797
https://163.125.19.88:9999
https://218.6.16.233:8118
https://180.168.210.132:80
https://61.164.39.69:53281
https://61.130.9.249:3128
https://122.143.117.8:8080
https://180.162.34.149:9797
https://115.231.50.10:53281
https://112.95.205.63:8888
https://112.95.205.71:8888
https://115.151.4.6:53128
https://110.73.40.17:8123
https://121.207.0.115:808
https://118.180.85.201:8123
https://61.157.206.182:60460
https://124.200.104.234:47076
https://61.157.206.170:42379
https://221.234.192.10:8010
https://59.32.37.7:3128
https://1.183.163.137:53077
https://59.49.22.231:30151
https://27.22.104.28:39560
https://61.160.233.214:39522
https://59.32.37.246:8010
https://115.46.79.110:8123
https://110.73.10.53:8123
https://110.73.43.173:8123
https://183.63.17.253:54174
https://121.9.199.51:59134
https://123.163.20.37:35249
https://61.158.187.118:56524
https://61.157.206.187:37667
https://203.93.125.238:51108
https://223.203.0.14:8080
https://221.224.62.243:51941
https://114.225.169.161:53128
https://124.77.92.239:31307
https://27.153.128.207:8010
https://110.188.0.64:35137
https://115.238.105.108:808
https://61.133.245.70:35652
https://60.211.192.54:40700
https://171.37.155.232:8123
https://221.232.193.223:8010
https://27.190.26.57:8118
https://221.224.212.11:23500
https://180.118.240.51:61234
https://113.106.97.148:38257
https://119.97.23.87:8123
https://1.183.163.101:52524
https://61.157.206.172:59656
https://121.205.254.201:8010
https://61.157.206.178:34692
https://115.46.74.160:8123
https://120.5.162.224:32290
https://61.154.49.38:59675
https://61.160.233.215:48478
https://119.123.77.41:31425
https://114.225.170.217:53128
https://113.17.36.96:47399
https://114.112.70.150:57871
https://123.207.30.131:80
https://119.254.94.97:41697
https://115.46.73.129:8123
https://115.221.112.122:25903
https://115.211.231.66:8010
https://221.232.192.206:8010
https://182.88.166.78:8123
https://115.46.67.43:8123
https://121.205.254.192:808
https://175.148.73.231:1133
https://183.129.153.122:36839
https://139.196.111.17:42589
https://60.12.214.184:33507
https://117.85.86.73:53128
https://115.46.77.225:8123
https://121.31.177.217:8123
https://110.73.42.191:8123
https://222.85.22.167:8010
https://119.48.97.137:80
https://218.79.113.92:30366
https://101.236.55.145:8866
https://116.235.75.177:61922
https://220.248.125.82:8118
https://121.60.76.28:8010
https://116.17.236.52:8010
https://115.223.114.224:8010
https://122.246.51.176:8010
https://59.45.27.245:50858
https://171.37.153.33:8123
https://121.225.26.218:3128
https://180.118.243.93:61234
https://115.46.78.208:8123
https://175.148.76.72:1133
https://223.244.252.58:45744
https://115.223.117.127:8010
https://59.46.112.34:43858
https://117.114.144.195:35070
https://180.118.243.52:61234
https://180.110.7.46:3128
https://106.42.208.201:8010
https://42.236.151.226:37848
https://221.2.207.205:51030
https://114.80.216.171:54408
https://119.254.94.95:43150
https://121.31.153.170:8123
https://113.121.242.173:808
https://122.138.16.158:80
https://182.88.129.168:8123
https://113.200.27.10:53281

5. Create main.py start crawlers

note:

5.1. Login public comment on the official website, available through Google developer tools to the current cookie, modify the cookie variable main.py,

5.2. View crawling shop ID and the comments of the shop pages, modify the corresponding position main.py,

5.3 If there xuchuan.txt (save the current review crawling progress), remove before crawling (each for a shop to be deleted once)

5.4. Since when crawling around about every 100 pages, you need to authenticate once, when found to get Comments 0 or abnormal, open a browser page review, slide the slider to unlock, and then restart the program, there are HTTP, It is true ~
5.5. when replacing the shop, the shop ID need to be replaced, as well as a review of the number of pages should be replaced, but also delete xuchuan.txt

# -*- coding: utf-8 -*-
"""
Created on Mon Jul  9 16:42:52 2018

@author: bin
"""

#目标爬取店铺的评论

import requests
from bs4 import BeautifulSoup
import time, random
import mysqls
import re
from fake_useragent import UserAgent
import os

ua = UserAgent()

#设置cookies
# cookie = "_lxsdk_cuid=162760423dfc8-0801f141cb0731-3b60490d-e1000-162760423dfc8; _lxsdk=162760423dfc8-0801f141cb0731-3b60490d-e1000-162760423dfc8; _hc.v=af7219c3-2b99-8bb8-f9b2-7b1d9be7f29e.1522398406; s_ViewType=10; ua=%E4%BB%A4%E7%8B%90%E5%86%B2; ctu=029e953356caf94d20233d299a70d285a03cb64585c371690b17d3e59c4c075c; cye=guangzhou; Hm_lvt_e6f449471d3527d58c46e24efb4c343e=1531964746; cy=4; dper=8c6ae023e893759ea57ce154028f1800be56b69450806b893b9cf5c6b6c3e3ba3c986c9a603bcbf9a7fb18dcd2038cf704b3e3baba3532bc7dffec965fe5e6c3b2479ca21c6577a1f5636088acbba8936df6ac994e02a923a907907a938559f9; ll=7fd06e815b796be3df069dec7836c3df; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s=1661889a264-50e-66f-22a%7C%7C276"
cookie="_lxsdk_cuid=16a4347ce37c8-0a8bfc61a70c74-5a442916-15f900-16a4347ce3750; _lxsdk=16a4347ce37c8-0a8bfc61a70c74-5a442916-15f900-16a4347ce3750; Hm_lvt_e6f449471d3527d58c46e24efb4c343e=1555906941; _hc.v=0f9cc4f8-5fa4-ea4c-262c-306a344b9a8e.1555906941; cy=2; cye=beijing; _dp.ac.v=72a808e1-1de4-45eb-8962-eefb0a179eb7; ll=7fd06e815b796be3df069dec7836c3df; ua=dpuser_0295126037; ctu=255d09ddf69958c91b07ce9c01164c9c8c6144674a4190c49410595ebe1a95d7; uamo=17010209086; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s=16a6820f2d8-3ed-bfa-b9e%7C%7C192"
#修改请求头
headers = {
        'User-Agent':ua.random,
        'Cookie':cookie,
        'Connection':'keep-alive',
        'Host':'www.dianping.com',
        'Referer': 'http://www.dianping.com/shop/521698/review_all/p6'
}

#从ip代理池中随机获取ip
ips = open('proxies.txt','r').read().split('\n')
#
def get_random_ip():
   ip = random.choice(ips)
   pxs = {ip.split(':')[0]:ip}
   return pxs

#获取html页面
def getHTMLText(url,code="utf-8"):
    try:
        time.sleep(random.random()*6 + 2)
        r=requests.get(url, timeout = 5, headers=headers, 
                      proxies=get_random_ip()
                       )
        r.raise_for_status()
        r.encoding = code
        return r.text
    except:
        print("产生异常")
        return "产生异常"

#因为评论中带有emoji表情,是4个字符长度的,mysql数据库不支持4个字符长度,因此要进行过滤
def remove_emoji(text):
    try:
        highpoints = re.compile(u'[\U00010000-\U0010ffff]')
    except re.error:
        highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
    return highpoints.sub(u'',text)

#从html中提起所需字段信息
def parsePage(html,shpoID):
    infoList = [] #用于存储提取后的信息,列表的每一项都是一个字典
    soup = BeautifulSoup(html, "html.parser")
    
    for item in soup('div','main-review'):
        cus_id = item.find('a','name').text.strip()
        comment_time = item.find('span','time').text.strip()
        try:
            comment_star = item.find('span',re.compile('sml-rank-stars')).get('class')[1]
        except:
            comment_star = 'NAN'
        cus_comment = item.find('div',"review-words").text.strip()
        scores = str(item.find('span','score'))
        try:
            kouwei = re.findall(r'口味:([\u4e00-\u9fa5]*)',scores)[0]
            huanjing = re.findall(r'环境:([\u4e00-\u9fa5]*)',scores)[0]
            fuwu = re.findall(r'服务:([\u4e00-\u9fa5]*)',scores)[0]
        except:
            kouwei = huanjing = fuwu = '无'
        
        infoList.append({'cus_id':cus_id,
                         'comment_time':comment_time,
                         'comment_star':comment_star,
                         'cus_comment':remove_emoji(cus_comment),
                         'kouwei':kouwei,
                         'huanjing':huanjing,
                         'fuwu':fuwu,
                         'shopID':shpoID})
    return infoList

#构造每一页的url,并且对爬取的信息进行存储
def getCommentinfo(shop_url, shpoID, page_begin, page_end):
    for i in range(page_begin, page_end):
        try:
            url = shop_url + 'p' + str(i)
            html = getHTMLText(url)
            infoList = parsePage(html,shpoID)
            print('成功爬取第{}页数据,有评论{}条'.format(i,len(infoList)))
            for info in infoList:
                mysqls.save_data(info)
            #断点续传中的断点
            if (html != "产生异常") and (len(infoList) != 0):
                with open('xuchuan.txt','a') as file:
                    duandian = str(i)+'\n'
                    file.write(duandian)
            else:
                print('休息60s...')
                time.sleep(60)
        except:
            print('跳过本次')
            continue
    return

def xuchuan():
    if os.path.exists('xuchuan.txt'):
        file = open('xuchuan.txt','r')
        nowpage = int(file.readlines()[-1])
        file.close()
    else:
        nowpage = 0
    return nowpage

#根据店铺id,店铺页码进行爬取
def craw_comment(shopID='521698',page = 53):
    shop_url = "http://www.dianping.com/shop/" + shopID + "/review_all/"
    #读取断点续传中的续传断点
    nowpage = xuchuan()
    getCommentinfo(shop_url, shopID, page_begin=nowpage+1, page_end=page+1)
    mysqls.close_sql()
    return

if __name__ == "__main__":
    craw_comment()
        

6. MYSQL data stored in the database can be used to read a variety of methods, La La ~

7. If you want to follow-up data text mining analysis, and can refer to my blog in two additional exploratory analysis blog sentiment analysis https://blog.csdn.net/weixin_40903057/article/details/89705923  and evaluating public comment on sentiment analysis https://blog.csdn.net/weixin_40903057/article/details/89706111

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

Guess you like

Origin blog.csdn.net/weixin_40903057/article/details/89705323