美团酒店

#!/root/.pyenv/shims/python3.6
# -*- coding:utf-8 -*-
# Anuthor : zhibo.wang
# E-mail  : [email protected]
# Date    : 18/04/12 16:11:28
# Desc    : 美团酒店


import time
import hashlib
import socket
import random
import json
import requests
from bs4 import BeautifulSoup
from data_utils.time_convert import get_time_stamp
from data_utils.conmongodb import mongo_con_keepalive
from data_utils.ali_oss import OSS2


class Crawl:
    is_proxy = True  #
    proxyMeta = "http://xxxx:[email protected]:9020"
    proxies = {
        "http": proxyMeta,
        "https": proxyMeta,
    }
    start_url = "http://hotel.meituan.com/"
    time_stamp = get_time_stamp()
    path_dir = "hotel/meituan/{0}/".format(time_stamp)
    time_local = time.localtime(int(time_stamp))
    date = time.strftime("%Y%m%d", time_local)

    data_url = "https://ihotel.meituan.com/hbsearch/HotelSearch" \
               "?utm_medium=pc" \
               "&version_name=999.9" \
               "&cateId=20" \
               "&attr_28=129" \
               "&uuid=" \
               "&cityId=cityId" \
               "&offset=0" \
               "&limit=20" \
               "&startDay={0}" \
               "&endDay={1}" \
               "&q=" \
               "&sort=defaults" \
               "&poi_attr_20022=poi_attr_20022".format(date, date)
    params_citys = "params_citys"
    website = "hotel_meituan"
    timeout = 20  # 超时时间
    if is_proxy:
        wait_time = [0.16, 0.17]
    else:
        wait_time = [1, 1.1, 1.2, 1.3]     # 间隔时间
    headers = {
            "Host": "hotel.meituan.com",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Cache-Control" : "max-age=0",
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Content-Type": "text/html"
        }

    def __init__(self):
        self.db = mongo_con_keepalive()
        self.db.get_collection('pathdir_dict').insert_one({'pathdir': self.path_dir, 'website': self.website, 'flag': False})
        self.oss = OSS2()
        super(Crawl, self).__init__()


    def req(self, url, headers, pattern=True, num=3):
        # 请求数据
        time.sleep(random.choice(self.wait_time))
        soup = None

        if not num:
            return soup
        try:
            if self.is_proxy:
                r = requests.get(url, headers=headers, timeout=self.timeout, proxies=self.proxies)
            else:
                r = requests.get(url, headers=headers, timeout=self.timeout)
            if r.status_code == 200:
                r.encoding = 'utf-8'
                if pattern:
                    soup = BeautifulSoup(r.text, "html.parser")
                else:
                    soup = r.json()
            elif r.status_code != 200:
                num -=1
                return self.req(url, headers, pattern, num)

        except Exception as e:
            print("fun req error: ", e)
        return soup


    def get_hotel_type_code(self, city_data):
        # 获取酒店类型
        city_url = "{0}{1}/".format(self.start_url, city_data.get("meituan_code"))
        headers = self.headers
        soup = self.req(city_url, headers, pattern=True)
        end_data = None
        if soup:
            txt = soup.find_all("div", class_="search-row-content")[2]
            end_data = [{"name": i.get_text().strip(), "poi_attr": i.get("href").split("/")[-2].replace("c", "")} for i in txt.find_all("a")]

        return end_data


    def create_filename(self, url):
        # 生成文件名
        fname = '%s_%s_%s_%s.html' % (socket.gethostname(),
                                          url.split('//')[-1].split('/')[0].replace('.', '-'),
                                          hashlib.md5(url.encode()).hexdigest(),
                                          str(time.time()).split('.')[0])
        return fname

    def get_data_totalcount(self, tot_url, headers):
        # 获取数据总数
        data = self.req(tot_url, headers, pattern=False)
        count = None
        if data:
            count = data.get("data").get("totalcount")
        return count


    def start(self):
        city_datas = self.db.get_collection(self.params_citys).find({})
        for city_data in city_datas:
            cityname = city_data.get("cityname")
            if city_data.get("meituan_code"):
                referer = "{0}{1}/".format(self.start_url, city_data.get("meituan_code"))
                hotel_type_codes = self.get_hotel_type_code(city_data)
                # 获取酒店类型
                if hotel_type_codes:
                    headers = self.headers
                    headers["Content-Type"] = "application/json, text/plain, */*"
                    headers["Host"] = "ihotel.meituan.com"
                    headers["Origin"] = "http://hotel.meituan.com"
                    headers["Referer"] = referer
                    for hotel_code in hotel_type_codes:
                        hotel_type_name = hotel_code.get("name")
                        hotel_type_code = hotel_code.get("poi_attr")
                        tot_url = self.data_url.replace("cityId=cityId", "cityId={0}".format(city_data.get("meituan_id")))\
                            .replace("poi_attr_20022=poi_attr_20022","poi_attr_20022={0}".format(hotel_code.get("poi_attr")))
                        totalcount = self.get_data_totalcount(tot_url, headers)
                        # 获取数据总数 
                        if totalcount:
                            all_url = [tot_url.replace("offset=0", "offset={0}".format(c)) for c in range(0, totalcount+1, 20)]
                            # 根据数据总数生成所有分页地址
                            for url_ in all_url:
                                data = self.req(url_, headers, pattern=False)
                                if data:
                                    file_ = "{0}{1}".format(self.path_dir, self.create_filename(url_))
                                    data["cityname"] = cityname
                                    data["hotel_type_name"] = hotel_type_name
                                    data["hotel_type_code"] = hotel_type_code
                                    self.oss.uploadfiledata(file_, json.dumps(data))


if __name__ == "__main__":
    C = Crawl()
    C.start()

  

猜你喜欢

转载自www.cnblogs.com/dockers/p/9238473.html