携程——多线程。

# _*_coding: utf-8 _*_
from fake_useragent import UserAgent
import requests
from requests.exceptions import RequestException
import time
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import csv
from lxml import etree
import threading
from queue import Queue



ua = UserAgent()

class XieCheng():


    def __init__(self):
        self.start_url = "http://hotels.ctrip.com/hotel/beijing1/p{}"
        self.details_url = "http://hotels.ctrip.com"     # 详情页url
        self.headers = {"User-Agent":ua.random}
        self.data_list = []
        self.details_list = []
        self.url_queue = Queue()
        self.html_queue = Queue()
        self.content_queue = Queue()
        # self.details_url_queue= Queue()
        self.details_html_queue = Queue()
        self.details_content_queue = Queue()


    # 构造列表页url列表
    def list_url(self):
        # url_list = [self.start_url.format(i) for i in range(1,500)]
        # return url_list
        for i in range(1,567):
            self.url_queue.put(self.start_url.format(i))
            # s = self.url_queue.get()
            # print(s)


    # 请求列表页
    def get_url(self):
        try:
            while True:
                url = self.url_queue.get()
                print(url)
                response = requests.get(url=url,headers=self.headers)
                if response.status_code ==200:
                    # return response.content.decode()
                    self.html_queue.put(response.content.decode())
                    self.url_queue.task_done()
                return None
        except RequestException:
            return None
    #提取详情页url
    def extract_data(self):
        while True:
            html_str = self.html_queue.get()
            print('----'*20)
            html_list = etree.HTML(html_str)
            div_list = html_list.xpath("//div[@id='hotel_list']/div/ul/li[2]/h2/a/@href")
            for div_url in div_list:
                url_str = self.details_url + div_url
                print(url_str)
                self.content_queue.put(url_str)
            self.html_queue.task_done()

     # 使用selenium请求详情页
    def driver_get(self):
        while True:
            details_url_s = self.content_queue.get()
            dcap = dict(DesiredCapabilities.PHANTOMJS)
            # 添加请求头
            dcap["phantomjs.page.settings.userAgent"] = (ua.random)
            # 取消图片加载
            dcap["phantomjs.page.settings.loadImages"] = False
            driver = webdriver.PhantomJS(desired_capabilities=dcap)
            driver.get(details_url_s)
            print(details_url_s)
            print("数据请求中。。。。。")
            time.sleep(3)
            details_html_str = driver.page_source
            driver.quit()
            self.details_html_queue.put(details_html_str)
            self.content_queue.task_done()
    # 提取详情页数据
    def driver_data(self,):
        while True:
            details_html_str= self.details_html_queue.get()
            html_str = etree.HTML(details_html_str)
            details_data_list= []
            item_dict = {}
            html_str_list = html_str.xpath(".//tr[@data-disable='0']")
            item_dict['name'] = html_str.xpath("//*[@id='J_htl_info']/div[1]/h2[1]/text()") if len(
                html_str.xpath("//*[@id='J_htl_info']/div[1]/h2[1]/text()")) > 0 else None
            item_dict['id'] = html_str.xpath("//a[@id='linkViewMap']/@data-hotelid") if len(
                html_str.xpath("//a[@id='linkViewMap']/@data-hotelid")) > 0 else None
            details_data_list.append(item_dict)
            for html in html_str_list:
                item = {}
                item['price'] = html.xpath(".//span[@class='base_price']/text()") if len(
                    html.xpath(".//span[@class='base_price']/text()")) > 0 else None
                item['bed'] = html.xpath(".//td[@class='col3']/text()") if len(
                    html.xpath(".//td[@class='col3']/text()")) >0 else None
                details_data_list.append(item)
            print(details_data_list)
            self.details_content_queue.put(details_data_list)
            self.details_html_queue.task_done()
    #保存
    def save_data(self):
        while True:
            details_data=self.details_content_queue.get()
            title = ['id', 'name', 'price', 'bed']
            with open('xiechen.csv', 'a+', encoding='utf-8') as f:
                f_csv = csv.DictWriter(f,title)
                f_csv.writeheader()
                f_csv.writerows(details_data)
                print("数据保存完成。。。。。。")
            self.details_content_queue.task_done()

    # 主函数
    def run(self):
        thread_list = []
        # 构造url列表
        t_url=threading.Thread(target=self.list_url)
        thread_list.append(t_url)
        # 请求列表页
        for i in range(567):
            t_g_url=threading.Thread(target= self.get_url)
            thread_list.append(t_g_url)
        # 提取详情页url
        for i in range(10):
            t_extract=threading.Thread(target=self.extract_data)
            thread_list.append(t_extract)
        # 请求详情页
        for i in range(7):
            t_details=threading.Thread(target=self.driver_get)
            thread_list.append(t_details)

        #提取详情页数据
        for i in range(5):
            t_details_data=threading.Thread(target=self.driver_data)
            thread_list.append(t_details_data)
        #保存
        for i in range(3):
            t_save=threading.Thread(target=self.save_data)
            thread_list.append(t_save)

        for t in thread_list:
            t.setDaemon(True)  # 设置子线程守护
            t.start()

        for q in [self.url_queue,
                  self.html_queue,
                  self.content_queue,
                  # self.details_url_queue,
                  self.details_html_queue,
                  self.details_content_queue,]:
            q.join()   # 阻塞

        print("主线程结束")
if __name__ == '__main__':
    xc = XieCheng()
    xc.run()
感觉多线程和phantomjs 不够完善· 多线程也够完善，想多写一些判断，和反爬措施。
携程——多线程。

猜你喜欢