每日爬虫练习:瓜子二手车爬虫信息的采集

2020-03-31日爬虫练习

爬取网站:瓜子二手车

需求:

一:瓜子二手车的爬虫,主要是获取二手车的一些基础信息和报价。
二:采集信息:500条,简单测试了一下,采集500条没问题

所用库:

1.requests
2.lxml

import requests
from lxml import etree
import time,random

class Guazi:
    def __init__(self):
        # 请求头
        self.headers = {
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Cookie': 'uuid=53a834f1-558a-4419-d444-57c7c0b08501; antipas=83297R9Z1320t999704Fg2958w647; clueSourceCode=%2A%2300; ganji_uuid=3436671079935985333508; sessionid=c64aa9ce-2daa-40e6-f0c9-6170a5aac324; lg=1; lng_lat=114.4273_30.38912; gps_type=1; close_finance_popup=2020-03-31; user_city_id=194; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A61315524923%7D; cityDomain=wh; track_id=60778091018690560; preTime=%7B%22last%22%3A1585645826%2C%22this%22%3A1585645268%2C%22pre%22%3A1585645268%7D; guazitrackersessioncadata=%7B%22ca_kw%22%3A%22%25e7%2593%259c%25e5%25ad%2590%25e4%25ba%258c%25e6%2589%258b%25e8%25bd%25a6%22%7D; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22pcbiaoti%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22%25e7%2593%259c%25e5%25ad%2590%25e4%25ba%258c%25e6%2589%258b%25e8%25bd%25a6%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%2260778091018690560%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%2253a834f1-558a-4419-d444-57c7c0b08501%22%2C%22sessionid%22%3A%22c64aa9ce-2daa-40e6-f0c9-6170a5aac324%22%2C%22ca_city%22%3A%22wh%22%7D',
            'Host': 'www.guazi.com',
            'Pragma': 'no-cache',
            'Referer': 'https://www.guazi.com/wh/buy/',
            'Sec-Fetch-Mode': '1',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

        }

        # 代理ip
        self.all_proxies = [{'http': 'http://115.28.148.192:8118'}, {'http': 'http://221.4.150.7:8181'},
                       {'http': 'http://112.85.131.172:9999'}, {'http': 'http://122.193.245.159:9999'},
                       {'http': 'http://101.231.234.38:8080'}]  # 需要自行去找一些免费的代理

    # 解析URL
    def parse_url(self, url):
        result = requests.get(url=url, headers=self.headers, proxies=random.choice(self.all_proxies))
        if result.status_code == 200:
            text = result.content.decode('utf-8')
            html = etree.HTML(text)
            yield html

    # 获取详情页面url
    def get_detail_urls(self, html):
            hrefs = html.xpath('//ul[@class="carlist clearfix js-top"]/li/a/@href')
            for href in hrefs:
                page_urls = 'https://www.guazi.com' + href
                yield page_urls

    # 获取详情页内容
    def parse_detail_page(self, html,url):
        Car_data = []
        title = html.xpath('//div[@class="product-textbox"]/h2/text()')[0]
        Car_title = title.replace(r'\r\n', '').strip()  # 去掉特殊符号  获取车辆标题
        Car_info = html.xpath('//div[@class="product-textbox"]/ul/li/span/text()')
        Car_price = html.xpath('//div[@class="price-main"]/span/text()')  # 汽车价格(全款\贷款)
        Car_data.append((Car_title,Car_info[0],Car_info[1],Car_info[2],Car_info[3],Car_price[0],url))
        yield Car_data[0]

    # 保存车辆信息到本地
    def save_data(self, data):
        with open('./rest/guazi_cs.csv', 'a', encoding='utf-8') as f:
            f.write('{},{},{},{},{},{},{}\n'.format(data[0], data[1], data[2], data[3], data[4], data[5], data[6],))
            print('{},保存成功!'.format(data[0]))

    # 逻辑业务
    def func(self, url, count):
        for x in range(count):  # 爬取分页
            url = url+str(x)   # 拼接分页url
            for html in self.parse_url(url):
                time.sleep(2)  # 模拟请求延时,如果太快容易被反爬
                for page_url in self.get_detail_urls(html):
                    for page_html in self.parse_url(page_url):
                        for data in self.parse_detail_page(page_html, page_url):
                            self.save_data(data)


if __name__ == '__main__':
    obj = Guazi()
    obj.func('https://www.guazi.com/wh/buy/', 2)
发布了72 篇原创文章 · 获赞 79 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/weixin_42444693/article/details/105235097
今日推荐