爬取链家所有地区最新房源信息

使用scrapy,拼接url,找到翻页参数,保存为json

lj.py

-- coding: utf-8 --

import copy
import re
import time
import scrapy

from lianjia.items import LianjiaItem

class LjSpider(scrapy.Spider):
name = 'lj'
allowed_domains = ['lianjia.com']
start_urls = ['https://www.lianjia.com/city/']

cookies_str = """lianjia_uuid=47a174e2-625a-4b22-a3c2-25fd1ec31b81; _ga=GA1.2.1898977159.1552302637; _gid=GA1.2.96347100.1552302637; lianjia_ssid=95f26562-a063-4c30-a718-28df4f75bc9c; _smt_uid=5c864d54.1796444c; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1552305492; gr_user_id=f98cfb4a-0fd7-48b6-bd92-84b72f7cda35; gr_session_id_a1a50f141657a94e=50c2d270-7331-4957-9f3c-ea51bf6dc831; gr_session_id_a1a50f141657a94e_50c2d270-7331-4957-9f3c-ea51bf6dc831=true; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1552307519; select_city=341100; lj_newh_session=eyJpdiI6IjRmZGdtYjR6Q3FEK2RoVVVBbGRib3c9PSIsInZhbHVlIjoicGR4N3hyZVwvRHN2dEFaR3pBY1Jodm1QVkZ2QVFuSTdia0RcL1wvVUpCU2JjZnpxTWRNWE9JTWxWOG1OZUZWMU52bXdZQ1wveGk0cUViK1hrZFVYblwvVlpiQT09IiwibWFjIjoiN2ZhOGU5N2Y0YWVmOGIyYjRmM2I4YTdmNzQzNDMxMzk5N2ZlYjQzNmU1MzI3OTQ0YTM3YjE4NDhlMDRkZTM2NyJ9"""
cookies_dict = {i.split('=')[0]: i.split('=')[1] for i in cookies_str.split('; ')}

def parse(self, response):
    city_list = response.xpath('//div/ul/li/div/div/ul/li/a')
    for data in city_list:
        time.sleep(1)
        city_dict = LianjiaItem()
        city_dict['name'] = data.xpath('./text()').extract_first()
        city_dict['link'] = data.xpath('./@href').extract_first() + 'ershoufang/pg1co32/'
        yield scrapy.Request(city_dict['link'], encoding='utf-8', cookies=self.cookies_dict,
                             callback=self.parse_city, meta={'city_h': copy.deepcopy(city_dict)})

def parse_city(self, response):
    city_dict = response.meta['city_h']
    hourse_list = response.xpath('//div[1]/ul/li/div[1]')
    for hourse in hourse_list:
        hoursing_area = hourse.xpath('./div[3]/div/a/text()').extract_first()
        if hoursing_area:
            city_dict['hoursing_area'] = hoursing_area
            city_dict['hoursing_total_price'] = hourse.xpath('./div[6]/div[1]/span/text()').extract_first()
            city_dict['hoursing_unit_price'] = hourse.xpath('./div[6]/div[2]/span/text()').extract_first()

            yield city_dict
    city_url = response.xpath('//body/div[1]/div/ul/li[2]/a/@href').extract_first()
    page_total = re.findall(r'totalPage":(\d{1,3})', response.text)
    print(page_total)
    if page_total:
        pages = int(page_total[0])
        # 发送 下一页的请求
        for city_page in range(2, pages):
            if city_page > 3:
                return
            page_url = city_url + '/pg{}co32/'.format(city_page)
            yield scrapy.Request(page_url, callback=self.parse_city, meta={'city_h': city_dict})

pipelines.py

如果信息多的话用JsonLinesItemExporter保存更好

from scrapy.exporters import JsonLinesItemExporter, JsonItemExporter
class AqiJsonPipeline(object):
def open_spider(self, spider):
self.file = open('lianjia2.json', 'wb')
self.writer = JsonItemExporter(self.file, ensure_ascii=False, encoding='utf-8')
self.writer.start_exporting()

def process_item(self, item, spider):
    self.writer.export_item(item)
    return item

def close_spider(self, spider):
    self.writer.finish_exporting()
    self.file.close()

items.py

import scrapy

class LianjiaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
link = scrapy.Field()
hoursing_area = scrapy.Field()
hoursing_total_price = scrapy.Field()
hoursing_unit_price = scrapy.Field()

猜你喜欢

转载自www.cnblogs.com/WhiteCoder/p/10520518.html