python 爬取链家南京的数据

spider代码

-- coding: utf-8 --

import scrapy
from lxml import etree
import time
import random
import requests
from ..items import LianjiananjingItem
class LianjiananjingSpider(scrapy.Spider):
name = ‘lianjiananjing’
allowed_domains = [‘nj.lianjia.com’]
start_urls = ‘https://nj.lianjia.com/ershoufang/

def start_requests(self):
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:60.0) Gecko/20100101 Firefox/60.0'
    headers = {'User-Agent': user_agent}
    yield scrapy.Request(url=self.start_urls,headers=headers,method='GET',callback=self.parse)

def parse(self,response):
    print(response)
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:60.0) Gecko/20100101 Firefox/60.0'
    headers = {'User-Agent': user_agent}
    lists = response.body.decode('utf-8')
    selector = etree.HTML(lists)
    # 在进行网页抓取的时候,分析定位html节点
    # 将文件读入,解析成树,然后根据路径定位到每个节点
    area_list = selector.xpath('/html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div/a')
    #etree.HTML得到的内容可以直接使用xpath
    for area in area_list:
        try:
            area_hanzi = area.xpath('text()').pop()
            print(area_hanzi)
            area_pinyin = area.xpath('@href').pop().split('/')[2] #['/ershoufang/pudong/', '/ershoufang/minhang/', '/ershoufang/baoshan/'...]
            area_url = 'http://nj.lianjia.com/ershoufang/{}/'.format(area_pinyin)
            print(area_url)
            yield scrapy.Request(url=area_url, headers=headers, callback=self.detail_url, meta={"id1":area_hanzi, "id2":area_pinyin})
        except Exception:
            print('44444444444444')

def detail_url(self,response):
    for i in range(1, 40):  # 获取1-39页
        url = 'http://nj.lianjia.com/ershoufang/{}/pg{}/'.format(response.meta["id2"], str(i))
        time.sleep(random.randint(1, 5))  # 随机等待1-5秒
        try:
            contents = requests.get(url)
            contents = etree.HTML(contents.content.decode('utf-8'))
            houselist = contents.xpath('/html/body/div[4]/div[1]/ul/li')
            for house in houselist:
                try:
                    item = LianjiananjingItem()
                    item['page'] = i
                    item['title'] = house.xpath('div[1]/div[1]/a/text()').pop()
                    item['community'] = house.xpath('div[1]/div[2]/div/a/text()').pop()
                    item['model'] = house.xpath('div[1]/div[2]/div/text()').pop().split('|')[1]
                    item['area'] = house.xpath('div[1]/div[2]/div/text()').pop().split('|')[2]
                    item['focus_num'] = house.xpath('div[1]/div[4]/text()').pop().split('/')[0]
                    item['watch_num'] = house.xpath('div[1]/div[4]/text()').pop().split('/')[1]
                    item['time'] = house.xpath('div[1]/div[4]/text()').pop().split('/')[2]
                    item['price'] = house.xpath('div[1]/div[6]/div[1]/span/text()').pop()
                    item['average_price'] = house.xpath('div[1]/div[6]/div[2]/span/text()').pop()
                    item['link'] = house.xpath('div[1]/div[1]/a/@href').pop()
                    item['city'] = response.meta["id1"]
                    self.url_detail = house.xpath('div[1]/div[1]/a/@href').pop()

                except Exception:
                    print('2222222')
                print(item)
                yield item
        except Exception:
            print('2222222')

setting 要注意把

ROBOTSTXT_OBEY = False,因为带有htpps的请求会被禁
ITEM_PIPELINES = {
‘Lianjiananjing.pipelines.LianjiananjingPipeline’: 300,
}

item代码

-- coding: utf-8 --

Define here the models for your scraped items

#

See documentation in:

https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class LianjiananjingItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
page = scrapy.Field()
title = scrapy.Field()
community = scrapy.Field()
model = scrapy.Field()
area = scrapy.Field()
focus_num = scrapy.Field()
watch_num =scrapy.Field()
time = scrapy.Field()
price = scrapy.Field()
average_price =scrapy.Field()
link = scrapy.Field()
city = scrapy.Field()

pipelines代码

import csv
class LianjiananjingPipeline(object):
def process_item(self, item, spider):
f = open(‘V:/Python/scapyProject/Lianjiananjing/lianjiananjing.csv’, ‘a+’)
write = csv.writer(f)
write.writerow((item[‘title’], item[‘community’], item[‘model’], item[‘area’], \
item[‘focus_num’], item[‘watch_num’], item[‘time’], item[‘price’], item[‘average_price’],
item[‘link’], \
item[‘city’], item[‘page’]))
return item

main 用于调试等

!/usr/bin/env python

-- coding:utf-8 --

from scrapy.cmdline import execute
import os
import sys

添加当前项目的绝对地址

sys.path.append(os.path.dirname(os.path.abspath(file)))

执行 scrapy 内置的函数方法execute, 使用 crawl 爬取并调试,最后一个参数jobbole 是我的爬虫文件名

execute([‘scrapy’, ‘crawl’, ‘lianjiananjing’])

猜你喜欢

转载自blog.csdn.net/qq_22994783/article/details/81871990