版权声明:本文为博主原创文章,未经博主允许不得转载。若有疑问,请邮件:[email protected] https://blog.csdn.net/cloudmq/article/details/78342636
最近使用scrapy 抓取一批数据,就拿链家实验一下吧
环境准备
pip install scrapy
基本命令
创建项目
scrapy startproject myproject
运行某个项目
scrapy crawl myspider
如何在pycharm里运行scrapy项目?
创建文件,代码如下:
# run.py
from scrapy import cmdline
cmdline.execute('scrapy crawl dmoz'.split())
创建一个spider(也可以自己添加文件)
- scrapy genspider myspider ‘baidu.com’
- 这样就形成了一个已名字为myspider的spider
列出当前项目所有的spider,每行输出一个spider
- scrapy list
- spider1
- spider2
- scrapy list
在未创建项目的情况下,运行一个编写在Python文件中的spider
- scrapy runspider myspider.py
xpath 简单使用demo
例如有下面的一段html代码
...
<div class="demo-class"> 这是div的text </div>
<div class="demo2-class"><a href="www.china.com">这是a的text></a>这是div的text</div>
response.xpath("//div[@class="demo-class"]/text()")
# 输出的是“这是div的text”
response.xpath("//div[@class="demo-class2"]/a/@href")
#输出的是 www.china.com
temp = response.xpath("//div[@class="demo-class2"]")
ou = temp.xpath("a/@href")
#这里使用的是逐级查找,注意“a/@href”前面没有"//",“//”应该是全文检索,不加的时候会相对位置查找,这时输出的是www.china.com
链家数据抓取(以北京为例)
- 获取所有的区域(例如东城区,西城区,海淀区…)
# 获取所有region
class LjregionSpider(scrapy.Spider):
name = 'ljRegion'
allowed_domains = ['bj.lianjia.com']
start_urls = ['https://bj.lianjia.com/ershoufang']
def parse(self, response):
regions=response.xpath('//div[@data-role="ershoufang"]/div/a')
for r in regions:
href=r.xpath('@href').extract()[0]
name=r.xpath('text()').extract()[0]
dj_ljRegion = djLjRegion(href=href,name=name)
dj_ljRegion.save()
- 获取每个房源简略信息
#-*- coding:utf-8 -*-
import scrapy
from lianjia.items import *
import json
import codecs
import logging
class ershoufang( scrapy.Spider ):
name = "ershoufang"
allowed_domains=['bj.lianjia.com']
start_urls=['https://bj.lianjia.com/ershoufang/dongcheng/']
file = codecs.open("scrapyUrl.txt", "w", encoding="utf-8")
def parse(self, response):
try: houseDetailClear=response.xpath('//div[@class="content "]/div[@class="leftContent"]/ul/li')
for item in houseDetailClear:
ljItem=LianjiaItem()
ljItem['houseCode'] = item.xpath('a[@class="img "]/@data-housecode').extract()[0]
ljItem['href'] = item.xpath('a[@class="img "]/@href').extract()[0]
ljItem['title'] = item.xpath('div[@class="info clear"]/div[@class="title"]/a/text()').extract()[0]
ljItem['houseInfoRegion']=item.xpath('div[@class="info clear"]/div[@class="address"]/div/a/text()').extract()[0]
ljItem['houseInfo'] =ljItem['houseInfoRegion']+ item.xpath('div[@class="info clear"]/div[@class="address"]/div/text()').extract()[0]
ljItem['houseInfoRegionHref']=item.xpath('div[@class="info clear"]/div[@class="address"]/div/a/@href').extract()[0]
ljItem['positionInfo']=item.xpath('div[@class="info clear"]/div[@class="flood"]/div/text()').extract()[0]
ljItem['positionInfoRegion']=item.xpath('div[@class="info clear"]/div[@class="flood"]/div/a/text()').extract()[0]
ljItem['followInfo']=item.xpath('div[@class="info clear"]/div[@class="followInfo"]/text()').extract()[0]
tagSubway=item.xpath('div[@class="info clear"]/div[@class="tag"]/span/text()').extract()
if len(tagSubway)!=0:
ljItem['tagSubway']=tagSubway[0]
else :
ljItem['tagSubway']=''
tagTaxfree=item.xpath('div[@class="info clear"]/div[@class="tag"]/span[@class="taxfree"]/text()').extract()
if len(tagTaxfree)!=0:
ljItem['tagTaxfree'] =tagTaxfree[0]
else:
ljItem['tagTaxfree'] = ''
tagHaskey=item.xpath('div[@class="info clear"]/div[@class="tag"]/span[@class="haskey"]/text()').extract()
if len(tagHaskey) != 0:
ljItem['tagHaskey'] = tagHaskey[0]
else:
ljItem['tagHaskey'] = ''
ljItem['totalPrice']=item.xpath('div[@class="info clear"]/div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()').extract()[0]
ljItem['unitPrice']=item.xpath('div[@class="info clear"]/div[@class="priceInfo"]/div[@class="unitPrice"]/@data-price').extract()[0]
yield ljItem
ershoufangRegions=response.xpath('//div[@data-role="ershoufang"]/div/a/@href').extract()
selectRegion=response.xpath('//div[@data-role="ershoufang"]/div/a[@class="selected"]') selectRegion=selectRegion.xpath('@href').extract()[0]
resPageInfo = response.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0].extract().encode('utf-8')
pgInfo = json.loads(resPageInfo)
totalPage = pgInfo['totalPage']
curPage = pgInfo['curPage']
if curPage < totalPage:
next_href='https://bj.lianjia.com%spg%d/'%(selectRegion,curPage+1)
self.file.write('\n'+next_href+'\n')
# 如果下一页属性值存在,则通过urljoin函数组合下一页的url:
next_page = response.urljoin(next_href)
# 回调parse处理下一页的url
yield scrapy.Request(next_page, callback=self.parse)
else:
regionIndex=ershoufangRegions.index(selectRegion)
if regionIndex < len(ershoufangRegions)-1:
selectRegion = ershoufangRegions[regionIndex+1]
next_href = 'https://bj.lianjia.com/%s' % (selectRegion)
self.file.write("\n====================================\n")
self.file.write('\n')
self.file.write(next_href + '\n')
self.file.write('\n')
self.file.write("====================================\n")
# 如果下一页属性值存在,则通过urljoin函数组合下一页的url:
next_page = response.urljoin(next_href)
# 回调parse处理下一页的url
yield scrapy.Request(next_page, callback=self.parse)
else:
return
except Exception,e:
print str(e)