# -*- coding: utf-8 -*- import scrapy from urllib import request from Py06_2018_3_16.items import TencentItem class TencentSpider(scrapy.Spider): name = 'tencent' allowed_domains = ['hr.tencent.com'] start_urls = [] base_url='https://hr.tencent.com/position.php?&start=%s#a' for i in range(0,1): url=base_url%(i-1)*10 start_urls.append(url) def parse(self, response): job_even=response.xpath('//tr[@class="even"]') job_odd=response.xpath('//tr[@class="odd"]') #合并数组 jobs=job_even+job_odd for job in jobs: print(job) item=TencentItem() #时间 date=job.xpath('.//td[5]/text()').extract()[0] item['date']=date #地点 location=job.xpath('.//td[4]/text()').extract()[0] item['location']=location #人数 num = job.xpath('.//td[3]/text()').extract()[0] item['num'] = num #职位类别 type = job.xpath('.//td[2]/text()').extract()[0] item['type'] = type #职位名称 name=job.xpath('.//td[1]/a/text()').extract()[0] item['name']=name #链接 url = job.xpath('.//td[1]/a/@href').extract()[0] # 比较低级 # url='https://hr.tencent.com/'+url #高级 #拼接全路径 url = request.urljoin(response.url,url) item['url'] = url print(name+'\t'+type+'\t'+num+'\t'+location+'\t'+date+'\t'+url) print('~~~~~~~~~~~') # yield item #请求详情页,二级流程 yield scrapy.Request(url=url,callback=self.parse_detail,meta={'data':item}) def parse_detail(self,response): # print('~~~~~~~~~~~detail~~~~~~~~~~~~`') # print(response.text) item=response.meta['data'] with open('detail.html','w',encoding='utf-8') as f: f.write(response.body.decode('utf-8')) #工作职责 duty=response.xpath('//tr[@class="c"][1]//li/text()').extract() # print(duty)#字符串列表 duty=''.join(duty) item['duty'] = duty #工作要求 rq=response.xpath('//tr[@class="c"][2]//li/text()').extract() rq=''.join(rq) item['rq']=rq print(duty+'\n') print(rq) print('~~~~~~~~~~~~~~~`') yield item
Python3~scrapy项目之爬取当前页和详细页
猜你喜欢
转载自blog.csdn.net/zbrj12345/article/details/80511456
今日推荐
周排行