python3 scrapy 爬取腾讯招聘

安装scrapy不再赘述,

在控制台中输入scrapy startproject tencent 创建爬虫项目名字为 tencent

接着cd tencent

用pycharm打开tencent项目

构建item文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# -*- coding: utf-8 -*-
 
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
 
import  scrapy
 
 
class  TencentItem(scrapy.Item):
     # define the fields for your item here like:
     # name = scrapy.Field()
     #职位名
     positionname  =  scrapy.Field()
     #详细链接
     positionLink  =  scrapy.Field()
     #职位类别
     positionType  =  scrapy.Field()
     #招聘人数
     peopleNum  =  scrapy.Field()
     #工作地点
     workLocation  =  scrapy.Field()
     #发布时间
     publishTime  =  scrapy.Field()

  接着在spiders文件夹中新建tencentPostition.py文件代码如下注释写的很清楚

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# -*- coding: utf-8 -*-
import  scrapy
from  tencent.items  import  TencentItem
 
class  TencentpostitionSpider(scrapy.Spider):
     #爬虫名
     name  =  'tencent'
     #爬虫域
     allowed_domains  =  [ 'tencent.com' ]
     #设置URL
     url  =  'http://hr.tencent.com/position.php?&start='
     #设置页码
     offset  =  0
     #默认url
     start_urls  =  [url + str (offset)]
 
     def  parse( self , response):
         #xpath匹配规则
         for  each  in  response.xpath( "//tr[@class='even'] | //tr[@class='odd']" ):
             item  =  TencentItem()
             # 职位名
             item[ "positionname" =  each.xpath( "./td[1]/a/text()" ).extract()[ 0 ]
             # 详细链接
             item[ "positionLink" =  each.xpath( "./td[1]/a/@href" ).extract()[ 0 ]
             # 职位类别
             try :
                 item[ "positionType" =  each.xpath( "./td[2]/text()" ).extract()[ 0 ]
             except :
                 item[ "positionType" =  '空'
             # 招聘人数
             item[ "peopleNum" =  each.xpath( "./td[3]/text()" ).extract()[ 0 ]
             # 工作地点
             item[ "workLocation" =  each.xpath( "./td[4]/text()" ).extract()[ 0 ]
             # 发布时间
             item[ "publishTime" =  each.xpath( "./td[5]/text()" ).extract()[ 0 ]
             #把数据交给管道文件
             yield  item
         #设置新URL页码
         if ( self .offset< 2620 ):
             self .offset  + =  10
         #把请求交给控制器
         yield  scrapy.Request( self .url + str ( self .offset),callback = self .parse)

  接着配置管道文件pipelines.py代码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# -*- coding: utf-8 -*-
 
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 
import  json
class  TencentPipeline( object ):
     def  __init__( self ):
         #在初始化方法中打开文件
         self .fileName  =  open ( "tencent.json" , "wb" )
 
     def  process_item( self , item, spider):
         #把数据转换为字典再转换成json
         text  =  json.dumps( dict (item),ensure_ascii = False ) + "\n"
         #写到文件中编码设置为utf-8
         self .fileName.write(text.encode( "utf-8" ))
         #返回item
         return  item
 
     def  close_spider( self ,spider):
         #关闭时关闭文件
         self .fileName.close()

  接下来需要配置settings.py文件

不遵循ROBOTS规则

1
ROBOTSTXT_OBEY  =  False

  

1
2
#下载延迟
DOWNLOAD_DELAY  =  3

  

1
2
3
4
5
#设置请求头
DEFAULT_REQUEST_HEADERS  =  {
     'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' ,
     'Accept' 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
}

 

1
2
3
4
#交给哪个管道文件处理 文件夹.管道文件名.类名
ITEM_PIPELINES  =  {
     'tencent.pipelines.TencentPipeline' 300 ,
}

 接下来再控制台中输入 

scrapy crawl tencent

即可爬取

源码地址

https://github.com/ingxx/scrapy_to_tencent 

猜你喜欢

转载自www.cnblogs.com/zxtceq/p/8984808.html
今日推荐