安装scrapy不再赘述,
在控制台中输入scrapy startproject tencent 创建爬虫项目名字为 tencent
接着cd tencent
用pycharm打开tencent项目
构建item文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#职位名
positionname
=
scrapy.Field()
#详细链接
positionLink
=
scrapy.Field()
#职位类别
positionType
=
scrapy.Field()
#招聘人数
peopleNum
=
scrapy.Field()
#工作地点
workLocation
=
scrapy.Field()
#发布时间
publishTime
=
scrapy.Field()
|
接着在spiders文件夹中新建tencentPostition.py文件代码如下注释写的很清楚
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
# -*- coding: utf-8 -*-
import
scrapy
from
tencent.items
import
TencentItem
class
TencentpostitionSpider(scrapy.Spider):
#爬虫名
name
=
'tencent'
#爬虫域
allowed_domains
=
[
'tencent.com'
]
#设置URL
url
=
'http://hr.tencent.com/position.php?&start='
#设置页码
offset
=
0
#默认url
start_urls
=
[url
+
str
(offset)]
def
parse(
self
, response):
#xpath匹配规则
for
each
in
response.xpath(
"//tr[@class='even'] | //tr[@class='odd']"
):
item
=
TencentItem()
# 职位名
item[
"positionname"
]
=
each.xpath(
"./td[1]/a/text()"
).extract()[
0
]
# 详细链接
item[
"positionLink"
]
=
each.xpath(
"./td[1]/a/@href"
).extract()[
0
]
# 职位类别
try
:
item[
"positionType"
]
=
each.xpath(
"./td[2]/text()"
).extract()[
0
]
except
:
item[
"positionType"
]
=
'空'
# 招聘人数
item[
"peopleNum"
]
=
each.xpath(
"./td[3]/text()"
).extract()[
0
]
# 工作地点
item[
"workLocation"
]
=
each.xpath(
"./td[4]/text()"
).extract()[
0
]
# 发布时间
item[
"publishTime"
]
=
each.xpath(
"./td[5]/text()"
).extract()[
0
]
#把数据交给管道文件
yield
item
#设置新URL页码
if
(
self
.offset<
2620
):
self
.offset
+
=
10
#把请求交给控制器
yield
scrapy.Request(
self
.url
+
str
(
self
.offset),callback
=
self
.parse)
|
接着配置管道文件pipelines.py代码如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
class
TencentPipeline(
object
):
def
__init__(
self
):
#在初始化方法中打开文件
self
.fileName
=
open
(
"tencent.json"
,
"wb"
)
def
process_item(
self
, item, spider):
#把数据转换为字典再转换成json
text
=
json.dumps(
dict
(item),ensure_ascii
=
False
)
+
"\n"
#写到文件中编码设置为utf-8
self
.fileName.write(text.encode(
"utf-8"
))
#返回item
return
item
def
close_spider(
self
,spider):
#关闭时关闭文件
self
.fileName.close()
|
接下来需要配置settings.py文件
不遵循ROBOTS规则
1
|
ROBOTSTXT_OBEY
=
False
|
1
2
|
#下载延迟
DOWNLOAD_DELAY
=
3
|
1
2
3
4
5
|
#设置请求头
DEFAULT_REQUEST_HEADERS
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
,
}
|
1
2
3
4
|
#交给哪个管道文件处理 文件夹.管道文件名.类名
ITEM_PIPELINES
=
{
'tencent.pipelines.TencentPipeline'
:
300
,
}
|
接下来再控制台中输入
scrapy crawl tencent
即可爬取
源码地址
https://github.com/ingxx/scrapy_to_tencent