用scrapy异步写入,爬取51job数据

# 51job我们查看源代码可以看出网站信息都是HTML数据,所以只用xpath,就能将所需要的信息获取
# 创建爬虫
class ZhaopinSpiderSpider(scrapy.Spider):
    name = 'zhaopin_spider'
    allowed_domains = ['51job.com']
    start_urls = ['https://search.51job.com/list/040000%252C010000%252C020000%252C030200%252C170200,000000,0000,00,9,99,Python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']

    def parse(self, response):
        href = response.xpath('//p[@class="t1 "]/span/a/@href').extract()
        for url in href:
            yield Request(
                url=url,
                callback=self.parse_detail,)

    def parse_detail(self, response):
        resource = '51job'
        url = response.xpath('//p[@class="cname"]/a/@href').extract_first('')
        try:
            name = response.xpath('//h1/text()').extract_first().strip()
        except Exception as e:
            name = None
        salary = response.xpath('//div[@class="cn"]/strong/text()').extract_first()
        # 这里薪资有多种情况,所以要用if处理一下
        if '-' in salary:
            salary_from = response.xpath('//div[@class="cn"]/strong/text()').extract_first().strip('/月').split('-')[0]
            salary_to = response.xpath('//div[@class="cn"]/strong/text()').extract_first().strip('/月').split('-')[1]
        else:
            salary_from = 0
            salary_to = salary
        years_of_work = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip()
        if '-'in years_of_work:
            years_of_work_from = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip().split('-')[0]
            years_of_work_to = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip().split('-')[1]
        else:
            years_of_work_from = 0
            years_of_work_to = years_of_work
        work_place = response.xpath('//p[@class="msg ltype"]/text()').extract()[0].strip()

        degree = response.xpath('//p[@class="msg ltype"]/text()').extract()[2].strip()
        try:
            release_ = response.xpath('//p[@class="msg ltype"]/text()').extract()[4].strip()
        except Exception as f:
            release_ = '无'
        member = response.xpath('//p[@class="msg ltype"]/text()').extract()[3].strip()
        description = response.xpath('//div[@class="bmsg job_msg inbox"]/p/text()').extract()
        # 这里取出的是列表,所以我们要将他转换一下
        description = ' '.join(description)
        try:
            temptation = response.xpath('//span[@class="sp4"]/text()').extract()
            temptation = " ".join(temptation)
        except Exception as e:
            temptation = '无'
        try:
            belong_id = response.xpath('//div[@class="bmsg inbox"]/a/@onclick')[0].extract().split('=')[1].split(',')[0]
        except Exception as e:
            belong_id = '无'

        item = MyItem()
        item['resource'] = resource
        item['url'] = url
        item['name'] = name
        item['salary_from'] = salary_from
        item['salary_to'] = salary_to
        item['is_annual_salary'] = is_annual_salary
        item['is_negotiable'] = is_negotiable
        item['years_of_work_from'] = years_of_work_from
        item['years_of_work_to'] = years_of_work_to
        item['work_place'] = work_place
        item['degree'] = degree
        item['release'] = release_
        item['member'] = member
        item['temptation'] = temptation
        item['description'] = description
        item['is_alive'] = is_alive
        item['create_time'] = create_time
        item['modified_time'] = modified_time
        item['is_add'] = is_add
        item['belong_id'] = belong_id
        # url = response.xpath('//p[@class="cname"]/a/@href').extract_first('')
        # if url:
        #     print('======================')

        req = Request(url=url,
                      meta={'item': item},
                    callback=self.parse_three)
        yield req

        # print('没有公司详情页网页链接!!!!')

    def parse_three(self, response):
        firm_introduction = response.xpath('//div[@class="con_txt"]/text()')[0].extract()
        firm_name = response.xpath('//h1/text()')[0].extract().strip()
        firm_scale = response.xpath('//p/text()')[15].extract().strip()
        if '-' in firm_scale:
            firm_scale_from = response.xpath('//p/text()')[15].extract().strip().split('-')[0]
            firm_scale_to = response.xpath('//p/text()')[15].extract().strip().strip('\r\n')
        else:
            firm_scale_from = 0
            firm_scale_to = firm_scale

        firm_nature = response.xpath('//p[@class="ltype"]/text()').extract()[0].strip()
        firm_industry = response.xpath('//p[@class="ltype"]/text()').extract()[2].strip()
        firm_website = response.xpath('//span[@class="icon_det"]/text()').extract_first()
        firm_location = response.xpath('//p[@class="fp"]/text()').extract()[1].strip().split('(')[0]

        firm_log = 0
        firm_lat = 0
        item = response.meta.get("item")
        item['firm_place'] = item["work_place"]
        item['firm_introduction'] = firm_introduction
        item['firm_name'] = firm_name
        item['firm_scale_from'] = firm_scale_from
        item['firm_scale_to'] = firm_scale_to
        item['firm_nature'] = firm_nature
        item['firm_industry'] = firm_industry
        item['firm_website'] = firm_website
        item['firm_location'] = firm_location
        item['firm_lon'] = firm_log
        item['firm_lat'] = firm_lat

        yield item

这是爬虫部分,剩下的在pipeline中设计数据库信息,创建数据库并导入
from twisted.enterprise import adbapi
from pymysql import cursors


class TwistedMysqlPipeline(object):
    @classmethod
    def from_settings(cls, settings):
        db_prams = dict(
            host=settings['MYSQL_HOST'],
            user=settings['MYSQL_USER'],
            password=settings['MYSQL_PW'],
            db=settings['MYSQL_DB'],
            port=3306,
            use_unicode=True,
            charset=settings['MYSQL_CHARSET'],
            cursorclass=cursors.DictCursor
        )
        db_pool = adbapi.ConnectionPool('pymysql', **db_prams)
        return cls(db_pool)

    def __init__(self, db_pool):
        self.db_pool = db_pool

    def process_item(self, item, spider):
        query = self.db_pool.runInteraction(self.insert_item, item)
        query.addErrback(self.handle_error, item, spider)
        return item

剩下的就是创建表格了,插入表格,,,,,,

猜你喜欢

转载自blog.csdn.net/wu_tian_hao/article/details/82784294