scrapy framework Shenzhen python crawling post information on the Zhaopin website.

Crawling field, company name, job title, company details link, salary, years of work experience required

1, items defined fields crawling

import scrapy


class ZhilianzhaopinItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    company_name = scrapy.Field()
    jobName = scrapy.Field()
    company_url = scrapy.Field()
    salary = scrapy.Field()
    workingExp = scrapy.Field()

2, the main program functions

# -*- coding: utf-8 -*-
import scrapy
from urllib.parse import urlencode
import json
import math
from zhilianzhaopin.items import ZhilianzhaopinItem
class ZlzpSpider(scrapy.Spider):
    name = 'zlzp'
    # allowed_domains = ['www.zhaopin.com']
    start_urls = ['https://fe-api.zhaopin.com/c/i/sou?']
    data = {
        'start': '0',
        'pageSize': '90',
        'cityId': '765',
        'kw': 'python',
        'kt': '3'
    }
    def start_requests(self):
        url = self.start_urls[0]+urlencode(self.data)
        yield scrapy.Request(url=url,callback=self.parse)

    def parse(self, response):
        response = json.loads(response.text)
        sum = int(response['data']['count'])
        for res in response['data']['results']:
            item = ZhilianzhaopinItem()
            item['company_name'] = res['company']['name']
            item['jobName'] = res['jobName']
            item['company_url'] = res['company']['url']
            item['salary'] = res['salary']
            item['workingExp'] = res['workingExp']['name']
            yield item

        for url_info in range(90,sum,90):
            self.data['start'] = str(url_info)
            url_i = self.start_urls[0]+urlencode(self.data)
            yield scrapy.Request(url=url_i,callback=self.parse)

3, settings are provided to open the download request headers and piping

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'

ITEM_PIPELINES = {
   'zhilianzhaopin.pipelines.ZhilianzhaopinPipeline': 300,
}

4, create a database,

5, pipelines.py written to the database file

import pymysql
# 写入mysql数据库
class ZhilianzhaopinPipeline(object):
    conn = None
    mycursor = None
    def open_spider(self, spider):
        print('链接数据库...')
        self.conn = pymysql.connect(host='172.16.25.4', user='root', password='root', db='scrapy')
        self.mycursor = self.conn.cursor()
    def process_item(self, item, spider):
        print('正在写数据库...')
        company_name = item['company_name']
        jobName = item['jobName']
        company_url = item['company_url']
        salary = item['salary']
        workingExp = item['workingExp']
        sql = 'insert into zlzp VALUES (null,"%s","%s","%s","%s","%s")' % (company_name, jobName, company_url,salary,workingExp)
        bool = self.mycursor.execute(sql)
        self.conn.commit()
        return item

    def close_spider(self, spider):
        print('写入数据库完成...')
        self.mycursor.close()
        self.conn.close()

6, to see if write successful

done。

Guess you like

Origin www.cnblogs.com/nmsghgnv/p/11348448.html