# 51job我们查看源代码可以看出网站信息都是HTML数据,所以只用xpath,就能将所需要的信息获取 # 创建爬虫 class ZhaopinSpiderSpider(scrapy.Spider): name = 'zhaopin_spider' allowed_domains = ['51job.com'] start_urls = ['https://search.51job.com/list/040000%252C010000%252C020000%252C030200%252C170200,000000,0000,00,9,99,Python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='] def parse(self, response): href = response.xpath('//p[@class="t1 "]/span/a/@href').extract() for url in href: yield Request( url=url, callback=self.parse_detail,) def parse_detail(self, response): resource = '51job' url = response.xpath('//p[@class="cname"]/a/@href').extract_first('') try: name = response.xpath('//h1/text()').extract_first().strip() except Exception as e: name = None salary = response.xpath('//div[@class="cn"]/strong/text()').extract_first() # 这里薪资有多种情况,所以要用if处理一下 if '-' in salary: salary_from = response.xpath('//div[@class="cn"]/strong/text()').extract_first().strip('/月').split('-')[0] salary_to = response.xpath('//div[@class="cn"]/strong/text()').extract_first().strip('/月').split('-')[1] else: salary_from = 0 salary_to = salary years_of_work = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip() if '-'in years_of_work: years_of_work_from = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip().split('-')[0] years_of_work_to = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip().split('-')[1] else: years_of_work_from = 0 years_of_work_to = years_of_work work_place = response.xpath('//p[@class="msg ltype"]/text()').extract()[0].strip() degree = response.xpath('//p[@class="msg ltype"]/text()').extract()[2].strip() try: release_ = response.xpath('//p[@class="msg ltype"]/text()').extract()[4].strip() except Exception as f: release_ = '无' member = response.xpath('//p[@class="msg ltype"]/text()').extract()[3].strip() description = response.xpath('//div[@class="bmsg job_msg inbox"]/p/text()').extract() # 这里取出的是列表,所以我们要将他转换一下 description = ' '.join(description) try: temptation = response.xpath('//span[@class="sp4"]/text()').extract() temptation = " ".join(temptation) except Exception as e: temptation = '无' try: belong_id = response.xpath('//div[@class="bmsg inbox"]/a/@onclick')[0].extract().split('=')[1].split(',')[0] except Exception as e: belong_id = '无' item = MyItem() item['resource'] = resource item['url'] = url item['name'] = name item['salary_from'] = salary_from item['salary_to'] = salary_to item['is_annual_salary'] = is_annual_salary item['is_negotiable'] = is_negotiable item['years_of_work_from'] = years_of_work_from item['years_of_work_to'] = years_of_work_to item['work_place'] = work_place item['degree'] = degree item['release'] = release_ item['member'] = member item['temptation'] = temptation item['description'] = description item['is_alive'] = is_alive item['create_time'] = create_time item['modified_time'] = modified_time item['is_add'] = is_add item['belong_id'] = belong_id # url = response.xpath('//p[@class="cname"]/a/@href').extract_first('') # if url: # print('======================') req = Request(url=url, meta={'item': item}, callback=self.parse_three) yield req # print('没有公司详情页网页链接!!!!') def parse_three(self, response): firm_introduction = response.xpath('//div[@class="con_txt"]/text()')[0].extract() firm_name = response.xpath('//h1/text()')[0].extract().strip() firm_scale = response.xpath('//p/text()')[15].extract().strip() if '-' in firm_scale: firm_scale_from = response.xpath('//p/text()')[15].extract().strip().split('-')[0] firm_scale_to = response.xpath('//p/text()')[15].extract().strip().strip('\r\n') else: firm_scale_from = 0 firm_scale_to = firm_scale firm_nature = response.xpath('//p[@class="ltype"]/text()').extract()[0].strip() firm_industry = response.xpath('//p[@class="ltype"]/text()').extract()[2].strip() firm_website = response.xpath('//span[@class="icon_det"]/text()').extract_first() firm_location = response.xpath('//p[@class="fp"]/text()').extract()[1].strip().split('(')[0] firm_log = 0 firm_lat = 0 item = response.meta.get("item") item['firm_place'] = item["work_place"] item['firm_introduction'] = firm_introduction item['firm_name'] = firm_name item['firm_scale_from'] = firm_scale_from item['firm_scale_to'] = firm_scale_to item['firm_nature'] = firm_nature item['firm_industry'] = firm_industry item['firm_website'] = firm_website item['firm_location'] = firm_location item['firm_lon'] = firm_log item['firm_lat'] = firm_lat yield item 这是爬虫部分,剩下的在pipeline中设计数据库信息,创建数据库并导入
from twisted.enterprise import adbapi from pymysql import cursors class TwistedMysqlPipeline(object): @classmethod def from_settings(cls, settings): db_prams = dict( host=settings['MYSQL_HOST'], user=settings['MYSQL_USER'], password=settings['MYSQL_PW'], db=settings['MYSQL_DB'], port=3306, use_unicode=True, charset=settings['MYSQL_CHARSET'], cursorclass=cursors.DictCursor ) db_pool = adbapi.ConnectionPool('pymysql', **db_prams) return cls(db_pool) def __init__(self, db_pool): self.db_pool = db_pool def process_item(self, item, spider): query = self.db_pool.runInteraction(self.insert_item, item) query.addErrback(self.handle_error, item, spider) return item
剩下的就是创建表格了,插入表格,,,,,,