我想分析下互联网行业全国招聘实习生的情况,通过爬取智联招聘,得到15467条数据,并导入Mysql
在items.py里:
import scrapy from scrapy.http import Request from lxml import etree from zhaopinzhilian.items import ZhaopinzhilianItem class RecuritSpider(scrapy.Spider): name = 'recurit' allowed_domains = ['zhaopin.com'] #start_urls = ['http://www.zhaopin.com/'] header = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"} def start_requests(self): return [Request( "http://sou.zhaopin.com/jobs/searchresult.ashx?bj=5006000&sj=299&in=210500%3b160400%3b160000%3b160500%3b160200%3b300100%3b160100%3b160600&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sb=2&sm=0&isfilter=0&fl=489&isadv=0&sg=2b24ff0c4e924139b8749ea5a59d2dbb&p=1", callback=self.parse, headers=self.header, dont_filter=True )] def parse(self, response): try: item = ZhaopinzhilianItem() data = response.text res = etree.HTML(data) table_list = res.xpath('//table[@class="newlist"]') for table in table_list: item["link"]= table.xpath('.//td[@class="zwmc"]//a[1]//@href') for j in range(0, len(item["link"])): surl=item["link"][j] print(surl) yield Request(surl,callback=self.next) for i in range(2, 91): url = "http://sou.zhaopin.com/jobs/searchresult.ashx?bj=5006000&sj=299&in=210500%3b160400%3b160000%3b160500%3b160200%3b300100%3b160100%3b160600&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sb=2&sm=0&isfilter=0&fl=489&isadv=0&sg=2b24ff0c4e924139b8749ea5a59d2dbb&p=" + str( i) yield Request(url, callback=self.parse) except Exception as e: print(e) def next(self,response): try: ''' conn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="zhilian", charset="utf8") cursor = conn.cursor() ''' item = ZhaopinzhilianItem() item["zwmc"]=response.xpath("//div[@class='inner-left fl']/h1/text()").extract() item["gsmc"] = response.xpath("//div[@class='inner-left fl']/h2/a[@target='_blank']/text()").extract() res = etree.HTML(response.text) item["gsgm"]= res.xpath("/html/body/div[6]/div[2]/div[1]/ul/li[1]/strong/text()") zwyx = res.xpath("/html/body/div[6]/div[1]/ul/li[1]/strong/text()") item["zwyx"] = [zwyx[0].replace(u'元/月\xa0', u' ')] #print(item["zwyx"]) item["gzdd"] = res.xpath("/html/body/div[6]/div[1]/ul/li[2]/strong/a/text()") zprs= res.xpath("/html/body/div[6]/div[1]/ul/li[7]/strong/text()") item["zprs"]=[zprs[0].replace(u'人',u' ')] item["minxueli"] = res.xpath("/html/body/div[6]/div[1]/ul/li[6]/strong/text()") ''' sql = "insert into zhaopin(zwmc,gsmc,zwyx,zprs,gzdd,gsgm,minxueli) values(%s,%s,%s,%s,%s,%s,%s);" params = (item["zwmc"][0], item["gsmc"][0], item["zwyx"][0],item["zprs"][0],item["gzdd"][0],item["gsgm"][0],item["minxueli"][0]) cursor.execute(sql, params) conn.commit() cursor.close() conn.close() ''' yield item except Exception as e: print(e)
之后在pipelines里对数据进行导入数据库的操作:
import pymysql class ZhaopinzhilianPipeline(object): def process_item(self, item, spider): conn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="zhilian", charset="utf8") cursor = conn.cursor() for i in range(0, len(item["zwmc"])): zwmc=item["zwmc"][i] gsmc=item["gsmc"][i] zwyx=item["zwyx"][i] gzdd=item["gzdd"][i] gsgm=item["gsgm"][i] minxueli=item["minxueli"][i] zprs=item["zprs"][i] sql = "insert into zhaopin(zwmc,gsmc,zwyx,zprs,gzdd,gsgm,minxueli) values(%s,%s,%s,%s,%s,%s,%s);" params = (zwmc,gsmc,zwyx,zprs,gzdd,gsgm,minxueli) cursor.execute(sql,params) conn.commit() cursor.close() conn.close() return item
最后记得在settings.py里打开piplines:
ITEM_PIPELINES = { 'zhaopinzhilian.pipelines.ZhaopinzhilianPipeline': 300, }