scrapy框架循环多层页面爬取数据写入数据库或文档

需求

公司app有个模块,需要做手机号段检索,便于导入指定市的电话
在这里插入图片描述
参考第三方网站:http://m.jihaoba.com/tools/haoduan/

利用scrapy框架,爬取城市和号段,存入数据库,数据结构:

create table `fcxlt_fans_data`(
                                  `id` bigint NOT NULL AUTO_INCREMENT,
                                  `city_id` int(11) NOT NULL,
                                  `segment_num_3` char(3) not null ,
                                  `segment_num_7` mediumtext not null ,
                                  primary key (`id`),
                                  key `city_id` (`city_id`) using btree
) engine =innodb auto_increment=1 default  charset=utf8mb4

phones.py

# -*- coding: utf-8 -*-
import scrapy
from phones.items import PhonesItem
import re
class PhonesSpider(scrapy.Spider):
    name = 'phones'
    allowed_domains = ['m.jihaoba.com']
    start_urls = ['http://m.jihaoba.com/tools/haoduan/']

    def parse(self, response):
        #city_list = response.xpath("//ul[@class='city_lst']/li")#/html/body/div[8]/ul[21]
        for c in city_list:
            phones_item =  PhonesItem()
            s = c.xpath(".//a/text()").extract()[0]
            phones_item['cname'] = s.strip()
            phref = c.xpath(".//a//@href").extract()
            curl = "http://m.jihaoba.com"+phref[0]
            yield scrapy.Request(url=curl,meta={'phones_item':phones_item,'curl':curl}, callback=self.gettitle,dont_filter=True)#爬取城市详情
        
    

    def gettitle(self,response):
        phones_item = response.meta['phones_item']
        curl = response.meta['curl']
        hd = response.xpath("//ul[@class='city_lst']/a")
        # hd = response.xpath("/html/body/div[7]/ul[1]/a")
        for i in hd:
            te = i.xpath(".//font/text()").extract()
            u = "http://m.jihaoba.com"+i.xpath(".//@href").extract()[0]           
            dh_3 = re.findall('\d+',te[0])
            yield scrapy.Request(url=u,meta={'phones_item':phones_item,'url':u,'dh_3':dh_3[0],'curl':curl},callback=self.gettels,dont_filter=True)#爬取三位段号详情, dont_filter=False

    def gettels(self,response):
        phones_item = response.meta['phones_item']
        phones_item['dh_url']=response.meta['url']
        phones_item['city_url']=response.meta['curl']
        phones_item['dh_3'] = response.meta['dh_3']
        p = response.xpath(".//li[@class='city-hd01']/a/text()").extract()
        s=''
        for i in p:
            s = s + "," +i
        phones_item['dh_7']=s[1:]
        yield phones_item

存入数据库的pipelines

import json
import pymysql
import time
class PhonesPipeline(object):
    def __init__(self):
        # connection database
        self.connect = pymysql.connect('localhost','root','root','appjjr',use_unicode=True,charset='utf8')
        # get cursor
        self.cursor = self.connect.cursor()
        print("connecting mysql success!")

    def process_item(self, item, spider):
        item = dict(item)
        if(len(item['dh_7']) >0):
            sql = """SELECT * FROM cofcxlt_areas WHERE name LIKE %s"""
            city_name = item['cname']+"%"
            sql = """select id from fcxlt_areas where name like %s and level =%s"""
            self.cursor.execute(sql, (city_name,2))
            res = self.cursor.fetchone()
            if(res[0]>0):
                sqlstr = "insert into fcxlt_fans_data(city_name,segment_num_3,segment_num_7) VALUES('%s','%s','%s')"%(res[0],item['dh_3'],item['dh_7'])
                self.cursor.execute(sqlstr)
                self.connect.commit() 
                time.sleep(0.5)
        return item

存入json文档

   def __init__(self):
        self.file = open('fenghua.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        str_data = json.dumps(dict(item), ensure_ascii=False) + ',\n'
        self.file.write(str_data)
        return item

    def __del__(self):
        self.file.close()

在这里插入图片描述

发布了62 篇原创文章 · 获赞 11 · 访问量 8096

猜你喜欢

转载自blog.csdn.net/u013252962/article/details/100158830