使用scrapy框架采集企业黄页数据

今天闲着无聊,好就没用scrapy框架写爬虫了,于是就写了一个企业黄页的爬虫脚本,采集顺企网黄页数据,保存到mongodb,话不多说,简单看下代码吧。

下面部分是spider代码(习惯使用beautifulsoup了QAQ)

import scrapy
from bs4 import BeautifulSoup
from shunqi.items import ShunqiItem
import re


class EpregSpider(scrapy.Spider):
    name = 'epreg'
    allowed_domains = ['11467.com']
    start_url = "http://b2b.11467.com/"

    def start_requests(self):
        yield scrapy.Request(url=self.start_url,callback=self.get_city)

    def get_city(self,response):
        soup = BeautifulSoup(response.text,"lxml")
        dl_listtxt = soup.find(text="按城市浏览全国公司黄页").findNext("div").find_all("dl",attrs={"class":"listtxt"})
        for dl in dl_listtxt:
            province = dl.find("dt").text.strip()
            dd_list = dl.find_all("dd")
            for dd in dd_list:
                city = dd.text.strip()
                city_link = "https:" + dd.find("a")["href"]
                yield scrapy.Request(url=city_link,callback=self.get_class,meta={"city":city,"province":province})


    def get_class(self,response):
        city = response.meta["city"]
        province = response.meta["province"]
        soup = BeautifulSoup(response.text,"lxml")
        class_all = soup.find(text=re.compile("公司分类")).findNext("ul").find_all("li")
        for li in class_all:
            dd_all = li.find("dl").find_all("dd")
            for dd in dd_all:
                cor_link = "https:" + dd.find("a")["href"]
                yield scrapy.Request(url=cor_link,callback=self.get_cor,meta={"city":city,"province":province})


    def get_cor(self,response):
        city = response.meta["city"]
        province = response.meta["province"]
        soup = BeautifulSoup(response.text,"lxml")
        company_list = soup.find("ul",attrs={"class":"companylist"})
        if company_list:
            company_list1 = company_list.find_all("div",attrs={"class","f_l"})
            for li in company_list1:
                whole_url = "https:" + li.find("h4").findNext("a")["href"]
                if "qiye" in whole_url:
                    whole_url = whole_url + "#contact"
                else:
                    whole_url = "https:" + li.find("h4").findNext("a")["href"]
                yield scrapy.Request(url=whole_url,callback=self.get_info,meta={"url":whole_url,"city":city,"province":province})
        is_pages = soup.find("div",attrs={"class","pages"})
        if is_pages:
            max_page = is_pages.find(text="尾页").findParent("a")["href"].split("-")[1].replace(".htm")
            page_href = is_pages.find(text="尾页").findParent("a")["href"]
            for page in range(2,int(max_page)+1):
                page_url = page_href.split("-")[0] + "-" + str(page) + ".htm"
                yield scrapy.Request(url=page_url,callback=self.get_cor,meta={"city":city,"province":province})

    def deal_item(self,soup,text,label):
        try:
            data = soup.find(text=re.compile("%s" % text)).findNext("%s" % label).text.strip().replace("\r\n","")
        except:
            data = None
        return data


    def get_info(self,response):
        item = ShunqiItem()
        whole_url = response.meta["url"]
        if "qiye" in whole_url:
            id = re.search("qiye\/(\d+)\.htm",whole_url,re.S).group(1)
        else:
            id = re.search("co\/(\d+)\.htm",whole_url,re.S).group(1)
        city= response.meta["city"]
        province = response.meta["province"]
        soup = BeautifulSoup(response.text,"lxml")
        adress = self.deal_item(soup,"公司地址:","dd")
        phone = self.deal_item(soup,"固定电话:","dd")
        manager = self.deal_item(soup,"经理:","dd")
        if self.deal_item(soup,"手机号码:","dd") == None:
            link_phone = self.deal_item(soup,"经理手机:","dd")
        else:
            link_phone= self.deal_item(soup,"手机号码:","dd")
        email = self.deal_item(soup,"电子邮件:","dd")
        cor_name = self.deal_item(soup,"法人名称:","td")
        business = self.deal_item(soup,"经营范围:","td")
        main_product = self.deal_item(soup,"主要经营产品:","td")
        for field in item.fields:
            item[field] = eval(field)
        yield item

下面是采集到一些数据的截图
在这里插入图片描述
以上就是基本的一些代码,本文章只是提供一些采集思路,有兴趣的可以自己写一下,不是很复杂。同时欢迎访问个人博客主页… …

发布了15 篇原创文章 · 获赞 9 · 访问量 8556

猜你喜欢

转载自blog.csdn.net/Since_you/article/details/95662482
今日推荐