用python爬取网贷之家p2p平台数据

网贷之家中的p2p平台数据比较容易获取，重要的就是如何分析网页的源代码然后从里面提取自己需要的信息，也不需要用户登录，该网站的爬虫比较简单，主要用了urllib包来获取网页信息，用BeautifulSoup来解析网页，最后用正则表达式提取数据。这里就直接上源码了：

# -*- coding: utf-8 -*-
"""
Created on Wed Aug  8 18:22:26 2018

@author: 95647
"""
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pandas as pd

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
lists =[]
domains = "https://www.wdzj.com"

def get_platform_site(url,lists):
    """获取所有的平台网址"""
#    global lists
    req = urllib.request.Request(url, headers=headers)
    html = urlopen(req)
    bsObj = BeautifulSoup(html,'lxml')
    title = bsObj.findAll("div",{'class':'itemTitle'})  
    for titles in title:
        links =  titles.findAll("a",{'target':'_blank'})
        for link in links:
            if 'href' in link.attrs:
                lists.append(link.attrs['href'])
#                print(link.attrs['href'])
    return lists                   #用utf-8进行解码

def pages_num(url):
    """获取各类平台的页面总数"""
    req = urllib.request.Request(url, headers=headers)
    html = urlopen(req)
    bsObj = BeautifulSoup(html,'lxml')    
    pages= bsObj.findAll("a",text = '尾页')
    for page in pages:
        if "currentnum" in page.attrs:
            pages_num = page.attrs["currentnum"]
        else:
            return None
    return pages_num


def conditions(i):
    """获取各个平台的运营状态，生成包含各类平台的列表"""
#    global lists
    lists =[]
    url_ = r"""https://www.wdzj.com/dangan/search?filter=e%s"""%str(i)
    all_pages_num = int(pages_num(url_))
    for num in range(1, all_pages_num +1):
        url = url_ + "&currentPage=%s"%str(num)
        lists = get_platform_site(url,lists)
    return lists

operations = conditions(1)   #正常运营平台
#close_transitions = conditions(2)  #停业或转型平台
#in_problems= conditions(3)   #问题平台

def plat_profile(lists):
    """抓取平台的数据"""
    global domains
    plat_profile=[]
    for site in lists:
        plat_info =[]
        url = domains + site
        req = urllib.request.Request(url, headers=headers)
        html = urlopen(req)
        bsObj = BeautifulSoup(html,'lxml')
        plat_name = bsObj.findAll('h1')[0].attrs["alt"]               #平台名称
        t_l = bsObj.findAll("div",{"class":"pt-info"})[0].get_text()
        time_s=""
        location =""
        if len(t_l)>0:
            t_l = re.split("上线",t_l)                                     
            time_s = t_l[0].strip()                                  #上线时间
            location= t_l[1].strip()                                 #平台所属地域 
        common_data = bsObj.findAll("b",{"class":"tab_common_data"}) 
        yield0 =""  #给出变量值
        duration = "" #给出变量值
        for data in common_data:
            text = data.parent.get_text()
            if len(re.findall(".*月.*",text)) > 0:
                duration = re.findall(".*月.*",text)[0]
                duration = text.strip()                              #平均期限
            if len(re.findall(".*%.*",text)) > 0:
                yield0 = re.findall(".*%.*",text)[0]
                yield0 = text.strip()                                #平均收益率 
        rates_ = bsObj.find("div",{"class":"dpxximg"})
        if "data-pl" in rates_.attrs:
            rates = bsObj.find("div",{"class":"dpxximg"}).attrs["data-pl"] #获取评分
        plat_pro = bsObj.findAll("div",{"class":"bgbox-bt zzfwbox"})
        plat_pro = BeautifulSoup(str(plat_pro),"lxml")
        L1 =[]
        L2 =[]
        zzzj = ""
        gqss = ""
        yhtg = ""           
        rzjl = ""            
        jgxh = ""            
        ICP = ""
        zdtb = ""
        zqzr = ""
        tbbz = ""
        bzms = ""    
        for div in plat_pro.findAll("div",{"class":"l"}):
            L1.append(div.get_text().strip())
        for div in plat_pro.findAll("div",{"class":"r"}):
            L2.append(div.get_text().strip())
        for slzz in L1:              #获取平台的备案信息
            if slzz =="注册资金":
                zzzj = L2[L1.index(slzz)]
            if slzz =="股权上市":
                gqss = L2[L1.index(slzz)].replace(" ","")
            if slzz =="银行存管":
                yhtg = L2[L1.index(slzz)]            
            if slzz =="融资记录":
                rzjl = L2[L1.index(slzz)].replace(" ","")            
            if slzz =="监管协会":
                jgxh = L2[L1.index(slzz)].replace(" ","")            
            if slzz =="ICP号":
                ICP = L2[L1.index(slzz)]
            if slzz =="自动投标":
                zdtb = L2[L1.index(slzz)]
            if slzz =="债券转让":
                zqzr = L2[L1.index(slzz)]            
            if slzz =="投标保障":
                tbbz = L2[L1.index(slzz)]
            if slzz =="保障模式":
                bzms = L2[L1.index(slzz)]         
        plat_info.append(plat_name)  #这个地放用了很笨的方法，一个个的添加元素到列表中，存在优化空间
        plat_info.append(time_s)
        plat_info.append(location)
        plat_info.append(duration)
        plat_info.append(yield0)
        plat_info.append(rates)
        plat_info.append(zzzj)
        plat_info.append(gqss)
        plat_info.append(yhtg)
        plat_info.append(rzjl)
        plat_info.append(jgxh)
        plat_info.append(ICP)
        plat_info.append(zdtb)
        plat_info.append(zqzr)
        plat_info.append(tbbz)
        plat_info.append(bzms)
        plat_profile.append(plat_info)
        print("------------->"+plat_name+str(lists.index(site)))  #打印爬取的平台信息
    return plat_profile

plat_profile = plat_profile(conditions(1))  #conditions根据平台类型的不同来设置，为1则表示正常运营平台
name = ['平台名称','上线时间','区域','投资期限','平均收益率','评分',
                  '注册资金', '股权上市', '银行存管', '融资记录', '监管协会', 
                  'ICP号', '自动投标', '债权转让', '投标保障', '保障模式']
operating = pd.DataFrame(columns=name,data= plat_profile)
operating.to_csv(r"""C:\Users\95647\Desktop\operating.csv""")   #path to save csvfile

写该代码之前，还没有学习panda的用法，所以简单粗暴的用列表和字典来解决问题，会使用pandas的朋友可以用pandas来进行优化。爬虫运行过程如下：

该爬虫运行速度很快，我爬取了100个正常运营平台的信息，才用了5分钟左右，部分爬虫结果如下：
爬虫结果展示

有问题的和其他想法的朋友，欢迎加QQ：956471511交流，这里还有一篇关于如何爬取人人贷网站数据的博文，有兴趣的朋友可以看一下。手把手教你用python爬取人人贷借款人数据

用python爬取网贷之家p2p平台数据

猜你喜欢