1. Loan dataset introduction
Import the libraries used
graph_objs in Plotly is a sub-module under Plotly, which is used to import all image objects in Plotly. After importing the corresponding graphic objects, you can define a graph object according to the data to be presented and custom graphic specification parameters, and then input The final rendering is done in plotly.offline.iplot().
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
color = sns.color_palette() # 调色板
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
import plotly.tools as tls
import squarify
from mpl_toolkits.basemap import Basemap
from numpy import array
from matplotlib import cm
# Supress unnecessary warnings so that presentation looks clean
import warnings
warnings.filterwarnings("ignore")
# Print all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
%matplotlib inline
The situation of the dataset:
kiva_loans_data = pd.read_csv("kiva_loans.csv") # 贷款数据集
kiva_mpi_locations_data = pd.read_csv("kiva_mpi_region_locations.csv") # 贷款人地理信息
loan_theme_ids_data = pd.read_csv("loan_theme_ids.csv") # 贷款主要用途
loan_themes_by_region_data = pd.read_csv("loan_themes_by_region.csv") # 地理信息与贷款用途
#loans_data = pd.read_csv("loans.csv")
lenders_data = pd.read_csv("lenders.csv") # 贷款方数据
loans_lenders_data = pd.read_csv("loans_lenders.csv") # 贷款借款方
country_stats_data = pd.read_csv("country_stats.csv") # 国家统计数据
mpi_national_data = pd.read_csv("MPI_national.csv") #国家多维贫困指数
mpi_subnational_data = pd.read_csv("MPI_subnational.csv") #贫困指数低的国家或地区
The data volume of the dataset:
print("Size of kiva_loans_data",kiva_loans_data.shape)
print("Size of kiva_mpi_locations_data",kiva_mpi_locations_data.shape)
print("Size of loan_theme_ids_data",loan_theme_ids_data.shape)
print("Size of loan_themes_by_region_data",loan_themes_by_region_data.shape)
print("***** Additional kiva snapshot******")
#print("Size of loans_data",loans_data.shape)
print("Size of lenders_data",lenders_data.shape)
print("Size of loans_lenders_data",loans_lenders_data.shape)
print("Size of country_stats_data",country_stats_data.shape)
print("*****Multidimensional Poverty Measures Data set******")
print("Size of mpi_national_data",mpi_national_data.shape)
print("Size of mpi_subnational_data",mpi_subnational_data.shape)
Dataset overview:
include=["0"] will display all indicators
Check for missing values, calculate the number of all missing values, sort them, and calculate the proportion of missing values
It can be seen that, except for tags, there are fewer missing values for other values
There are many missing values in the region and poverty index data
The loan dataset has fewer missing values
In the data of loan purpose and region, geocode_old data and mpi_geo data have more missing values, and other missing values are less
2. Data visualization
1. The main purpose of the loan
plt.figure(figsize=(15,8))
sector_name = kiva_loans_data['sector'].value_counts()
sns.barplot(sector_name.values, sector_name.index)
for i, v in enumerate(sector_name.values):
plt.text(0.8,i,v,color='k',fontsize=19)
plt.xticks(rotation='vertical')
plt.xlabel('Number of loans were given')
plt.ylabel('Sector Name')
plt.title("Top sectors in which more loans were given")
plt.show()
Agriculture, food, retail, services, housing, clothing, education and other necessities are at the top of the list
A more intuitive image:
plt.figure(figsize=(15,8))
count = kiva_loans_data['sector'].value_counts()
squarify.plot(sizes=count.values,label=count.index, value=count.values)
plt.title('Distribution of sectors')
Detailed use:
plt.figure(figsize=(15,8))
count = kiva_loans_data['use'].value_counts().head(10)
sns.barplot(count.values, count.index, )
for i, v in enumerate(count.values):
plt.text(0.8,i,v,color='k',fontsize=19)
plt.xlabel('Count', fontsize=12)
plt.ylabel('uses of loans', fontsize=12)
plt.title("Most popular uses of loans", fontsize=16)
Most water, food and medicine
2. Repayment situation
Pay back when you have money and pay the most monthly
4. Which countries borrow the most
Poor and backward countries such as the Philippines and Kenya have the most demand for loans
5. How much is the loan
plt.figure(figsize = (12, 8))
plt.scatter(range(kiva_loans_data.shape[0]), np.sort(kiva_loans_data.funded_amount.values))
plt.xlabel('index', fontsize=12)
plt.ylabel('loan_amount', fontsize=12)
plt.title("Loan Amount Distribution")
plt.show()
The loan amount of most people is relatively small, the loan amount is below 20,000, and the loan amount of extremely small points is relatively high
6. Demand in each region
The demand in sub-Saharan Africa is relatively large, and there is basically no demand in Europe and Central Asia
7. Number distribution of lenders
1 lender, more than 5-10
8. Detailed purpose of the loan
General store and farm loans are more
9. How long will it take to repay the loan?
There are more repayments in 8 months and 14 months.
9. Gender ratio
gender_list = []
for gender in kiva_loans_data["borrower_genders"].values:
if str(gender) != "nan":
gender_list.extend( [lst.strip() for lst in gender.split(",")] )
temp_data = pd.Series(gender_list).value_counts()
labels = (np.array(temp_data.index))
sizes = (np.array((temp_data / temp_data.sum())*100))
plt.figure(figsize=(15,8))
trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(title='Borrower Gender')
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="BorrowerGender")
Most of the loans are women
11. Average quota
kiva_loans_data.borrower_genders = kiva_loans_data.borrower_genders.astype(str)
gender_data = pd.DataFrame(kiva_loans_data.borrower_genders.str.split(',').tolist())
kiva_loans_data['sex_borrowers'] = gender_data[0]
kiva_loans_data.loc[kiva_loans_data.sex_borrowers == 'nan', 'sex_borrowers'] = np.nan
sex_mean = pd.DataFrame(kiva_loans_data.groupby(['sex_borrowers'])['funded_amount'].mean().sort_values(ascending=False)).reset_index()
print(sex_mean)
g1 = sns.barplot(x='sex_borrowers', y='funded_amount', data=sex_mean)
g1.set_title("Mean funded Amount by Gender ", fontsize=15)
g1.set_xlabel("Gender")
g1.set_ylabel("Average funded Amount(US)", fontsize=12)
Men have more on average
f, ax = plt.subplots(figsize=(15, 5))
print("Genders count with repayment interval monthly\n",kiva_loans_data['sex_borrowers'][kiva_loans_data['repayment_interval'] == 'monthly'].value_counts())
print("Genders count with repayment interval weekly\n",kiva_loans_data['sex_borrowers'][kiva_loans_data['repayment_interval'] == 'weekly'].value_counts())
print("Genders count with repayment interval bullet\n",kiva_loans_data['sex_borrowers'][kiva_loans_data['repayment_interval'] == 'bullet'].value_counts())
print("Genders count with repayment interval irregular\n",kiva_loans_data['sex_borrowers'][kiva_loans_data['repayment_interval'] == 'irregular'].value_counts())
sns.countplot(x="sex_borrowers", hue='repayment_interval', data=kiva_loans_data).set_title('sex borrowers with repayment_intervals');
Most men make one-time repayment
12. Loans in different countries
countries_funded_amount = kiva_loans_data.groupby('country').mean()['funded_amount'].sort_values(ascending = False)
print("Top Countries with funded_amount(Dollar value of loan funded on Kiva.org)(Mean values)\n",countries_funded_amount.head(10))
data = [dict(
type='choropleth',
locations= countries_funded_amount.index,
locationmode='country names',
z=countries_funded_amount.values,
text=countries_funded_amount.index,
colorscale='Red',
marker=dict(line=dict(width=0.7)),
colorbar=dict(autotick=False, tickprefix='', title='Top Countries with funded_amount(Mean value)'),
)]
layout = dict(title = 'Top Countries with funded_amount(Dollar value of loan funded on Kiva.org)',
geo = dict(
showframe = False,
#showcoastlines = False,
projection = dict(
type = 'Mercatorodes'
)
),)
fig = dict(data=data, layout=layout)
py.iplot(fig, validate=False)
13. Average Loan Situation in Various Situations
14. Which countries are more eye-catching in the data set?
from wordcloud import WordCloud
names = kiva_loans_data["country"][~pd.isnull(kiva_loans_data["country"])]
#print(names)
wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(names))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Wordcloud for country Names", fontsize=35)
plt.axis("off")
plt.show()
15. Changes in repayment methods over time
kiva_loans_data['date'] = pd.to_datetime(kiva_loans_data['date'])
kiva_loans_data['date_month_year'] = kiva_loans_data['date'].dt.to_period("M")
plt.figure(figsize=(8,10))
g1 = sns.pointplot(x='date_month_year', y='loan_amount',
data=kiva_loans_data, hue='repayment_interval')
g1.set_xticklabels(g1.get_xticklabels(),rotation=90)
g1.set_title("Mean Loan by Month Year", fontsize=15)
g1.set_xlabel("")
g1.set_ylabel("Loan Amount", fontsize=12)
plt.show()
There are many ways to repay in one go
14. Changes in the loan situation of different countries over time
kiva_loans_data['Century'] = kiva_loans_data.date.dt.year
loan = kiva_loans_data.groupby(['country', 'Century'])['loan_amount'].mean().unstack()
loan = loan.sort_values([2017], ascending=False)
f, ax = plt.subplots(figsize=(15, 20))
loan = loan.fillna(0)
temp = sns.heatmap(loan, cmap='Reds')
plt.show()
Some countries have relatively large differences in borrowing over the years, which may be caused by factors such as wars and natural disasters.
15. Comparison of different types of repayment methods
sector_repayment = ['sector', 'repayment_interval']
cm = sns.light_palette("red", as_cmap=True)
pd.crosstab(kiva_loans_data[sector_repayment[0]], kiva_loans_data[sector_repayment[1]]).style.background_gradient(cmap = cm) # 混淆矩阵
16. The difference between the loan amount and the approved amount
kiva_loans_data.index = pd.to_datetime(kiva_loans_data['posted_time'])
plt.figure(figsize = (12, 8))
ax = kiva_loans_data['loan_amount'].resample('w').sum().plot()
ax = kiva_loans_data['funded_amount'].resample('w').sum().plot()
ax.set_ylabel('Amount ($)')
ax.set_xlabel('month-year')
ax.set_xlim((pd.to_datetime(kiva_loans_data['posted_time'].min()),
pd.to_datetime(kiva_loans_data['posted_time'].max())))
ax.legend(["loan amount", "funded amount"])
plt.title('Trend of loan amount V.S. funded amount')
plt.show()
17. Target individual regions
loan_use_in_india = kiva_loans_data['use'][kiva_loans_data['country'] == 'India']
percentages = round(loan_use_in_india.value_counts() / len(loan_use_in_india) * 100, 2)[:13]
trace = go.Pie(labels=percentages.keys(), values=percentages.values, hoverinfo='label+percent',
textfont=dict(size=18, color='#000000'))
data = [trace]
layout = go.Layout(width=800, height=800, title='Top 13 loan uses in India',titlefont= dict(size=20),
legend=dict(x=0.1,y=-0.7))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, show_link=False)
The biggest use of the loan in India was to buy a smokeless stove, which was then used to expand her tailoring business by buying fabric and sewing machines.
18. The 7 regions with the most loans
# Plotting these Top 7 funded regions on India map. Circles are sized according to the
# regions of the india
plt.subplots(figsize=(20, 15))
map = Basemap(width=4500000,height=900000,projection='lcc',resolution='l',
llcrnrlon=67,llcrnrlat=5,urcrnrlon=99,urcrnrlat=37,lat_0=28,lon_0=77)
map.drawmapboundary ()
map.drawcountries ()
map.drawcoastlines ()
lg=array(top7_cities['lon'])
lt=array(top7_cities['lat'])
pt=array(top7_cities['amount'])
nc=array(top7_cities['region'])
x, y = map(lg, lt)
population_sizes = top7_cities["amount"].apply(lambda x: int(x / 3000))
plt.scatter(x, y, s=population_sizes, marker="o", c=population_sizes, alpha=0.9)
for ncs, xpt, ypt in zip(nc, x, y):
plt.text(xpt+60000, ypt+30000, ncs, fontsize=20, fontweight='bold')
plt.title('Top 7 funded regions in India',fontsize=30)
19. Poverty index
data = [ dict(
type = 'scattergeo',
lat = kiva_mpi_locations_data['lat'],
lon = kiva_mpi_locations_data['lon'],
text = kiva_mpi_locations_data['LocationName'],
marker = dict(
size = 10,
line = dict(
width=1,
color='rgba(102, 102, 102)'
),
cmin = 0,
color = kiva_mpi_locations_data['MPI'],
cmax = kiva_mpi_locations_data['MPI'].max(),
colorbar=dict(
title="Multi-dimenstional Poverty Index"
)
))]
layout = dict(title = 'Multi-dimensional Poverty Index for different regions')
fig = dict( data=data, layout=layout )
py.iplot(fig)
19. Human Development Index
data = [dict(
type='choropleth',
locations= country_stats_data['country_name'],
locationmode='country names',
z=country_stats_data['hdi'],
text=country_stats_data['country_name'],
colorscale='Red',
marker=dict(line=dict(width=0.7)),
colorbar=dict(autotick=False, tickprefix='', title='Human Development Index(HDI)'),
)]
layout = dict(title = 'Human Development Index(HDI) for different countries',)
fig = dict(data=data, layout=layout)
py.iplot(fig, validate=False)
20. Comparison of poverty in different countries
data = [dict(
type='choropleth',
locations= country_stats_data['country_name'],
locationmode='country names',
z=country_stats_data['population_below_poverty_line'],
text=country_stats_data['country_name'],
colorscale='Red',
marker=dict(line=dict(width=0.7)),
colorbar=dict(autotick=False, tickprefix='', title='population_below_poverty_line in %'),
)]
layout = dict(title = 'Population below poverty line for different countries in % ',)
fig = dict(data=data, layout=layout)
py.iplot(fig, validate=False)