# -*- coding: utf-8 -*-
"""
Created on
@author:
"""
import pandas as pd
import numpy as np
data = pd.read_csv('iris.csv', sep="\t", na_values=['', '?'])
temp = data[['x','y']]
# 定义一个卡方分箱(可设置参数置信度水平与箱的个数)停止条件为大于置信水平且小于bin的数目
def ChiMerge(df, variable, flag, confidenceVal=3.841, bin=10, sample = None):
return df
'''
#运行前需要 import pandas as pd 和 import numpy as np
#df:传入一个数据框仅包含一个需要卡方分箱的变量与正负样本标识(正样本为1,负样本为0)
#variable:需要卡方分箱的变量名称(字符串)
#flag:正负样本标识的名称(字符串)
#confidenceVal:置信度水平(默认是不进行抽样95%)
#bin:最多箱的数目
#sample: 为抽样的数目(默认是不进行抽样),因为如果观测值过多运行会较慢
'''
#进行数据格式化录入、
total_num = df.groupby([variable])[flag].count() # 统计需分箱变量每个值数目
total_num = pd.DataFrame({'total_num': total_num}) # 创建一个数据框保存之前的结果
positive_class = df.groupby([variable])[flag].sum() # 统计需分箱变量每个值正样本数
positive_class = pd.DataFrame({'positive_class': positive_class}) # 创建一个数据框保存之前的结果
regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True,
how='inner') # 组合total_num与positive_class
regroup.reset_index(inplace=True)
regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 统计需分箱变量每个值负样本数
regroup = regroup.drop('total_num', axis=1)
np_regroup = np.array(regroup) # 把数据框转化为numpy(提高运行效率)
print('已完成数据读入,正在计算数据初处理')
#处理连续没有正样本或负样本的区间,并进行区间的合并(以免卡方值计算报错)
i = 0
while (i <= np_regroup.shape[0] - 2):
if ((np_regroup[i, 1] == 0 and np_regroup[i + 1, 1] == 0) or ( np_regroup[i, 2] == 0 and np_regroup[i + 1, 2] == 0)):
np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1] # 正样本
np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2] # 负样本
np_regroup[i, 0] = np_regroup[i + 1, 0]
np_regroup = np.delete(np_regroup, i + 1, 0)
i = i - 1
i = i + 1
#对相邻两个区间进行卡方值计算
chi_table = np.array([]) # 创建一个数组保存相邻两个区间的卡方值
for i in np.arange(np_regroup.shape[0] - 1):
chi = (np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 \
* (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) / \
((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * (
np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2]))
chi_table = np.append(chi_table, chi)
print('已完成数据初处理,正在进行卡方分箱核心操作')
#把卡方值最小的两个区间进行合并(卡方分箱核心)
while (1):
if (len(chi_table) <= (bin - 1) and min(chi_table) >= confidenceVal):
break
chi_min_index = np.argwhere(chi_table == min(chi_table))[0] # 找出卡方值最小的位置索引
np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]
np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]
np_regroup[chi_min_index, 0] = np_regroup[chi_min_index + 1, 0]
np_regroup = np.delete(np_regroup, chi_min_index + 1, 0)
if (chi_min_index == np_regroup.shape[0] - 1): # 最小值试最后两个区间的时候
# 计算合并后当前区间与前一个区间的卡方值并替换
chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
* (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
# 删除替换前的卡方值
chi_table = np.delete(chi_table, chi_min_index, axis=0)
else:
# 计算合并后当前区间与前一个区间的卡方值并替换
chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
* (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
# 计算合并后当前区间与后一个区间的卡方值并替换
chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index + 1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index + 1, 1]) ** 2 \
* (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) / \
((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]))
# 删除替换前的卡方值
chi_table = np.delete(chi_table, chi_min_index + 1, axis=0)
print('已完成卡方分箱核心操作,正在保存结果')
#把结果保存成一个数据框
result_data = pd.DataFrame() # 创建一个保存结果的数据框
result_data['variable'] = [variable] * np_regroup.shape[0] # 结果表第一列:变量名
list_temp = []
for i in np.arange(np_regroup.shape[0]):
if i == 0:
x = '0' + ',' + str(np_regroup[i, 0])
elif i == np_regroup.shape[0] - 1:
x = str(np_regroup[i - 1, 0]) + '+'
else:
x = str(np_regroup[i - 1, 0]) + ',' + str(np_regroup[i, 0])
list_temp.append(x)
result_data['interval'] = list_temp # 结果表第二列:区间
result_data['flag_0'] = np_regroup[:, 2] # 结果表第三列:负样本数目
result_data['flag_1'] = np_regroup[:, 1] # 结果表第四列:正样本数目
return result_data
#调用函数参数示例
bins = ChiMerge(temp, 'x','y', confidenceVal=3.841, bin=10,sample=None)
bins
# -*- coding: utf-8 -*-
"""
Created on
@author:
"""
#ChiMerge 是监督的、自底向上的(即基于合并的)数据离散化方法。它依赖于卡方分析:具有最小卡方值的相邻区间合并在一起,直到满足确定的停止准则。
#ChiMerge算法包括2部分:
# 1、初始化,2
# 、自底向上合并,当满足停止条件的时候,区间合并停止。
import math
import numpy as np
import pandas as pd
iris = pd.read_csv('iris.csv', header=None)
iris.columns = ['sepal_length', 'sepal_width',
'petal_length', 'petal_width', 'target_class']
#将最小卡方值添加进DataFrame
def merge_rows(df,feature):
tdf = df[:-1]
distinct_values = sorted(set(tdf['chi2']), reverse=False)
col_names = [feature,'Iris-setosa', 'Iris-versicolor',
'Iris-virginica','chi2']
updated_df = pd.DataFrame(columns = col_names) #加入卡方值,生成新的数组
updated_df_index=0
for index, row in df.iterrows():
if(index==0):
updated_df.loc[len(updated_df)] = df.loc[index]
updated_df_index+=1
else:
if(df.loc[index-1]['chi2']==distinct_values[0]):
updated_df.loc[updated_df_index-1]['Iris-setosa']+=df.loc[index]['Iris-setosa']
updated_df.loc[updated_df_index-1]['Iris-versicolor']+=df.loc[index]['Iris-versicolor']
updated_df.loc[updated_df_index-1]['Iris-virginica']+=df.loc[index]['Iris-virginica']
else:
updated_df.loc[len(updated_df)] = df.loc[index]
updated_df_index+=1
updated_df['chi2'] = 0.
return updated_df
#计算卡方值
def calc_chi2(array):
shape = array.shape
n = float(array.sum())
row={}
column={}
#计算每行的和
for i in range(shape[0]):
row[i] = array[i].sum()
#计算每列的和
for j in range(shape[1]):
column[j] = array[:,j].sum()
chi2 = 0
#卡方计算公式
for i in range(shape[0]):
for j in range(shape[1]):
eij = row[i]*column[j] / n
oij = array[i,j]
if eij==0.:
chi2 += 0. #确保不存在NaN值
else:
chi2 += math.pow((oij - eij),2) / float(eij)
return chi2
#计算每一个类别的卡方值
def update_chi2_column(contingency_table,feature):
for index, row in contingency_table.iterrows():
if(index!=contingency_table.shape[0]-1):
list1=[]
list2=[]
list1.append(contingency_table.loc[index]['Iris-setosa'])
list1.append(contingency_table.loc[index]['Iris-versicolor'])
list1.append(contingency_table.loc[index]['Iris-virginica'])
list2.append(contingency_table.loc[index+1]['Iris-setosa'])
list2.append(contingency_table.loc[index+1]['Iris-versicolor'])
list2.append(contingency_table.loc[index+1]['Iris-virginica'])
prep_chi2 = np.array([np.array(list1),np.array(list2)])
c2 = calc_chi2(prep_chi2)
contingency_table.loc[index]['chi2'] = c2
return contingency_table
#计算频次表
def create_contingency_table(dataframe,feature):
distinct_values = sorted(set(dataframe[feature]), reverse=False)
col_names = [feature,'Iris-setosa', 'Iris-versicolor','Iris-virginica','chi2']
my_contingency = pd.DataFrame(columns = col_names)
#计算唯一值
for i in range(len(distinct_values)):
temp_df=dataframe.loc[dataframe[feature]==distinct_values[i]]
count_dict = temp_df["target_class"].value_counts().to_dict()
setosa_count = 0
versicolor_count = 0
virginica_count = 0
if 'Iris-setosa' in count_dict:
setosa_count = count_dict['Iris-setosa']
if 'Iris-versicolor' in count_dict:
versicolor_count = count_dict['Iris-versicolor']
if 'Iris-virginica' in count_dict:
virginica_count = count_dict['Iris-virginica']
new_row = [distinct_values[i],setosa_count,versicolor_count,virginica_count,0]
my_contingency.loc[len(my_contingency)] = new_row
return my_contingency
#ChiMerge
def chimerge(feature, data, max_interval):
df = data.sort_values(by=[feature],ascending=True).reset_index()
#传入频次表
contingency_table = create_contingency_table(df,feature)
#计算初始间隔值
num_intervals= contingency_table.shape[0]
#是否满足最大间隔
while num_intervals > max_interval:
#相邻列的卡方值
chi2_df = update_chi2_column(contingency_table,feature)
contingency_table = merge_rows(chi2_df,feature)
num_intervals= contingency_table.shape[0]
#得出结果
print('The split points for '+feature+' are:')
for index, row in contingency_table.iterrows():
print(contingency_table.loc[index][feature])
print('The final intervals for '+feature+' are:')
for index, row in contingency_table.iterrows():
if(index!=contingency_table.shape[0]-1):
for index2, row2 in df.iterrows():
if df.loc[index2][feature]<contingency_table.loc[index+1][feature]:
temp = df.loc[index2][feature]
else:
temp = df[feature].iloc[-1]
print("["+str(contingency_table.loc[index][feature])+","+str(temp)+"]")
print(" ")
if __name__=='__main__':
for feature in ['sepal_length', 'sepal_width', 'petal_length','petal_width']:
chimerge(feature=feature, data=iris, max_interval=6)