【分箱操作】决策树、卡方、分位数、等距和映射分箱操作代码实现

2023-11-24 13:55:08

1 数据准备
2 决策树分箱
3 分位数分箱
4 等距和等距log映射
5 卡方分箱

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
plt.rcParams["font.sans-serif"] = ["FangSong"] 
plt.rcParams["axes.unicode_minus"] = False 
import warnings
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeClassifier

数据准备

使用x作为待分箱数据
y为分箱的目标标签

data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)

variable = "mean radius"
x = df[variable].values
y = data.target

test = pd.DataFrame({'x':x,'y':y})

test.head()

	x	y
0	17.99	0
1	20.57	0
2	19.69	0
3	11.42	0
4	20.29	0

sns.kdeplot(test.x)

在这里插入图片描述

sns.boxplot(test.x)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XtQ0F2dI-1601345323110)(output_10_1.png)]

woe和iv的计算过程和理解可见：
https://blog.csdn.net/xiezhen_zheng/article/details/82888653

定义计算函数

def woe_iv(data, x_col='x', y_col='y'):'''data  为含有y和分箱标签的dataframex_col 为分箱标签y_col 为分类标签'''rate_table = test.groupby(x_col)[y_col].agg(['count', 'sum']).rename(columns={'count': 'total', 'sum': 'bad'})  # 对X进行加总，计算等到每一类型下的总数量和坏样本数量rate_table['good'] = rate_table['total'] - \rate_table['bad']  # 计算每一类型下好样本的数量total_bad = rate_table.sum()['bad']  # 计算坏样本总量total_good = rate_table.sum()['good']  # 计算好样本总量rate_table['p_bad'] = rate_table['bad']/total_bad  # 就是每一类型下坏样本的概率rate_table['p_good'] = rate_table['good']/total_good  # 就是每一类型下好样本的概率woe = np.log(rate_table['p_bad']/rate_table['p_good'])  # 就是每一类型的WOE值iv = (rate_table['p_bad'] - rate_table['p_good'])*woe  # 就是每一类的IV值rate_table['iv'] = ivrate_table['woe'] = woereturn rate_table

决策树分箱

def optimal_binning_boundary(data, x_col = 'x',y_col = 'y',criterion='gini') -> list:'''利用决策树进行分箱data  为含有y和分箱标签的dataframex_col 为分箱标签y_col 为分类标签criterion 为决策树分类准则'''x0 = data[x_col]x = data[x_col].valuesy = data[y_col].valuesboundary = []  # 待return的分箱边界值列表clf = DecisionTreeClassifier(criterion=criterion,  # 决策树分类准则max_leaf_nodes=6,       # 最大叶子节点数min_samples_leaf=0.05)  # 叶子节点样本数量最小占比clf.fit(x.reshape(-1, 1), y)  # 训练决策树n_nodes = clf.tree_.node_countchildren_left = clf.tree_.children_leftchildren_right = clf.tree_.children_rightthreshold = clf.tree_.thresholdfor i in range(n_nodes):if children_left[i] != children_right[i]:  # 获得决策树节点上的划分边界值boundary.append(threshold[i])boundary.sort()min_x = x.min()max_x = x.max()  boundary = [min_x] + boundary + [max_x]data['bins_dtc'] = pd.cut(x0, bins=boundary, include_lowest=True, labels=False)return boundary, data

boundary, bins = optimal_binning_boundary(test)

woe_iv(bins,'bins_dtc')

	total	bad	good	p_bad	p_good	iv	woe
bins_dtc
0	150	147	3	0.411765	0.014151	1.340225	3.370671
1	115	105	10	0.294118	0.047170	0.451970	1.830226
2	49	39	10	0.109244	0.047170	0.052131	0.839827
3	83	55	28	0.154062	0.132075	0.003385	0.153979
4	54	10	44	0.028011	0.207547	0.359566	-2.002754
5	118	1	117	0.002801	0.551887	2.900997	-5.283323

分位数分箱

使用pd.qcut函数

bins = pd.qcut(df[variable] ,5,labels=False,)

test['bins_q'] = bins

woe_iv(test,'bins_q')

	total	bad	good	p_bad	p_good	iv	woe
bins_q
0	114	112	2	0.313725	0.009434	1.066299	3.504202
1	114	106	8	0.296919	0.037736	0.534655	2.062848
2	113	91	22	0.254902	0.103774	0.135814	0.898668
3	114	47	67	0.131653	0.316038	0.161465	-0.875695
4	114	1	113	0.002801	0.533019	2.782868	-5.248537

等距和等距log映射

np.floor_divide（a,b） a除b再取整
bins = np.floor（） 向下取整

bins = np.floor_divide(df[variable] ,6)

test['bins_divide'] = bins

woe_iv(test,'bins_divide')

	total	bad	good	p_bad	p_good	iv	woe
bins_divide
1.0	169	163	6	0.456583	0.028302	1.190981	2.780841
2.0	308	194	114	0.543417	0.537736	0.000060	0.010510
3.0	85	0	85	0.000000	0.400943	inf	-inf
4.0	7	0	7	0.000000	0.033019	inf	-inf

bins = np.floor(np.log2(df[variable]))
test['bins_log'] = bins 
woe_iv(test,'bins_log')

	total	bad	good	p_bad	p_good	iv	woe
bins_log
2.0	4	4	0	0.011204	0.000000	inf	inf
3.0	424	347	77	0.971989	0.363208	0.599266	0.984370
4.0	141	6	135	0.016807	0.636792	2.253440	-3.634665

卡方分箱

卡方分箱的具体思路为：
https://mp.weixin.qq.com/s?__biz=MzA5Njc1MDA2Ng%3D%3D&idx=1&mid=2651650083&sn=a24381efa404500ae96ccfcc3716a614

def Chi2(df, total_col, bad_col,overallRate):'''#此函数计算卡方值:df dataFrame:total_col 每个值得总数量:bad_col 每个值的坏数据数量:overallRate 坏数据的占比: return 卡方值'''df2=df.copy()df2['expected']=df[total_col].apply(lambda x: x*overallRate)combined=zip(df2['expected'], df2[bad_col])chi=[(i[0]-i[1])**2/i[0] for i in combined]chi2=sum(chi)return chi2

def chiMerge(data,x_col = 'x',y_col = 'y',max_bins = 5):gro = data.groupby(x_col)[y_col].agg(['mean', 'count'])gro['bad'] = gro['count']*gro['mean']total_rate_bad = gro.sum()['bad']/gro.sum()['count']gro['exp_bad'] = gro['count']*total_rate_badgro['chi2'] = ((gro['exp_bad'] - gro['bad']) ** 2)/gro['exp_bad']gro.drop('mean', 1, inplace=True)bad = list(gro['bad'])exp_bad = list(gro['exp_bad'])chi_c = np.array((bad,exp_bad)).Tinterval = [[i] for i in list(gro.index)]chi2 = list(gro['chi2'].values)while len(interval) >max_bins:between_sum = [chi2[i] + chi2[i+1] for i in range(len(chi2)-1)]chi2_min_index = between_sum.index(min(between_sum))interval[chi2_min_index] = interval[chi2_min_index] + interval[chi2_min_index+1]interval.pop(chi2_min_index+1)chi_c[chi2_min_index] = chi_c[chi2_min_index] + chi_c[chi2_min_index + 1]chi_c = np.delete(chi_c,chi2_min_index + 1,0)chi2[chi2_min_index] = ((chi_c[chi2_min_index][0] - chi_c[chi2_min_index][1])**2)/chi_c[chi2_min_index][1]chi2.pop(chi2_min_index+1)interval_ = [min(i) for i in interval] + [data[x_col].max()]bins = pd.cut(data[x_col],interval_,labels=False,include_lowest=True)return bins,interval_

inter = chiMerge(test)

bins = inter[0]

test['bins_kafang'] = bins

woe_iv(test,'bins_kafang')

	total	bad	good	p_bad	p_good	iv	woe
bins_kafang
0	425	351	74	0.983193	0.349057	0.656694	1.035572
1	55	6	49	0.016807	0.231132	0.561792	-2.621210
2	29	0	29	0.000000	0.136792	inf	-inf
3	31	0	31	0.000000	0.146226	inf	-inf
4	29	0	29	0.000000	0.136792	inf	-inf

本文来自互联网用户投稿，文章观点仅代表作者本人，不代表本站立场，不承担相关法律责任。如若转载，请注明出处。 如若内容造成侵权/违法违规/事实不符，请点击【内容举报】进行投诉反馈！

标签：技术

上一篇 > Android消息机制 Handler，Looper，MessageQueue
下一篇 > Android广播机制-Broadcast Receiver

Duilib中list控件支持ctrl和shif多行选中的实现

[ICML2015]Batch Normalization:Accelerating Deep Network Training by Reducing Internal Covariate Shif

win10系统微软输入法于eclipse ctrl+shif+f冲突间接处理办法

Codeforces Round #259 (Div. 2) B. Little Pony and Sort by Shif

读LDD3，内存映射与DMA--PAGE_SHIF…

VMware虚拟机安装XP【要先分区，再设置BOOT 启动CD，shif+上移】

更换iBus五笔的左与右Shif

sublime ctrl+shif+f 没用解决办法

idea 对 ctrl + z 的撤销是 ctrl + shif + z

计算机最早的设计师应用于,计算机应用基础选择题doc.doc

win10自带截图神器：Win+Shift+S

Python基础之文件目录操作

python简述目录_Python基础之文件目录操作(示例代码)

tp5 如何做数据采集

任务2-7(服务器字体+阿里巴巴矢量库)

html标签（1)：h1~h6,p,br,pre,hr

TI 电量计介绍与芯片选型指南

几款TI电源芯片简介

TI DSP芯片C2000系列读取FLASH数据

德州仪器(Ti)平台嵌入式开发基础

TI三相电机智能栅极驱动芯片特点分类

省选模拟（12.08） T3 圈圈圈圈圈圈圈圈

Hadoop生态圈技术栈（上）

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之6.Impala交互式查询

小猿圈之Linux下Mysql 操作命令

大数据Hadoop生态圈常用面试题

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之4.Hive DDL、DQL和数据操作

备战Noip2018模拟赛11（B组）T3 Monogatari 物语

【智能优化算法-圆圈搜索算法】基于圆圈搜索算法Circle Search Algorithm求解单目标优化问题附matlab代码

NYOJ 78 圈水池

递归问题跑道汽车绕圈问题 Python实现

Hadoop生态圈（三）：MapReduce