XY的基础处理
target_column = [ 'SeriousDlqin2yrs' ]
feature_columns = [ 'RevolvingUtilizationOfUnsecuredLines' , 'age' ,'NumberOfTime30-59DaysPastDueNotWorse' , 'DebtRatio' , 'MonthlyIncome' ,'NumberOfOpenCreditLinesAndLoans' , 'NumberOfTimes90DaysLate' ,'NumberRealEstateLoansOrLines' , 'NumberOfTime60-89DaysPastDueNotWorse' ,'NumberOfDependents' ]
x_list = feature_columnsimport pandas as pd
df = pd.read_csv( "cs-training.csv" )
features = x_list
for i in x_list:df[ i] = df[ i] .astype( float)
for i in target_column:df[ i] = df[ i] .astype( float) import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
features = x_list
iv_values = [ ]
df[ 'Y' ] = df[ 'SeriousDlqin2yrs' ]
第二步,我们封装一个计算KS的函数
def calculate_ks(y_true, y_score):# 根据评分排序df = pd.DataFrame({'y': y_true, 'score': y_score})df = df.sort_values('score', ascending=False).reset_index(drop=True)# 计算累积分布df['cum_total'] = np.arange(1, len(df)+1) / len(df)df['cum_bad'] = df['y'].cumsum() / df['y'].sum()df['cum_good'] = (1 - df['y']).cumsum() / (len(df) - df['y'].sum())# 计算KS值ks = (df['cum_bad'] - df['cum_good']).abs().max()return ks
第三步,我们定义 plot_feature_analysis信贷特征分析可视化函数(外置指标面板完整版)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curvedef plot_feature_analysis(df, features, Y, n_bins=10, figsize=(20, 100)):"""参数:df : DataFrame 包含特征和目标变量的数据集features : list 需要分析的特征列表Y : str 目标变量名称n_bins : int分箱数量 (默认10)figsize : tuple画布尺寸 (宽, 高)"""# 动态计算布局n_features = len(features)fig, axes = plt.subplots(n_features, 1, figsize=figsize, constrained_layout=False, squeeze=False)# 调整画布右侧空间fig.subplots_adjust(right=0.7) # 为指标面板腾出30%空间# 全局样式设置plt.rcParams.update({'font.sans-serif': 'SimHei','axes.unicode_minus': False,'axes.titlesize': 20,'axes.titlepad': 20,'figure.dpi': 400})# 遍历所有特征for idx, (feature, ax) in enumerate(zip(features, axes.flat)):ax2 = ax.twinx() # 创建副坐标轴try:# === 数据清洗 ===df_clean = df[[feature, Y]].dropna()coverage = len(df_clean) / len(df)y = df_clean[Y]overall_bad_rate = y.mean()# === 动态分箱处理 ===ser = df_clean[feature]try:# 使用qcut并处理空箱bins = pd.qcut(ser, q=n_bins, duplicates='drop')bin_categories = bins.cat.categoriesexcept Exception as e:# 数值型变量分箱失败时改用等距分箱bins = pd.cut(ser, bins=n_bins, include_lowest=True)bin_categories = bins.cat.categories# === 生成分箱标签 ===bin_labels = []for interval in bin_categories:if pd.isnull(interval):bin_labels.append('Missing')else:left = round(interval.left, 2)right = round(interval.right, 2)bin_labels.append(f"{left}-{right}")# === 分箱统计 ===grouped = (df_clean.assign(bin=bins).groupby('bin', observed=False).agg(count=(Y, 'count'),bad=(Y, 'sum')).reindex(bin_categories) # 确保包含所有分箱.fillna({'count':0, 'bad':0}).assign(bad_rate = lambda x: x['bad']/x['count'].replace(0, np.nan),lift = lambda x: x['bad_rate']/overall_bad_rate).reset_index()).fillna(0)# === 绘制柱状图 ===bars = ax.bar(bin_labels, grouped['count'], width=0.8,alpha=0.7,color='#1f77b4',label='样本量')# === 绘制折线图 ===line = ax2.plot(bin_labels, grouped['bad_rate'], color='#d62728',marker='o',markersize=20,linewidth=3,linestyle='--',label='逾期率')# === 添加数值标注 ===for i, (rect, br, lift, count, bad) in enumerate(zip(bars, grouped['bad_rate'], grouped['lift'],grouped['count'],grouped['bad'])):# 样本量标注ax.text(rect.get_x() + rect.get_width()/2, rect.get_height() * 0.6, f"All: {count:,}\nBad: {bad:,}",ha='center', va='center',color='white', fontsize=13,fontweight='bold',linespacing=1.2)# 逾期率标注ax2.text(rect.get_x() + rect.get_width()/2, br + 0.005, f'{br:.2%}',ha='center', va='bottom',color='#d62728',fontsize=18, fontweight='bold')# Lift值标注ax.text(rect.get_x() + rect.get_width()/2, rect.get_height() * 0.95,f'Lift: {lift:.2f}',ha='center', va='top',color='#FFFF00',fontsize=15,fontweight='bold')# === 计算模型指标 ===#y_score = grouped['bad_rate'].iloc[pd.factorize(bins)[0]].values#auc = roc_auc_score(y, y_score)#fpr, tpr, _ = roc_curve(y, y_score)#ks = (tpr - fpr).max()y_score = grouped['bad_rate'].iloc[pd.factorize(bins)[0]].values # 使用分箱坏样本率作为评分auc = roc_auc_score(y, y_score)# auc=max(auc,1-auc)ks = calculate_ks(y, y_score)fpr, tpr, _ = roc_curve(y, y_score)# === 设置坐标轴 ===ax.set_ylabel('样本量', color='#1f77b4', fontsize=20)ax2.set_ylabel('逾期率', color='#d62728', fontsize=20)ax.tick_params(axis='y', colors='#1f77b4', labelsize=14)ax2.tick_params(axis='y', colors='#d62728', labelsize=14)# === 设置X轴 ===ax.set_xticks(range(len(bin_labels)))ax.set_xticklabels(bin_labels,rotation=45,ha='right',fontsize=16)# === 添加外部指标面板 === # 获取坐标位置(figure坐标系)ax_bbox = ax.get_position()panel_x = ax_bbox.x1 + 0.03 # 右侧偏移3%panel_y = ax_bbox.y0 + ax_bbox.height*0.6 # 垂直居中偏上# 在figure层面添加文本fig.text(x=panel_x,y=panel_y,s=f"特征分析指标\n━━━━━━━━━━━━\n"f"特征名称: {feature}\n"f"分箱数量: {len(bin_labels)}\n"f"特征覆盖率: {coverage:.2%}\n"f"AUC: {auc:.3f}\n"f"KS值: {ks:.3f}\n"f"全局逾期率: {overall_bad_rate:.2%}",fontsize=15,linespacing=1.8,va='top',ha='left',fontfamily='SimHei',bbox=dict(boxstyle='round',facecolor='#f8f9fa',edgecolor='#ced4da',alpha=0.95,pad=0.8))# === 网格线设置 ===ax.grid(True, axis='y', linestyle=':', alpha=0.7)ax2.grid(True, axis='y', linestyle=':', alpha=0.3)except Exception as e:print(f"特征 {feature} 分析失败: {str(e)}")ax.set_visible(False)ax2.set_visible(False)
第四步,使用示例
if __name__ == "__main__":# 执行分析,以上代码都是在个人电脑基于开源数据编译成功plot_feature_analysis(df,features,Y='Y',n_bins=8,figsize=(22, 80))plt.show()