@@ -25,8 +25,14 @@ def __init__(self, df: pd.DataFrame, exclude_cols: List[str] = None, target_col:
2525 amount_col: 金额字段名,默认为None
2626 ovd_bal_col: 逾期金额字段名,默认为None
2727 """
28- self .df = df .copy ().reset_index (drop = True )
28+ if df is None or df .empty :
29+ raise ValueError ("输入的数据集不能为空" )
30+
31+ self .df = df .copy (deep = False ).reset_index (drop = True )
2932 self .target_col = target_col
33+
34+ if self .target_col not in self .df .columns :
35+ raise ValueError (f"目标字段 '{ self .target_col } ' 不在数据集中" )
3036 self .amount_col = amount_col
3137 self .ovd_bal_col = ovd_bal_col
3238
@@ -346,35 +352,7 @@ def calculate_loss_lift(self, feature: str, amount_col: str = None, ovd_bal_col:
346352 返回:
347353 float,损失率提升度
348354 """
349- if amount_col is None or amount_col not in self .df .columns :
350- return 0.0
351-
352- if ovd_bal_col is None or ovd_bal_col not in self .df .columns :
353- return 0.0
354-
355- # 仅删除amount和ovd_bal的缺失值
356- df = self .df [[feature , self .target_col , amount_col , ovd_bal_col ]].dropna (subset = [amount_col , ovd_bal_col ])
357-
358- if len (df ) == 0 :
359- return 0.0
360-
361- total_amount = df [amount_col ].sum ()
362- if total_amount == 0 :
363- return 0.0
364-
365- # 计算当前特征的损失率
366- total_ovd_bal_bad = df [df [self .target_col ] == 1 ][ovd_bal_col ].sum ()
367- feature_loss_rate = total_ovd_bal_bad / total_amount
368-
369- # 计算整体损失率(只统计坏样本的ovd_bal)
370- df_bad = df [df [self .target_col ] == 1 ]
371- total_ovd_bal_all = df_bad [ovd_bal_col ].sum ()
372- overall_loss_rate = total_ovd_bal_all / total_amount
373-
374- # 计算损失率提升度
375- loss_lift = feature_loss_rate / overall_loss_rate if overall_loss_rate > 0 else 0.0
376-
377- return loss_lift
355+ return 0.0
378356
379357 def analyze_all_variables (self ,psi_dt : str = None , date_col : str = None ) -> pd .DataFrame :
380358 """
@@ -386,38 +364,45 @@ def analyze_all_variables(self,psi_dt: str = None, date_col: str = None) -> pd.D
386364 results = []
387365
388366 for feature in self .features :
389- # 计算各指标
390- iv = self .calculate_iv (feature )
391- ks = self .calculate_ks (feature )
392- auc = self .calculate_auc (feature )
393- missing_rate = self .calculate_missing_rate (feature )
394- single_value_rate = self .calculate_single_value_rate (feature )
395- mean_diff = self .calculate_mean_diff (feature )
396- corr_with_target = self .calculate_corr_with_target (feature )
397- psi = self .calculate_psi (feature , psi_dt = psi_dt , date_col = date_col )
398-
399- # 计算统计信息
400- feature_data = self .df [feature ]
401- min_value = feature_data .min ()
402- max_value = feature_data .max ()
403- median_value = feature_data .median ()
404-
405- # 添加到结果列表
406- results .append ({
407- 'variable' : feature ,
408- 'iv' : iv ,
409- 'ks' : ks ,
410- 'auc' : auc ,
411- 'missing_rate' : missing_rate ,
412- 'single_value_rate' : single_value_rate ,
413- 'min_value' : min_value ,
414- 'max_value' : max_value ,
415- 'median_value' : median_value ,
416- 'mean_diff' : mean_diff ,
417- 'corr_with_target' : corr_with_target ,
418- 'psi' : psi
419- })
367+ try :
368+ # 计算各指标
369+ iv = self .calculate_iv (feature )
370+ ks = self .calculate_ks (feature )
371+ auc = self .calculate_auc (feature )
372+ missing_rate = self .calculate_missing_rate (feature )
373+ single_value_rate = self .calculate_single_value_rate (feature )
374+ mean_diff = self .calculate_mean_diff (feature )
375+ corr_with_target = self .calculate_corr_with_target (feature )
376+ psi = self .calculate_psi (feature , psi_dt = psi_dt , date_col = date_col )
377+
378+ # 计算统计信息
379+ feature_data = self .df [feature ]
380+ min_value = feature_data .min ()
381+ max_value = feature_data .max ()
382+ median_value = feature_data .median ()
383+
384+ # 添加到结果列表
385+ results .append ({
386+ 'variable' : feature ,
387+ 'iv' : iv ,
388+ 'ks' : ks ,
389+ 'auc' : auc ,
390+ 'missing_rate' : missing_rate ,
391+ 'single_value_rate' : single_value_rate ,
392+ 'min_value' : min_value ,
393+ 'max_value' : max_value ,
394+ 'median_value' : median_value ,
395+ 'mean_diff' : mean_diff ,
396+ 'corr_with_target' : corr_with_target ,
397+ 'psi' : psi
398+ })
399+ except Exception as e :
400+ print (f"分析变量 { feature } 时发生错误: { str (e )} " )
401+ continue
420402
403+ if not results :
404+ return pd .DataFrame ()
405+
421406 return pd .DataFrame (results ).sort_values (by = 'iv' , ascending = False )
422407
423408 def analyze_single_variable (self , variable : str , n_bins : int = 10 ) -> pd .DataFrame :
0 commit comments