Skip to content

Commit 49b8881

Browse files
committed
V1.3.0
1 parent b69e5cf commit 49b8881

24 files changed

Lines changed: 1115 additions & 909 deletions

README.md

Lines changed: 244 additions & 26 deletions
Large diffs are not rendered by default.

__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
generate_rule_report
2525
)
2626

27-
__version__ = '1.2.4'
27+
__version__ = '1.3.0'
2828
__all__ = [
2929
# Utils
3030
'load_example_data',
@@ -41,7 +41,7 @@
4141
'SingleFeatureRuleMiner',
4242
'MultiFeatureRuleMiner',
4343
'DecisionTreeRuleExtractor',
44-
'XGBoostRuleMiner',
44+
'XGBoostRuleMiner', # Deprecated: 请使用 TreeRuleExtractor(algorithm='gbdt')
4545
'TreeRuleExtractor',
4646

4747
# Visualization
0 Bytes
Binary file not shown.
0 Bytes
Binary file not shown.
-1.6 KB
Binary file not shown.
-343 Bytes
Binary file not shown.
-163 Bytes
Binary file not shown.

analysis/variable_analysis.py

Lines changed: 46 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,14 @@ def __init__(self, df: pd.DataFrame, exclude_cols: List[str] = None, target_col:
2525
amount_col: 金额字段名,默认为None
2626
ovd_bal_col: 逾期金额字段名,默认为None
2727
"""
28-
self.df = df.copy().reset_index(drop=True)
28+
if df is None or df.empty:
29+
raise ValueError("输入的数据集不能为空")
30+
31+
self.df = df.copy(deep=False).reset_index(drop=True)
2932
self.target_col = target_col
33+
34+
if self.target_col not in self.df.columns:
35+
raise ValueError(f"目标字段 '{self.target_col}' 不在数据集中")
3036
self.amount_col = amount_col
3137
self.ovd_bal_col = ovd_bal_col
3238

@@ -346,35 +352,7 @@ def calculate_loss_lift(self, feature: str, amount_col: str = None, ovd_bal_col:
346352
返回:
347353
float,损失率提升度
348354
"""
349-
if amount_col is None or amount_col not in self.df.columns:
350-
return 0.0
351-
352-
if ovd_bal_col is None or ovd_bal_col not in self.df.columns:
353-
return 0.0
354-
355-
# 仅删除amount和ovd_bal的缺失值
356-
df = self.df[[feature, self.target_col, amount_col, ovd_bal_col]].dropna(subset=[amount_col, ovd_bal_col])
357-
358-
if len(df) == 0:
359-
return 0.0
360-
361-
total_amount = df[amount_col].sum()
362-
if total_amount == 0:
363-
return 0.0
364-
365-
# 计算当前特征的损失率
366-
total_ovd_bal_bad = df[df[self.target_col] == 1][ovd_bal_col].sum()
367-
feature_loss_rate = total_ovd_bal_bad / total_amount
368-
369-
# 计算整体损失率(只统计坏样本的ovd_bal)
370-
df_bad = df[df[self.target_col] == 1]
371-
total_ovd_bal_all = df_bad[ovd_bal_col].sum()
372-
overall_loss_rate = total_ovd_bal_all / total_amount
373-
374-
# 计算损失率提升度
375-
loss_lift = feature_loss_rate / overall_loss_rate if overall_loss_rate > 0 else 0.0
376-
377-
return loss_lift
355+
return 0.0
378356

379357
def analyze_all_variables(self,psi_dt: str = None, date_col: str = None) -> pd.DataFrame:
380358
"""
@@ -386,38 +364,45 @@ def analyze_all_variables(self,psi_dt: str = None, date_col: str = None) -> pd.D
386364
results = []
387365

388366
for feature in self.features:
389-
# 计算各指标
390-
iv = self.calculate_iv(feature)
391-
ks = self.calculate_ks(feature)
392-
auc = self.calculate_auc(feature)
393-
missing_rate = self.calculate_missing_rate(feature)
394-
single_value_rate = self.calculate_single_value_rate(feature)
395-
mean_diff = self.calculate_mean_diff(feature)
396-
corr_with_target = self.calculate_corr_with_target(feature)
397-
psi = self.calculate_psi(feature, psi_dt=psi_dt, date_col=date_col)
398-
399-
# 计算统计信息
400-
feature_data = self.df[feature]
401-
min_value = feature_data.min()
402-
max_value = feature_data.max()
403-
median_value = feature_data.median()
404-
405-
# 添加到结果列表
406-
results.append({
407-
'variable': feature,
408-
'iv': iv,
409-
'ks': ks,
410-
'auc': auc,
411-
'missing_rate': missing_rate,
412-
'single_value_rate': single_value_rate,
413-
'min_value': min_value,
414-
'max_value': max_value,
415-
'median_value': median_value,
416-
'mean_diff': mean_diff,
417-
'corr_with_target': corr_with_target,
418-
'psi': psi
419-
})
367+
try:
368+
# 计算各指标
369+
iv = self.calculate_iv(feature)
370+
ks = self.calculate_ks(feature)
371+
auc = self.calculate_auc(feature)
372+
missing_rate = self.calculate_missing_rate(feature)
373+
single_value_rate = self.calculate_single_value_rate(feature)
374+
mean_diff = self.calculate_mean_diff(feature)
375+
corr_with_target = self.calculate_corr_with_target(feature)
376+
psi = self.calculate_psi(feature, psi_dt=psi_dt, date_col=date_col)
377+
378+
# 计算统计信息
379+
feature_data = self.df[feature]
380+
min_value = feature_data.min()
381+
max_value = feature_data.max()
382+
median_value = feature_data.median()
383+
384+
# 添加到结果列表
385+
results.append({
386+
'variable': feature,
387+
'iv': iv,
388+
'ks': ks,
389+
'auc': auc,
390+
'missing_rate': missing_rate,
391+
'single_value_rate': single_value_rate,
392+
'min_value': min_value,
393+
'max_value': max_value,
394+
'median_value': median_value,
395+
'mean_diff': mean_diff,
396+
'corr_with_target': corr_with_target,
397+
'psi': psi
398+
})
399+
except Exception as e:
400+
print(f"分析变量 {feature} 时发生错误: {str(e)}")
401+
continue
420402

403+
if not results:
404+
return pd.DataFrame()
405+
421406
return pd.DataFrame(results).sort_values(by='iv', ascending=False)
422407

423408
def analyze_single_variable(self, variable: str, n_bins: int = 10) -> pd.DataFrame:
0 Bytes
Binary file not shown.
4 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)