8000 Add sklearn API wrapper by slundberg · Pull Request #9 · boredbird/woe · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Add sklearn API wrapper #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,6 @@
'numpy>=1.11.3',
'scipy>=0.18.1',
'matplotlib>=2.0.0',
'tqdm'
]
)
1 change: 1 addition & 0 deletions woe/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .sklearn import WOEEncoder
16 changes: 9 additions & 7 deletions woe/feature_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ def woe_trans(dvar,civ):

return var

def proc_woe_discrete(df,var,global_bt,global_gt,min_sample,alpha=0.01):
def proc_woe_discrete(df,var,global_bt,global_gt,min_sample,alpha=0.01, silent=False):
'''
process woe transformation of discrete variables
:param df:
Expand All @@ -391,10 +391,11 @@ def proc_woe_discrete(df,var,global_bt,global_gt,min_sample,alpha=0.01):
:param min_sample:
:return:
'''
s = 'process discrete variable:'+str(var)
print(s.center(60, '-'))
if not silent:
s = 'process discrete variable:'+str(var)
print(s.center(60, '-'))

df = df[[var,'target']]
df = df[[var,'target']].copy()
div = DisInfoValue()
div.var_name = var
rdict = {}
Expand Down Expand Up @@ -448,7 +449,7 @@ def proc_woe_discrete(df,var,global_bt,global_gt,min_sample,alpha=0.01):
return civ


def proc_woe_continuous(df,var,global_bt,global_gt,min_sample,alpha=0.01):
def proc_woe_continuous(df,var,global_bt,global_gt,min_sample,alpha=0.01, silent=False):
'''
process woe transformation of discrete variables
:param df:
Expand All @@ -458,8 +459,9 @@ def proc_woe_continuous(df,var,global_bt,global_gt,min_sample,alpha=0.01):
:param min_sample:
:return:
'''
s = 'process continuous variable:'+str(var)
print(s.center(60, '-'))
if not silent:
s = 'process continuous variable:'+str(var)
print(s.center(60, '-'))
df = df[[var,'target']]
iv_tree = binning_data_split(df, var,global_bt,global_gt,min_sample,alpha)

Expand Down
72 changes: 72 additions & 0 deletions woe/sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# -*- coding:utf-8 -*-
__author__ = 'slundberg'

# this creates an sklearn API style wrapper
from tqdm import tqdm
from .feature_process import proc_woe_continuous, proc_woe_discrete, woe_trans

class WOEEncoder():
def __init__(self):
pass

def fit(self, X, y, continuous_features=None):
""" Learn a WoE transform from a dataset and binary label.
"""

# auto-detect continuous features
if continuous_features is None:
continuous_features = []
for i in range(len(X.columns)):
if str(X.dtypes[i]).startswith("float"):
continuous_features.append(X.columns[i])
self.continuous_features = continuous_features

# set the target in the joint dataset
assert "target" not in X.columns, "'target' is a reserved name, and can't be a feature name!"
joint_data = X.copy()
joint_data["target"] = y

# compute some parameters
self.dataset_len = X.shape[0]
self.min_sample = int(self.dataset_len * 0.05)
self.global_bt = sum(y)
self.global_gt = self.dataset_len - self.global_bt

# build transformations
self.rst = []
for c in tqdm(X.columns):

# continuous features
if c in self.continuous_features:
joint_data.loc[joint_data[c].isnull(), (c)] = -1 # fill null
self.rst.append(proc_woe_continuous(
joint_data, c, self.global_bt,
self.global_gt, self.min_sample, alpha=0.05, silent=True
))

# discrete features
else:
joint_data.loc[joint_data[c].isnull(), (c)] = 'missing' # fill null
self.rst.append(proc_woe_discrete(
joint_data, c, self.global_bt, self.global_gt,
self.min_sample, alpha=0.05, silent=True
))

def transform(self, X):
""" Apply the learned WoE transform to a new dataset.
"""

X_new = X.copy()
for c in X.columns:
if c in self.continuous_features:
X_new.loc[X_new[c].isnull(), (c)] = -1 # fill null
else:
X_new.loc[X_new[c].isnull(), (c)] = 'missing' # fill null

# training dataset WoE Transformation
for r in self.rst:
X_new[r.var_name] = woe_trans(X_new[r.var_name], r)

return X_new


0