数据挖掘比赛Handbook

Pandas常用操作

去重

1	BASE_EXCG = BASE_EXCG.drop_duplicates(subset=None, keep='first', inplace=False)

合并

1	IDV_TD = pd.merge(IDV_TD, BASE_EXCG, left_on='CCY_CD', right_on='CCY_LETE_CD', how='left')

删除

1	IDV_TD.drop(['RAT_CTG','FXDI_SA_ACCM'], axis=1,inplace =True)

运算

1	IDV_TD = IDV_TD.eval('CRBAL_RMB = CRBAL * RMB_MID_PRIC')

索引重置

1	IDV_TD_after.reset_index()

修改列名

1	IDV_TD_after_after.rename(columns= lambda x:x if x=='CUST_NO' else IDV_TD_'+x).to_csv("./data/IDV_TD_out.csv",index = False)

修改字段类型

1	IDV_CUST_BASIC['OCP_CD']=IDV_CUST_BASIC['OCP_CD'].astype(str)

填充NaN

1	IDV_CUST_BASIC['OCP_CD'].fillna('NULL',inplace=True)

groupby+join

1 2	IDV_TD_group = IDV_TD.groupby(['CUST_NO','DATA_DAT']) IDV_TD_after = IDV_TD_group.sum().join(IDV_TD_group.size().to_frame(name='count')).join(IDV_TD_group['CRBAL_RMB'].max(),rsuffix = 'max')

新建一列

1	IDV_TD_after_after['HAS_IDV_TD'] = 1

常用编码方式

One-hot

#one-hot encoding
n_columns = ['CCY_CD','RDEP_IND_CD','ACCT_STS_CD','DP_DAY_CD','RDEP_DP_DAY_CD']
dummy_df = pd.get_dummies(IDV_TD[n_columns],columns=n_columns)
IDV_TD = pd.concat([IDV_TD, dummy_df], axis=1)
IDV_TD.drop(n_columns, axis=1,inplace =True)

Label

#label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
DP_DAY_CD_lset = ["Y005","Y003","Y002","Y001","M006","M003","M001","D7","D1","#","NULL"]
le.fit(DP_DAY_CD_lset)
IDV_TD["DP_DAY_CD"] = le.transform(IDV_TD["DP_DAY_CD"])
IDV_TD['RDEP_DP_DAY_CD'] = le.transform(IDV_TD['RDEP_DP_DAY_CD'])

Mean encoding

#mean encoding
from sklearn.model_selection import KFold,StratifiedKFold 

class MeanEncoder:
def __init__(self, categorical_features, n_splits=5, target_type='classification', prior_weight_func=None):
    """
    :param categorical_features: list of str, the name of the categorical columns to encode
    :param n_splits: the number of splits used in mean encoding
    :param target_type: str, 'regression' or 'classification'
    :param prior_weight_func:
    a function that takes in the number of observations, and outputs prior weight
    when a dict is passed, the default exponential decay function will be used:
    k: the number of observations needed for the posterior to be weighted equally as the prior
    f: larger f --> smaller slope
    """

    self.categorical_features = categorical_features
    self.n_splits = n_splits
    self.learned_stats = {}

    if target_type == 'classification':
        self.target_type = target_type
        self.target_values = []
    else:
        self.target_type = 'regression'
        self.target_values = None

    if isinstance(prior_weight_func, dict):
        self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
    elif callable(prior_weight_func):
        self.prior_weight_func = prior_weight_func
    else:
        self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))

@staticmethod
def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
    X_train = X_train[[variable]].copy()
    X_test = X_test[[variable]].copy()

    if target is not None:
        nf_name = '{}_pred_{}'.format(variable, target)
        X_train['pred_temp'] = (y_train == target).astype(int)  # classification
    else:
        nf_name = '{}_pred'.format(variable)
        X_train['pred_temp'] = y_train  # regression

    prior = X_train['pred_temp'].mean()
    col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
    col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
    col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
    col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)
    nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
    nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
    return nf_train, nf_test, prior, col_avg_y

def fit_transform(self, X, y):
    """
    :param X: pandas DataFrame, n_samples * n_features
    :param y: pandas Series or numpy array, n_samples
    :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
    """
    X_new = X.copy()
    if self.target_type == 'classification':
        skf = StratifiedKFold(self.n_splits)
    else:
        skf = KFold(self.n_splits)

    if self.target_type == 'classification':
        self.target_values = sorted(set(y))
        self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                              itertools.product(self.categorical_features, self.target_values)}
        for variable, target in itertools.product(self.categorical_features, self.target_values):
            nf_name = '{}_pred_{}'.format(variable, target)
            X_new.loc[:, nf_name] = np.nan
            for large_ind, small_ind in skf.split(y, y):
                nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                    X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                X_new.iloc[small_ind, -1] = nf_small
                self.learned_stats[nf_name].append((prior, col_avg_y))
    else:
        self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
        for variable in self.categorical_features:
            nf_name = '{}_pred'.format(variable)
            X_new.loc[:, nf_name] = np.nan
            for large_ind, small_ind in skf.split(y, y):
                nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                    X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                X_new.iloc[small_ind, -1] = nf_small
                self.learned_stats[nf_name].append((prior, col_avg_y))
    return X_new

def transform(self, X):
    """
    :param X: pandas DataFrame, n_samples * n_features
    :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
    """
    X_new = X.copy()
    if self.target_type == 'classification':
        for variable, target in itertools.product(self.categorical_features, self.target_values[:-1]):
            nf_name = '{}_pred_{}'.format(variable, target)
            X_new[nf_name] = 0
            for prior, col_avg_y in self.learned_stats[nf_name]:
                X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name]
            X_new[nf_name] /= self.n_splits
    else:
        for variable in self.categorical_features:
            nf_name = '{}_pred'.format(variable)
            X_new[nf_name] = 0
            for prior, col_avg_y in self.learned_stats[nf_name]:
                X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name]
            X_new[nf_name] /= self.n_splits

    return X_new

使用:

#mean-encoding
MeanEncodeFeature = ['PROV_CD','NATN_CD','CULT_DGR_CD','SPEC_TECH_PRFN_QUA_CD','OCP_CD','RES_CD']
ME = MeanEncoder(MeanEncodeFeature)
ME.fit_transform(IDV_CUST_BASIC_train,IDV_y)
IDV_CUST_BASIC = ME.transform(IDV_CUST_BASIC)

模型

训练集验证集划分

1 2	from sklearn.model_selection import train_test_split X_dtrain ,X_deval,Y_dtrain,Y_deval = train_test_split(X_train,Y_train,test_size=0.3,random_state=1024,stratify=Y_train)

LightGBM

import lightgbm as lgb  
import pickle  
from sklearn.metrics import roc_auc_score  

lgb_train = lgb.Dataset(X_dtrain, Y_dtrain)  
lgb_eval = lgb.Dataset(X_deval, Y_deval)  

params = {  
    'boosting_type': 'gbdt',  
    'objective': 'binary',  
    'metric': 'auc', 
    'num_leaves': 20,  
    'max_depth': 5,  
    'min_data_in_leaf': 20,  
    'learning_rate': 0.02,  
    'feature_fraction': 0.9,  
    'bagging_fraction': 0.9,  
    'bagging_freq': 5,  
    'lambda_l1': 1,
    'lambda_l2': 0.001,
    'min_gain_to_split': 0.2,  
    'verbose': 5,  
    'is_unbalance': True, #重要
    'random_state' : 1024
}  

gbm = lgb.train(params,  
                lgb_train,  
                num_boost_round=2000,  
                valid_sets=lgb_eval,  
                early_stopping_rounds=200)

导出特征重要性

importance = gbm.feature_importance()  
names = gbm.feature_name()  
with open('./feature_importance.txt', 'w+') as file: 
    for index, im in enumerate(importance):  
        string = names[index] + ', ' + str(im) + '\n'  
        file.write(string)

保存模型

1	gbm.save_model('./model.txt')

KFOLD

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import lightgbm as lgb
import gc

# Extract feature names
feature_names = list(X_train.columns)
# Create the kfold object
k_fold = KFold(n_splits = 5, shuffle = True, random_state = 50)
    
# Empty array for feature importances
feature_importance_values = np.zeros(len(feature_names))
    
# Empty array for test predictions
target_predictions = np.zeros(target.shape[0])
    
# Empty array for out of fold validation predictions
out_of_fold = np.zeros(X_train.shape[0])

valid_f1 = []
valid_threshold = []
valid_auc = []
# Iterate through each fold
for train_indices, valid_indices in k_fold.split(X_train):
    print('开始一次迭代')
    # Training data for the fold
    train_features, train_labels = X_train.iloc[train_indices], Y_train.iloc[train_indices]
    # Validation data for the fold
    valid_features, valid_labels = X_train.iloc[valid_indices], Y_train.iloc[valid_indices]
    lgb_train = lgb.Dataset(train_features, train_labels)  
    lgb_eval = lgb.Dataset(valid_features, valid_labels)  
    params = {  
    'boosting_type': 'gbdt',  
    'objective': 'binary',  
    'metric': 'auc', 
    'num_leaves': 30,  
    'max_depth': 6,  
    'min_data_in_leaf': 20,  
    'learning_rate': 0.01,  
    'feature_fraction': 0.9,  
    'bagging_fraction': 0.9,  
    'bagging_freq': 5,  
    'lambda_l1': 1,
    'lambda_l2': 0.001,
    'min_gain_to_split': 0.2,  
    'verbose': 5,  
    'is_unbalance': True, #重要
    'random_state' : 1024
    }  
    gbm = lgb.train(params,  
                lgb_train,  
                num_boost_round=2000,  
                verbose_eval = False,
                valid_sets=lgb_eval,  
                early_stopping_rounds=200)  
    
    pred_train = gbm.predict(valid_features, num_iteration=gbm.best_iteration) 
    pred_train =  pd.DataFrame(pred_train)
    maxf1 = 0
    maxi = 0
    for i in np.arange(0.2,0.8,0.001):
        temp = pd.DataFrame(pred_train[0].apply(lambda x:1 if x>i else 0))
        score = f1_score(valid_labels,temp)
        if score >maxf1 :
            maxf1 = score
            maxi = i
    valid_f1.append(maxf1)
    valid_threshold.append(maxi)
    # Record the best iteration
    best_iteration = gbm.best_iteration
    valid_auc.append(gbm.best_score['valid_0']['auc'])
    # Record the feature importances
    feature_importance_values += gbm.feature_importance() / k_fold.n_splits
    # Make predictions
    target_predictions += gbm.predict(target, num_iteration = best_iteration) / k_fold.n_splits   
    # Record the out of fold predictions
    out_of_fold[valid_indices] = gbm.predict(valid_features, num_iteration = best_iteration)  
    # Clean up memory
    gc.enable()
    del gbm, train_features, valid_features
    gc.collect()

print(valid_f1,np.mean(valid_f1))
print(valid_threshold,np.mean(valid_threshold))
print(valid_auc,np.mean(valid_auc))

StackNet

import lightgbm as lgb  

lightgbm = lgb.LGBMClassifier(    
    boosting_type= 'gbdt',  
    objective= 'binary',  
    metric= 'auc', 
    num_leaves= 20,  
    max_depth= 5,  
    min_data_in_leaf= 20,  
    learning_rate= 0.02,  
    feature_fraction= 0.9,  
    bagging_fraction=0.9,  
    bagging_freq= 5,  
    lambda_l1=1,
    lambda_l2=0.001,
    min_gain_to_split= 0.2,  
    verbose=5,  
    is_unbalance=True, 
    random_state=1024)

import xgboost as xgb

xgboost_gbtree = xgb.XGBClassifier(
    booster='gbtree',
    objective='binary:logistic',
    n_estimators = 500,
    learning_rate = 0.1,
    subsample= 0.9,
    colsample_bytree= 0.9,
    min_child_weight=2,
    max_depth= 7
)

from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier

param_cb = {
        'learning_rate': 0.15,
        'bagging_temperature': 0.1, 
        'l2_leaf_reg': 4,
        'depth': 7, 
        'iterations' : 300,
        'task_type':'CPU',
        'loss_function' : "CrossEntropy",
        'eval_metric' : "AUC",
        #'bootstrap_type' : 'Bayesian',
        #'random_seed':42,
        #'early_stopping_rounds' : 100,
}
clf_ctb = CatBoostClassifier(**param_cb)

models = [  ######## First level ########
            [xgboost_gbtree, clf_ctb, lightgbm],
            ######## Second level ########
            [lightgbm],
]

from pystacknet.pystacknet import StackNetClassifier

model = StackNetClassifier(
    models,
    metric="auc",
    folds=3,
    restacking=True,
    use_retraining=True,
    use_proba=True,
    random_state=42,
    verbose=1,
)

model.fit(X_dtrain, Y_dtrain)