LucKy_one

Always aiming higher


  • 首页

  • 关于

  • 标签

  • 分类

  • 归档

  • 搜索

数据挖掘比赛Handbook

发表于 2019-08-08
参加数据挖掘比赛的总结

Pandas常用操作

去重

1
BASE_EXCG = BASE_EXCG.drop_duplicates(subset=None, keep='first', inplace=False)

合并

1
IDV_TD = pd.merge(IDV_TD, BASE_EXCG, left_on='CCY_CD', right_on='CCY_LETE_CD', how='left')

删除

1
IDV_TD.drop(['RAT_CTG','FXDI_SA_ACCM'], axis=1,inplace =True)

运算

1
IDV_TD = IDV_TD.eval('CRBAL_RMB = CRBAL * RMB_MID_PRIC')

索引重置

1
IDV_TD_after.reset_index()

修改列名

1
IDV_TD_after_after.rename(columns= lambda x:x if x=='CUST_NO' else IDV_TD_'+x).to_csv("./data/IDV_TD_out.csv",index = False)

修改字段类型

1
IDV_CUST_BASIC['OCP_CD']=IDV_CUST_BASIC['OCP_CD'].astype(str)

填充NaN

1
IDV_CUST_BASIC['OCP_CD'].fillna('NULL',inplace=True)

groupby+join

1
2
IDV_TD_group = IDV_TD.groupby(['CUST_NO','DATA_DAT'])
IDV_TD_after = IDV_TD_group.sum().join(IDV_TD_group.size().to_frame(name='count')).join(IDV_TD_group['CRBAL_RMB'].max(),rsuffix = 'max')

新建一列

1
IDV_TD_after_after['HAS_IDV_TD'] = 1

常用编码方式

One-hot

1
2
3
4
5
#one-hot encoding
n_columns = ['CCY_CD','RDEP_IND_CD','ACCT_STS_CD','DP_DAY_CD','RDEP_DP_DAY_CD']
dummy_df = pd.get_dummies(IDV_TD[n_columns],columns=n_columns)
IDV_TD = pd.concat([IDV_TD, dummy_df], axis=1)
IDV_TD.drop(n_columns, axis=1,inplace =True)

Label

1
2
3
4
5
6
7
#label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
DP_DAY_CD_lset = ["Y005","Y003","Y002","Y001","M006","M003","M001","D7","D1","#","NULL"]
le.fit(DP_DAY_CD_lset)
IDV_TD["DP_DAY_CD"] = le.transform(IDV_TD["DP_DAY_CD"])
IDV_TD['RDEP_DP_DAY_CD'] = le.transform(IDV_TD['RDEP_DP_DAY_CD'])

Mean encoding

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#mean encoding
from sklearn.model_selection import KFold,StratifiedKFold

class MeanEncoder:
def __init__(self, categorical_features, n_splits=5, target_type='classification', prior_weight_func=None):
"""
:param categorical_features: list of str, the name of the categorical columns to encode
:param n_splits: the number of splits used in mean encoding
:param target_type: str, 'regression' or 'classification'
:param prior_weight_func:
a function that takes in the number of observations, and outputs prior weight
when a dict is passed, the default exponential decay function will be used:
k: the number of observations needed for the posterior to be weighted equally as the prior
f: larger f --> smaller slope
"""

self.categorical_features = categorical_features
self.n_splits = n_splits
self.learned_stats = {}

if target_type == 'classification':
self.target_type = target_type
self.target_values = []
else:
self.target_type = 'regression'
self.target_values = None

if isinstance(prior_weight_func, dict):
self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
elif callable(prior_weight_func):
self.prior_weight_func = prior_weight_func
else:
self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))

@staticmethod
def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
X_train = X_train[[variable]].copy()
X_test = X_test[[variable]].copy()

if target is not None:
nf_name = '{}_pred_{}'.format(variable, target)
X_train['pred_temp'] = (y_train == target).astype(int) # classification
else:
nf_name = '{}_pred'.format(variable)
X_train['pred_temp'] = y_train # regression

prior = X_train['pred_temp'].mean()
col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)
nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
return nf_train, nf_test, prior, col_avg_y

def fit_transform(self, X, y):
"""
:param X: pandas DataFrame, n_samples * n_features
:param y: pandas Series or numpy array, n_samples
:return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
"""
X_new = X.copy()
if self.target_type == 'classification':
skf = StratifiedKFold(self.n_splits)
else:
skf = KFold(self.n_splits)

if self.target_type == 'classification':
self.target_values = sorted(set(y))
self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
itertools.product(self.categorical_features, self.target_values)}
for variable, target in itertools.product(self.categorical_features, self.target_values):
nf_name = '{}_pred_{}'.format(variable, target)
X_new.loc[:, nf_name] = np.nan
for large_ind, small_ind in skf.split(y, y):
nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
X_new.iloc[small_ind, -1] = nf_small
self.learned_stats[nf_name].append((prior, col_avg_y))
else:
self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
for variable in self.categorical_features:
nf_name = '{}_pred'.format(variable)
X_new.loc[:, nf_name] = np.nan
for large_ind, small_ind in skf.split(y, y):
nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
X_new.iloc[small_ind, -1] = nf_small
self.learned_stats[nf_name].append((prior, col_avg_y))
return X_new

def transform(self, X):
"""
:param X: pandas DataFrame, n_samples * n_features
:return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
"""
X_new = X.copy()
if self.target_type == 'classification':
for variable, target in itertools.product(self.categorical_features, self.target_values[:-1]):
nf_name = '{}_pred_{}'.format(variable, target)
X_new[nf_name] = 0
for prior, col_avg_y in self.learned_stats[nf_name]:
X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name]
X_new[nf_name] /= self.n_splits
else:
for variable in self.categorical_features:
nf_name = '{}_pred'.format(variable)
X_new[nf_name] = 0
for prior, col_avg_y in self.learned_stats[nf_name]:
X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name]
X_new[nf_name] /= self.n_splits

return X_new

使用:

1
2
3
4
5
#mean-encoding
MeanEncodeFeature = ['PROV_CD','NATN_CD','CULT_DGR_CD','SPEC_TECH_PRFN_QUA_CD','OCP_CD','RES_CD']
ME = MeanEncoder(MeanEncodeFeature)
ME.fit_transform(IDV_CUST_BASIC_train,IDV_y)
IDV_CUST_BASIC = ME.transform(IDV_CUST_BASIC)

模型

训练集验证集划分

1
2
from sklearn.model_selection import train_test_split
X_dtrain ,X_deval,Y_dtrain,Y_deval = train_test_split(X_train,Y_train,test_size=0.3,random_state=1024,stratify=Y_train)

LightGBM

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import lightgbm as lgb  
import pickle
from sklearn.metrics import roc_auc_score

lgb_train = lgb.Dataset(X_dtrain, Y_dtrain)
lgb_eval = lgb.Dataset(X_deval, Y_deval)

params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_leaves': 20,
'max_depth': 5,
'min_data_in_leaf': 20,
'learning_rate': 0.02,
'feature_fraction': 0.9,
'bagging_fraction': 0.9,
'bagging_freq': 5,
'lambda_l1': 1,
'lambda_l2': 0.001,
'min_gain_to_split': 0.2,
'verbose': 5,
'is_unbalance': True, #重要
'random_state' : 1024
}

gbm = lgb.train(params,
lgb_train,
num_boost_round=2000,
valid_sets=lgb_eval,
early_stopping_rounds=200)

导出特征重要性

1
2
3
4
5
6
importance = gbm.feature_importance()  
names = gbm.feature_name()
with open('./feature_importance.txt', 'w+') as file:
for index, im in enumerate(importance):
string = names[index] + ', ' + str(im) + '\n'
file.write(string)

保存模型

1
gbm.save_model('./model.txt')

KFOLD

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import lightgbm as lgb
import gc

# Extract feature names
feature_names = list(X_train.columns)
# Create the kfold object
k_fold = KFold(n_splits = 5, shuffle = True, random_state = 50)

# Empty array for feature importances
feature_importance_values = np.zeros(len(feature_names))

# Empty array for test predictions
target_predictions = np.zeros(target.shape[0])

# Empty array for out of fold validation predictions
out_of_fold = np.zeros(X_train.shape[0])

valid_f1 = []
valid_threshold = []
valid_auc = []
# Iterate through each fold
for train_indices, valid_indices in k_fold.split(X_train):
print('开始一次迭代')
# Training data for the fold
train_features, train_labels = X_train.iloc[train_indices], Y_train.iloc[train_indices]
# Validation data for the fold
valid_features, valid_labels = X_train.iloc[valid_indices], Y_train.iloc[valid_indices]
lgb_train = lgb.Dataset(train_features, train_labels)
lgb_eval = lgb.Dataset(valid_features, valid_labels)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_leaves': 30,
'max_depth': 6,
'min_data_in_leaf': 20,
'learning_rate': 0.01,
'feature_fraction': 0.9,
'bagging_fraction': 0.9,
'bagging_freq': 5,
'lambda_l1': 1,
'lambda_l2': 0.001,
'min_gain_to_split': 0.2,
'verbose': 5,
'is_unbalance': True, #重要
'random_state' : 1024
}
gbm = lgb.train(params,
lgb_train,
num_boost_round=2000,
verbose_eval = False,
valid_sets=lgb_eval,
early_stopping_rounds=200)

pred_train = gbm.predict(valid_features, num_iteration=gbm.best_iteration)
pred_train = pd.DataFrame(pred_train)
maxf1 = 0
maxi = 0
for i in np.arange(0.2,0.8,0.001):
temp = pd.DataFrame(pred_train[0].apply(lambda x:1 if x>i else 0))
score = f1_score(valid_labels,temp)
if score >maxf1 :
maxf1 = score
maxi = i
valid_f1.append(maxf1)
valid_threshold.append(maxi)
# Record the best iteration
best_iteration = gbm.best_iteration
valid_auc.append(gbm.best_score['valid_0']['auc'])
# Record the feature importances
feature_importance_values += gbm.feature_importance() / k_fold.n_splits
# Make predictions
target_predictions += gbm.predict(target, num_iteration = best_iteration) / k_fold.n_splits
# Record the out of fold predictions
out_of_fold[valid_indices] = gbm.predict(valid_features, num_iteration = best_iteration)
# Clean up memory
gc.enable()
del gbm, train_features, valid_features
gc.collect()

print(valid_f1,np.mean(valid_f1))
print(valid_threshold,np.mean(valid_threshold))
print(valid_auc,np.mean(valid_auc))

StackNet

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import lightgbm as lgb  

lightgbm = lgb.LGBMClassifier(
boosting_type= 'gbdt',
objective= 'binary',
metric= 'auc',
num_leaves= 20,
max_depth= 5,
min_data_in_leaf= 20,
learning_rate= 0.02,
feature_fraction= 0.9,
bagging_fraction=0.9,
bagging_freq= 5,
lambda_l1=1,
lambda_l2=0.001,
min_gain_to_split= 0.2,
verbose=5,
is_unbalance=True,
random_state=1024)

import xgboost as xgb

xgboost_gbtree = xgb.XGBClassifier(
booster='gbtree',
objective='binary:logistic',
n_estimators = 500,
learning_rate = 0.1,
subsample= 0.9,
colsample_bytree= 0.9,
min_child_weight=2,
max_depth= 7
)

from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier

param_cb = {
'learning_rate': 0.15,
'bagging_temperature': 0.1,
'l2_leaf_reg': 4,
'depth': 7,
'iterations' : 300,
'task_type':'CPU',
'loss_function' : "CrossEntropy",
'eval_metric' : "AUC",
#'bootstrap_type' : 'Bayesian',
#'random_seed':42,
#'early_stopping_rounds' : 100,
}
clf_ctb = CatBoostClassifier(**param_cb)

models = [ ######## First level ########
[xgboost_gbtree, clf_ctb, lightgbm],
######## Second level ########
[lightgbm],
]

from pystacknet.pystacknet import StackNetClassifier

model = StackNetClassifier(
models,
metric="auc",
folds=3,
restacking=True,
use_retraining=True,
use_proba=True,
random_state=42,
verbose=1,
)

model.fit(X_dtrain, Y_dtrain)
# python # pandas
2018闲书阅读书目
CFA Level I exam 报名注册
  • 文章目录
  • 站点概览

Liu Tianhe

喜欢金融,喜欢科技
31 日志
29 标签
RSS
GitHub E-Mail
  1. 1. Pandas常用操作
  2. 2. 常用编码方式
    1. 2.1. One-hot
    2. 2.2. Label
    3. 2.3. Mean encoding
  3. 3. 模型
    1. 3.1. 训练集验证集划分
    2. 3.2. LightGBM
    3. 3.3. 导出特征重要性
    4. 3.4. 保存模型
    5. 3.5. StackNet
© 2014 – 2022 Liu Tianhe
由 Hexo 强力驱动 v3.9.0
|
主题 – NexT.Mist v7.3.0