Deeper Learning

Feature Engineering 본문

AI/Machine Learning & Data Science

Feature Engineering

Dlaiml 2020. 11. 27. 22:56

 

 

import numpy as np
import pandas as pd

 

 
from sklearn.datasets import fetch_california_housing
x,y = fetch_california_housing(return_X_y=True)

 

 
k=fetch_california_housing()
k.feature_names
 
print(x.shape)
print(y.shape)
 
np.random.seed(1127)
shuffle_ind = np.random.permutation(x.shape[0])
x=x[shuffle_ind,:]
y=y[shuffle_ind]
x_train = x[:int(x.shape[0]*0.8),:]
y_train = y[:int(x.shape[0]*0.8)]
x_test = x[int(x.shape[0]*0.8):,:]
y_test = y[int(x.shape[0]*0.8):]

 

 

 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

def feature_engineering(x_train,x_test,method):
    if method=='binning':
        x_train_binned_list = []
        x_test_binned_list = []
        for f_tr,f_te in zip(x_train.T,x_test.T):
            f_tr = np.expand_dims(f_tr,axis=-1)
            f_te = np.expand_dims(f_te,axis=-1)
            bins = np.linspace(np.quantile(f_tr,0.1),np.quantile(f_tr,0.9),11)
            which_bin = np.digitize(f_tr,bins=bins)
            encoder = OneHotEncoder(sparse = False) # dense matrix
            encoder.fit(which_bin)
            x_train_binned = encoder.transform(which_bin)
            x_train_binned=np.array(x_train_binned)
            x_train_binned_list.append(x_train_binned)
            which_bin = np.digitize(f_te,bins=bins)
            x_test_binned = np.array(encoder.transform(which_bin))
            x_test_binned_list.append(x_test_binned)
        transformed_x_train = np.concatenate(x_train_binned_list,axis=1)
        transformed_x_test = np.concatenate(x_test_binned_list,axis=1)
        return transformed_x_train, transformed_x_test
    
    elif method=='polinomialization':
        poly = PolynomialFeatures(degree=6,include_bias=False)
        poly.fit(x_train)
        poly_x_train = poly.transform(x_train)
        poly_x_test = poly.transform(x_test)
        return poly_x_train, poly_x_test
    
    elif method=='nonlinear_transformation':
        x_train_t = x_train.T.copy()
        x_test_t = x_test.T.copy()
        for i,(f_tr,f_te) in enumerate(zip(x_train_t,x_test_t)):
            if (f_tr.min() > -1) and (f_te.min() > -1):
                log_f_tr = np.log(f_tr+1)
                x_train_t[i] = log_f_tr
                log_f_te = np.log(f_te+1)
                x_test_t[i] = log_f_te
        return x_train_t.T,x_test_t.T
        
            

 

 

 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
scaler = StandardScaler()
ridge = Ridge()

 

 
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
tr_pred=rf.predict(x_train)
te_pred=rf.predict(x_test)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))

 

 
ridge.fit(x_train,y_train)
tr_pred=ridge.predict(x_train)
te_pred=ridge.predict(x_test)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))

 

 
#scaled
scaler.fit(x_train)
x_train_scaled=scaler.transform(x_train)
x_test_scaled=scaler.transform(x_test)
ridge.fit(x_train_scaled,y_train)
tr_pred=ridge.predict(x_train_scaled)
te_pred=ridge.predict(x_test_scaled)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
 
#scaled
from sklearn.preprocessing import MinMaxScaler
m_scaler = MinMaxScaler()
m_scaler.fit(x_train)
x_train_scaled=m_scaler.transform(x_train)
x_test_scaled=m_scaler.transform(x_test)
ridge.fit(x_train_scaled,y_train)
tr_pred=ridge.predict(x_train_scaled)
te_pred=ridge.predict(x_test_scaled)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
 
#binning
x_train_binning,x_test_binning = feature_engineering(x_train,x_test,'binning')
ridge.fit(x_train_binning,y_train)
tr_pred=ridge.predict(x_train_binning)
te_pred=ridge.predict(x_test_binning)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
 
#binning + log target
x_train_binning,x_test_binning = feature_engineering(x_train,x_test,'binning')
y_train_log = np.log(y_train+1)
y_test_log = np.log(y_test+1)
ridge.fit(x_train_binning,y_train_log)

tr_pred=ridge.predict(x_train_binning)
te_pred=ridge.predict(x_test_binning)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train_log),r2_score(te_pred,y_test_log)))
 
#polynomial + (scale + log target)
x_train_poly,x_test_poly = feature_engineering(x_train,x_test,'polinomialization')
ridge.fit(x_train_poly,y_train)
tr_pred=ridge.predict(x_train_poly)
te_pred=ridge.predict(x_test_poly)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}\n'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
scaler = StandardScaler()
scaler.fit(x_train_poly)
x_train_poly_scaled=scaler.transform(x_train_poly)
x_test_poly_scaled=scaler.transform(x_test_poly)
ridge.fit(x_train_poly_scaled,y_train_log)
tr_pred=ridge.predict(x_train_poly_scaled)
te_pred=ridge.predict(x_test_poly_scaled)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train_log),r2_score(te_pred,y_test_log)))
 
#nonlinear transformation
x_train_nonlinear,x_test_nonlinear = feature_engineering(x_train,x_test,'nonlinear_transformation')
ridge.fit(x_train_nonlinear,y_train)
tr_pred=ridge.predict(x_train_nonlinear)
te_pred=ridge.predict(x_test_nonlinear)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}\n'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
y_train_log = np.log(y_train+1)
y_test_log = np.log(y_test+1)
ridge.fit(x_train_nonlinear,y_train_log)
tr_pred=ridge.predict(x_train_nonlinear)
te_pred=ridge.predict(x_test_nonlinear)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}\n'.format(r2_score(tr_pred,y_train_log),r2_score(te_pred,y_test_log)))
scaler.fit(x_train_nonlinear)
x_train_nonlinear_scaled = scaler.transform(x_train_nonlinear)
x_test_nonlinear_scaled = scaler.transform(x_test_nonlinear)
ridge.fit(x_train_nonlinear_scaled,y_train_log)
tr_pred=ridge.predict(x_train_nonlinear_scaled)
te_pred=ridge.predict(x_test_nonlinear_scaled)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train_log),r2_score(te_pred,y_test_log)))
 
# feature selection
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
skb = SelectKBest(f_regression,k=6)
skb.fit(x_train,y_train)
x_train_skb = skb.transform(x_train)
x_test_skb = skb.transform(x_test)

ridge = Ridge()
ridge.fit(x_train_skb,y_train)
tr_pred=ridge.predict(x_train_skb)
te_pred=ridge.predict(x_test_skb)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
 
# feature selection
skb = SelectKBest(mutual_info_regression,k=6)
skb.fit(x_train,y_train)
x_train_skb = skb.transform(x_train)
x_test_skb = skb.transform(x_test)

ridge = Ridge()
ridge.fit(x_train_skb,y_train)
tr_pred=ridge.predict(x_train_skb)
te_pred=ridge.predict(x_test_skb)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
 
from sklearn.feature_selection import RFE

rfe = RFE(ridge,n_features_to_select=6,step=1)
rfe.fit(x_train,y_train)
x_train_rfe = rfe.transform(x_train)
x_test_rfe = rfe.transform(x_test)
ridge.fit(x_train_rfe,y_train)
tr_pred=ridge.predict(x_train_rfe)
te_pred=ridge.predict(x_test_rfe)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
 
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(ridge,max_features=8)
sfm.fit(x_train,y_train)
x_train_sfm = sfm.transform(x_train)
x_test_sfm = sfm.transform(x_test)

ridge.fit(x_train_sfm,y_train)
tr_pred=ridge.predict(x_train_sfm)
te_pred=ridge.predict(x_test_sfm)
print(x_train_sfm.shape)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
 
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(ridge,max_features=x_train_poly_scaled.shape[1])
sfm.fit(x_train_poly_scaled,y_train_log)
print(x_train_poly_scaled.shape)
x_train_poly_scaled_sfm = sfm.transform(x_train_poly_scaled)
x_test_poly_scaled_sfm = sfm.transform(x_test_poly_scaled)

ridge.fit(x_train_poly_scaled_sfm,y_train_log)
tr_pred=ridge.predict(x_train_poly_scaled_sfm)
te_pred=ridge.predict(x_test_poly_scaled_sfm)
print(x_train_poly_scaled_sfm.shape)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train_log),r2_score(te_pred,y_test_log)))

Description

sklearn을 사용

선형 모델인 Ridge 모델로 선형 회귀시 데이터의 분포가 선형적이지 않으면 낮은 performance를 보인다.

Binning을 통해 구간별로 나누어 One-hot-encoding을 통해 Categorical한 Feature로 feature들을 모두 변경하여 performance가 크게 향상되었다. 

Polynomial feature로 차원을 늘려 선형 모델의 input feature 자체에 비선형 특성을 추가하여 독립변수와 종속변수가 선형관계에 있게하여 performance를 향상 시킬 수 있었다.

로그 변환은 정규분포에 가깝게 분포를 만들어주고 회귀의 정확도를 높이기 때문에 자주 쓰인다.

Log-transform

회귀선이 원점을 지나거나 변수가 상대적인 scale (ex. percent), Right skewed distribution에서 로그 변환을 이용할 수 있다. 

종속변수에 log 변환을 하는것도 회귀에서 자주 사용되며 performance를 향상 시킨다.

 

Feature Selection은 각 feature의 분포에 따라 선정이 이루어지며 tree 알고리즘의 feature importance와 같이 feature의 중요도를 Entropy를 통해 측정할 수 있다.

RFE는 반복을 통해 feature를 제거하면서 performance의 변화를 통해 feature를 선택한다.

Comments