일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | 3 | 4 | |||
5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 | 20 | 21 | 22 | 23 | 24 | 25 |
26 | 27 | 28 | 29 | 30 | 31 |
Tags
- BERT
- Generative Model
- UE5
- 폰트생성
- motion matching
- NLP
- dl
- deep learning
- cv
- ue5.4
- userwidget
- WBP
- 언리얼엔진
- animation retargeting
- WinAPI
- Few-shot generation
- 모션매칭
- GAN
- Diffusion
- RNN
- Stat110
- 딥러닝
- Unreal Engine
- 디퓨전모델
- CNN
- multimodal
- 오블완
- ddpm
- 생성모델
- Font Generation
Archives
- Today
- Total
Deeper Learning
Feature Engineering 본문
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
x,y = fetch_california_housing(return_X_y=True)
k=fetch_california_housing()
k.feature_names
print(x.shape)
print(y.shape)
np.random.seed(1127)
shuffle_ind = np.random.permutation(x.shape[0])
x=x[shuffle_ind,:]
y=y[shuffle_ind]
x_train = x[:int(x.shape[0]*0.8),:]
y_train = y[:int(x.shape[0]*0.8)]
x_test = x[int(x.shape[0]*0.8):,:]
y_test = y[int(x.shape[0]*0.8):]
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
def feature_engineering(x_train,x_test,method):
if method=='binning':
x_train_binned_list = []
x_test_binned_list = []
for f_tr,f_te in zip(x_train.T,x_test.T):
f_tr = np.expand_dims(f_tr,axis=-1)
f_te = np.expand_dims(f_te,axis=-1)
bins = np.linspace(np.quantile(f_tr,0.1),np.quantile(f_tr,0.9),11)
which_bin = np.digitize(f_tr,bins=bins)
encoder = OneHotEncoder(sparse = False) # dense matrix
encoder.fit(which_bin)
x_train_binned = encoder.transform(which_bin)
x_train_binned=np.array(x_train_binned)
x_train_binned_list.append(x_train_binned)
which_bin = np.digitize(f_te,bins=bins)
x_test_binned = np.array(encoder.transform(which_bin))
x_test_binned_list.append(x_test_binned)
transformed_x_train = np.concatenate(x_train_binned_list,axis=1)
transformed_x_test = np.concatenate(x_test_binned_list,axis=1)
return transformed_x_train, transformed_x_test
elif method=='polinomialization':
poly = PolynomialFeatures(degree=6,include_bias=False)
poly.fit(x_train)
poly_x_train = poly.transform(x_train)
poly_x_test = poly.transform(x_test)
return poly_x_train, poly_x_test
elif method=='nonlinear_transformation':
x_train_t = x_train.T.copy()
x_test_t = x_test.T.copy()
for i,(f_tr,f_te) in enumerate(zip(x_train_t,x_test_t)):
if (f_tr.min() > -1) and (f_te.min() > -1):
log_f_tr = np.log(f_tr+1)
x_train_t[i] = log_f_tr
log_f_te = np.log(f_te+1)
x_test_t[i] = log_f_te
return x_train_t.T,x_test_t.T
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
scaler = StandardScaler()
ridge = Ridge()
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
tr_pred=rf.predict(x_train)
te_pred=rf.predict(x_test)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
ridge.fit(x_train,y_train)
tr_pred=ridge.predict(x_train)
te_pred=ridge.predict(x_test)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
#scaled
scaler.fit(x_train)
x_train_scaled=scaler.transform(x_train)
x_test_scaled=scaler.transform(x_test)
ridge.fit(x_train_scaled,y_train)
tr_pred=ridge.predict(x_train_scaled)
te_pred=ridge.predict(x_test_scaled)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
#scaled
from sklearn.preprocessing import MinMaxScaler
m_scaler = MinMaxScaler()
m_scaler.fit(x_train)
x_train_scaled=m_scaler.transform(x_train)
x_test_scaled=m_scaler.transform(x_test)
ridge.fit(x_train_scaled,y_train)
tr_pred=ridge.predict(x_train_scaled)
te_pred=ridge.predict(x_test_scaled)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
#binning
x_train_binning,x_test_binning = feature_engineering(x_train,x_test,'binning')
ridge.fit(x_train_binning,y_train)
tr_pred=ridge.predict(x_train_binning)
te_pred=ridge.predict(x_test_binning)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
#binning + log target
x_train_binning,x_test_binning = feature_engineering(x_train,x_test,'binning')
y_train_log = np.log(y_train+1)
y_test_log = np.log(y_test+1)
ridge.fit(x_train_binning,y_train_log)
tr_pred=ridge.predict(x_train_binning)
te_pred=ridge.predict(x_test_binning)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train_log),r2_score(te_pred,y_test_log)))
#polynomial + (scale + log target)
x_train_poly,x_test_poly = feature_engineering(x_train,x_test,'polinomialization')
ridge.fit(x_train_poly,y_train)
tr_pred=ridge.predict(x_train_poly)
te_pred=ridge.predict(x_test_poly)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}\n'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
scaler = StandardScaler()
scaler.fit(x_train_poly)
x_train_poly_scaled=scaler.transform(x_train_poly)
x_test_poly_scaled=scaler.transform(x_test_poly)
ridge.fit(x_train_poly_scaled,y_train_log)
tr_pred=ridge.predict(x_train_poly_scaled)
te_pred=ridge.predict(x_test_poly_scaled)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train_log),r2_score(te_pred,y_test_log)))
#nonlinear transformation
x_train_nonlinear,x_test_nonlinear = feature_engineering(x_train,x_test,'nonlinear_transformation')
ridge.fit(x_train_nonlinear,y_train)
tr_pred=ridge.predict(x_train_nonlinear)
te_pred=ridge.predict(x_test_nonlinear)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}\n'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
y_train_log = np.log(y_train+1)
y_test_log = np.log(y_test+1)
ridge.fit(x_train_nonlinear,y_train_log)
tr_pred=ridge.predict(x_train_nonlinear)
te_pred=ridge.predict(x_test_nonlinear)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}\n'.format(r2_score(tr_pred,y_train_log),r2_score(te_pred,y_test_log)))
scaler.fit(x_train_nonlinear)
x_train_nonlinear_scaled = scaler.transform(x_train_nonlinear)
x_test_nonlinear_scaled = scaler.transform(x_test_nonlinear)
ridge.fit(x_train_nonlinear_scaled,y_train_log)
tr_pred=ridge.predict(x_train_nonlinear_scaled)
te_pred=ridge.predict(x_test_nonlinear_scaled)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train_log),r2_score(te_pred,y_test_log)))
# feature selection
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
skb = SelectKBest(f_regression,k=6)
skb.fit(x_train,y_train)
x_train_skb = skb.transform(x_train)
x_test_skb = skb.transform(x_test)
ridge = Ridge()
ridge.fit(x_train_skb,y_train)
tr_pred=ridge.predict(x_train_skb)
te_pred=ridge.predict(x_test_skb)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
# feature selection
skb = SelectKBest(mutual_info_regression,k=6)
skb.fit(x_train,y_train)
x_train_skb = skb.transform(x_train)
x_test_skb = skb.transform(x_test)
ridge = Ridge()
ridge.fit(x_train_skb,y_train)
tr_pred=ridge.predict(x_train_skb)
te_pred=ridge.predict(x_test_skb)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
from sklearn.feature_selection import RFE
rfe = RFE(ridge,n_features_to_select=6,step=1)
rfe.fit(x_train,y_train)
x_train_rfe = rfe.transform(x_train)
x_test_rfe = rfe.transform(x_test)
ridge.fit(x_train_rfe,y_train)
tr_pred=ridge.predict(x_train_rfe)
te_pred=ridge.predict(x_test_rfe)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(ridge,max_features=8)
sfm.fit(x_train,y_train)
x_train_sfm = sfm.transform(x_train)
x_test_sfm = sfm.transform(x_test)
ridge.fit(x_train_sfm,y_train)
tr_pred=ridge.predict(x_train_sfm)
te_pred=ridge.predict(x_test_sfm)
print(x_train_sfm.shape)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train),r2_score(te_pred,y_test)))
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(ridge,max_features=x_train_poly_scaled.shape[1])
sfm.fit(x_train_poly_scaled,y_train_log)
print(x_train_poly_scaled.shape)
x_train_poly_scaled_sfm = sfm.transform(x_train_poly_scaled)
x_test_poly_scaled_sfm = sfm.transform(x_test_poly_scaled)
ridge.fit(x_train_poly_scaled_sfm,y_train_log)
tr_pred=ridge.predict(x_train_poly_scaled_sfm)
te_pred=ridge.predict(x_test_poly_scaled_sfm)
print(x_train_poly_scaled_sfm.shape)
print('train set r2_score: {:.4f}\ntest set r2_score: {:.4f}'.format(r2_score(tr_pred,y_train_log),r2_score(te_pred,y_test_log)))
Description
sklearn을 사용
선형 모델인 Ridge 모델로 선형 회귀시 데이터의 분포가 선형적이지 않으면 낮은 performance를 보인다.
Binning을 통해 구간별로 나누어 One-hot-encoding을 통해 Categorical한 Feature로 feature들을 모두 변경하여 performance가 크게 향상되었다.
Polynomial feature로 차원을 늘려 선형 모델의 input feature 자체에 비선형 특성을 추가하여 독립변수와 종속변수가 선형관계에 있게하여 performance를 향상 시킬 수 있었다.
로그 변환은 정규분포에 가깝게 분포를 만들어주고 회귀의 정확도를 높이기 때문에 자주 쓰인다.
회귀선이 원점을 지나거나 변수가 상대적인 scale (ex. percent), Right skewed distribution에서 로그 변환을 이용할 수 있다.
종속변수에 log 변환을 하는것도 회귀에서 자주 사용되며 performance를 향상 시킨다.
Feature Selection은 각 feature의 분포에 따라 선정이 이루어지며 tree 알고리즘의 feature importance와 같이 feature의 중요도를 Entropy를 통해 측정할 수 있다.
RFE는 반복을 통해 feature를 제거하면서 performance의 변화를 통해 feature를 선택한다.
'AI > Machine Learning & Data Science' 카테고리의 다른 글
직관과 MLE, MAP(2) (0) | 2023.06.27 |
---|---|
직관과 MLE, MAP (1) (0) | 2023.06.22 |
MLE(Maximum Likelihood Estimation), MAP(Maximum A Posterior) (0) | 2022.11.25 |
Generative modeling (0) | 2021.08.16 |
Netflix Challenge - Recommendation System (0) | 2021.02.25 |
Comments