Logistic Regression, Decision Tree, Random Forest, Gradient Boosting Tree, SVM, k-NN, Neural Network


데이터 불러오기

import pandas as pd
from sklearn import metrics
train = pd.read_csv('data/train.csv')
train.head()
age workclass education marital occupation relationship race sex capital_gain capital_loss hours_per_week income
0 39 State-gov 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 under50k
1 50 Self-emp-not-inc 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 under50k
2 38 Private 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 under50k
3 53 Private 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 under50k
4 28 Private 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 under50k
train.shape
(24999, 12)
train.columns
Index(['age', 'workclass', 'education', 'marital', 'occupation',
       'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
       'hours_per_week', 'income'],
      dtype='object')

종속변수

y = train['income']
y.head()
0    under50k
1    under50k
2    under50k
3    under50k
4    under50k
Name: income, dtype: object

독립변수

# 연속형 변수
conti_var = train.columns[train.dtypes != 'object']
conti_var
Index(['age', 'education', 'capital_gain', 'capital_loss', 'hours_per_week'], dtype='object')
# 범주형 변수
cate_var = train.columns[train.dtypes == 'object'].difference(['income'])
cate_var
Index(['marital', 'occupation', 'race', 'relationship', 'sex', 'workclass'], dtype='object')
# 범주형 변수를 dummy 변수로 변환
dummy_var = pd.get_dummies(train[cate_var])
X = pd.concat([train[conti_var], dummy_var], axis=1)
X.head()
age education capital_gain capital_loss hours_per_week marital_Divorced marital_Married-AF-spouse marital_Married-civ-spouse marital_Married-spouse-absent marital_Never-married ... relationship_Wife sex_Female sex_Male workclass_Federal-gov workclass_Local-gov workclass_Private workclass_Self-emp-inc workclass_Self-emp-not-inc workclass_State-gov workclass_Without-pay
0 39 13 2174 0 40 0 0 0 0 1 ... 0 0 1 0 0 0 0 0 1 0
1 50 13 0 0 13 0 0 1 0 0 ... 0 0 1 0 0 0 0 1 0 0
2 38 9 0 0 40 1 0 0 0 0 ... 0 0 1 0 0 1 0 0 0 0
3 53 7 0 0 40 0 0 1 0 0 ... 0 0 1 0 0 1 0 0 0 0
4 28 13 0 0 40 0 0 1 0 0 ... 1 1 0 0 0 1 0 0 0 0

5 rows × 46 columns

데이터 분할

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape
(22499, 46)
X_test.shape
(2500, 46)

모형 평가 출력 함수

# 년수입이 $ 50,000 를 넘는 사람에 대한 분류/예측 결과
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label='over50k').round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label='over50k').round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label='over50k').round(3)))

1. Logistic Regression (Lasso / Ridge)

from sklearn.linear_model import LogisticRegression
def run_lr_model(penalties, Clist):
    for p, c in zip(penalties, Clist):
        print('---------- penalty : {}, C : {} ----------'.format(p, c))
        model = LogisticRegression(penalty=p, C=c)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')
  • penalty : l1 = Lasso / l2 = Ridge
  • Lasso (L1 regularization) : 특정 변수의 coef를 0으로 만듬. automatic feature selection.
  • Ridge (L2 regularization) : coef를 조절하지만 0으로 만들지는 않음.
  • C > 1 : 오차를 주로 줄임. 가능한 training set에 맞춤 / C < 1 : coef를 주로 줄임
plist = ['l1','l1','l1','l1','l1','l2','l2','l2','l2','l2',]
clist = [0.001, 0.01, 0.1, 1, 100, 0.001, 0.01, 0.1, 1, 100]
run_lr_model(plist, clist)
---------- penalty : l1, C : 0.001 ----------
confusion matrix
[[ 168  461]
 [  55 1816]]
accuracy : 0.794
precision : 0.753
recall : 0.267
F1 : 0.394


---------- penalty : l1, C : 0.01 ----------
confusion matrix
[[ 338  291]
 [ 107 1764]]
accuracy : 0.841
precision : 0.76
recall : 0.537
F1 : 0.629


---------- penalty : l1, C : 0.1 ----------
confusion matrix
[[ 374  255]
 [ 124 1747]]
accuracy : 0.848
precision : 0.751
recall : 0.595
F1 : 0.664


---------- penalty : l1, C : 1 ----------
confusion matrix
[[ 378  251]
 [ 129 1742]]
accuracy : 0.848
precision : 0.746
recall : 0.601
F1 : 0.665


---------- penalty : l1, C : 100 ----------
confusion matrix
[[ 379  250]
 [ 129 1742]]
accuracy : 0.848
precision : 0.746
recall : 0.603
F1 : 0.667


---------- penalty : l2, C : 0.001 ----------
confusion matrix
[[ 190  439]
 [  59 1812]]
accuracy : 0.801
precision : 0.763
recall : 0.302
F1 : 0.433


---------- penalty : l2, C : 0.01 ----------
confusion matrix
[[ 338  291]
 [  99 1772]]
accuracy : 0.844
precision : 0.773
recall : 0.537
F1 : 0.634


---------- penalty : l2, C : 0.1 ----------
confusion matrix
[[ 369  260]
 [ 121 1750]]
accuracy : 0.848
precision : 0.753
recall : 0.587
F1 : 0.66


---------- penalty : l2, C : 1 ----------
confusion matrix
[[ 374  255]
 [ 124 1747]]
accuracy : 0.848
precision : 0.751
recall : 0.595
F1 : 0.664


---------- penalty : l2, C : 100 ----------
confusion matrix
[[ 373  256]
 [ 124 1747]]
accuracy : 0.848
precision : 0.751
recall : 0.593
F1 : 0.663
# Lasso & C = 100 인 모델의 F1 score가 가장 높다.

2. Decision Tree

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
y_pred = model.predict(X_test)
model_performance(y_test, y_pred)
confusion matrix
[[ 410  219]
 [ 258 1613]]
accuracy : 0.809
precision : 0.614
recall : 0.652
F1 : 0.632

3. Random Forest

from sklearn.ensemble import RandomForestClassifier
def run_rf_model(n_estimators, n_jobs):
    for ne, nj in zip(n_estimators, n_jobs):
        print('---------- n_estimators : {}, n_jobs : {} ----------'.format(ne, nj))
        model = RandomForestClassifier(n_estimators=ne, n_jobs=nj)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')
  • n_estimators = number of trees

  • n_jobs = number of jobs to run in parallel for both fit and predict.

n_estimators = [10, 10, 10, 100, 100, 100, 1000, 1000, 1000]
n_jobs = [1, 10, 100, 1, 10, 100, 1, 10, 100]
run_rf_model(n_estimators, n_jobs)
---------- n_estimators : 10, n_jobs : 1 ----------
confusion matrix
[[ 409  220]
 [ 195 1676]]
accuracy : 0.834
precision : 0.677
recall : 0.65
F1 : 0.663


---------- n_estimators : 10, n_jobs : 10 ----------
confusion matrix
[[ 420  209]
 [ 194 1677]]
accuracy : 0.839
precision : 0.684
recall : 0.668
F1 : 0.676


---------- n_estimators : 10, n_jobs : 100 ----------
confusion matrix
[[ 412  217]
 [ 189 1682]]
accuracy : 0.838
precision : 0.686
recall : 0.655
F1 : 0.67


---------- n_estimators : 100, n_jobs : 1 ----------
confusion matrix
[[ 397  232]
 [ 153 1718]]
accuracy : 0.846
precision : 0.722
recall : 0.631
F1 : 0.673


---------- n_estimators : 100, n_jobs : 10 ----------
confusion matrix
[[ 397  232]
 [ 156 1715]]
accuracy : 0.845
precision : 0.718
recall : 0.631
F1 : 0.672


---------- n_estimators : 100, n_jobs : 100 ----------
confusion matrix
[[ 398  231]
 [ 154 1717]]
accuracy : 0.846
precision : 0.721
recall : 0.633
F1 : 0.674


---------- n_estimators : 1000, n_jobs : 1 ----------
confusion matrix
[[ 392  237]
 [ 159 1712]]
accuracy : 0.842
precision : 0.711
recall : 0.623
F1 : 0.664


---------- n_estimators : 1000, n_jobs : 10 ----------
confusion matrix
[[ 398  231]
 [ 149 1722]]
accuracy : 0.848
precision : 0.728
recall : 0.633
F1 : 0.677


---------- n_estimators : 1000, n_jobs : 100 ----------
confusion matrix
[[ 394  235]
 [ 152 1719]]
accuracy : 0.845
precision : 0.722
recall : 0.626
F1 : 0.671
# n_estimators : 1000, n_jobs : 100 인 random forest 모델의  F1 score가 가장 높으며, Lasso Logistic Regression 결과보다 높다.

4. Gradient Boosting Tree

from sklearn.ensemble import GradientBoostingClassifier

(1) loss function : ‘deviance’ = logistic regression (default)

def run_gbt_model(n_estimators, l_rate):
    for ne, lr in zip(n_estimators, l_rate):
        print('---------- n_estimators : {}, learning_rate : {} ----------'.format(ne, lr))
        model = GradientBoostingClassifier(n_estimators=ne, learning_rate=lr)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')
  • n_estimators = number of boosting stages to perform
n_estimators = [100, 100, 100, 1000, 1000, 1000]
l_rate = [0.1, 0.3, 0.5, 0.1, 0.3, 0.5]
run_gbt_model(n_estimators, l_rate)
---------- n_estimators : 100, learning_rate : 0.1 ----------
confusion matrix
[[ 379  250]
 [ 102 1769]]
accuracy : 0.859
precision : 0.788
recall : 0.603
F1 : 0.683


---------- n_estimators : 100, learning_rate : 0.3 ----------
confusion matrix
[[ 407  222]
 [ 111 1760]]
accuracy : 0.867
precision : 0.786
recall : 0.647
F1 : 0.71


---------- n_estimators : 100, learning_rate : 0.5 ----------
confusion matrix
[[ 414  215]
 [ 119 1752]]
accuracy : 0.866
precision : 0.777
recall : 0.658
F1 : 0.713


---------- n_estimators : 1000, learning_rate : 0.1 ----------
confusion matrix
[[ 420  209]
 [ 121 1750]]
accuracy : 0.868
precision : 0.776
recall : 0.668
F1 : 0.718


---------- n_estimators : 1000, learning_rate : 0.3 ----------
confusion matrix
[[ 423  206]
 [ 133 1738]]
accuracy : 0.864
precision : 0.761
recall : 0.672
F1 : 0.714


---------- n_estimators : 1000, learning_rate : 0.5 ----------
confusion matrix
[[ 415  214]
 [ 135 1736]]
accuracy : 0.86
precision : 0.755
recall : 0.66
F1 : 0.704

(2) loss function : ‘exponential’ = AdaBoost algorithm.

def run_gbtExp_model(n_estimators, l_rate):
    for ne, lr in zip(n_estimators, l_rate):
        print('---------- n_estimators : {}, learning_rate : {} ----------'.format(ne, lr))
        model = GradientBoostingClassifier(n_estimators=ne, learning_rate=lr, loss='exponential')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')
run_gbt_model(n_estimators, l_rate)
---------- n_estimators : 100, learning_rate : 0.1 ----------
confusion matrix
[[ 379  250]
 [ 102 1769]]
accuracy : 0.859
precision : 0.788
recall : 0.603
F1 : 0.683


---------- n_estimators : 100, learning_rate : 0.3 ----------
confusion matrix
[[ 407  222]
 [ 111 1760]]
accuracy : 0.867
precision : 0.786
recall : 0.647
F1 : 0.71


---------- n_estimators : 100, learning_rate : 0.5 ----------
confusion matrix
[[ 414  215]
 [ 119 1752]]
accuracy : 0.866
precision : 0.777
recall : 0.658
F1 : 0.713


---------- n_estimators : 1000, learning_rate : 0.1 ----------
confusion matrix
[[ 419  210]
 [ 121 1750]]
accuracy : 0.868
precision : 0.776
recall : 0.666
F1 : 0.717


---------- n_estimators : 1000, learning_rate : 0.3 ----------
confusion matrix
[[ 423  206]
 [ 132 1739]]
accuracy : 0.865
precision : 0.762
recall : 0.672
F1 : 0.715


---------- n_estimators : 1000, learning_rate : 0.5 ----------
confusion matrix
[[ 415  214]
 [ 135 1736]]
accuracy : 0.86
precision : 0.755
recall : 0.66
F1 : 0.704
  • F1 score와 Accuracy가 가장 높은 Tree에서 변수의 중요도 확인
# feature importance
model = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1)
model.fit(X_train, y_train)
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=1000, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
varDic = {'var':X_train.columns, 'importance':model.feature_importances_}
impVar = pd.DataFrame(varDic)
impVar.sort_values(by='importance', ascending=False)[1:11]
importance var
4 0.135903 hours_per_week
1 0.108185 education
3 0.104246 capital_loss
2 0.089625 capital_gain
7 0.031849 marital_Married-civ-spouse
43 0.021935 workclass_Self-emp-not-inc
42 0.020932 workclass_Self-emp-inc
15 0.019298 occupation_Exec-managerial
36 0.017739 relationship_Wife
23 0.017046 occupation_Sales

5. SVM

  • SVM Classification 의 경우 kernel=’linear’로 설정시 22499 건의 데이터를 훈련시키는데 엄청난 시간이 걸림.
  • 범주형 변수를 제외하고 연속형 변수만으로 Classification을 하는 경우도 시간이 많이 소요됨.
from sklearn.svm import SVC

RBF kernel

model = SVC(C=10)
model.fit(X_train, y_train)
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
y_pred = model.predict(X_test)
model_performance(y_test, y_pred)
confusion matrix
[[ 397  232]
 [ 119 1752]]
accuracy : 0.86
precision : 0.769
recall : 0.631
F1 : 0.693

변수 Scaling

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
MinMaxScaler(copy=True, feature_range=(0, 1))
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = SVC(C=10)
model.fit(X_train_scaled, y_train)
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
y_pred = model.predict(X_test_scaled)
model_performance(y_test, y_pred)
confusion matrix
[[ 369  260]
 [ 122 1749]]
accuracy : 0.847
precision : 0.752
recall : 0.587
F1 : 0.659

6. k-NN

from sklearn.neighbors import KNeighborsClassifier
neighbors = range(1,17,2)  # 최근접이웃 갯수.
def run_knn_model(n_neighbors):
    for nn in n_neighbors:
        print('---------- knn : ' + str(nn) + ' ----------')
        model = KNeighborsClassifier(n_neighbors=nn)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')
run_knn_model(neighbors)
---------- knn : 1 ----------
confusion matrix
[[ 411  218]
 [ 245 1626]]
accuracy : 0.815
precision : 0.627
recall : 0.653
F1 : 0.64


---------- knn : 3 ----------
confusion matrix
[[ 414  215]
 [ 189 1682]]
accuracy : 0.838
precision : 0.687
recall : 0.658
F1 : 0.672


---------- knn : 5 ----------
confusion matrix
[[ 406  223]
 [ 160 1711]]
accuracy : 0.847
precision : 0.717
recall : 0.645
F1 : 0.679


---------- knn : 7 ----------
confusion matrix
[[ 392  237]
 [ 152 1719]]
accuracy : 0.844
precision : 0.721
recall : 0.623
F1 : 0.668


---------- knn : 9 ----------
confusion matrix
[[ 394  235]
 [ 155 1716]]
accuracy : 0.844
precision : 0.718
recall : 0.626
F1 : 0.669


---------- knn : 11 ----------
confusion matrix
[[ 400  229]
 [ 148 1723]]
accuracy : 0.849
precision : 0.73
recall : 0.636
F1 : 0.68


---------- knn : 13 ----------
confusion matrix
[[ 393  236]
 [ 151 1720]]
accuracy : 0.845
precision : 0.722
recall : 0.625
F1 : 0.67


---------- knn : 15 ----------
confusion matrix
[[ 383  246]
 [ 156 1715]]
accuracy : 0.839
precision : 0.711
recall : 0.609
F1 : 0.656

7. Neural Network

from sklearn.neural_network import MLPClassifier

(1) adam : stochastic gradient-based optimizer

model = MLPClassifier(solver='adam', activation='logistic', hidden_layer_sizes=(100,), max_iter=2000)
model.fit(X_train, y_train)
MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
y_pred = model.predict(X_test)
model_performance(y_test, y_pred)
confusion matrix
[[ 403  226]
 [ 171 1700]]
accuracy : 0.841
precision : 0.702
recall : 0.641
F1 : 0.67

(2) sgd : stochastic gradient descent

model = MLPClassifier(solver='sgd', activation='logistic', hidden_layer_sizes=(200,), max_iter=2000)
model.fit(X_train, y_train)
MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='sgd', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
y_pred = model.predict(X_test)
model_performance(y_test, y_pred)
confusion matrix
[[ 383  246]
 [ 215 1656]]
accuracy : 0.816
precision : 0.64
recall : 0.609
F1 : 0.624

변수 Scaling + adam

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
MinMaxScaler(copy=True, feature_range=(0, 1))
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = MLPClassifier(solver='adam', activation='logistic', hidden_layer_sizes=(100,), max_iter=2000)
model.fit(X_train_scaled, y_train)
MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
y_pred = model.predict(X_test_scaled)
model_performance(y_test, y_pred)
confusion matrix
[[ 411  218]
 [ 159 1712]]
accuracy : 0.849
precision : 0.721
recall : 0.653
F1 : 0.686

최종 예측

newdata = pd.read_csv('data/newpeople.csv')
newdata.head()
age workclass education marital occupation relationship race sex capital_gain capital_loss hours_per_week
0 38 Private 9 Married-civ-spouse Other-service Husband White Male 0 0 40
1 34 Local-gov 11 Divorced Protective-serv Own-child Asian-Pac-Islander Male 0 0 40
2 51 Private 9 Married-civ-spouse Craft-repair Husband White Male 7298 0 50
3 48 Private 9 Married-civ-spouse Craft-repair Husband White Male 0 0 42
4 63 Private 14 Married-civ-spouse Prof-specialty Husband White Male 0 0 50
dummy_final = pd.get_dummies(newdata[cate_var])
X_final = pd.concat([newdata[conti_var], dummy_final], axis=1)
X_final.head()
age education capital_gain capital_loss hours_per_week marital_Divorced marital_Married-AF-spouse marital_Married-civ-spouse marital_Married-spouse-absent marital_Never-married ... relationship_Wife sex_Female sex_Male workclass_Federal-gov workclass_Local-gov workclass_Private workclass_Self-emp-inc workclass_Self-emp-not-inc workclass_State-gov workclass_Without-pay
0 38 9 0 0 40 0 0 1 0 0 ... 0 0 1 0 0 1 0 0 0 0
1 34 11 0 0 40 1 0 0 0 0 ... 0 0 1 0 1 0 0 0 0 0
2 51 9 7298 0 50 0 0 1 0 0 ... 0 0 1 0 0 1 0 0 0 0
3 48 9 0 0 42 0 0 1 0 0 ... 0 0 1 0 0 1 0 0 0 0
4 63 14 0 0 50 0 0 1 0 0 ... 0 0 1 0 0 1 0 0 0 0

5 rows × 46 columns

  • 최종 모델 선택 - F1 score와 accuracy가 가장 높은 모델

  • Gradient Boosting Tree. n_estimators : 1000, learning_rate : 0.1 적용 (loss function = default 사용)

# 학습 및 검증
selected_model = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1)
selected_model.fit(X_train, y_train)
y_pred = selected_model.predict(X_test)
model_performance(y_test, y_pred)
confusion matrix
[[ 420  209]
 [ 121 1750]]
accuracy : 0.868
precision : 0.776
recall : 0.668
F1 : 0.718
# 예측
y_final = selected_model.predict(X_final)
y_final
array(['under50k', 'under50k', 'over50k', ..., 'under50k', 'under50k',
       'over50k'], dtype=object)
# 데이터 저장
import numpy
numpy.savetxt('final.csv', y_final, fmt='%s')