[딥러닝] Logistic Regression을 이용하여 서비스 이탈고객 예측 (Binary Classification)


import tensorflow as tf
import numpy as np
tf.set_random_seed(777)

1. 당뇨병 예측 모델링

# import data
xy = np.loadtxt('data/diabetes.csv', delimiter=',', dtype=np.float32)
x_data = xy[:, 0:-1]
y_data = xy[:, [-1]]
x_data.shape
(759, 8)
y_data.shape
(759, 1)
X = tf.placeholder(tf.float32, shape=[None, 8])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([8, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')
# Sigmoid function
hypothesis = tf.sigmoid(tf.matmul(X, W) + b)

cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis))

train = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)
# hypothesis > 0.5 이면 True 아니면 False 처리.
predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32))
# Launch graph
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for step in range(10001):
        cost_val, _ = sess.run([cost, train], feed_dict={X: x_data, Y: y_data})
        if step % 1000 == 0:
            print(step, cost_val)

    # Accuracy report
    h, c, a = sess.run([hypothesis, predicted, accuracy], feed_dict={X: x_data, Y: y_data})
0 0.82794
1000 0.669853
2000 0.606246
3000 0.566555
4000 0.541229
5000 0.524479
6000 0.512969
7000 0.504778
8000 0.498771
9000 0.494249
10000 0.490767
print("Hypothesis: \n", h[:10])
Hypothesis: 
 [[ 0.44348484]
 [ 0.91536468]
 [ 0.22591162]
 [ 0.93583125]
 [ 0.33763626]
 [ 0.70926887]
 [ 0.94409138]
 [ 0.63417912]
 [ 0.25953037]
 [ 0.46434346]]
print("Predicted: \n", c[:10])
Predicted: 
 [[ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]]
print("Accuracy: ", a)
Accuracy:  0.762846

2. 서비스 이탈자 예측 모델링

import pandas as pd
# features 확인
df = pd.read_csv('data/churnTrain.csv')
df.columns
Index(['account_length', 'international_plan', 'voice_mail_plan',
       'number_vmail_messages', 'total_day_minutes', 'total_day_calls',
       'total_day_charge', 'total_eve_minutes', 'total_eve_calls',
       'total_eve_charge', 'total_night_minutes', 'total_night_calls',
       'total_night_charge', 'total_intl_minutes', 'total_intl_calls',
       'total_intl_charge', 'number_customer_service_calls', 'churn'],
      dtype='object')
# churn : 서비스 이탈(해지) 여부
df.shape
(3333, 18)

Training Data

cData = np.loadtxt('data/churnTrain.csv', delimiter=',', dtype=np.float32, skiprows=1)
x_data = cData[:, 0:-1]
y_data = cData[:, [-1]]
x_data.shape
(3333, 17)
y_data.shape
(3333, 1)
# Standardzation
xn_data = (x_data - np.mean(x_data, axis=0)) / np.std(x_data, axis=0)
xn_data
array([[ 0.67649007, -0.32758394,  1.61706638, ..., -0.60119295,
        -0.08568114, -0.42793074],
       [ 0.14906523, -0.32758394,  1.61706638, ..., -0.60119295,
         1.24117649, -0.42793074],
       [ 0.90252924, -0.32758394, -0.61838871, ...,  0.21153317,
         0.69716471, -1.1882149 ],
       ..., 
       [-1.83505678, -0.32758394, -0.61838871, ...,  0.61789626,
         1.38713062,  0.33235341],
       [ 2.08295631,  3.05271697, -0.61838871, ...,  2.2433486 ,
        -1.87693894,  0.33235341],
       [-0.6797452 , -0.32758394,  1.61706638, ..., -0.1948299 ,
         1.24117649, -1.1882149 ]], dtype=float32)

Test Data

cData2 = np.loadtxt('data/churnTest.csv', delimiter=',', dtype=np.float32, skiprows=1)
x2_data = cData2[:, 0:-1]
y2_data = cData2[:, [-1]]
# Standardzation
xn2_data = (x2_data - np.mean(x2_data, axis=0)) / np.std(x2_data, axis=0)

Modeling

X = tf.placeholder(tf.float32, shape=[None, 17])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([17, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')
hypothesis = tf.sigmoid(tf.matmul(X, W) + b)
cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis))

train = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)
predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32))

Launch graph

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for step in range(10001):
        cost_val, _ = sess.run([cost, train], feed_dict={X: xn_data, Y: y_data})
        if step % 1000 == 0:
            print(step, cost_val)

    # Accuracy report
    # Trainging Data
    h, c, a = sess.run([hypothesis, predicted, accuracy], feed_dict={X: xn_data, Y: y_data})
    # Test Data
    h2, c2, a2 = sess.run([hypothesis, predicted, accuracy], feed_dict={X: xn2_data, Y: y2_data})
0 1.54038
1000 0.379552
2000 0.33152
3000 0.326789
4000 0.326013
5000 0.325778
6000 0.32565
7000 0.325551
8000 0.325466
9000 0.325389
10000 0.325317
print("Training Data Accuracy: ", a)
Training Data Accuracy:  0.862286
print("Test Data Accuracy: ", a2)
Test Data Accuracy:  0.871026