import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
# import data
df = pd.read_csv("./data.csv", index_col=0)
df['diagnosis']=df['diagnosis'].map({'M':1,'B':0})
df.describe()
diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | ... | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 |
mean | 0.372583 | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | ... | 16.269190 | 25.677223 | 107.261213 | 880.583128 | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 |
std | 0.483918 | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | ... | 4.833242 | 6.146258 | 33.602542 | 569.356993 | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 |
min | 0.000000 | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | ... | 7.930000 | 12.020000 | 50.410000 | 185.200000 | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 |
25% | 0.000000 | 11.700000 | 16.170000 | 75.170000 | 420.300000 | 0.086370 | 0.064920 | 0.029560 | 0.020310 | 0.161900 | ... | 13.010000 | 21.080000 | 84.110000 | 515.300000 | 0.116600 | 0.147200 | 0.114500 | 0.064930 | 0.250400 | 0.071460 |
50% | 0.000000 | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 0.092630 | 0.061540 | 0.033500 | 0.179200 | ... | 14.970000 | 25.410000 | 97.660000 | 686.500000 | 0.131300 | 0.211900 | 0.226700 | 0.099930 | 0.282200 | 0.080040 |
75% | 1.000000 | 15.780000 | 21.800000 | 104.100000 | 782.700000 | 0.105300 | 0.130400 | 0.130700 | 0.074000 | 0.195700 | ... | 18.790000 | 29.720000 | 125.400000 | 1084.000000 | 0.146000 | 0.339100 | 0.382900 | 0.161400 | 0.317900 | 0.092080 |
max | 1.000000 | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | ... | 36.040000 | 49.540000 | 251.200000 | 4254.000000 | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 |
8 rows × 31 columns
# Split Data into training and test (70% and 30%)
train, test = train_test_split(df, test_size = 0.3, random_state=1)
# separation de X et Y
train_x = train.drop(['diagnosis'], axis=1)
train_y = train.loc[:,['diagnosis']]
test_x = test.drop(['diagnosis'], axis=1)
test_y = test.loc[:,['diagnosis']]
train_x = np.asarray(train_x)
train_y = np.asarray(train_y)
test_x = np.asarray(test_x)
test_y = np.asarray(test_y)
train_x.shape, train_y.shape
((398, 30), (398, 1))
train_x = np.concatenate([train_x, np.ones((train_x.shape[0], 1))], axis=1)
test_x = np.concatenate([test_x, np.ones((test_x.shape[0], 1))], axis=1)
train_x.shape, test_x.shape
((398, 31), (171, 31))
Sigmoid, f and gradient functions
def sig(z):
return 1. / (1 + np.exp(-z))
def grad(x, y, alpha):
return ((y-sig(np.dot(x, alpha)))[:, np.newaxis]*x).mean(0)
def f(x, y, alpha):
alpha_dot_x = np.dot(x, alpha)
return (y*alpha_dot_x + np.log(sig(-alpha_dot_x))).mean()
Check that gradient is a descent direction
alpha = np.zeros(31)
x = train_x
y = train_y[:,0]
g = grad(x, y, alpha)
f(x, y, alpha), f(x, y, alpha + 1e-6*g)
(-0.6931471805599452, -0.6825210687368062)
Preform gradient descent
alpha = np.zeros(31)
for k in range(10000):
g = grad(x, y, alpha)
alpha = alpha + 4e-6*g
print(f(x, y, alpha)) if not(k % 1000) else None
-0.6823035814678295 -0.3118002555207603 -0.2649502290723981 -0.24489842888584296 -0.23340153590821244 -0.22588835631609905 -0.2206139599565642 -0.21673050561269097 -0.21376574569535084 -0.2114344094751581
y_pred_test = (sig(test_x.dot(alpha)) > 0.5).astype('int')
f'Test_accuracy {100*(y_pred_test == test_y[:,0]).mean():2f}%'
'Test_accuracy 92.982456%'