import pandas as pd
import numpy as np

from matplotlib import pyplot as plt 
from sklearn.model_selection import train_test_split


# import data
df = pd.read_csv("./data.csv", index_col=0)
df['diagnosis']=df['diagnosis'].map({'M':1,'B':0})


df.describe()


# Split Data into training and test (70% and 30%)
train, test = train_test_split(df, test_size = 0.3, random_state=1)

# separation de X et Y
train_x = train.drop(['diagnosis'], axis=1)
train_y = train.loc[:,['diagnosis']]

test_x = test.drop(['diagnosis'], axis=1)
test_y = test.loc[:,['diagnosis']]

train_x = np.asarray(train_x)
train_y = np.asarray(train_y)
test_x = np.asarray(test_x)
test_y = np.asarray(test_y)
train_x.shape, train_y.shape

((398, 30), (398, 1))


train_x = np.concatenate([train_x, np.ones((train_x.shape[0], 1))], axis=1)
test_x = np.concatenate([test_x, np.ones((test_x.shape[0], 1))], axis=1)


train_x.shape, test_x.shape

((398, 31), (171, 31))


def sig(z):
    return 1. / (1 + np.exp(-z))

def grad(x, y, alpha):
    return ((y-sig(np.dot(x, alpha)))[:, np.newaxis]*x).mean(0)

def f(x, y, alpha):
    alpha_dot_x = np.dot(x, alpha)
    return (y*alpha_dot_x + np.log(sig(-alpha_dot_x))).mean()


alpha = np.zeros(31)
x = train_x
y = train_y[:,0]
g = grad(x, y, alpha)
f(x, y, alpha), f(x, y, alpha + 1e-6*g)

(-0.6931471805599452, -0.6825210687368062)


alpha = np.zeros(31)

for k in range(10000):
    g = grad(x, y, alpha)
    alpha = alpha + 4e-6*g
    print(f(x, y, alpha)) if not(k % 1000) else None

-0.6823035814678295
-0.3118002555207603
-0.2649502290723981
-0.24489842888584296
-0.23340153590821244
-0.22588835631609905
-0.2206139599565642
-0.21673050561269097
-0.21376574569535084
-0.2114344094751581


y_pred_test = (sig(test_x.dot(alpha)) > 0.5).astype('int')
f'Test_accuracy {100*(y_pred_test == test_y[:,0]).mean():2f}%'

'Test_accuracy 92.982456%'

	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
count	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	...	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000
mean	0.372583	14.127292	19.289649	91.969033	654.889104	0.096360	0.104341	0.088799	0.048919	0.181162	...	16.269190	25.677223	107.261213	880.583128	0.132369	0.254265	0.272188	0.114606	0.290076	0.083946
std	0.483918	3.524049	4.301036	24.298981	351.914129	0.014064	0.052813	0.079720	0.038803	0.027414	...	4.833242	6.146258	33.602542	569.356993	0.022832	0.157336	0.208624	0.065732	0.061867	0.018061
min	0.000000	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	...	7.930000	12.020000	50.410000	185.200000	0.071170	0.027290	0.000000	0.000000	0.156500	0.055040
25%	0.000000	11.700000	16.170000	75.170000	420.300000	0.086370	0.064920	0.029560	0.020310	0.161900	...	13.010000	21.080000	84.110000	515.300000	0.116600	0.147200	0.114500	0.064930	0.250400	0.071460
50%	0.000000	13.370000	18.840000	86.240000	551.100000	0.095870	0.092630	0.061540	0.033500	0.179200	...	14.970000	25.410000	97.660000	686.500000	0.131300	0.211900	0.226700	0.099930	0.282200	0.080040
75%	1.000000	15.780000	21.800000	104.100000	782.700000	0.105300	0.130400	0.130700	0.074000	0.195700	...	18.790000	29.720000	125.400000	1084.000000	0.146000	0.339100	0.382900	0.161400	0.317900	0.092080
max	1.000000	28.110000	39.280000	188.500000	2501.000000	0.163400	0.345400	0.426800	0.201200	0.304000	...	36.040000	49.540000	251.200000	4254.000000	0.222600	1.058000	1.252000	0.291000	0.663800	0.207500