import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)
from sklearn.svm import LinearSVC
svm = LinearSVC(C=1).fit(X_train,y_train)
print('Score: ',svm.score(X_test,y_test))
from sklearn.model_selection import validation_curve
C_range = np.logspace(-6,-2,50)
train_scores, valid_scores = validation_curve(LinearSVC(), X_train, y_train, "C", C_range, cv=10)
train_scores_mean = np.mean(train_scores,axis=1)
valid_scores_mean = np.mean(valid_scores,axis=1)
plt.plot(np.log10(C_range),train_scores_mean,label="scores d'apprentissage")
plt.plot(np.log10(C_range),valid_scores_mean,label="scores de validation")
plt.legend()
plt.xlabel('log(C)')
plt.ylabel('score')
plt.show()
C_best = C_range[np.argmax(valid_scores_mean)]
svm = LinearSVC(C=C_best).fit(X_train,y_train)
print('Score:', svm.score(X_test,y_test))
y_test_predict = svm.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix_test = confusion_matrix(y_test,y_test_predict)
print(confusion_matrix_test)
tp = confusion_matrix_test[0,0]
fp = confusion_matrix_test[1,0]
fn = confusion_matrix_test[0,1]
recall = tp/(tp+fn)
precision = tp/(tp+fp)
print('Recall :', recall)
print('Precision :', precision)
from sklearn.metrics import recall_score, precision_score
print('Recall: ',recall_score(y_test,y_test_predict,pos_label=0))
print('Precision: ',precision_score(y_test,y_test_predict,pos_label=0))
def false_positive_rate(y_true,y_predict,pos_label):
return np.sum(y_true[y_predict == pos_label] != pos_label)/np.sum(y_true != pos_label)
print(false_positive_rate(y_test,y_test_predict,0))
def modified_predictor(X,tau):
return (svm.decision_function(X) >= tau).astype('int')
Un très mauvais prédicteur a un rappel (taux de vrais positifs) à peine plus élevé que le taux de faux positifs: la courbe ROC n'est donc que légèrement au-dessus de la diagonale (courbe de coordonnées $(t,t)$ pour $t\in [0,1]$). Pour un bon prédicteur en revanche, la courbe ROC passe nettement au-dessus de la diagonale.
decision_function_train = svm.decision_function(X_train)
tau_range = np.linspace(np.min(decision_function_train),np.max(decision_function_train),100)
recalls = []
fprs = []
for tau in tau_range:
y_train_predict = modified_predictor(X_train,tau)
recalls.append(recall_score(y_train,y_train_predict,pos_label=0))
fprs.append(false_positive_rate(y_train,y_train_predict,0))
plt.plot(fprs,recalls)
plt.show()
recalls_array = np.array(recalls)
fprs_array = np.array(fprs)
good_enough_recalls_index = (recalls_array >= .95)
tau_best = (tau_range[good_enough_recalls_index])[np.argmin(fprs_array[good_enough_recalls_index])]
y_test_predict = modified_predictor(X_test,tau_best)
from sklearn.metrics import accuracy_score
print('Accuracy score: ', accuracy_score(y_test, y_test_predict))
print('Recall: ', recall_score(y_test, y_test_predict,pos_label=0))
print('Precision: ', precision_score(y_test, y_test_predict,pos_label=0))
print('False positive rate: ', false_positive_rate(y_test, y_test_predict,pos_label=0))