import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('wine_dataset.csv')
data.head()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | style | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | red |
1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 | red |
2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 | red |
3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 | red |
4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | red |
data
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | style | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | red |
1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 | red |
2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 | red |
3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 | red |
4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | red |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6492 | 6.2 | 0.21 | 0.29 | 1.6 | 0.039 | 24.0 | 92.0 | 0.99114 | 3.27 | 0.50 | 11.2 | 6 | white |
6493 | 6.6 | 0.32 | 0.36 | 8.0 | 0.047 | 57.0 | 168.0 | 0.99490 | 3.15 | 0.46 | 9.6 | 5 | white |
6494 | 6.5 | 0.24 | 0.19 | 1.2 | 0.041 | 30.0 | 111.0 | 0.99254 | 2.99 | 0.46 | 9.4 | 6 | white |
6495 | 5.5 | 0.29 | 0.30 | 1.1 | 0.022 | 20.0 | 110.0 | 0.98869 | 3.34 | 0.38 | 12.8 | 7 | white |
6496 | 6.0 | 0.21 | 0.38 | 0.8 | 0.020 | 22.0 | 98.0 | 0.98941 | 3.26 | 0.32 | 11.8 | 6 | white |
6497 rows × 13 columns
Le jeu de données contient 13 variables et 6497 exemples.
data = data.drop(columns=['quality'])
g = sns.FacetGrid(data=data, hue='style')
g.map(sns.histplot, 'fixed_acidity').add_legend()
<seaborn.axisgrid.FacetGrid at 0x7f60e3ff2280>
for col in data.columns.drop('style'):
g = sns.FacetGrid(data=data, hue='style')
g.map(sns.histplot, col)
Il semble raisonnable de sélectionner les variables explicatives volatile_acidity
, chlorides
, free_sulfur_dioxide
, total_sulfur_dioxide
.
data_reduced = data[['volatile_acidity', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'style']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_reduced.drop(columns='style'), data_reduced['style'])
print(y_train.size)
print(y_test.size)
print(y_train.size/(y_train.size+y_test.size))
4872 1625 0.7498845621055872
Les proporitions des échantillons d'apprentissage et de test sont de 75% - 25%.
from sklearn.linear_model import LogisticRegression
f = LogisticRegression()
f.fit(X_train, y_train)
/usr/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False)
print('Erreur d\'apprentissage:', 1-f.score(X_train, y_train))
print('Erreur de test:', 1-f.score(X_test, y_test))
Erreur d'apprentissage: 0.04022988505747127 Erreur de test: 0.03876923076923078
Dans la suite, on pourra considérer directement le score plutôt que l'erreur.
from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron.fit(X_train,y_train)
perceptron.score(X_test,y_test)
/usr/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py:144: FutureWarning: max_iter and tol parameters have been added in Perceptron in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3. FutureWarning)
0.8012307692307692
from sklearn.neighbors import KNeighborsClassifier
knn = {}
score_train = []
score_test = []
k_range = range(1,20)
for k in k_range:
knn[k] = KNeighborsClassifier(n_neighbors=k)
knn[k].fit(X_train,y_train)
score_train.append(knn[k].score(X_train,y_train))
score_test.append(knn[k].score(X_test,y_test))
k_best = np.argmax(score_test)
print('Score de test du meilleur prédicteur kNN:',score_test[k_best])
plt.plot(list(k_range),score_train,label='erreur d\'apprentissage')
plt.plot(list(k_range),score_test,label='erreur de test')
plt.legend()
plt.show()
Score de test du meilleur prédicteur kNN: 0.936
La régression logistique donne le meilleur prédicteur. Le meilleur prédicteur kNN est légèrement moins bon. Le perceptron est nettement moins bon.
pd.crosstab(y_test,f.predict(X_test))
col_0 | red | white |
---|---|---|
style | ||
red | 358 | 36 |
white | 30 | 1201 |
Dans la matrice de confusion, les lignes correspondent aux étiquettes réelles, et les colonnes aux étiquettes prédites. Pour chaque étiquette prédite, on a le nombre correspondant de chaque étiquette réelle. On remarque remarque qu'en proportition, un vin rouge est plus souvent prédit comme vin blanc que l'inverse.
from sklearn.utils import resample,shuffle
test_label_counts = y_test.value_counts()
X_test_white = X_test[y_test=='white']
y_test_white = y_test[y_test=='white']
X_test_less_white, y_test_less_white = resample(X_test_white,y_test_white,n_samples=test_label_counts['red'],replace=False)
X_test_ = pd.concat([X_test_less_white,X_test[y_test=='red']])
y_test_ = pd.concat([y_test_less_white,y_test[y_test=='red']])
X_test_, y_test_ = shuffle(X_test_,y_test_)
print(y_test_.value_counts())
white 394 red 394 Name: style, dtype: int64
print('Régression logistique:', f.score(X_test_,y_test_))
print('Peceptron:', perceptron.score(X_test_,y_test_))
print('Meilleur kNN:',knn[k_best].score(X_test_,y_test_))
Régression logistique: 0.9441624365482234 Peceptron: 0.5901015228426396 Meilleur kNN: 0.8972081218274112
Lorsque l'échantillon de test contient autant de vins rouges que de vins blancs, le score de chaque prédicteur est légèrement intérieur.
train_label_counts = y_train.value_counts()
X_train_white = X_train[y_train=='white']
y_train_white = y_train[y_train=='white']
X_train_less_white, y_train_less_white = resample(X_train_white,y_train_white,n_samples=train_label_counts['red'],replace=False)
X_train_ = pd.concat([X_train_less_white,X_train[y_train=='red']])
y_train_ = pd.concat([y_train_less_white,y_train[y_train=='red']])
X_train_, y_train_ = shuffle(X_train_,y_train_)
print(y_train_.value_counts())
white 1205 red 1205 Name: style, dtype: int64
f_ = LogisticRegression()
f_.fit(X_train_, y_train_)
print('Régression logistique:', f_.score(X_test_,y_test_))
perceptron_ = Perceptron()
perceptron_.fit(X_train_,y_train_)
print('Perceptron:', perceptron_.score(X_test_,y_test_))
knn_ = {}
score_train = []
score_test = []
k_range = range(1,20)
for k in k_range:
knn_[k] = KNeighborsClassifier(n_neighbors=k)
knn_[k].fit(X_train_,y_train_)
score_test.append(knn_[k].score(X_test_,y_test_))
k_best = np.argmax(score_test)
print('Meilleur kNN:', score_test[k_best])
Régression logistique: 0.9517766497461929 Perceptron: 0.7411167512690355 Meilleur kNN: 0.899746192893401
/usr/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning) /usr/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py:144: FutureWarning: max_iter and tol parameters have been added in Perceptron in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3. FutureWarning)
Même si on a réduit la taille de l'échantillon d'apprentissage, du fait que celui-ci contienne autant de vins rouges que de vins blancs, on observe une légère amélioration des scores des prédicteurs obtenus.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns='style'), data['style'])
train_label_counts = y_train.value_counts()
X_train_white = X_train[y_train=='white']
y_train_white = y_train[y_train=='white']
X_train_less_white, y_train_less_white = resample(X_train_white,y_train_white,n_samples=train_label_counts['red'],replace=False)
X_train_ = pd.concat([X_train_less_white,X_train[y_train=='red']])
y_train_ = pd.concat([y_train_less_white,y_train[y_train=='red']])
X_train_, y_train_ = shuffle(X_train_,y_train_)
test_label_counts = y_test.value_counts()
X_test_white = X_test[y_test=='white']
y_test_white = y_test[y_test=='white']
X_test_less_white, y_test_less_white = resample(X_test_white,y_test_white,n_samples=test_label_counts['red'],replace=False)
X_test_ = pd.concat([X_test_less_white,X_test[y_test=='red']])
y_test_ = pd.concat([y_test_less_white,y_test[y_test=='red']])
X_test_, y_test_ = shuffle(X_test_,y_test_)
f_ = LogisticRegression()
f_.fit(X_train_, y_train_)
print('Régression logistique:',f_.score(X_test_,y_test_))
perceptron_ = Perceptron()
perceptron_.fit(X_train_,y_train_)
print('Perceptron', perceptron_.score(X_test_,y_test_))
knn_ = {}
score_train = []
score_test = []
k_range = range(1,20)
for k in k_range:
knn_[k] = KNeighborsClassifier(n_neighbors=k)
knn_[k].fit(X_train_,y_train_)
score_test.append(knn_[k].score(X_test_,y_test_))
k_best = np.argmax(score_test)
print('Meilleur kNN', score_test[k_best])
Régression logistique: 0.9798387096774194 Perceptron 0.9086021505376344 Meilleur kNN 0.9274193548387096
/usr/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning) /usr/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py:144: FutureWarning: max_iter and tol parameters have been added in Perceptron in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3. FutureWarning)
On observe une amélioration légère pour la régression logistique et kNN, et une amélioration nette pour le perceptron.