import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


data = pd.read_csv('wine_dataset.csv')
data.head()


data


data = data.drop(columns=['quality'])


g = sns.FacetGrid(data=data, hue='style')
g.map(sns.histplot, 'fixed_acidity').add_legend()

<seaborn.axisgrid.FacetGrid at 0x7f60e3ff2280>


for col in data.columns.drop('style'):
   g = sns.FacetGrid(data=data, hue='style')
   g.map(sns.histplot, col)


data_reduced = data[['volatile_acidity', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'style']]


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_reduced.drop(columns='style'), data_reduced['style'])


print(y_train.size)
print(y_test.size)
print(y_train.size/(y_train.size+y_test.size))

4872
1625
0.7498845621055872


from sklearn.linear_model import LogisticRegression

f = LogisticRegression()
f.fit(X_train, y_train)

/usr/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


print('Erreur d\'apprentissage:', 1-f.score(X_train, y_train))
print('Erreur de test:', 1-f.score(X_test, y_test))

Erreur d'apprentissage: 0.04022988505747127
Erreur de test: 0.03876923076923078


from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron.fit(X_train,y_train)
perceptron.score(X_test,y_test)

/usr/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py:144: FutureWarning: max_iter and tol parameters have been added in Perceptron in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  FutureWarning)

0.8012307692307692


from sklearn.neighbors import KNeighborsClassifier

knn = {}
score_train = []
score_test = []

k_range = range(1,20)

for k in k_range:
   knn[k] =  KNeighborsClassifier(n_neighbors=k)
   knn[k].fit(X_train,y_train)
   score_train.append(knn[k].score(X_train,y_train))
   score_test.append(knn[k].score(X_test,y_test))

k_best = np.argmax(score_test)

print('Score de test du meilleur prédicteur kNN:',score_test[k_best])

plt.plot(list(k_range),score_train,label='erreur d\'apprentissage')
plt.plot(list(k_range),score_test,label='erreur de test')
plt.legend()
plt.show()

Score de test du meilleur prédicteur kNN: 0.936


pd.crosstab(y_test,f.predict(X_test))


from sklearn.utils import resample,shuffle
test_label_counts = y_test.value_counts()
X_test_white = X_test[y_test=='white']
y_test_white = y_test[y_test=='white']
X_test_less_white, y_test_less_white = resample(X_test_white,y_test_white,n_samples=test_label_counts['red'],replace=False)
X_test_ = pd.concat([X_test_less_white,X_test[y_test=='red']])
y_test_ = pd.concat([y_test_less_white,y_test[y_test=='red']])
X_test_, y_test_ = shuffle(X_test_,y_test_)
print(y_test_.value_counts())

white    394
red      394
Name: style, dtype: int64


print('Régression logistique:', f.score(X_test_,y_test_))
print('Peceptron:', perceptron.score(X_test_,y_test_))
print('Meilleur kNN:',knn[k_best].score(X_test_,y_test_))

Régression logistique: 0.9441624365482234
Peceptron: 0.5901015228426396
Meilleur kNN: 0.8972081218274112


train_label_counts = y_train.value_counts()
X_train_white = X_train[y_train=='white']
y_train_white = y_train[y_train=='white']
X_train_less_white, y_train_less_white = resample(X_train_white,y_train_white,n_samples=train_label_counts['red'],replace=False)
X_train_ = pd.concat([X_train_less_white,X_train[y_train=='red']])
y_train_ = pd.concat([y_train_less_white,y_train[y_train=='red']])
X_train_, y_train_ = shuffle(X_train_,y_train_)
print(y_train_.value_counts())

white    1205
red      1205
Name: style, dtype: int64


f_ = LogisticRegression()
f_.fit(X_train_, y_train_)
print('Régression logistique:', f_.score(X_test_,y_test_))

perceptron_ = Perceptron()
perceptron_.fit(X_train_,y_train_)
print('Perceptron:', perceptron_.score(X_test_,y_test_))

knn_ = {}
score_train = []
score_test = []
k_range = range(1,20)
for k in k_range:
   knn_[k] =  KNeighborsClassifier(n_neighbors=k)
   knn_[k].fit(X_train_,y_train_)
   score_test.append(knn_[k].score(X_test_,y_test_))
k_best = np.argmax(score_test)
print('Meilleur kNN:', score_test[k_best])

Régression logistique: 0.9517766497461929
Perceptron: 0.7411167512690355
Meilleur kNN: 0.899746192893401

/usr/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/usr/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py:144: FutureWarning: max_iter and tol parameters have been added in Perceptron in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  FutureWarning)


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns='style'), data['style'])

train_label_counts = y_train.value_counts()
X_train_white = X_train[y_train=='white']
y_train_white = y_train[y_train=='white']
X_train_less_white, y_train_less_white = resample(X_train_white,y_train_white,n_samples=train_label_counts['red'],replace=False)
X_train_ = pd.concat([X_train_less_white,X_train[y_train=='red']])
y_train_ = pd.concat([y_train_less_white,y_train[y_train=='red']])
X_train_, y_train_ = shuffle(X_train_,y_train_)

test_label_counts = y_test.value_counts()
X_test_white = X_test[y_test=='white']
y_test_white = y_test[y_test=='white']
X_test_less_white, y_test_less_white = resample(X_test_white,y_test_white,n_samples=test_label_counts['red'],replace=False)
X_test_ = pd.concat([X_test_less_white,X_test[y_test=='red']])
y_test_ = pd.concat([y_test_less_white,y_test[y_test=='red']])
X_test_, y_test_ = shuffle(X_test_,y_test_)

f_ = LogisticRegression()
f_.fit(X_train_, y_train_)
print('Régression logistique:',f_.score(X_test_,y_test_))

perceptron_ = Perceptron()
perceptron_.fit(X_train_,y_train_)
print('Perceptron', perceptron_.score(X_test_,y_test_))

knn_ = {}
score_train = []
score_test = []
k_range = range(1,20)
for k in k_range:
   knn_[k] =  KNeighborsClassifier(n_neighbors=k)
   knn_[k].fit(X_train_,y_train_)
   score_test.append(knn_[k].score(X_test_,y_test_))
k_best = np.argmax(score_test)
print('Meilleur kNN', score_test[k_best])

Régression logistique: 0.9798387096774194
Perceptron 0.9086021505376344
Meilleur kNN 0.9274193548387096

/usr/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/usr/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py:144: FutureWarning: max_iter and tol parameters have been added in Perceptron in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  FutureWarning)

	fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	pH	sulphates	alcohol	quality	style
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5	red
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	5	red
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	5	red
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.9980	3.16	0.58	9.8	6	red
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5	red

	fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	pH	sulphates	alcohol	quality	style
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.99780	3.51	0.56	9.4	5	red
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.99680	3.20	0.68	9.8	5	red
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.99700	3.26	0.65	9.8	5	red
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.99800	3.16	0.58	9.8	6	red
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.99780	3.51	0.56	9.4	5	red
...	...	...	...	...	...	...	...	...	...	...	...	...	...
6492	6.2	0.21	0.29	1.6	0.039	24.0	92.0	0.99114	3.27	0.50	11.2	6	white
6493	6.6	0.32	0.36	8.0	0.047	57.0	168.0	0.99490	3.15	0.46	9.6	5	white
6494	6.5	0.24	0.19	1.2	0.041	30.0	111.0	0.99254	2.99	0.46	9.4	6	white
6495	5.5	0.29	0.30	1.1	0.022	20.0	110.0	0.98869	3.34	0.38	12.8	7	white
6496	6.0	0.21	0.38	0.8	0.020	22.0	98.0	0.98941	3.26	0.32	11.8	6	white

Question 1¶

Question 2¶

Question 3¶

Question 4¶

Question 5¶

Question 6¶

Question 7¶

Question 8¶

Question 9¶

Question 10¶

Question 11¶