data science
simple_validation
April 1, 2022
[1]: # classification algorithms:
# KNeighborsClassifier # RandomForestClassifier # ExtraTreesClassifier # SVC # GaussianNB
# choose the one that provides the best accuracy (there are other measures) # train all the classifier on the same training set. # Calculate the accuracy using the validation set.
# At the end we use the test set to determine the performance of our analysis.
[2]: import numpy as np from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB
[3]: X = np.genfromtxt('iris.csv', delimiter=',', skip_header=0) np.random.seed(12) np.random.shuffle(X) X.shape
[3]: (150, 5)
[4]: split1 = 80 split2 = 100
# In Python, -1 indicates the last index
X_train = X[:split1, :-1] y_train = X[:split1, -1]
# validation set X_valid = X[split1:split2, :-1]
1
y_valid = X[split1:split2, -1]
X_test = X[split2:, :-1] y_test = X[split2:, -1]
[5]: X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape,␣ ↪→y_test.shape
[5]: ((80, 4), (80,), (20, 4), (20,), (50, 4), (50,))
[6]: model1 = KNeighborsClassifier().fit(X_train, y_train)
[7]: model2 = RandomForestClassifier().fit(X_train, y_train)
[8]: model3 = ExtraTreesClassifier().fit(X_train, y_train)
[9]: model4 = SVC().fit(X_train, y_train)
[10]: model5 = GaussianNB().fit(X_train, y_train)
[11]: y_pred1 = model1.predict(X_valid) y_pred2 = model2.predict(X_valid) y_pred3 = model3.predict(X_valid) y_pred4 = model4.predict(X_valid) y_pred5 = model5.predict(X_valid)
[12]: np.sum(y_pred1 == y_valid) / len(y_valid) * 100
[12]: 100.0
[13]: np.sum(y_pred2 == y_valid) / len(y_valid) * 100
[13]: 95.0
[14]: np.sum(y_pred3 == y_valid) / len(y_valid) * 100
[14]: 95.0
[15]: np.sum(y_pred4 == y_valid) / len(y_valid) * 100
[15]: 100.0
[16]: np.sum(y_pred5 == y_valid) / len(y_valid) * 100
[16]: 95.0
[17]: # SVC and KNeighborsClassifier perform best on the validation set. # We are going to choose SVC as our main classifier.
2
[18]: # Retrain SVC on training + validation sets. X_train2 = X[:split2, :-1] y_train2 = X[:split2, -1] X_train2.shape, y_train2.shape
[18]: ((100, 4), (100,))
[19]: model = SVC().fit(X_train2, y_train2)
[20]: # make predictions on the test set y_pred = model.predict(X_test)
[21]: np.sum(y_pred == y_test) / len(y_test) * 100
[21]: 92.0
[23]: # When the validation set is small, the validation accuracy does not give us a␣ ↪→good estimate on which model perform best on this data set.
# So when the data set is small (e.g. less than 1000 samples) it is best to use␣ ↪→the cross validation technique (explained later).
[ ]:
3