data science

Royal9
simple_validation.pdf

simple_validation

April 1, 2022

[1]: # classification algorithms:

# KNeighborsClassifier # RandomForestClassifier # ExtraTreesClassifier # SVC # GaussianNB

# choose the one that provides the best accuracy (there are other measures) # train all the classifier on the same training set. # Calculate the accuracy using the validation set.

# At the end we use the test set to determine the performance of our analysis.

[2]: import numpy as np from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB

[3]: X = np.genfromtxt('iris.csv', delimiter=',', skip_header=0) np.random.seed(12) np.random.shuffle(X) X.shape

[3]: (150, 5)

[4]: split1 = 80 split2 = 100

# In Python, -1 indicates the last index

X_train = X[:split1, :-1] y_train = X[:split1, -1]

# validation set X_valid = X[split1:split2, :-1]

1

y_valid = X[split1:split2, -1]

X_test = X[split2:, :-1] y_test = X[split2:, -1]

[5]: X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape,␣ ↪→y_test.shape

[5]: ((80, 4), (80,), (20, 4), (20,), (50, 4), (50,))

[6]: model1 = KNeighborsClassifier().fit(X_train, y_train)

[7]: model2 = RandomForestClassifier().fit(X_train, y_train)

[8]: model3 = ExtraTreesClassifier().fit(X_train, y_train)

[9]: model4 = SVC().fit(X_train, y_train)

[10]: model5 = GaussianNB().fit(X_train, y_train)

[11]: y_pred1 = model1.predict(X_valid) y_pred2 = model2.predict(X_valid) y_pred3 = model3.predict(X_valid) y_pred4 = model4.predict(X_valid) y_pred5 = model5.predict(X_valid)

[12]: np.sum(y_pred1 == y_valid) / len(y_valid) * 100

[12]: 100.0

[13]: np.sum(y_pred2 == y_valid) / len(y_valid) * 100

[13]: 95.0

[14]: np.sum(y_pred3 == y_valid) / len(y_valid) * 100

[14]: 95.0

[15]: np.sum(y_pred4 == y_valid) / len(y_valid) * 100

[15]: 100.0

[16]: np.sum(y_pred5 == y_valid) / len(y_valid) * 100

[16]: 95.0

[17]: # SVC and KNeighborsClassifier perform best on the validation set. # We are going to choose SVC as our main classifier.

2

[18]: # Retrain SVC on training + validation sets. X_train2 = X[:split2, :-1] y_train2 = X[:split2, -1] X_train2.shape, y_train2.shape

[18]: ((100, 4), (100,))

[19]: model = SVC().fit(X_train2, y_train2)

[20]: # make predictions on the test set y_pred = model.predict(X_test)

[21]: np.sum(y_pred == y_test) / len(y_test) * 100

[21]: 92.0

[23]: # When the validation set is small, the validation accuracy does not give us a␣ ↪→good estimate on which model perform best on this data set.

# So when the data set is small (e.g. less than 1000 samples) it is best to use␣ ↪→the cross validation technique (explained later).

[ ]:

3