data science

profileRoyal9
cross_validation.pdf

cross_validation

April 1, 2022

[1]: import numpy as np from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import cross_val_score

[2]: X = np.genfromtxt('titanic.csv', delimiter=',', skip_header=1) np.random.seed(12) np.random.shuffle(X) X.shape

[2]: (896, 7)

[3]: # Divide X into a training set (700 samples) and a test set. split = 700 X_train = X[:split, 1:] # training set, features y_train = X[:split, 0] # training set, target X_test = X[split:, 1:] # test set, features y_test = X[split:, 0] # test set, target

[4]: def validate(model, X, y): val = cross_val_score(model, X, y, cv=5, scoring='accuracy') return np.mean(val)

[5]: validate(KNeighborsClassifier(), X_train, y_train)

[5]: 0.7071428571428571

[6]: validate(RandomForestClassifier(), X_train, y_train)

[6]: 0.8428571428571427

[7]: validate(ExtraTreesClassifier(), X_train, y_train)

[7]: 0.8314285714285713

1

[8]: validate(SVC(), X_train, y_train)

[8]: 0.6814285714285715

[9]: validate(GaussianNB(), X_train, y_train)

[9]: 0.7914285714285714

[10]: model = RandomForestClassifier().fit(X_train, y_train)

[11]: y_pred = model.predict(X_test)

[12]: np.sum(y_pred == y_test) / len(y_test) * 100

[12]: 77.55102040816327

[ ]:

2