data science
cross_validation
April 1, 2022
[1]: import numpy as np from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import cross_val_score
[2]: X = np.genfromtxt('titanic.csv', delimiter=',', skip_header=1) np.random.seed(12) np.random.shuffle(X) X.shape
[2]: (896, 7)
[3]: # Divide X into a training set (700 samples) and a test set. split = 700 X_train = X[:split, 1:] # training set, features y_train = X[:split, 0] # training set, target X_test = X[split:, 1:] # test set, features y_test = X[split:, 0] # test set, target
[4]: def validate(model, X, y): val = cross_val_score(model, X, y, cv=5, scoring='accuracy') return np.mean(val)
[5]: validate(KNeighborsClassifier(), X_train, y_train)
[5]: 0.7071428571428571
[6]: validate(RandomForestClassifier(), X_train, y_train)
[6]: 0.8428571428571427
[7]: validate(ExtraTreesClassifier(), X_train, y_train)
[7]: 0.8314285714285713
1
[8]: validate(SVC(), X_train, y_train)
[8]: 0.6814285714285715
[9]: validate(GaussianNB(), X_train, y_train)
[9]: 0.7914285714285714
[10]: model = RandomForestClassifier().fit(X_train, y_train)
[11]: y_pred = model.predict(X_test)
[12]: np.sum(y_pred == y_test) / len(y_test) * 100
[12]: 77.55102040816327
[ ]:
2