Need help with Project Portfolio
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 1/13
In [71]: import pandas as pd import numpy as np from collections import Counter import matplotlib.pyplot as plt import numpy from statsmodels.graphics.mosaicplot import mosaic from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score, precision_score, recall_scor e, f1_score from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, tr ain_test_split import warnings %matplotlib inline df = pd.read_csv("data/adult.data", header=None, sep=", ") df.columns = ["age", "workclass", "fnlwgt", "education", "education-num" , "marital-status", "occupation", "relationship", "race", "sex", "capita l-gain", "capital-loss", "hours-per-week", "native-country", "class"] df = df[df["workclass"] != '?'] df = df[df["education"] != '?'] df = df[df["marital-status"] != '?'] df = df[df["occupation"] != '?'] df = df[df["relationship"] != '?'] df = df[df["race"] != '?'] df = df[df["sex"] != '?'] df = df[df["native-country"] != '?'] below = df[df["class"] == "<=50K"] above = df[df["class"] == ">50K"]
<ipython-input-71-d873bf4dac12>:19: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separator s (separators > 1 char and different from '\s+' are interpreted as rege x); you can avoid this warning by specifying engine='python'. df = pd.read_csv("data/adult.data", header=None, sep=", ")
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 2/13
In [61]: above_50k = Counter(above['native-country']) below_50k = Counter(below['native-country']) print('native-country') fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(5,10)) axes[0].pie(above_50k.values(), labels=above_50k.keys(), autopct='%1.0f% %') axes[0].set_title(">50K") axes[1].pie(below_50k.values(), labels=below_50k.keys(), autopct='%1.0f% %') axes[1].set_title("<=50K") plt.show()
native-country
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 3/13
In [62]: above_50k = Counter(above['race']) below_50k = Counter(below['race']) print('race') fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(5,10)) axes[0].pie(above_50k.values(), labels=above_50k.keys(), autopct='%1.0f% %') axes[0].set_title(">50K") axes[1].pie(below_50k.values(), labels=below_50k.keys(), autopct='%1.0f% %') axes[1].set_title("<=50K") plt.show()
race
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 4/13
In [63]: above_50k = Counter(above['education']) below_50k = Counter(below['education']) print('education') fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(5,10)) axes[0].pie(above_50k.values(), labels=above_50k.keys(), autopct='%1.0f% %') axes[0].set_title(">50K") axes[1].pie(below_50k.values(), labels=below_50k.keys(), autopct='%1.0f% %') axes[1].set_title("<=50K") plt.show()
education
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 5/13
In [64]: above_50k = Counter(above['workclass']) below_50k = Counter(below['workclass']) print('workclass') fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(5,10)) axes[0].pie(above_50k.values(), labels=above_50k.keys(), autopct='%1.0f% %') axes[0].set_title(">50K") axes[1].pie(below_50k.values(), labels=below_50k.keys(), autopct='%1.0f% %') axes[1].set_title("<=50K") plt.show()
workclass
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 6/13
In [65]: fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(8,8)) fig.subplots_adjust(hspace=.5) x = below['capital-gain'] y = below['age'] axes[0, 0].scatter(x,y) axes[0, 0].set_title("<=50K") axes[0, 0].set_xlabel('capital-gain') axes[0, 0].set_ylabel('age') x = above['capital-gain'] y = above['age'] axes[0, 1].scatter(x,y) axes[0, 1].set_title(">50K") axes[0, 1].set_xlabel('capital-gain') axes[0, 1].set_ylabel('age') x = below['age'] y = below['hours-per-week'] axes[1, 0].scatter(x,y) axes[1, 0].set_title("<=50K") axes[1, 0].set_xlabel('age') axes[1, 0].set_ylabel('hours-per-week') x = above['age'] y = above['hours-per-week'] axes[1, 1].scatter(x,y) axes[1, 1].set_title(">50K") axes[1, 1].set_xlabel('age') axes[1, 1].set_ylabel('hours-per-week') x = below['hours-per-week'] y = below['capital-gain'] axes[2, 0].scatter(x,y) axes[2, 0].set_title("<=50K") axes[2, 0].set_xlabel('hours-per-week') axes[2, 0].set_ylabel('capital-gain') x = above['hours-per-week'] y = above['capital-gain'] axes[2, 1].scatter(x,y) axes[2, 1].set_title(">50K") axes[2, 1].set_xlabel('hours-per-week') axes[2, 1].set_ylabel('capital-gain') plt.show()
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 7/13
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 8/13
In [50]: fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(15,10)) fig.subplots_adjust(hspace=.5) mosaic(df, ['occupation', 'class'], ax=axes, axes_label=False) plt.show()
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 9/13
In [51]: fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(15,10)) fig.subplots_adjust(hspace=.5) mosaic(df, ['marital-status', 'class'], ax=axes, axes_label=False) plt.show()
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 10/13
In [54]: fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(15,12)) fig.subplots_adjust(hspace=.5) mosaic(df, ['education-num', 'class'], ax=axes, axes_label=False) plt.show()
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 11/13
In [90]: train = df train = train.drop("capital-loss", axis=1) train = train.drop("native-country", axis=1) train = train.drop("fnlwgt", axis=1) train = train.drop("education",axis=1) def get_occupation(x): if x in ["Exec-managerial", "Prof-specialty", "Protective-serv"]: return 1 elif x in ["Sales", "Transport-moving", "Tech-support", "Craft-repai r"]: return 2 else: return 3 def get_relationship(x): if x == "Own-child": return 6 elif x == "Other-relative": return 5 elif x == "Unmarried": return 4 elif x == "Not-in-family": return 3 elif x == "Husband": return 2 else: return 1 def get_race(x): if x == "Other": return 5 elif x == "Amer-Indian-Eskimo": return 4 elif x == "Black": return 3 elif x == "White": return 2 else: return 1 def get_sex(x): if x == "Male": return 2 else: return 1 def get_class(x): if x == ">50K": return 1 else: return 0 def get_workclass(x): if x == "Without-pay":
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 12/13
return 7 elif x == "Private": return 6 elif x == "State-gov": return 5 elif x == "Self-emp-not-inc": return 4 elif x == "Local-gov": return 3 elif x == "Federal-gov": return 2 else: return 1 def get_marital_status(x): if x == "Never-married": return 7 elif x == "Separated": return 6 elif x == "Married-spouse-absent": return 5 elif x == "Widowed": return 4 elif x == "Divorced": return 3 elif x == "Married-civ-spouse": return 2 else: return 1 train['workclass'] = train['workclass'].apply(get_workclass) train['marital-status'] = train['marital-status'].apply(get_marital_stat us) train['occupation'] = train['occupation'].apply(get_occupation) train['relationship'] = train['relationship'].apply(get_relationship) train['race'] = train['race'].apply(get_race) train['sex'] = train['sex'].apply(get_sex) train['class'] = train['class'].apply(get_class)
Out[90]:
age workclass education-
num marital- status occupation relationship race sex
capital- gain
hours- per-
week cla
0 39 5 13 7 3 3 2 2 2174 40
1 50 4 13 2 1 2 2 2 0 13
2 38 6 9 3 3 3 2 2 0 40
3 53 6 7 2 3 2 3 2 0 40
4 28 6 13 2 1 1 3 1 0 40
2/26/22, 9:04 PM CSE578Project
localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 13/13
In [96]: test=pd.read_csv("data/adult.test", header=None, sep=", ") feature = train.iloc[:, :-1] labels = train.iloc[:, -1] feature_matrix1 = feature.values labels1 = labels.values train_data, test_data, train_labels, test_labels = train_test_split(feat ure_matrix1, labels1, test_size=0.2, random_state=42) transformed_train_data = MinMaxScaler().fit_transform(train_data) transformed_test_data = MinMaxScaler().fit_transform(test_data)
In [97]: t
In [114]: mod=LogisticRegression().fit(transformed_train_data,train_labels) test_predict=mod.predict(transformed_test_data) acc=accuracy_score(test_labels, test_predict) f1=f1_score(test_labels, test_predict) prec=precision_score(test_labels,test_predict) rec=recall_score(test_labels, test_predict)
In [115]: print("%.4f\t%.4f\t%.4f\t%.4f\t%s" % (acc, f1, prec, rec, 'Logistic Regr ession'))
In [ ]:
<ipython-input-96-90f00b23459c>:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separator s (separators > 1 char and different from '\s+' are interpreted as rege x); you can avoid this warning by specifying engine='python'. test=pd.read_csv("data/adult.test", header=None, sep=", ")
0.8409 0.6404 0.7500 0.5588 Logistic Regression