Need help with Project Portfolio

profilepeddnadepradeep
CSE578Project.pdf

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 1/13

In [71]: import pandas as pd import numpy as np from collections import Counter import matplotlib.pyplot as plt import numpy from statsmodels.graphics.mosaicplot import mosaic from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score, precision_score, recall_scor e, f1_score from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, tr ain_test_split import warnings %matplotlib inline df = pd.read_csv("data/adult.data", header=None, sep=", ") df.columns = ["age", "workclass", "fnlwgt", "education", "education-num" , "marital-status", "occupation", "relationship", "race", "sex", "capita l-gain", "capital-loss", "hours-per-week", "native-country", "class"] df = df[df["workclass"] != '?'] df = df[df["education"] != '?'] df = df[df["marital-status"] != '?'] df = df[df["occupation"] != '?'] df = df[df["relationship"] != '?'] df = df[df["race"] != '?'] df = df[df["sex"] != '?'] df = df[df["native-country"] != '?'] below = df[df["class"] == "<=50K"] above = df[df["class"] == ">50K"]

<ipython-input-71-d873bf4dac12>:19: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separator s (separators > 1 char and different from '\s+' are interpreted as rege x); you can avoid this warning by specifying engine='python'. df = pd.read_csv("data/adult.data", header=None, sep=", ")

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 2/13

In [61]: above_50k = Counter(above['native-country']) below_50k = Counter(below['native-country']) print('native-country') fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(5,10)) axes[0].pie(above_50k.values(), labels=above_50k.keys(), autopct='%1.0f% %') axes[0].set_title(">50K") axes[1].pie(below_50k.values(), labels=below_50k.keys(), autopct='%1.0f% %') axes[1].set_title("<=50K") plt.show()

native-country

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 3/13

In [62]: above_50k = Counter(above['race']) below_50k = Counter(below['race']) print('race') fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(5,10)) axes[0].pie(above_50k.values(), labels=above_50k.keys(), autopct='%1.0f% %') axes[0].set_title(">50K") axes[1].pie(below_50k.values(), labels=below_50k.keys(), autopct='%1.0f% %') axes[1].set_title("<=50K") plt.show()

race

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 4/13

In [63]: above_50k = Counter(above['education']) below_50k = Counter(below['education']) print('education') fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(5,10)) axes[0].pie(above_50k.values(), labels=above_50k.keys(), autopct='%1.0f% %') axes[0].set_title(">50K") axes[1].pie(below_50k.values(), labels=below_50k.keys(), autopct='%1.0f% %') axes[1].set_title("<=50K") plt.show()

education

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 5/13

In [64]: above_50k = Counter(above['workclass']) below_50k = Counter(below['workclass']) print('workclass') fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(5,10)) axes[0].pie(above_50k.values(), labels=above_50k.keys(), autopct='%1.0f% %') axes[0].set_title(">50K") axes[1].pie(below_50k.values(), labels=below_50k.keys(), autopct='%1.0f% %') axes[1].set_title("<=50K") plt.show()

workclass

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 6/13

In [65]: fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(8,8)) fig.subplots_adjust(hspace=.5) x = below['capital-gain'] y = below['age'] axes[0, 0].scatter(x,y) axes[0, 0].set_title("<=50K") axes[0, 0].set_xlabel('capital-gain') axes[0, 0].set_ylabel('age') x = above['capital-gain'] y = above['age'] axes[0, 1].scatter(x,y) axes[0, 1].set_title(">50K") axes[0, 1].set_xlabel('capital-gain') axes[0, 1].set_ylabel('age') x = below['age'] y = below['hours-per-week'] axes[1, 0].scatter(x,y) axes[1, 0].set_title("<=50K") axes[1, 0].set_xlabel('age') axes[1, 0].set_ylabel('hours-per-week') x = above['age'] y = above['hours-per-week'] axes[1, 1].scatter(x,y) axes[1, 1].set_title(">50K") axes[1, 1].set_xlabel('age') axes[1, 1].set_ylabel('hours-per-week') x = below['hours-per-week'] y = below['capital-gain'] axes[2, 0].scatter(x,y) axes[2, 0].set_title("<=50K") axes[2, 0].set_xlabel('hours-per-week') axes[2, 0].set_ylabel('capital-gain') x = above['hours-per-week'] y = above['capital-gain'] axes[2, 1].scatter(x,y) axes[2, 1].set_title(">50K") axes[2, 1].set_xlabel('hours-per-week') axes[2, 1].set_ylabel('capital-gain') plt.show()

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 7/13

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 8/13

In [50]: fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(15,10)) fig.subplots_adjust(hspace=.5) mosaic(df, ['occupation', 'class'], ax=axes, axes_label=False) plt.show()

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 9/13

In [51]: fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(15,10)) fig.subplots_adjust(hspace=.5) mosaic(df, ['marital-status', 'class'], ax=axes, axes_label=False) plt.show()

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 10/13

In [54]: fig, axes = plt.subplots(ncols=1, nrows=1, figsize=(15,12)) fig.subplots_adjust(hspace=.5) mosaic(df, ['education-num', 'class'], ax=axes, axes_label=False) plt.show()

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 11/13

In [90]: train = df train = train.drop("capital-loss", axis=1) train = train.drop("native-country", axis=1) train = train.drop("fnlwgt", axis=1) train = train.drop("education",axis=1) def get_occupation(x): if x in ["Exec-managerial", "Prof-specialty", "Protective-serv"]: return 1 elif x in ["Sales", "Transport-moving", "Tech-support", "Craft-repai r"]: return 2 else: return 3 def get_relationship(x): if x == "Own-child": return 6 elif x == "Other-relative": return 5 elif x == "Unmarried": return 4 elif x == "Not-in-family": return 3 elif x == "Husband": return 2 else: return 1 def get_race(x): if x == "Other": return 5 elif x == "Amer-Indian-Eskimo": return 4 elif x == "Black": return 3 elif x == "White": return 2 else: return 1 def get_sex(x): if x == "Male": return 2 else: return 1 def get_class(x): if x == ">50K": return 1 else: return 0 def get_workclass(x): if x == "Without-pay":

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 12/13

return 7 elif x == "Private": return 6 elif x == "State-gov": return 5 elif x == "Self-emp-not-inc": return 4 elif x == "Local-gov": return 3 elif x == "Federal-gov": return 2 else: return 1 def get_marital_status(x): if x == "Never-married": return 7 elif x == "Separated": return 6 elif x == "Married-spouse-absent": return 5 elif x == "Widowed": return 4 elif x == "Divorced": return 3 elif x == "Married-civ-spouse": return 2 else: return 1 train['workclass'] = train['workclass'].apply(get_workclass) train['marital-status'] = train['marital-status'].apply(get_marital_stat us) train['occupation'] = train['occupation'].apply(get_occupation) train['relationship'] = train['relationship'].apply(get_relationship) train['race'] = train['race'].apply(get_race) train['sex'] = train['sex'].apply(get_sex) train['class'] = train['class'].apply(get_class)

Out[90]:

age workclass education-

num marital- status occupation relationship race sex

capital- gain

hours- per-

week cla

0 39 5 13 7 3 3 2 2 2174 40

1 50 4 13 2 1 2 2 2 0 13

2 38 6 9 3 3 3 2 2 0 40

3 53 6 7 2 3 2 3 2 0 40

4 28 6 13 2 1 1 3 1 0 40

2/26/22, 9:04 PM CSE578Project

localhost:8888/nbconvert/html/CSE578Project.ipynb?download=false 13/13

In [96]: test=pd.read_csv("data/adult.test", header=None, sep=", ") feature = train.iloc[:, :-1] labels = train.iloc[:, -1] feature_matrix1 = feature.values labels1 = labels.values train_data, test_data, train_labels, test_labels = train_test_split(feat ure_matrix1, labels1, test_size=0.2, random_state=42) transformed_train_data = MinMaxScaler().fit_transform(train_data) transformed_test_data = MinMaxScaler().fit_transform(test_data)

In [97]: t

In [114]: mod=LogisticRegression().fit(transformed_train_data,train_labels) test_predict=mod.predict(transformed_test_data) acc=accuracy_score(test_labels, test_predict) f1=f1_score(test_labels, test_predict) prec=precision_score(test_labels,test_predict) rec=recall_score(test_labels, test_predict)

In [115]: print("%.4f\t%.4f\t%.4f\t%.4f\t%s" % (acc, f1, prec, rec, 'Logistic Regr ession'))

In [ ]:

<ipython-input-96-90f00b23459c>:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separator s (separators > 1 char and different from '\s+' are interpreted as rege x); you can avoid this warning by specifying engine='python'. test=pd.read_csv("data/adult.test", header=None, sep=", ")

0.8409 0.6404 0.7500 0.5588 Logistic Regression