PYTHON PROGRAMMING

profileHussain2018
python_code.docx

# -*- coding: utf-8 -*-

"""

Created on Tue Jan 16 22:58:46 2018

@author: Paul Lee

"""

# Using Linear Regression to predict

# family home sale prices in Ames, Iowa

# Packages

import pandas as pd

import numpy as np

import statsmodels.api as sm

import statsmodels.formula.api as smf

import matplotlib.pyplot as plt

from scipy import stats

from sklearn import linear_model, metrics

# Set some options for the output

pd.set_option('display.notebook_repr_html', False)

pd.set_option('display.max_columns', 40)

pd.set_option('display.max_rows', 10)

pd.set_option('display.width', 120)

# Read in the data

train = pd.read_csv('C:/Users/Jahee Koo/Desktop/AMES_TRAIN.csv')

test = pd.read_csv('C:/Users/Jahee Koo/Desktop/AMES_TEST_SFAM.csv')

# Convert all variable names to lower case

train.columns = [col.lower() for col in train.columns]

test.columns = [col.lower() for col in test.columns]

# EDA

print('\n----- Summary of Train Data -----\n')

print('Object type: ', type(train))

print('Number of observations & variables: ', train.shape)

# Variable names and information

print(train.info())

print(train.dtypes.value_counts())

# Descriptive statistics

print(train.describe())

# show a portion of the beginning of the DataFrame

print(train.head(10))

print(train.shape)

train.loc[:, train.isnull().any()].isnull().sum().sort_values(ascending=False)

train[train == 0].count().sort_values(ascending=False)

t_null = train.isnull().sum()

t_zero = train[train == 0].count()

t_good = train.shape[0] - (t_null + t_zero)

xx = range(train.shape[1])

plt.figure(figsize=(8,8))

plt.bar(xx, t_good, color='g', width=1,

bottom=t_null+t_zero)

plt.bar(xx, t_zero, color='y', width=1,

bottom=t_null)

plt.bar(xx, t_null, color='r', width=1)

plt.show()

print(t_null[t_null > 1000].sort_values(ascending=False))

print(t_zero[t_zero > 1900].sort_values(ascending=False))

drop_cols = (t_null > 1000) | (t_zero > 1900)

train = train.loc[:, -drop_cols]

# Some quick plots of the data

train.hist(figsize=(18,14))

train.plot(

kind='box',

subplots=True,

layout=(5,9),

sharex=False,

sharey=False,

figsize=(18,14)

)

train.plot.scatter(x='grlivarea', y='saleprice')

train.boxplot(column='saleprice', by='yrsold')

train.plot.scatter(x='subclass', y='saleprice')

train.boxplot(column='saleprice', by='overallqual')

train.boxplot(column='saleprice', by='overallcond')

train.plot.scatter(x='overallcond', y='saleprice')

train.plot.scatter(x='lotarea', y='saleprice')

# Replace NaN values with medians in train data

train = train.fillna(train.median())

train = train.apply(lambda med:med.fillna(med.value_counts().index[0]))

train.head()

t_null = train.isnull().sum()

t_zero = train[train == 0].count()

t_good = train.shape[0] - (t_null + t_zero)

xx = range(train.shape[1])

plt.figure(figsize=(14,14))

plt.bar(xx, t_good, color='g', width=.8,

bottom=t_null+t_zero)

plt.bar(xx, t_zero, color='y', width=.8,

bottom=t_null)

plt.bar(xx, t_null, color='r', width=.8)

plt.show()

train.bldgtype.unique()

train.housestyle.unique()

# Goal is typical family home

# Drop observations too far from typical

iqr = np.percentile(train.saleprice, 75) - np.percentile(train.saleprice, 25)

drop_rows = train.saleprice > iqr * 1.5 + np.percentile(train.saleprice, 75)

train = train.loc[-drop_rows, :]

iqr = np.percentile(train.grlivarea, 75) - np.percentile(train.grlivarea, 25)

drop_rows = train.grlivarea > iqr * 1.5 + np.percentile(train.grlivarea, 75)

train = train.loc[-drop_rows, :]

iqr = np.percentile(train.lotarea, 75) - np.percentile(train.lotarea, 25)

drop_rows = train.lotarea > iqr * 1.5 + np.percentile(train.lotarea, 75)

train = train.loc[-drop_rows, :]

iqr = np.percentile(train.totalbsmtsf, 75) - np.percentile(train.totalbsmtsf, 25)

drop_rows = train.totalbsmtsf > iqr * 1.5 + np.percentile(train.totalbsmtsf, 75)

train = train.loc[-drop_rows, :]

# Replace 0 values with median to living area in train data

m = np.median(train.grlivarea[train.grlivarea > 0])

train = train.replace({'grlivarea': {0: m}})

# Discrete variables

plt.figure()

g = sns.PairGrid(train,

x_vars=["bldgtype",

"exterqual",

"centralair",

"kitchenqual",

"salecondition"],

y_vars=["saleprice"],

aspect=.75, size=3.5)

g.map(sns.violinplot, palette="pastel");

# Print correlations

corr_matrix = train.corr()

print(corr_matrix["saleprice"].sort_values(ascending=False).head(10))

print(corr_matrix["saleprice"].sort_values(ascending=True).head(10))

## Pick 10 variable to focus on

pick_10 = [

'saleprice',

'grlivarea',

'overallqual',

'garagecars',

'yearbuilt',

'totalbsmtsf',

'salecondition',

'bldgtype',

'kitchenqual',

'exterqual',

'centralair'

]

corr = train[pick_10].corr()

blank = np.zeros_like(corr, dtype=np.bool)

blank[np.triu_indices_from(blank)] = True

fig, ax = plt.subplots(figsize=(10, 10))

corr_map = sns.diverging_palette(255, 133, l=60, n=7,

center="dark", as_cmap=True)

sns.heatmap(corr, mask=blank, cmap=corr_map, square=True,

vmax=.3, linewidths=0.25, cbar_kws={"shrink": .5})

# Quick plots

for variable in pick_10[1:]:

if train[variable].dtype.name == 'object':

plt.figure()

sns.stripplot(y="saleprice", x=variable, data=train, jitter=True)

plt.show()

plt.figure()

sns.factorplot(y="saleprice", x=variable, data=train, kind="box")

plt.show()

else:

fig, ax = plt.subplots()

ax.set_ylabel('Sale Price')

ax.set_xlabel(variable)

scatter_plot = ax.scatter(

y=train['saleprice'],

x=train[variable],

facecolors = 'none',

edgecolors = 'blue'

)

plt.show()

plt.figure()

sns.factorplot(x="bldgtype", y="saleprice", col="exterqual", row="kitchenqual",

hue="overallqual", data=train, kind="swarm")

plt.figure()

sns.countplot(y="overallqual", hue="exterqual", data=train, palette="Greens_d")

# Run simple models

model1 = smf.ols(formula='saleprice ~ grlivarea', data=train).fit()

model2 = smf.ols(formula='saleprice ~ grlivarea + overallqual', data=train).fit()

model3 = smf.ols(formula='saleprice ~ grlivarea + overallqual + garagecars' , data=train).fit()

model4 = smf.ols(formula='saleprice ~ grlivarea + overallqual + garagecars + yearbuilt' , data=train).fit()

model5 = smf.ols(formula='saleprice ~ grlivarea + overallqual + garagecars + yearbuilt + totalbsmtsf + kitchenqual + exterqual + centralair', data=train).fit()

print('\n\nmodel 1----------\n', model1.summary())

print('\n\nmodel 2----------\n', model2.summary())

print('\n\nmodel 3----------\n', model3.summary())

print('\n\nmodel 4----------\n', model4.summary())

print('\n\nmodel 5----------\n', model5.summary())

out = [model1,

model2,

model3,

model4,

model5]

out_df = pd.DataFrame()

out_df['labels'] = ['rsquared', 'rsquared_adj', 'fstatistic', 'aic']

i = 0

for model in out:

train['pred'] = model.fittedvalues

plt.figure()

train.plot.scatter(x='saleprice', y='pred', title='model' + str(i+1))

plt.show()

out_df['model' + str(i+1)] = [

model.rsquared.round(3),

model.rsquared_adj.round(3),

model.fvalue.round(3),

model.aic.round(3)

]

i += 1

train['predictions'] = model5.fittedvalues

print(train['predictions'])

# Clean test data

test.info()

test[3:] = test[3:].fillna(test[3:].median())

test["kitchenqual"] = test["kitchenqual"].fillna(test["kitchenqual"].value_counts().index[0])

test["exterqual"] = test["exterqual"].fillna(test["exterqual"].value_counts().index[0])

m = np.median(test.grlivarea[test.grlivarea > 0])

test = test.replace({'grlivarea': {0: m}})

print(test)

# Convert the array predictions to a data frame then merge with the index for the test data

test_predictions = model5.predict(test)

test_predictions[test_predictions < 0] = train['saleprice'].min()

print(test_predictions)

dat = {'p_saleprice': test_predictions}

df1 = test[['index']]

df2 = pd.DataFrame(data=dat)

submission = pd.concat([df1,df2], axis = 1, join_axes=[df1.index])

print(submission)

submission.to_csv('C:/Users/Jahee Koo/Desktop/hw01_predictions.csv')