PYTHON PROGAMMING
#Import packages that you will be using #Some of the options in this program do not work in Rodeo, use Enthought Python3 import pandas as pd import numpy as np import statsmodels.api as sm import statsmodels.formula.api as smf import matplotlib.pyplot as plt from scipy import stats get_ipython().magic(u'matplotlib inline')
#Set some display options pd.set_option('display.notebook_repr_html', False) pd.set_option('display.max_columns', 40) pd.set_option('display.max_rows', 10) pd.set_option('display.width', 120) #Read in the ames train test datasets with the address for these on your computer train = pd.read_sas('c:/sasuniversityedition/data/ames_train.sas7bdat') test = pd.read_sas('c:/sasuniversityedition/data/ames_test_sfam.sas7bdat')
#A good step to take is to convert all variable names to lower case train.columns = [s.lower() for s in train.columns] test.columns = [s.lower() for s in test.columns]
# Define these two variables for later use; train['qualityindex'] = (train.overallqual*train.overallcond) train['totalsqftcalc'] = (train.bsmtfinsf1+train.bsmtfinsf2+train.grlivarea) test['qualityindex'] = (test.overallqual*test.overallcond) test['totalsqftcalc'] = (test.bsmtfinsf1+test.bsmtfinsf2+test.grlivarea) print(train.describe()) #take a look at some correlations with the saleprice X = train[['saleprice','qualityindex','totalsqftcalc','yearbuilt','lotarea','lotfrontage']].copy() X1 = train[['qualityindex','totalsqftcalc','yearbuilt','lotarea','lotfrontage']].copy() corr = X[X.columns].corr() print(corr) #Also, look at the KBest features #you will have to do some data cleaning before this Y = train[['saleprice']].copy() Y.head from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 select_top_3 = SelectKBest(score_func=chi2, k = 3) fit = select_top_3.fit(X1,Y) features = fit.transform(X1) features[0:5]
#Set variable list y = train['saleprice'] plt.plot(y) #Code for linear regression with categorical variables c() model1 = smf.ols(formula='y ~ qualityindex+totalsqftcalc+C(lotconfig)+C(housestyle)+yearbuilt+C(roofstyle)+C(heating)' model1.summary() model2 = smf.ols(formula='y ~ qualityindex+totalsqftcalc+yearbuilt', data=train).fit() model2.summary()
#dir(model2) lists other model2 options predictions = model2.fittedvalues predictions.head() plt.plot(predictions)
1
#Convert the array predictions to a data frame then merge with the index for the test data to create your file test_predictions = model2.predict(test) print(test_predictions) d = {'p_saleprice': test_predictions} df1 = test[['index']] df2=pd.DataFrame(data=d) your_file = pd.concat([df1,df2],axis = 1, join_axes=[df1.index]) #Submit your file as csv using the following code to save on your computer #You will have to delete the first column in the csv file to submit to kaggle your_file.to_csv('c:/data/hw02_predictions.csv')
#You can use sklearn to create train and test split to see how well your models perform #for the bonus you can use the train test split from sklearn.model_selection import train_test_split train, test = train_test_split(train, test_size = 0.3) print(train) print(test)
2