PythonforEDA.docx

Importing required libraries for EDA

import pandas as pd

import numpy as np

import seaborn as sns #visualization

import matplotlib.pyplot as plt #visualization

%matplotlib inline

sns.set(color_codes=True)

Loading data into data frame

df = pd.read_csv("data.csv")// Replace with your car dataset file

# To display the top 5 rows

df.head(5)

df.tail(5)

Checking the types of data

df.dtypes

Dropping irrelevant columns

df = df.drop(['Engine Fuel Type', 'Market Category', 'Vehicle Style', 'Popularity', 'Number of Doors', 'Vehicle Size'], axis=1)
df.head(5)

Renaming the columns

df = df.rename(columns={"Engine HP": "HP", "Engine Cylinders": "Cylinders", "Transmission Type": "Transmission", "Driven_Wheels": "Drive Mode","highway MPG": "MPG-H", "city mpg": "MPG-C", "MSRP": "Price" })
df.head(5)

Dropping duplicate Rows

df.shape
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

df.count()
df = df.drop_duplicates()
df.head(5)
df.count()

Dropping the missing or null values

print(df.isnull().sum())
df = df.dropna()    # Dropping the missing values.
df.count()

print(df.isnull().sum())# After dropping the values
Detecting Outliers
sns.boxplot(x=df['Price'])
sns.boxplot(x=df['HP'])
sns.boxplot(x=df['Cylinders'])
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
Histogram
df.Make.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title("Number of cars by make")
plt.ylabel('Number of cars')
plt.xlabel('Make');
Heat Maps
plt.figure(figsize=(10,5))
c= df.corr()
sns.heatmap(c,cmap="BrBG",annot=True)
c
Scatter Plot
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['HP'], df['Price'])
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show()