PHYTON
Importing required libraries for EDA
import pandas as pd
import numpy as np
import seaborn as sns #visualization
import matplotlib.pyplot as plt #visualization
%matplotlib inline
sns.set(color_codes=True)
Loading data into data frame
df = pd.read_csv("data.csv")// Replace with your car dataset file
# To display the top 5 rows
df.head(5)
df.tail(5)
Checking the types of data
df.dtypes
Dropping irrelevant columns
df = df.drop(['Engine Fuel Type', 'Market Category', 'Vehicle Style', 'Popularity', 'Number of Doors', 'Vehicle Size'], axis=1)
df.head(5)
Renaming the columns
df = df.rename(columns={"Engine HP": "HP", "Engine Cylinders": "Cylinders", "Transmission Type": "Transmission", "Driven_Wheels": "Drive Mode","highway MPG": "MPG-H", "city mpg": "MPG-C", "MSRP": "Price" })
df.head(5)
Dropping duplicate Rows
df.shape
duplicate_rows_df = df[df.duplicated()]print("number of duplicate rows: ", duplicate_rows_df.shape)
df.count()df = df.drop_duplicates()df.head(5)df.count()
Dropping the missing or null values
print(df.isnull().sum())df = df.dropna() # Dropping the missing values.df.count()
print(df.isnull().sum())# After dropping the valuesDetecting Outlierssns.boxplot(x=df['Price'])sns.boxplot(x=df['HP'])sns.boxplot(x=df['Cylinders'])Q1 = df.quantile(0.25)Q3 = df.quantile(0.75)IQR = Q3 - Q1print(IQR)Histogramdf.Make.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))plt.title("Number of cars by make")plt.ylabel('Number of cars')plt.xlabel('Make');Heat Mapsplt.figure(figsize=(10,5))c= df.corr()sns.heatmap(c,cmap="BrBG",annot=True)cScatter Plotfig, ax = plt.subplots(figsize=(10,6))ax.scatter(df['HP'], df['Price'])ax.set_xlabel('HP')ax.set_ylabel('Price')plt.show()