import pandas as pd
import numpy as np
import datetime
import re
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import wordcloud
from wordcloud import WordCloud
from tabulate import tabulate
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error,mean_absolute_error,mean_absolute_percentage_error, r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV


%matplotlib inline

class color:
   BLUE = '\033[94m'
   BOLD = '\033[1m'
   END = '\033[0m'


train_data = pd.read_csv('train_data_2.csv')
test_data = pd.read_csv('test_data_2.csv')


train_data.columns

Index(['Unnamed: 0', 'Selling_Price', 'Name', 'Location', 'Year',
       'Kilometers_Driven', 'Fuel_Type', 'Transmission', 'Seller_Type',
       'Owner_Type', 'Mileage', 'Current_Mileage', 'Engine', 'Power', 'Seats',
       'New_Price'],
      dtype='object')


train_data.head()


print(color.BOLD + "There are {} rows and {} columns in the dataset.".format(train_data.shape[0],train_data.shape[1]),"\n"+ color.END)
print("The first column is unnamed which seems to be the index which can be deleted and reset the index.","\n")
train_data.drop('Unnamed: 0', axis=1, inplace=True)
print(color.BOLD +color.BLUE +"Let's look at the data types available in the dataset"+ color.END)
train_data.info()
print(color.BOLD +color.BLUE +"\n","Summary statistics of dataset"+ color.END)
train_data.describe()

There are 6017 rows and 16 columns in the dataset. 

The first column is unnamed which seems to be the index which can be deleted and reset the index. 

Let's look at the data types available in the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6017 entries, 0 to 6016
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Selling_Price      6017 non-null   float64
 1   Name               6017 non-null   object 
 2   Location           6017 non-null   object 
 3   Year               6017 non-null   int64  
 4   Kilometers_Driven  6017 non-null   int64  
 5   Fuel_Type          6017 non-null   object 
 6   Transmission       6017 non-null   object 
 7   Seller_Type        6005 non-null   object 
 8   Owner_Type         6017 non-null   object 
 9   Mileage            6016 non-null   object 
 10  Current_Mileage    5985 non-null   object 
 11  Engine             5981 non-null   object 
 12  Power              5981 non-null   object 
 13  Seats              5975 non-null   float64
 14  New_Price          823 non-null    object 
dtypes: float64(2), int64(2), object(11)
memory usage: 705.2+ KB

 Summary statistics of dataset


train_data['Selling_Price'] = train_data['Selling_Price'].apply(lambda x: x*100000)


test_data.head()


print(color.BOLD +"There are {} rows and {} columns in the dataset.".format(test_data.shape[0],test_data.shape[1]),"\n"+ color.END)
print("The first column is unnamed which seems to be the index which can be deleted and reset the index.","\n")
test_data.drop('Unnamed: 0', axis=1, inplace=True)
print(color.BOLD +color.BLUE +"Let's look at the data types available in the dataset"+ color.END)
test_data.info()
print(color.BOLD +color.BLUE +"\n","Summary statistics of dataset"+ color.END)
test_data.describe()

There are 1234 rows and 15 columns in the dataset. 

The first column is unnamed which seems to be the index which can be deleted and reset the index. 

Let's look at the data types available in the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               1234 non-null   object 
 1   Location           1234 non-null   object 
 2   Year               1234 non-null   int64  
 3   Kilometers_Driven  1234 non-null   int64  
 4   Fuel_Type          1234 non-null   object 
 5   Transmission       1234 non-null   object 
 6   Seller_Type        1234 non-null   object 
 7   Owner_Type         1234 non-null   object 
 8   Mileage            1234 non-null   object 
 9   Current_Mileage    1217 non-null   object 
 10  Engine             1224 non-null   object 
 11  Power              1224 non-null   object 
 12  Seats              1223 non-null   float64
 13  New_Price          182 non-null    object 
dtypes: float64(1), int64(2), object(11)
memory usage: 135.1+ KB

 Summary statistics of dataset


def seats_(df):
    df['Seats'].replace(0, np.nan, inplace=True)
    df['Seats'] = df['Seats'].astype('category')


seats_(train_data)
seats_(test_data)


current_year = datetime.datetime.now().year #get current year
def veh_age(df):
    df["vehicle_age"] = df['Year'].apply(lambda x: current_year-x)  # substract to get the year delta
    df.drop('Year', axis=1, inplace=True)


veh_age(train_data)
veh_age(test_data)


pattern = '\s\D+[/]*\D+'

def mileage_(df):
    df['Current_Mileage'] = df['Current_Mileage'].replace(to_replace = pattern, value = '', regex = True)
    df['Current_Mileage'] = df['Current_Mileage'].astype(float)
    df['Mileage'] = df['Mileage'].replace(to_replace = pattern, value = '', regex = True)
    df['Mileage'] = df['Mileage'].astype(float)


mileage_(train_data)
mileage_(test_data)


def checknull(df):
    # printing column name where null is present
    col_name = df.isnull().sum(axis=0).sort_values(ascending = False)
    print(col_name)


print(color.BOLD +"printing column name where null is present in train data"+ color.END, '\n')
checknull(train_data)
print(color.BOLD +"printing column name where null is present in test data"+ color.END, '\n')
checknull(test_data)

printing column name where null is present in train data 

New_Price            5194
Seats                  43
Engine                 36
Power                  36
Current_Mileage        32
Seller_Type            12
Mileage                 1
Selling_Price           0
Name                    0
Location                0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
vehicle_age             0
dtype: int64
printing column name where null is present in test data 

New_Price            1052
Current_Mileage        17
Seats                  11
Engine                 10
Power                  10
Name                    0
Location                0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Seller_Type             0
Owner_Type              0
Mileage                 0
vehicle_age             0
dtype: int64


print(color.BOLD +"There are many null values in New_Price. It is very difficult to impute those values. Hence we will drop that column from both test and train data.","\n"+ color.END)

There are many null values in New_Price. It is very difficult to impute those values. Hence we will drop that column from both test and train data.


train_data.drop('New_Price', axis=1, inplace=True)


test_data.drop('New_Price', axis=1, inplace=True)


print(color.BOLD +'Rows with NaN in Mileage in train data',"\n"+ color.END)
train_data[train_data['Mileage'].isna()]

Rows with NaN in Mileage in train data


print(color.BOLD +"There are {} row entries for electric vehicle in train data.".format(train_data['Fuel_Type'][train_data['Fuel_Type'] == 'Electric'].count()),"\n"+ color.END)

There are 2 row entries for electric vehicle in train data.


print(color.BOLD +"There are {} row entries for electric vehicle in test data.".format(test_data['Fuel_Type'][test_data['Fuel_Type'] == 'Electric'].count()),"\n"+ color.END)

There are 0 row entries for electric vehicle in test data.


# Get indexes where Fuel_Type column has value Electric
indexNames = train_data[train_data['Fuel_Type'] == 'Electric'].index
 
# Delete these row indexes from dataFrame
train_data.drop(indexNames , inplace=True, axis=0)


pattern = '\s\D+[/]*\D+'

def curmil_eng_pow(df):
    df['Current_Mileage'].replace(0, np.nan, inplace=True)
    df['Mileage'].replace(0, np.nan, inplace=True)
    print("There are {} null in Current_Mileage.Replace them with the mileage value".format(df['Current_Mileage'].isna().sum()),"\n")
    df.Current_Mileage.fillna(df.Mileage, inplace=True)
    
    df['Engine'] = df['Engine'].replace(to_replace = pattern, value = '', regex = True)
    df['Engine'] = df['Engine'].astype("float").astype("Int64")
    df['Power'] = pd.to_numeric(df['Power'].str.lower().str.split().str.get(0).str.replace('null',''), errors='coerce')


print(color.BOLD +"Treating train data"+ color.END, '\n')
curmil_eng_pow(train_data)
print(color.BOLD +"Treating test data"+ color.END, '\n')
curmil_eng_pow(test_data)

print("Remove units and change datatype for Engine and Power")

Treating train data 

There are 87 null in Current_Mileage.Replace them with the mileage value 

Treating test data 

There are 26 null in Current_Mileage.Replace them with the mileage value 

Remove units and change datatype for Engine and Power


def get_make_and_model(df):
    
    #Get the Make from the name
    make_list=list(df['Name'].str.lower())
    i=0

    for item in make_list:
        make_list[i] = item.split(' ')[0]

        i=i+1

    # replace formated Names into Make  
    df['Make']=make_list
    
    #Get the Model from the name
    model_list=list(df['Name'].str.lower())
    i=0

    for item in model_list:
        model_list[i] = item.split(' ')[0]+'_'+item.split(' ')[1]

        i=i+1

    # replace formated Names into Model  
    df['Model']=model_list
    
    #Drop name column
    df.drop('Name', axis=1, inplace=True)


get_make_and_model(train_data)
get_make_and_model(test_data)


train_data=  train_data.sort_values(['Make','Model'])
for column in ['Power','Mileage', 'Seats','Engine','Seller_Type']:
    train_data[column].fillna(method='ffill', inplace=True)


test_data=  test_data.sort_values(['Make','Model'])
for column in ['Power','Mileage', 'Seats','Engine','Seller_Type']:
    test_data[column].fillna(method='ffill', inplace=True)


numeric_traindf = train_data.select_dtypes(include=['int64','float']).drop(columns=['Selling_Price'])

row_nums = 2  # how many rows of plots
col_nums = 3  # how many plots per row

# Create the subplots
fig, axes = plt.subplots(nrows=row_nums, ncols=col_nums, figsize=(15, 10))

for i, column in enumerate(numeric_traindf.columns):
    sns.distplot(numeric_traindf[column],ax=axes[i//col_nums,i%col_nums])
    #sns.histplot(numeric_traindf[column],ax=axes[i//col_nums,i%col_nums], color="red", kde=True)


    
# "i//ncols gives the floor division which is the row when you are working left to right then top to bottom.
# i%ncols will give you the integer remainder which is the column when you are working left to right top to bottom."


numeric_traindf.agg(['skew', 'kurtosis']).transpose()


%matplotlib inline
# Step 0 - Read the dataset, calculate column correlations and make a seaborn heatmap
df_cor = train_data.select_dtypes(include=['int64','float']).drop(columns=['Kilometers_Driven', 'vehicle_age'])
corr = df_cor.corr()
print(corr)

fig, ax = plt.subplots(figsize=(10,10))
#ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, cmap= sns.diverging_palette(20, 220, as_cmap=True), square=True, annot=True, fmt='.1f')
ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, cmap= 'Blues', square=True, annot=True, fmt='.2f')

ax.set_xticklabels(ax.get_xticklabels(), rotation=45,horizontalalignment='right');

                 Selling_Price   Mileage  Current_Mileage    Engine     Power
Selling_Price         1.000000 -0.336658        -0.311500  0.662560  0.773172
Mileage              -0.336658  1.000000         0.920549 -0.640797 -0.545029
Current_Mileage      -0.311500  0.920549         1.000000 -0.599200 -0.498446
Engine                0.662560 -0.640797        -0.599200  1.000000  0.860156
Power                 0.773172 -0.545029        -0.498446  0.860156  1.000000


sns.pairplot(df_cor)

<seaborn.axisgrid.PairGrid at 0x164fd9a47f0>


drop_column_list=['Mileage','Engine']
train_data.drop(drop_column_list, axis=1, inplace=True)
train_data.reset_index(drop=True, inplace=True)


test_data.drop(drop_column_list, axis=1, inplace=True)
test_data.reset_index(drop=True, inplace=True)


train_data.select_dtypes(include=['int64','float']).describe()


# Get indexes where Fuel_Type column has value Electric
train_data[train_data['Kilometers_Driven'] >= 300000]


index_=train_data.loc[train_data['Kilometers_Driven'] == train_data['Kilometers_Driven'].max()].index.item()
vehicle_age_=train_data['vehicle_age'][train_data['Kilometers_Driven'] == train_data['Kilometers_Driven'].max()].item()

print(color.BOLD +"Maximum in Kilometers_Driven is {}, this is an outlier".format(train_data['Kilometers_Driven'].max()), "\n"+ color.END)
print(color.BOLD +"Index of this row is {} and age of this vehicle is {}".format(index_,vehicle_age_), "\n"+ color.END)

Maximum in Kilometers_Driven is 6500000, this is an outlier 

Index of this row is 483 and age of this vehicle is 4


train_data['Kilometers_Driven'] = train_data['Kilometers_Driven'].replace(train_data['Kilometers_Driven'].max(),np.NaN)
replace_=train_data['Kilometers_Driven'].groupby(train_data['vehicle_age']).get_group(vehicle_age_).max()
print(color.BOLD +"Maximum Kilometers_Driven by vehicles with age of {} is {}".format(vehicle_age_,replace_), "\n"+ color.END)

Maximum Kilometers_Driven by vehicles with age of 4 is 148000.0


train_data['Kilometers_Driven'].fillna(replace_,inplace=True)


train_data['Kilometers_Driven'].mask(train_data['Kilometers_Driven'] > 300000, 300000, inplace=True)


# Set up the axes with gridspec
x=train_data['vehicle_age']
y=train_data['Kilometers_Driven']

# Set up the axes with gridspec
fig = plt.figure(figsize=(8, 8))
grid = plt.GridSpec(4, 4, hspace=0.2, wspace=0.2)
main_ax = fig.add_subplot(grid[:-1, 1:])
y_hist = fig.add_subplot(grid[:-1, 0], xticklabels=[], sharey=main_ax)
x_hist = fig.add_subplot(grid[-1, 1:], yticklabels=[], sharex=main_ax)

# scatter points on the main axes
main_ax.plot(x, y, 'ok', markersize=4, alpha=0.2)

# histogram on the attached axes
x_hist.hist(x, 40, histtype='stepfilled', orientation='vertical', color='skyblue')
#x_hist.invert_yaxis()

y_hist.hist(y, 60, histtype='stepfilled', orientation='horizontal', color='skyblue')
y_hist.invert_xaxis()

# Title 
plt.suptitle('Relation between Kilometers_Driven and vehicle_age', size = 15);


# Check if the data type of all columns is same in train _data and test_data
#train_data.info()
#test_data.info()
test_data['Kilometers_Driven'] = test_data['Kilometers_Driven'].astype(float)


numeric_traindf = train_data.select_dtypes(include=['int64','float']).drop(columns=['Selling_Price']).apply(lambda x: stats.boxcox(x)[0])
numeric_testdf = test_data.select_dtypes(include=['int64','float']).apply(lambda x: stats.boxcox(x)[0])


# Replace original data with box-cox ransformed data
train_data.loc[:, ['Kilometers_Driven', 'Current_Mileage', 'vehicle_age','Power']] = numeric_traindf[['Kilometers_Driven', 'Current_Mileage', 'vehicle_age','Power']]
test_data.loc[:, ['Kilometers_Driven', 'Current_Mileage', 'vehicle_age','Power']] = numeric_testdf[['Kilometers_Driven', 'Current_Mileage', 'vehicle_age','Power']]


print(train_data.isnull().values.any())
print(test_data.isnull().values.any())

print(color.BOLD +"There are no more null values in train_data and test data"+ color.END)

False
False
There are no more null values in train_data and test data


%matplotlib inline
cat_column_list={'Transmission': 'Manual transmission is most popular.',
                 'Fuel_Type':'Many vehicles that are sold are diesel vehicles, followed by petrol vehicles.',
                 'Seller_Type':'Most sellers are indivisual sellers.',
                 'Owner_Type':'Many vehicles that are sold are first hand vehicles.',
                 'Seats':'5 seater vehicles are owned by majority of the users.',
                 'Location':'Popular locaitions where the vehicle is available for purchase are Mumbai, Hyderabad and kochi'}

for column in cat_column_list.keys():
    #uniques = train_data[column].values
    #total_unique=len(list(np.unique(uniques)))
    count_uniques = pd.DataFrame(train_data[column].value_counts()).rename(columns={column:'Total_Count'}).sort_values('Total_Count',ascending=False)
    
    # parameters in format function.
    print(color.BOLD +"Number of unique values in {} is {}".format(column, count_uniques.shape[0]), "\n"+ color.END)
    #print("Unique value count in {}".format(column))
    #print(count_uniques)

    # Create Figure
    fig, ax = plt.subplots(figsize=(5,5))

    ax = sns.barplot(x=count_uniques.index.values.tolist()  , y="Total_Count", data=count_uniques, palette= 'viridis')
    # rotates labels and aligns them horizontally to left 
    plt.setp( ax.xaxis.get_majorticklabels(), rotation=90, ha="left" )

    plt.tight_layout()
    plt.show()
    print("{}".format(cat_column_list[column]))

    print("\n",'-------------------------------------------------------------------------------------------------')

Number of unique values in Transmission is 2

Manual transmission is most popular.

 -------------------------------------------------------------------------------------------------
Number of unique values in Fuel_Type is 4

Many vehicles that are sold are diesel vehicles, followed by petrol vehicles.

 -------------------------------------------------------------------------------------------------
Number of unique values in Seller_Type is 3

Most sellers are indivisual sellers.

 -------------------------------------------------------------------------------------------------
Number of unique values in Owner_Type is 4

Many vehicles that are sold are first hand vehicles.

 -------------------------------------------------------------------------------------------------
Number of unique values in Seats is 8

5 seater vehicles are owned by majority of the users.

 -------------------------------------------------------------------------------------------------
Number of unique values in Location is 11

Popular locaitions where the vehicle is available for purchase are Mumbai, Hyderabad and kochi

 -------------------------------------------------------------------------------------------------


%matplotlib inline
fig, ax = plt.subplots(figsize=(14, 9))
count_uniques = pd.DataFrame(train_data['Model'].value_counts()).reset_index().rename(columns={'index':'options','Model':'Total_Count'}).sort_values('Total_Count', ascending=False)

print(color.BOLD +"Number of unique values in Model is {}".format(count_uniques.shape[0]), "\n"+ color.END)

dictionary = pd.Series(count_uniques.Total_Count.values,index=count_uniques.options).to_dict()

wordcloud = WordCloud(max_font_size=800, background_color='white', colormap='viridis', width=500, height=300, max_words=15).generate_from_frequencies(dictionary)

plt.imshow(wordcloud, interpolation='bilinear') # image show
plt.axis('off'); # to off the axis of x and y

Number of unique values in Model is 216


%matplotlib inline
fig, ax = plt.subplots(figsize=(14, 9))
count_uniques = pd.DataFrame(train_data['Make'].value_counts()).reset_index().rename(columns={'index':'options','Make':'Total_Count'}).sort_values('Total_Count', ascending=False)

print(color.BOLD +"Number of unique values in Make is {}".format(count_uniques.shape[0]), "\n"+ color.END)

dictionary = pd.Series(count_uniques.Total_Count.values,index=count_uniques.options).to_dict()

wordcloud = WordCloud(max_font_size=800, background_color='white', colormap='viridis', width=500, height=300, max_words=15).generate_from_frequencies(dictionary)

plt.imshow(wordcloud, interpolation='bilinear') # image show
plt.axis('off'); # to off the axis of x and y

Number of unique values in Make is 29


print(color.BOLD +"Aggregate statistics for Selling_price"+ color.END)
train_data['Selling_Price'].describe()

Aggregate statistics for Selling_price

count    6.015000e+03
mean     9.434941e+05
std      1.092925e+06
min      4.400000e+04
25%      3.500000e+05
50%      5.630000e+05
75%      9.950000e+05
max      1.000000e+07
Name: Selling_Price, dtype: float64


#train_data[train_data.Selling_Price==train_data.Selling_Price.min()]

vehicle_ = train_data['Model'][train_data.Selling_Price==train_data.Selling_Price.min()]
print("One {} vehicle was sold at {} INR, lowest in the dataset".format(vehicle_.item(), train_data.Selling_Price.min()))
vehicle_ = train_data['Model'][train_data.Selling_Price==train_data.Selling_Price.max()]
print("One {} vehicle was sold at {} INR, highest in the dataset".format(vehicle_.item(), train_data.Selling_Price.max()))

One tata_nano vehicle was sold at 44000.0 INR, lowest in the dataset
One jaguar_f vehicle was sold at 10000000.0 INR, highest in the dataset


#train_data['Selling_Price'].loc[train_data['Make'] == 'nissan'].describe()


sns.set_style("ticks",{'axes.grid' : True})
ax = sns.catplot(data=train_data, x='Selling_Price', y='Make',height=10, color='skyblue')  

# rotates labels and aligns them horizontally to left 
plt.suptitle('Selling Price vs Make of Vehicle', size = 15);

ax.set(xticks=(2e+05,5e+05,1e+06,2e+06,3e+06,4e+06,5e+06,1e+07))
ax.set_xticklabels((2e+05,5e+05,1e+06,2e+06,3e+06,4e+06,5e+06,1e+07), rotation=90)

plt.tight_layout()
plt.show()

print('-------------------------------------------------------------------------------------------------')

-------------------------------------------------------------------------------------------------


sns.set_style("ticks",{'axes.grid' : True})

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(20, 20))

# Plot the orbital period with horizontal boxes
sns.boxplot(data=train_data, x='Selling_Price', y='Make',
            whis=[0, 100], width=.6, color='skyblue')

# Add in points to show each observation
sns.stripplot(data=train_data, x='Selling_Price', y='Make',
              size=4, color=".3", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)


count_uniques = pd.DataFrame(train_data['Make'].value_counts()).rename_axis('Make').rename(columns={'Make':'Total_Count'})[24:30]
print(tabulate(count_uniques, headers = ["Make", "Total_Count "], tablefmt="pretty"),'\n')
print("There are very few vehicles in our dataset of {}".format(count_uniques.index.tolist()))
vehicle_ = train_data.loc[train_data['Make'] == 'bentley']
print("There is only one {} vehicle was sold at {} INR".format(vehicle_.Model.item(), vehicle_.Selling_Price.item()))
vehicle_ = train_data.loc[train_data['Make'] == 'ambassador']
print("There is only one {} vehicle was sold at {} INR".format(vehicle_.Model.item(), vehicle_.Selling_Price.item()))
vehicle_ = train_data.loc[train_data['Make'] == 'smart']
print("There is only one {} vehicle was sold at {} INR".format(vehicle_.Model.item(), vehicle_.Selling_Price.item()))

+------------+--------------+
|    Make    | Total_Count  |
+------------+--------------+
|   isuzu    |      3       |
|   force    |      3       |
|   smart    |      1       |
|  bentley   |      1       |
| ambassador |      1       |
+------------+--------------+ 

There are very few vehicles in our dataset of ['isuzu', 'force', 'smart', 'bentley', 'ambassador']
There is only one bentley_continental vehicle was sold at 5900000.0 INR
There is only one ambassador_classic vehicle was sold at 135000.0 INR
There is only one smart_fortwo vehicle was sold at 300000.0 INR


# Change the categorical variables into Binary variable

fuel_list= ['Diesel','Petrol']
Owner_Type= ['First','Second']
Seats= [2,4,5]
Seller_Type= ['Individual']
Make_top15_list= train_data.Make.value_counts().index[0:15].to_list()
Model_top15_list= train_data.Model.value_counts().index[0:15].to_list()
Metro_city_list= ['Chennai','Delhi','Mumbai','Kolkata','Ahmedabad','Bangalore','Hyderabad','Pune']
premiumvehicles = ['audi','bentely','bmw','jaguar','land-rover','mercedes-benz','mini-cooper','porsche']
def Binary_variable(df):
    df['Fuel_Type'] = df['Fuel_Type'].apply(lambda x: x if x in fuel_list else 'Gas_fuel')
    df['Owner_Type'] = df['Owner_Type'].apply(lambda x: x if x in Owner_Type else 'Third&above')
    df['Seats'] = df['Seats'].apply(lambda x: x if x in Seats else '6nabove')
    df['Seller_Type'] = df['Seller_Type'].apply(lambda x: x if x in Seller_Type else 'Dealer')
    df['Make_15_BIN'] = df['Make'].apply(lambda x: 1 if x in Make_top15_list else 0)#.astype('category')
    df['Model_15_BIN'] = df['Model'].apply(lambda x: 1 if x in Model_top15_list else 0)#.astype('category')
    df['Location'] = df['Location'].apply(lambda x: 1 if x in Metro_city_list else 0)
    df['Premium_vehicle'] = df['Make'].apply(lambda x: 1 if x in premiumvehicles else 0)


Binary_variable(train_data)
Binary_variable(test_data)


drop_column_list=['Make', 'Model']
train_data.drop(drop_column_list, axis=1, inplace=True)
test_data.drop(drop_column_list, axis=1, inplace=True)
#train_data.info()
#test_data.info()


# generate binary values using get_dummies
train_df = pd.get_dummies(train_data)
test_df = pd.get_dummies(test_data)
#train_df.shape
#test_df.shape


corrmat=train_df.corr()
top_corr_features=corrmat.index
plt.figure(figsize=(25,25))
#plot heat map
g=sns.heatmap(train_df[top_corr_features].corr(),annot=True,cmap="RdYlGn")


drop_column_list=['Fuel_Type_Diesel', 'Transmission_Automatic','Seller_Type_Dealer', 'Owner_Type_Second', 'Seats_6nabove']

train_df.drop(columns=[col for col in train_df if col in drop_column_list], inplace=True)
test_df.drop(columns=[col for col in test_df if col in drop_column_list], inplace=True)

#train_df.drop(drop_column_list, axis=1, inplace=True)
#test_df.drop(drop_column_list, axis=1, inplace=True)


#train_df.columns
#test_df.columns
#train_df[train_df.isin([np.nan, np.inf, -np.inf]).any(1)]
#test_df[test_df.isin([np.nan, np.inf, -np.inf]).any(1)]


#train_df.to_csv("train_df_check.csv")


X = train_df.iloc[:,1:]  #independent columns
y = train_df.iloc[:,0]    #target column Selling_Price

plt.figure(figsize=(7,7))

model = ExtraTreesRegressor()
model.fit(X,y)

print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

#plot graph of feature importances for better visualization
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.nlargest(15).plot(kind='barh')
plt.show()

[7.66327835e-03 3.03320778e-02 1.98728282e-02 1.22307857e-01
 1.59201015e-01 2.04968327e-02 8.46382554e-03 5.24686511e-01
 3.67609210e-04 1.11226483e-02 1.60757149e-02 6.11334277e-03
 5.98075124e-03 2.14072164e-04 1.84792352e-03 4.34670233e-03
 6.09070100e-02]


print("The following are the top 14 features(columns) in the order of decreasing importance that govern the selling pricec of the vehicle.",'\n')
print(feature_importances.nlargest(14).index.tolist())

The following are the top 14 features(columns) in the order of decreasing importance that govern the selling pricec of the vehicle. 

['Premium_vehicle', 'vehicle_age', 'Power', 'Seats_5.0', 'Kilometers_Driven', 'Make_15_BIN', 'Current_Mileage', 'Transmission_Manual', 'Fuel_Type_Petrol', 'Model_15_BIN', 'Location', 'Seller_Type_Individual', 'Owner_Type_First', 'Seats_4.0']


col_list = feature_importances.nlargest(14).index.tolist()
test_df = test_df[col_list]


col_list.append('Selling_Price')
train_df = train_df[col_list]


X=train_df.drop(columns='Selling_Price', axis=1)
y=train_df['Selling_Price']


# Test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


models=[]
models.append(('LINREG', LinearRegression() ))
models.append(('CART', DecisionTreeRegressor()))
models.append(("KNN",KNeighborsRegressor()))
models.append(("RF", RandomForestRegressor()))
models.append(("XGBOOST", XGBRegressor()))
names=[]
result=[]
for name,model in models:
    k_fold=model_selection.KFold(n_splits=10,shuffle=True,random_state=7)
    score=model_selection.cross_val_score(model,X_train,y_train,cv=k_fold,scoring="r2")
    result.append(score)
    names.append(name)
    print(name,score.mean(),score.std())

LINREG 0.7299316155114458 0.02947007558313618
CART 0.8199062181794978 0.03644084539749022
KNN 0.4719272182433219 0.06967162827378849
RF 0.8982717381371742 0.03298370594297641
XGBOOST 0.8957824703722868 0.04424320194826293


fig = plt.figure(figsize=(10,6))
plt.boxplot(result,labels=names)
plt.title('Algorithm Comparison',fontsize=25)
plt.show()


rfreg=RandomForestRegressor()


#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rfreg, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=123, n_jobs = 1)


rf_random.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=300; total time=   0.8s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=300; total time=   0.8s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=300; total time=   0.9s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=300; total time=   0.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=300; total time=   0.7s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=15, n_estimators=700; total time=   4.6s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=15, n_estimators=700; total time=   4.6s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=15, n_estimators=700; total time=   4.6s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=15, n_estimators=700; total time=   4.6s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=5, min_samples_split=15, n_estimators=700; total time=   4.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   0.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   0.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   0.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   0.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   0.8s
[CV] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=700; total time=   3.2s
[CV] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=700; total time=   3.3s
[CV] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=700; total time=   3.2s
[CV] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=700; total time=   3.3s
[CV] END max_depth=25, max_features=auto, min_samples_leaf=10, min_samples_split=100, n_estimators=700; total time=   3.2s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=800; total time=   4.8s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=800; total time=   4.8s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=800; total time=   4.8s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=800; total time=   4.9s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=800; total time=   4.8s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=1000; total time=   6.9s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=1000; total time=   7.1s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=1000; total time=   6.9s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=1000; total time=   6.9s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=1000; total time=   6.9s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   3.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   3.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   3.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   3.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   3.1s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=1100; total time=   2.9s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=1100; total time=   3.0s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=1100; total time=   2.9s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=1100; total time=   3.0s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=15, n_estimators=1100; total time=   3.0s

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=123, scoring='neg_mean_squared_error',
                   verbose=2)


print("The best parameters finalized for random forest regressor")
rf_random.best_params_

The best parameters finalized for random forest regressor

{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 25}


rf_random.best_score_

-133884233577.35501


## <span style='color:Maroon'> Make predictions on holdout sample </span>


y_predicted=rf_random.predict(X_test)


print("Test set accuracy : ",r2_score(y_test, y_predicted))

Test set accuracy :  0.9161536711496185


print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predicted)) 
print('Mean Absolute Percentage Error:',mean_absolute_percentage_error(y_test,y_predicted))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predicted))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_predicted)))

Mean Absolute Error: 158949.95541972152
Mean Absolute Percentage Error: 0.21733872788258643
Mean Squared Error: 96702599464.27417
Root Mean Squared Error: 310970.41573801544


fig, ax = plt.subplots(figsize=(10,7))

plt.scatter(y_test, y_predicted)
plt.xlabel('Actual Price')
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Price")
ax.set(xticks=(2e+05,5e+05,1e+06,2e+06,3e+06,4e+06,5e+06,1e+07))
ax.set_xticklabels((2e+05,5e+05,1e+06,2e+06,3e+06,4e+06,5e+06,1e+07), rotation=90)
ax.set(yticks=(2e+05,5e+05,1e+06,2e+06,3e+06,4e+06,5e+06,1e+07))
ax.set_yticklabels((2e+05,5e+05,1e+06,2e+06,3e+06,4e+06,5e+06,1e+07))

plt.show()


sns.distplot(y_test-y_predicted)

<AxesSubplot:xlabel='Selling_Price', ylabel='Density'>


y_predicted=rf_random.predict(test_df)


final_result=pd.DataFrame(y_predicted, index =list(test_df.index)).rename(columns={0:'Predicted_Selling_Price'})
final_result

	Selling_Price	Year	Kilometers_Driven	Seats
count	6017.000000	6017.000000	6.017000e+03	5975.000000
mean	9.436084	2013.357986	5.875267e+04	5.279331
std	10.927617	3.269807	9.128049e+04	0.807854
min	0.440000	1998.000000	1.710000e+02	0.000000
25%	3.500000	2011.000000	3.400000e+04	5.000000
50%	5.640000	2014.000000	5.300000e+04	5.000000
75%	9.950000	2016.000000	7.300000e+04	5.000000
max	100.000000	2019.000000	6.500000e+06	10.000000

	Year	Kilometers_Driven	Seats
count	1234.000000	1234.000000	1223.000000
mean	2013.400324	58507.288493	5.284546
std	3.179700	35598.702098	0.825622
min	1996.000000	1000.000000	2.000000
25%	2011.000000	34000.000000	5.000000
50%	2014.000000	54572.500000	5.000000
75%	2016.000000	75000.000000	5.000000
max	2019.000000	350000.000000	10.000000

	skew	kurtosis
Kilometers_Driven	58.711910	4122.991359
Mileage	0.219672	-0.280579
Current_Mileage	0.072462	-0.553288
Engine	1.414046	3.010772
Power	1.849557	5.837669
vehicle_age	0.846093	0.894725

	Selling_Price	Kilometers_Driven	Current_Mileage	Power	vehicle_age
count	6.015000e+03	6.015000e+03	6015.000000	6015.000000	6015.000000
mean	9.434941e+05	5.875658e+04	16.149742	112.854595	7.642062
std	1.092925e+06	9.129540e+04	4.709114	53.548409	3.270032
min	4.400000e+04	1.710000e+02	5.000000	34.200000	2.000000
25%	3.500000e+05	3.400000e+04	12.600000	74.945000	5.000000
50%	5.630000e+05	5.300000e+04	16.200000	93.700000	7.000000
75%	9.950000e+05	7.300000e+04	19.600000	138.100000	10.000000
max	1.000000e+07	6.500000e+06	30.900000	552.000000	23.000000

	Predicted_Selling_Price
0	2642335.0
1	3217420.0
2	2813520.0
3	2325320.0
4	1798910.0
...	...
1229	1636715.0
1230	2066095.0
1231	2130345.0
1232	2630070.0
1233	2351570.0

Used Car Price Prediction ¶

A step-by-step approach to predict used car price using machine learning ¶

Potential Business: Pandemic-Driven Purchase¶

Data Description¶

Importing necessary libraries¶

Load Data¶

Data Overview ¶

Train Data¶

Selling Price¶

Test Data¶

Converting to appropriate data type ¶

Seats Column¶

Year column¶

Current_Mileage and Mileage¶

Feature Engineering ¶

Missing Value Treatment ¶

Check for null/ nan in Dataset¶

Removing suffixes from values in a column ¶

Splitting columns ¶

Impute missing values ¶

Checking Distribution of Numeric Variables ¶

Correlation between variables ¶

Relation between Mileage, Engine and Power¶

Outlier Treatment ¶

Relation between Kilometers_Driven and vehicle_age¶

Box-Cox Transformation for numeric variables variables ¶

Data Grouping¶

categorical variables into Binary variables and adding new columns ¶

Dummy Variable Creation ¶

Correlation matrix ¶

Feature Importance ¶

Model Building ¶

Choosing best fit model for our dataset : ¶

Random Forest Regressor ¶

Hyper parameter tuning¶

Comparision of Predictions with Actual Values ¶

Make Predictions on Test Data ¶

	Unnamed: 0	Selling_Price	Name	Location	Year	Kilometers_Driven	Fuel_Type	Transmission	Seller_Type	Owner_Type	Mileage	Current_Mileage	Engine	Power	Seats	New_Price
0	0	1.75	Maruti Wagon R LXI CNG	Mumbai	2010	72000	CNG	Manual	Individual	First	26.6 km/kg	24.8 km/kg	998 CC	58.16 bhp	5.0	NaN
1	1	12.50	Hyundai Creta 1.6 CRDi SX Option	Pune	2015	41000	Diesel	Manual	Individual	First	19.67 kmpl	15.5 kmpl	1582 CC	126.2 bhp	5.0	NaN
2	2	4.50	Honda Jazz V	Chennai	2011	46000	Petrol	Manual	Individual	First	18.2 kmpl	14.3 kmpl	1199 CC	88.7 bhp	5.0	8.61 Lakh
3	3	6.00	Maruti Ertiga VDI	Chennai	2012	87000	Diesel	Manual	Individual	First	20.77 kmpl	21.2 kmpl	1248 CC	88.76 bhp	7.0	NaN
4	4	17.74	Audi A4 New 2.0 TDI Multitronic	Coimbatore	2013	40670	Diesel	Automatic	Dealer	Second	15.2 kmpl	12.9 kmpl	1968 CC	140.8 bhp	5.0	NaN

	Unnamed: 0	Name	Location	Year	Kilometers_Driven	Fuel_Type	Transmission	Seller_Type	Owner_Type	Mileage	Current_Mileage	Engine	Power	Seats	New_Price
0	0	Maruti Alto K10 LXI CNG	Delhi	2014	40929	CNG	Manual	Dealer	First	32.26 km/kg	31 km/kg	998 CC	58.2 bhp	4.0	NaN
1	1	Maruti Alto 800 2016-2019 LXI	Coimbatore	2013	54493	Petrol	Manual	Dealer	Second	24.7 kmpl	21.5 kmpl	796 CC	47.3 bhp	5.0	NaN
2	2	Toyota Innova Crysta Touring Sport 2.4 MT	Mumbai	2017	34000	Diesel	Manual	Dealer	First	13.68 kmpl	14.2 kmpl	2393 CC	147.8 bhp	7.0	25.27 Lakh
3	3	Toyota Etios Liva GD	Hyderabad	2012	139000	Diesel	Manual	Dealer	First	23.59 kmpl	23.8 kmpl	1364 CC	null bhp	5.0	NaN
4	4	Hyundai i20 Magna	Mumbai	2014	29000	Petrol	Manual	Dealer	First	18.5 kmpl	17.8 kmpl	1197 CC	82.85 bhp	5.0	NaN

	Selling_Price	Location	Kilometers_Driven	Fuel_Type	Transmission	Seller_Type	Owner_Type	Current_Mileage	Power	Seats	vehicle_age	Make	Model
483	6500000.0	Chennai	6500000	Diesel	Automatic	Individual	First	10.4	258.00	5.0	4	bmw	bmw_x5
1300	500000.0	Kolkata	480000	Petrol	Manual	Individual	First	12.8	117.30	5.0	6	honda	honda_city
1960	270000.0	Chennai	620000	Petrol	Manual	Individual	First	20.9	78.90	5.0	12	hyundai	hyundai_i10
2288	445000.0	Bangalore	445000	Petrol	Manual	Individual	First	17.6	82.90	5.0	8	hyundai	hyundai_i20
4950	750000.0	Kolkata	775000	Diesel	Automatic	Dealer	First	18.2	141.00	5.0	8	skoda	skoda_octavia
5151	100000.0	Jaipur	300000	Diesel	Manual	Individual	First	17.2	70.00	5.0	13	tata	tata_indigo
5714	1300000.0	Chennai	480000	Diesel	Automatic	Individual	First	11.5	138.03	5.0	6	volkswagen	volkswagen_jetta
5915	590000.0	Chennai	720000	Diesel	Manual	Individual	First	21.5	103.60	5.0	8	volkswagen	volkswagen_vento

Used Car Price Prediction ¶

A step-by-step approach to predict used car price using machine learning ¶

Potential Business: Pandemic-Driven Purchase¶

Please vote up and share your feedback in the comment box, if you like this notebook.¶

Data Description¶

Importing necessary libraries¶

Load Data¶

Data Overview ¶

Train Data¶

Selling Price¶

Test Data¶

Converting to appropriate data type ¶

Seats Column¶

Year column¶

Current_Mileage and Mileage¶

Feature Engineering ¶

Missing Value Treatment ¶

Check for null/ nan in Dataset¶

Removing suffixes from values in a column ¶

Splitting columns ¶

Impute missing values ¶

Checking Distribution of Numeric Variables ¶

Correlation between variables ¶

Relation between Mileage, Engine and Power¶

Outlier Treatment ¶

Relation between Kilometers_Driven and vehicle_age¶

Box-Cox Transformation for numeric variables variables ¶

Data Grouping¶

categorical variables into Binary variables and adding new columns ¶

Dummy Variable Creation ¶

Correlation matrix ¶

Feature Importance ¶

Model Building ¶

Choosing best fit model for our dataset : ¶

Random Forest Regressor ¶

Hyper parameter tuning¶

Comparision of Predictions with Actual Values ¶

Make Predictions on Test Data ¶