import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
# Importing re package
import re

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

%matplotlib inline

class color:
   BLUE = '\033[94m'
   PURPLE = '\033[95m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   END = '\033[0m'

train_data = pd.read_csv('train.csv', encoding = 'ISO-8859-1')
test_data = pd.read_csv('test.csv', encoding = 'ISO-8859-1')

def overview(df):
    print(color.BOLD + "There are {} rows and {} columns in the dataset.".format(df.shape[0],df.shape[1]),"\n"+ color.END)
    print(color.BOLD +color.BLUE +"Let's look at the data types available in the dataset"+ color.END)
    df.info()

overview(train_data)
print(color.BOLD +color.BLUE +"\n","Summary statistics of dataset"+ color.END)
train_data.describe()

There are 250306 rows and 34 columns in the dataset. 

Let's look at the data types available in the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250306 entries, 0 to 250305
Data columns (total 34 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   ticket_id                   250306 non-null  int64  
 1   agency_name                 250306 non-null  object 
 2   inspector_name              250306 non-null  object 
 3   violator_name               250272 non-null  object 
 4   violation_street_number     250306 non-null  float64
 5   violation_street_name       250306 non-null  object 
 6   violation_zip_code          0 non-null       float64
 7   mailing_address_str_number  246704 non-null  float64
 8   mailing_address_str_name    250302 non-null  object 
 9   city                        250306 non-null  object 
 10  state                       250213 non-null  object 
 11  zip_code                    250305 non-null  object 
 12  non_us_str_code             3 non-null       object 
 13  country                     250306 non-null  object 
 14  ticket_issued_date          250306 non-null  object 
 15  hearing_date                237815 non-null  object 
 16  violation_code              250306 non-null  object 
 17  violation_description       250306 non-null  object 
 18  disposition                 250306 non-null  object 
 19  fine_amount                 250305 non-null  float64
 20  admin_fee                   250306 non-null  float64
 21  state_fee                   250306 non-null  float64
 22  late_fee                    250306 non-null  float64
 23  discount_amount             250306 non-null  float64
 24  clean_up_cost               250306 non-null  float64
 25  judgment_amount             250306 non-null  float64
 26  payment_amount              250306 non-null  float64
 27  balance_due                 250306 non-null  float64
 28  payment_date                41113 non-null   object 
 29  payment_status              250306 non-null  object 
 30  collection_status           36897 non-null   object 
 31  grafitti_status             1 non-null       object 
 32  compliance_detail           250306 non-null  object 
 33  compliance                  159880 non-null  float64
dtypes: float64(13), int64(1), object(20)
memory usage: 64.9+ MB

 Summary statistics of dataset

overview(test_data)
print(color.BOLD +color.BLUE +"\n","Summary statistics of dataset"+ color.END)
test_data.describe()

There are 61001 rows and 27 columns in the dataset. 

Let's look at the data types available in the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61001 entries, 0 to 61000
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ticket_id                   61001 non-null  int64  
 1   agency_name                 61001 non-null  object 
 2   inspector_name              61001 non-null  object 
 3   violator_name               60973 non-null  object 
 4   violation_street_number     61001 non-null  float64
 5   violation_street_name       61001 non-null  object 
 6   violation_zip_code          24024 non-null  object 
 7   mailing_address_str_number  59987 non-null  object 
 8   mailing_address_str_name    60998 non-null  object 
 9   city                        61000 non-null  object 
 10  state                       60670 non-null  object 
 11  zip_code                    60998 non-null  object 
 12  non_us_str_code             0 non-null      float64
 13  country                     61001 non-null  object 
 14  ticket_issued_date          61001 non-null  object 
 15  hearing_date                58804 non-null  object 
 16  violation_code              61001 non-null  object 
 17  violation_description       61001 non-null  object 
 18  disposition                 61001 non-null  object 
 19  fine_amount                 61001 non-null  float64
 20  admin_fee                   61001 non-null  float64
 21  state_fee                   61001 non-null  float64
 22  late_fee                    61001 non-null  float64
 23  discount_amount             61001 non-null  float64
 24  clean_up_cost               61001 non-null  float64
 25  judgment_amount             61001 non-null  float64
 26  grafitti_status             2221 non-null   object 
dtypes: float64(9), int64(1), object(17)
memory usage: 12.6+ MB

 Summary statistics of dataset

from geopy.geocoders import Nominatim
from folium.plugins import HeatMap
import folium
from tqdm import tqdm

#load datasets
addresses =  pd.read_csv('addresses.csv')
lat_lons = pd.read_csv('latlons.csv')

#join datasets
id_address= addresses.set_index('address').join(lat_lons.set_index('address')).reset_index().set_index('ticket_id')

print(color.BOLD + "There are {} rows and {} columns in the dataset.".format(id_address.shape[0],id_address.shape[1]),"\n"+ color.END)

There are 311307 rows and 3 columns in the dataset.

print(color.BOLD +color.BLUE +"\n","Summary statistics of dataset"+ color.END)
id_address.describe()


 Summary statistics of dataset

address_locations=pd.DataFrame(id_address['address'].value_counts().reset_index())
address_locations.columns=['address','count']

address_locations=address_locations.merge(id_address,on='address',how="left").dropna()
address_locations['count'].max()

203

def defaultmap(default_location=[42.3314, -83.0458], default_zoom_start=9):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

basemap=defaultmap()
HeatMap(address_locations[['lat','lon','count']].values.tolist(),zoom=20,radius=15).add_to(basemap)

<folium.plugins.heat_map.HeatMap at 0x286dbb2e8f0>

basemap

train_only_columns = ['payment_amount', 'payment_date', 'payment_status', 'balance_due','collection_status', 'compliance_detail']
train_data.drop(train_only_columns, axis=1, inplace=True)

def uniqueplot(df):
    data_ = df.select_dtypes(exclude=['int', 'float'])
    df_variable_unique_levels_=(pd.DataFrame(data_.nunique())).reset_index().rename(columns={'index':'Variable', 0:'Unique_Levels'})
    df_variable_unique_levels_.sort_values('Unique_Levels',ascending=False, inplace=True)

    # Create just a figure and only one subplot
    fig, ax = plt.subplots(figsize=(10,10))

    ax = sns.barplot(x="Variable", y="Unique_Levels", data=df_variable_unique_levels_)

    # rotates labels and aligns them horizontally to left 
    plt.setp( ax.xaxis.get_majorticklabels(), rotation=90, ha="left" )

    plt.tight_layout()
    plt.show()

### Train Data
uniqueplot(train_data)

### Test Data
uniqueplot(test_data)

# Remove column name from both test and train
cols_del=['inspector_name', 'violator_name', 'violation_street_name', 'mailing_address_str_name', 'zip_code', 'violation_description']
train_data.drop(cols_del, axis=1, inplace=True)
test_data.drop(cols_del, axis=1, inplace=True)

def converttobinary(df):
    print(color.BOLD +color.BLUE +"\n","country Column","\n"+ color.END)
    print(color.BOLD +color.PURPLE +"If the violator statying is within country USA or not, If the violator is statying within USA 1 else 0"+ color.END)
    print(color.BLUE +"Unique values in country column"+ color.END)
    country_df_valcounts_=pd.DataFrame(df["country"].value_counts()).rename(columns={'country':'Total Rows'})
    print(country_df_valcounts_)
    print(color.RED +"There are {} unique values in country column".format(country_df_valcounts_.count()[0])+ color.END)
    df = df[df.country == 'USA']
    df['USA'] = df['country'].apply(lambda x: 1 if 'USA' else 0)
    
    print(color.BOLD +color.BLUE +"\n","city Column","\n"+ color.END)
    print(color.BOLD +color.PURPLE +"If the violator is statying within city detriot, MI or not, If the violator is statying within city detriot 1 else 0"+ color.END)
    df['within_Detriot'] = np.where((df.state == 'MI' ) & (df['city'].str.findall('det', flags=re.IGNORECASE)), 1, 0)
    
    print(color.BOLD +color.BLUE +"\n","state Column","\n"+ color.END)
    print(color.BOLD +color.PURPLE +"If the violator is statying within state, MI or not, if the violator is statying within state 1 else 0"+ color.END)
    df['MI'] = np.where((df.state == 'MI' ), 1, 0)

## Train data
converttobinary(train_data)


 country Column 

If the violator statying is within country USA or not, If the violator is statying within USA 1 else 0
Unique values in country column
      Total Rows
USA       250293
Cana           7
Aust           3
Egyp           2
Germ           1
There are 5 unique values in country column

 city Column 

If the violator is statying within city detriot, MI or not, If the violator is statying within city detriot 1 else 0

 state Column 

If the violator is statying within state, MI or not, if the violator is statying within state 1 else 0

## Test data
converttobinary(test_data)
print("\n")
print("Drop country, city,state column from test and train data")


 country Column 

If the violator statying is within country USA or not, If the violator is statying within USA 1 else 0
Unique values in country column
     Total Rows
USA       61001
There are 1 unique values in country column

 city Column 

If the violator is statying within city detriot, MI or not, If the violator is statying within city detriot 1 else 0

 state Column 

If the violator is statying within state, MI or not, if the violator is statying within state 1 else 0


Drop country, city,state column from test and train data

### get the top 10 violation codes and if the violation is in top 10 then 1 or else 0 for both test and train data.
print(color.BOLD +color.BLUE +"\n","violation codes Column","\n"+ color.END)
    
# train data 
top10_violation_types = train_data.violation_code.value_counts().index[0:10]
train_data['top10_violation_types'] = [1 if c in top10_violation_types else 0 for c in train_data.violation_code]

# test data 
test_data['top10_violation_types'] = [1 if c in top10_violation_types else 0 for c in test_data.violation_code]


 violation codes Column

delete_columns=['country', 'city', 'state','violation_code']
train_data.drop(delete_columns, axis=1, inplace=True)
test_data.drop(delete_columns, axis=1, inplace=True)

#Pandas will recognize both empty cells and “NA” types as missing values. 
# there are many empty rows in grafitti_status we can delete the column
nan_columns=[ 'violation_street_number', 'violation_zip_code', 'mailing_address_str_number', 'non_us_str_code', 'violation_zip_code', 'non_us_str_code', 'grafitti_status']
train_data.drop(nan_columns, axis=1, inplace=True)
test_data.drop(nan_columns, axis=1, inplace=True)

agency_train_valcounts_=pd.DataFrame(train_data["agency_name"].value_counts()).rename(columns={'agency_name':'Total Rows'})
agency_train_valcounts_

agency_test_valcounts_=pd.DataFrame(test_data["agency_name"].value_counts()).rename(columns={'agency_name':'Total Rows'})
agency_test_valcounts_

agency_compliance_=pd.crosstab(train_data.agency_name, train_data.compliance)

agency_compliance_

pd.merge(agency_compliance_, agency_train_valcounts_, left_index=True, right_index=True).reset_index().rename(columns={'index':'Agency_Name', 0.0:'Non_Compliant_0', 1.0:'Compliant_1'})

# Create just a figure and only one subplot
fig, ax = plt.subplots(figsize=(10,10))

color= ['dodgerblue', 'hotpink']
sns.countplot(hue ='compliance', x = "agency_name", data = train_data, palette=color)

# rotates labels and aligns them horizontally to left 
plt.setp( ax.xaxis.get_majorticklabels(), rotation=-45, ha="left" )

plt.tight_layout()
plt.show()

delete_list=['Neighborhood City Halls']

train_data = train_data[~train_data['agency_name'].isin(delete_list)]

disposition_train_valcounts_=pd.DataFrame(train_data["disposition"].value_counts()).rename(columns={'disposition':'Total Rows'})
disposition_train_valcounts_

disposition_test_valcounts_=pd.DataFrame(test_data["disposition"].value_counts()).rename(columns={'disposition':'Total Rows'})
disposition_test_valcounts_

disposition_compliance_=pd.crosstab(train_data.disposition, train_data.compliance)
disposition_compliance_

delete_list=['Not responsible by Dismissal', 'Not responsible by City Dismissal', 'Not responsible by Determination',
             'PENDING JUDGMENT','SET-ASIDE (PENDING JUDGMENT)']

train_data = train_data[~train_data['disposition'].isin(delete_list)]

sns.scatterplot(data=train_data, x="fine_amount", y="late_fee")

plt.show() 

corr_val = train_data[['fine_amount','late_fee']].corr()
corr_val

# Create just a figure and only one subplot
fig, ax = plt.subplots(figsize=(10,10))

sns.boxplot(x='compliance',y='fine_amount',data=train_data, palette='rainbow')

plt.tight_layout()
plt.show()

#Perform mean and median fine_amount
fine_amount_agg = train_data.groupby('compliance').fine_amount.agg(['mean','median','std','var']).reset_index()
fine_amount_agg

# Create just a figure and only one subplot
fig, ax = plt.subplots(figsize=(10,10))

colors=['blue','lightcoral']
sns.boxplot(x='compliance',y='late_fee',data=train_data, palette=colors)

plt.tight_layout()
plt.show()

#Perform mean and median late_fee
late_fee_agg = train_data.groupby('compliance').late_fee.agg(['mean','median','std','var']).reset_index()
late_fee_agg

training_vif= train_data.drop(['compliance', 'ticket_id', 'ticket_issued_date', 'hearing_date', 'clean_up_cost', 'agency_name', 'disposition'], axis='columns')

# VIF dataframe

from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["feature"] = training_vif.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(training_vif.values, i)
                   for i in range(len(training_vif.columns))]
  
print(vif_data)

                 feature            VIF
0            fine_amount  416652.246629
1              admin_fee       0.000000
2              state_fee       0.000000
3               late_fee    4221.034608
4        discount_amount       1.171671
5        judgment_amount  464586.439750
6  top10_violation_types       1.000398

training_vif_itr= training_vif.drop(['judgment_amount', 'late_fee'], axis='columns')

vif_data_itr = pd.DataFrame()
vif_data_itr["feature"] = training_vif_itr.columns
  
# calculating VIF for each feature
vif_data_itr["VIF"] = [variance_inflation_factor(training_vif_itr.values, i)
                     for i in range(len(training_vif_itr.columns))]
  
print(vif_data_itr)
# vif after dropping two variables judgement_fee and late_fee
# this is final patch after checking the vif individually.

                 feature       VIF
0            fine_amount  1.001959
1              admin_fee  0.000000
2              state_fee  0.000000
3        discount_amount  1.001603
4  top10_violation_types  1.000385

# printing column name where null is present
print()
print("printing column name where null is present")
col_name = train_data.isnull().any()
print(col_name)

printing column name where null is present
ticket_id                False
agency_name              False
ticket_issued_date       False
hearing_date              True
disposition              False
fine_amount              False
admin_fee                False
state_fee                False
late_fee                 False
discount_amount          False
clean_up_cost            False
judgment_amount          False
compliance               False
top10_violation_types    False
dtype: bool

# printing column name where null is present
print()
print("printing column name where null is present")
col_name = test_data.isnull().any()
print(col_name)

printing column name where null is present
ticket_id                False
agency_name              False
ticket_issued_date       False
hearing_date              True
disposition              False
fine_amount              False
admin_fee                False
state_fee                False
late_fee                 False
discount_amount          False
clean_up_cost            False
judgment_amount          False
top10_violation_types    False
dtype: bool

from datetime import date
train_data[['ticket_issued_date','hearing_date']] = train_data[['ticket_issued_date','hearing_date']].apply(pd.to_datetime)
test_data[['ticket_issued_date','hearing_date']] = test_data[['ticket_issued_date','hearing_date']].apply(pd.to_datetime) 

# forward fill to fill the gaps in hearing date
train_data.hearing_date.fillna(method='ffill', inplace=True)
test_data.hearing_date.fillna(method='ffill', inplace=True)

# we take the difference between the ticket_issued_date and hearing_date in number of days
train_data['ticket_hearing_gap'] = (train_data['hearing_date'] - train_data['ticket_issued_date']).dt.days

# Create just a figure and only one subplot
fig, ax = plt.subplots(figsize=(10,10))

colors=['deeppink','aqua']
sns.boxplot(x='compliance',y='ticket_hearing_gap',data=train_data, palette=colors)

plt.tight_layout()
plt.show()

#Perform mean and median ticket_hearing_gap
ticket_hearing_gap_agg = train_data.groupby('compliance').ticket_hearing_gap.agg(['mean','median','std','var']).reset_index()
ticket_hearing_gap_agg

test_data['ticket_hearing_gap'] = (test_data['hearing_date'] - test_data['ticket_issued_date']).dt.days

delete_columns=['clean_up_cost' ,'judgment_amount', 'late_fee', 'ticket_issued_date', 'hearing_date']
train_data.drop(delete_columns, axis=1, inplace=True)
test_data.drop(delete_columns, axis=1, inplace=True)

train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

#Checking for the labels in the categorical parameters
print('unique values in agency_name')
print(train_data['agency_name'].value_counts())

unique values in agency_name
Buildings, Safety Engineering & Env Department    95863
Department of Public Works                        52445
Health Department                                  7107
Detroit Police Department                          4464
Name: agency_name, dtype: int64

print('unique values in disposition')
print(train_data['disposition'].value_counts())

unique values in disposition
Responsible by Default                138340
Responsible by Admission               13701
Responsible by Determination            7643
Responsible (Fine Waived) by Deter       195
Name: disposition, dtype: int64

# One-Hot encoding the categorical parameters using get_dummies()
training_df = pd.get_dummies(train_data, columns = ['agency_name', 'disposition'])
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159879 entries, 0 to 159878
Data columns (total 16 columns):
 #   Column                                                      Non-Null Count   Dtype  
---  ------                                                      --------------   -----  
 0   ticket_id                                                   159879 non-null  int64  
 1   fine_amount                                                 159879 non-null  float64
 2   admin_fee                                                   159879 non-null  float64
 3   state_fee                                                   159879 non-null  float64
 4   discount_amount                                             159879 non-null  float64
 5   compliance                                                  159879 non-null  float64
 6   top10_violation_types                                       159879 non-null  int64  
 7   ticket_hearing_gap                                          159879 non-null  int64  
 8   agency_name_Buildings, Safety Engineering & Env Department  159879 non-null  uint8  
 9   agency_name_Department of Public Works                      159879 non-null  uint8  
 10  agency_name_Detroit Police Department                       159879 non-null  uint8  
 11  agency_name_Health Department                               159879 non-null  uint8  
 12  disposition_Responsible (Fine Waived) by Deter              159879 non-null  uint8  
 13  disposition_Responsible by Admission                        159879 non-null  uint8  
 14  disposition_Responsible by Default                          159879 non-null  uint8  
 15  disposition_Responsible by Determination                    159879 non-null  uint8  
dtypes: float64(5), int64(3), uint8(8)
memory usage: 11.0 MB

# One-Hot encoding the categorical parameters using get_dummies()
testing_df = pd.get_dummies(test_data, columns = ['agency_name', 'disposition'])
testing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61001 entries, 0 to 61000
Data columns (total 18 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   ticket_id                                                   61001 non-null  int64  
 1   fine_amount                                                 61001 non-null  float64
 2   admin_fee                                                   61001 non-null  float64
 3   state_fee                                                   61001 non-null  float64
 4   discount_amount                                             61001 non-null  float64
 5   top10_violation_types                                       61001 non-null  int64  
 6   ticket_hearing_gap                                          61001 non-null  int64  
 7   agency_name_Buildings, Safety Engineering & Env Department  61001 non-null  uint8  
 8   agency_name_Department of Public Works                      61001 non-null  uint8  
 9   agency_name_Detroit Police Department                       61001 non-null  uint8  
 10  disposition_Responsible (Fine Waived) by Admis              61001 non-null  uint8  
 11  disposition_Responsible (Fine Waived) by Deter              61001 non-null  uint8  
 12  disposition_Responsible - Compl/Adj by Default              61001 non-null  uint8  
 13  disposition_Responsible - Compl/Adj by Determi              61001 non-null  uint8  
 14  disposition_Responsible by Admission                        61001 non-null  uint8  
 15  disposition_Responsible by Default                          61001 non-null  uint8  
 16  disposition_Responsible by Determination                    61001 non-null  uint8  
 17  disposition_Responsible by Dismissal                        61001 non-null  uint8  
dtypes: float64(4), int64(3), uint8(11)
memory usage: 3.9 MB

train_ticketid=training_df[['ticket_id']]
test_ticketid=testing_df[['ticket_id']]

training_df.drop(['ticket_id', 'agency_name_Health Department'], axis=1, inplace=True)

delete_columns=['ticket_id',
                'disposition_Responsible (Fine Waived) by Admis',
                'disposition_Responsible - Compl/Adj by Default',
                'disposition_Responsible - Compl/Adj by Determi',
                'disposition_Responsible by Dismissal']

testing_df.drop(delete_columns, axis=1, inplace=True)

### rename columns

col_name={'agency_name_Buildings, Safety Engineering & Env Department': 'Agency_BSE',
          'agency_name_Department of Public Works': 'Agency_DPW',
          'agency_name_Detroit Police Department': 'Agency_DPD',
          'disposition_Responsible (Fine Waived) by Deter': 'Dis_res_deter',
          'disposition_Responsible by Admission':'Dis_res_admission',
          'disposition_Responsible by Default':'Dis_res_default',
          'disposition_Responsible by Determination':'Dis_res_determination'}

training_df.rename(columns=col_name, inplace=True)
testing_df.rename(columns=col_name, inplace=True)

compliance_=pd.DataFrame(training_df["compliance"].value_counts().apply(lambda x: x/len(training_df["compliance"]))).rename(columns={'compliance':'percentage of class'}).T
compliance_

# Create just a figure and only one subplot
fig, ax = plt.subplots(figsize=(5,4))

colors=['deeppink','dodgerblue']
sns.countplot(x='compliance',data=train_data, palette=colors)

plt.title("Distribution of target class")
plt.ylabel("count")

plt.tight_layout()
plt.show()

# Simultaneously we will add the scores to a df for future reference

scores = pd.DataFrame(columns = ['Accuracy', 'Recall score', 'Precision score', 'F1 score', 'Specificity', 'AUC','TPR','FPR'])

#create a function to add rows
def add_row(scores, row):
    scores.loc[-1] = row
    scores.index = scores.index + 1
    return scores

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_curve, auc, f1_score,roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# default is 75% / 25% train-test split
X = training_df.drop('compliance', axis=1)
y = training_df.compliance

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# use most frequent strategy always predicts '0'
dclf = DummyClassifier(strategy = 'most_frequent', random_state = 0)

dclf.fit(X_train, y_train)

#get predictions 
y_pred = dclf.predict(X_test)

# predict probabilities
y_probs = dclf.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)
#roc_auc_ = auc(fpr_, tpr_)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

## Resampling:
# import library
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X = training_df.drop('compliance', axis=1)
y = training_df.compliance
# fit predictor and target variable
X_smote, y_smote = smote.fit_resample(X, y)

y_smote.value_counts()

0.0    148282
1.0    148282
Name: compliance, dtype: int64

# Test train split
# default is 75% / 25% train-test split
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, random_state=0)

from sklearn.linear_model import LogisticRegression

log_reg_smote = LogisticRegression(max_iter=10000).fit(X_train, y_train)

#get predictions
y_pred = log_reg_smote.predict(X_test)

# predict probabilities
y_probs = log_reg_smote.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

print("confusion matrix")
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'])

confusion matrix

import matplotlib.pyplot as plt  
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(log_reg_smote, X_test, y_test)
plt.show()

from sklearn.ensemble import GradientBoostingClassifier

GBC_D3_smote = GradientBoostingClassifier(random_state = 0)
GBC_D3_smote.fit(X_train, y_train)

#get predictions
y_pred = GBC_D3_smote.predict(X_test)

# predict probabilities
y_probs = GBC_D3_smote.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

from sklearn.tree import DecisionTreeClassifier

dtc_smote = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)

#get predictions
y_pred = dtc_smote.predict(X_test)

# predict probabilities
y_probs = dtc_smote.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

from sklearn.ensemble import RandomForestClassifier

rf_smote = RandomForestClassifier(min_samples_split=100, max_depth=5, max_features = 8, random_state = 0).fit(X_train, y_train)

#get predictions
y_pred = rf_smote.predict(X_test)

# predict probabilities
y_probs = rf_smote.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

from xgboost import XGBClassifier

xgb_smote = XGBClassifier(eval_metric='logloss').fit(X_train, y_train)

#get predictions
y_pred = xgb_smote.predict(X_test)

# predict probabilities
y_probs = xgb_smote.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_smote)
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_train_scaled, y_smote, random_state=0)

log_norm_reg_smote = LogisticRegression(max_iter=10000).fit(X_train_norm, y_train_norm)

#get predictions
y_pred = log_norm_reg_smote.predict(X_test_norm)

# predict probabilities
y_probs = log_norm_reg_smote.predict_proba(X_test_norm)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test_norm, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

import imblearn
from imblearn.under_sampling import RandomUnderSampler

X = training_df.drop('compliance', axis=1)
y = training_df.compliance

# fit predictor and target variable
rus = RandomUnderSampler(random_state=0, replacement=True)# fit predictor and target variable
X_rus, y_rus = rus.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, random_state=0)

y_rus.value_counts()

0.0    11597
1.0    11597
Name: compliance, dtype: int64

from sklearn.linear_model import LogisticRegression
# default is 75% / 25% train-test split

log_reg_rus = LogisticRegression(max_iter=10000).fit(X_train, y_train)

#get predictions
y_pred = log_reg_rus.predict(X_test)

# predict probabilities
y_probs = log_reg_rus.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

from sklearn.ensemble import GradientBoostingClassifier

GBC_D3_rus = GradientBoostingClassifier(random_state = 0)
GBC_D3_rus.fit(X_train, y_train)

#get predictions
y_pred = GBC_D3_rus.predict(X_test)

# predict probabilities
y_probs = GBC_D3_rus.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

from sklearn.tree import DecisionTreeClassifier

dtc_rus = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)

#get predictions
y_pred = dtc_rus.predict(X_test)

# predict probabilities
y_probs = dtc_rus.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

from sklearn.ensemble import RandomForestClassifier

rf_rus = RandomForestClassifier(min_samples_split=100, max_depth=5, max_features = 8, random_state = 0).fit(X_train, y_train)

#get predictions
y_pred = rf_rus.predict(X_test)

# predict probabilities
y_probs = rf_rus.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

from xgboost import XGBClassifier

xgb_rus = XGBClassifier(eval_metric='logloss').fit(X_train, y_train)

#get predictions
y_pred = xgb_rus.predict(X_test)

# predict probabilities
y_probs = xgb_rus.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_rus)
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_train_scaled, y_rus, random_state=0)

log_norm_reg_rus = LogisticRegression(max_iter=10000).fit(X_train_norm, y_train_norm)

#get predictions
y_pred = log_norm_reg_rus.predict(X_test_norm)

# predict probabilities
y_probs = log_norm_reg_rus.predict_proba(X_test_norm)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test_norm, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

X = training_df.drop('compliance', axis=1)
y = training_df.compliance

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

from sklearn.ensemble import RandomForestClassifier

weighted_rf = RandomForestClassifier(min_samples_split=100, max_depth=5, random_state=0,class_weight={0:13, 1:1}).fit(X_train, y_train)

#get predictions
y_pred = weighted_rf.predict(X_test)

# predict probabilities
y_probs = weighted_rf.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(min_samples_split=100, max_depth=5,n_estimators=200,random_state=0).fit(X_train,y_train)

#get predictions
y_pred = weighted_rf.predict(X_test)

# predict probabilities
y_probs = weighted_rf.predict_proba(X_test)

# keep probabilities for the positive outcome only
y_probs = y_probs[:, 1]

# calculate roc curve
fpr_, tpr_, _ = roc_curve(y_test, y_probs)

# calculate AUC
auc_ = roc_auc_score(y_test, y_probs)

#confusion matrix
cm=confusion_matrix(y_test,y_pred)

specificity_ = cm[0,0]/(cm[0,0]+cm[0,1]) #TN/(TN+FP)

#append to df scores
add_row(scores, [accuracy_score(y_test, y_pred), recall_score(y_test, y_pred), precision_score(y_test, y_pred),
                 f1_score(y_test, y_pred), specificity_, auc_, tpr_, fpr_]);

# Add the model names as column 
Model = ['Dummy Classifier', 'Logiestic Regression-SMOTE', 'Gradient Boost D3-SMOTE', 'Decision Tree-SMOTE',
         'Random Forest-SMOTE', 'XG Boost-SMOTE', 'Logistic Regression Normalised-SMOTE', 'Logiestic Regression-RUS',
         'Gradient Boost D3-RUS', 'Decision Tree-RUS', 'Random Forest-RUS', 'XG Boost-RUS',
         'Logistic Regression Normalised-RUS', 'Random Forest-CW', 'Random Forest-BRFC']
  
scores.insert(0, "Model", Model, True)

scores.sort_values(by='AUC', ascending=False, inplace=True)

scores.reset_index(drop=True, inplace=True)

scores

cols=['AUC', 'Accuracy', 'Recall score', 'Precision score', 'Specificity', 'F1 score']
plot_data=scores[cols]

# Create just a figure and only one subplot
plt.figure(figsize = (16,9))

sns.lineplot(data=plot_data,dashes = False)
plt.title("Model performance parameters", fontsize = 20) # for title
# Adding label on the y-axis
plt.ylabel('Numbers label')

plt.xticks(np.arange(0, (len(plot_data)+1), 1))

plt.show()

#scores.to_csv('model_scores.csv')

probability=xgb_smote.predict_proba(testing_df)[:,1]
# we need to select the probability that the ticket will be paid

final_result=pd.DataFrame({'ticket_id':test_ticketid['ticket_id'], 'probability':probability})

final_series=pd.Series(probability, index = test_ticketid['ticket_id'])

	ticket_id	violation_street_number	violation_zip_code	mailing_address_str_number	fine_amount	admin_fee	state_fee	late_fee	discount_amount	clean_up_cost	judgment_amount	payment_amount	balance_due	compliance
count	250306.000000	2.503060e+05	0.0	2.467040e+05	250305.000000	250306.000000	250306.000000	250306.000000	250306.000000	250306.0	250306.000000	250306.000000	250306.000000	159880.000000
mean	152665.543099	1.064986e+04	NaN	9.149788e+03	374.423435	12.774764	6.387382	21.494506	0.125167	0.0	268.685356	48.898986	222.449058	0.072536
std	77189.882881	3.188733e+04	NaN	3.602034e+04	707.195807	9.607344	4.803672	56.464263	3.430178	0.0	626.915212	222.422425	606.394010	0.259374
min	18645.000000	0.000000e+00	NaN	1.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	-7750.000000	0.000000
25%	86549.250000	4.739000e+03	NaN	5.440000e+02	200.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000
50%	152597.500000	1.024400e+04	NaN	2.456000e+03	250.000000	20.000000	10.000000	10.000000	0.000000	0.0	140.000000	0.000000	25.000000	0.000000
75%	219888.750000	1.576000e+04	NaN	1.292725e+04	250.000000	20.000000	10.000000	25.000000	0.000000	0.0	305.000000	0.000000	305.000000	0.000000
max	366178.000000	1.415411e+07	NaN	5.111345e+06	10000.000000	20.000000	10.000000	1000.000000	350.000000	0.0	11030.000000	11075.000000	11030.000000	1.000000

	ticket_id	violation_street_number	non_us_str_code	fine_amount	admin_fee	state_fee	late_fee	discount_amount	clean_up_cost	judgment_amount
count	61001.000000	6.100100e+04	0.0	61001.000000	61001.0	61001.0	61001.000000	61001.000000	61001.000000	61001.000000
mean	331724.532811	1.256638e+04	NaN	272.714185	20.0	10.0	25.116219	0.239340	20.649711	347.895541
std	25434.932141	1.414373e+05	NaN	360.101855	0.0	0.0	36.310155	3.245894	242.375180	460.058043
min	284932.000000	-1.512600e+04	NaN	0.000000	20.0	10.0	0.000000	0.000000	0.000000	0.000000
25%	310111.000000	6.008000e+03	NaN	50.000000	20.0	10.0	5.000000	0.000000	0.000000	85.000000
50%	332251.000000	1.213400e+04	NaN	200.000000	20.0	10.0	10.000000	0.000000	0.000000	250.000000
75%	353031.000000	1.716500e+04	NaN	250.000000	20.0	10.0	25.000000	0.000000	0.000000	305.000000
max	376698.000000	2.010611e+07	NaN	10000.000000	20.0	10.0	1000.000000	250.000000	15309.000000	15558.800000

	lat	lon
count	311299.000000	311299.000000
mean	42.389822	-83.112668
std	0.038155	0.095320
min	41.785926	-88.081348
25%	42.364200	-83.188845
50%	42.393794	-83.126422
75%	42.419709	-83.036160
max	45.809387	-82.433593

	Agency_Name	Non_Compliant_0	Compliant_1	Total Rows
0	Buildings, Safety Engineering & Env Department	90040	5823	157784
1	Department of Public Works	47727	4718	74717
2	Detroit Police Department	3876	588	8900
3	Health Department	6639	468	8903
4	Neighborhood City Halls	1	0	2

	compliance	mean	median	std	var
0	0.0	366.315008	250.0	693.570453	481039.972606
1	1.0	238.327240	250.0	357.343654	127694.486989

	Total Rows
Department of Public Works	40731
Buildings, Safety Engineering & Env Department	16832
Detroit Police Department	3438

	Total Rows
Responsible by Default	138340
Not responsible by Dismissal	48694
Not responsible by City Dismissal	34401
Responsible by Admission	13701
Responsible by Determination	7643
Not responsible by Determination	6639
PENDING JUDGMENT	387
SET-ASIDE (PENDING JUDGMENT)	304
Responsible (Fine Waived) by Deter	195

	Total Rows
Responsible by Default	51602
Responsible by Admission	4484
Responsible by Determination	4124
Responsible (Fine Waived) by Deter	781
Responsible - Compl/Adj by Default	6
Responsible - Compl/Adj by Determi	2
Responsible (Fine Waived) by Admis	1
Responsible by Dismissal	1

compliance	0.0	1.0
disposition
Responsible (Fine Waived) by Deter	0	195
Responsible by Admission	9933	3768
Responsible by Default	133055	5285
Responsible by Determination	5294	2349

	compliance	mean	median	std	var
0	0.0	35.26124	25.0	69.498911	4830.098653
1	1.0	13.06343	5.0	30.942405	957.432434

	compliance	mean	median	std	var
0	0.0	72.724727	54.0	59.246169	3510.108550
1	1.0	71.544020	48.0	96.127696	9240.533872

	Model	Accuracy	Recall score	Precision score	F1 score	Specificity	AUC	TPR	FPR
0	XG Boost-SMOTE	0.761697	0.661338	0.825276	0.734268	0.861191	0.838339	[0.0, 2.70929287455974e-05, 0.0002709292874559...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1	Gradient Boost D3-SMOTE	0.745269	0.634381	0.812851	0.712612	0.855201	0.813505	[0.0, 2.70929287455974e-05, 5.41858574911948e-...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2	Random Forest-SMOTE	0.741466	0.598998	0.835058	0.697599	0.882705	0.807333	[0.0, 5.41858574911948e-05, 8.12787862367922e-...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3	XG Boost-RUS	0.735472	0.620678	0.798652	0.698506	0.847411	0.798779	[0.0, 0.0017464198393293748, 0.002444987775061...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4	Logiestic Regression-SMOTE	0.738984	0.586481	0.841118	0.691090	0.890172	0.797036	[0.0, 0.002492549444594961, 0.0025467353020861...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
5	Logistic Regression Normalised-SMOTE	0.738107	0.579220	0.846190	0.687704	0.895625	0.796266	[0.0, 0.000135464643727987, 0.0003251151449471...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
6	Gradient Boost D3-RUS	0.729609	0.576668	0.822621	0.678029	0.878747	0.795219	[0.0, 0.001047851903597625, 0.0013971358714634...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
7	Random Forest-RUS	0.730816	0.575271	0.826807	0.678476	0.882493	0.793759	[0.0, 0.010478519035976248, 0.0220048899755501...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
8	Decision Tree-SMOTE	0.734182	0.570306	0.845449	0.681142	0.896645	0.787136	[0.0, 0.06261175833107559, 0.09336223245732864...	[0.0, 0.00026859337648733585, 0.00042974940237...
9	Random Forest-CW	0.931449	0.057527	0.976608	0.108653	0.999892	0.785195	[0.0, 0.0003444712366517396, 0.001722356183258...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
10	Random Forest-BRFC	0.931449	0.057527	0.976608	0.108653	0.999892	0.785195	[0.0, 0.0003444712366517396, 0.001722356183258...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
11	Logiestic Regression-RUS	0.727367	0.555362	0.837724	0.667927	0.895095	0.780769	[0.0, 0.00209570380719525, 0.01047851903597624...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
12	Logistic Regression Normalised-RUS	0.724263	0.550472	0.834746	0.663439	0.893733	0.778093	[0.0, 0.00034928396786587494, 0.00488997555012...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
13	Decision Tree-RUS	0.725470	0.553615	0.834650	0.665687	0.893052	0.767860	[0.0, 0.05902899056933287, 0.09640237513098149...	[0.0, 0.0, 0.0003405994550408719, 0.0895776566...
14	Dummy Classifier	0.927371	0.000000	0.000000	0.000000	1.000000	0.500000	[0.0, 1.0]	[0.0, 1.0]

Predicted	0.0	1.0
Actual
0.0	33142	4089
1.0	15263	21647

Predict probability whether a given ticket will be paid?¶

The Dataset¶

Evaluation¶

Import Libraries¶

Load Test Train Data¶

Data Overview ¶

Load Address Data and Create a map with property location¶

Remove train only columns from train dataset ¶

get the unique values form variaous categorical variables from both test and train datasets

Remove variables with too many unique values ¶

Feature Engineering ¶

Convert Categorical variables to Binary Variables¶

Drop unwanted columns ¶

Drop country, city, state column from test and train data¶

Drop columns nan_columns ¶

More EDA ¶

agency_name¶

disposition¶

Relation between numeric variables ¶

fine_amount¶

Late fee¶

Detecting Multicollinearity with VIF¶

Dont consider for VIF¶

checking the dfs' is there are any more null¶

convert the dates to usable manner¶

Drop unwanted columns¶

Drop ticket_id,clean_up_cost ,judgment_amount, late_fee, ticket_issued_date and hearing_date¶

Dummy Variable Creation¶

Handling Class imbalance¶

Dummy classifier that classifies everything as the majority class of the training data¶

Synthetic Minority Oversampling Technique (SMOTE)¶

Model Building ¶

Logistic Regression¶

Gradient-boosted decision trees¶

Decision tree classifier¶

Random Forests¶

XG Boost Classifier¶

Logistic regression: Data Normalization¶

Random under-sampling with imblearn¶

Logistic regression¶

Gradient-boosted decision trees¶

Decision tree classifier¶

Random Forests¶

XG Boost Classifier¶

Logistic regression: Data Normalization¶

Random Forest Classifier with ‘class_weight’ parameter ¶

BalancedRandomForestClassifier in imbalanced-learn library ¶

Model performance paramter comparison¶

plot model performance parameters ¶

get probability compliance for Test Data¶