import numpy as np
import pandas as pd
import datetime as dt
from datetime import date
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import unique
from numpy import where
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler

from sklearn import metrics
import os

import warnings
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
np.random.seed(0)


from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture


%matplotlib inline
sns.set_style("white")

class color:
   BLUE = '\033[94m'
   BOLD = '\033[1m'
   END = '\033[0m'


data = pd.read_csv("marketing_campaign.csv", sep="\t")
data.head()


print(color.BOLD + "There are {} entries and {} variables in the dataset.".format(data.shape[0],data.shape[1]),"\n"+ color.END)

There are 2240 entries and 29 variables in the dataset.


print(color.BOLD +color.BLUE +"Let's look at the data types available in the dataset"+ color.END)
data.info()

Let's look at the data types available in the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   int64  
 16  NumWebPurchases      2240 non-null   int64  
 17  NumCatalogPurchases  2240 non-null   int64  
 18  NumStorePurchases    2240 non-null   int64  
 19  NumWebVisitsMonth    2240 non-null   int64  
 20  AcceptedCmp3         2240 non-null   int64  
 21  AcceptedCmp4         2240 non-null   int64  
 22  AcceptedCmp5         2240 non-null   int64  
 23  AcceptedCmp1         2240 non-null   int64  
 24  AcceptedCmp2         2240 non-null   int64  
 25  Complain             2240 non-null   int64  
 26  Z_CostContact        2240 non-null   int64  
 27  Z_Revenue            2240 non-null   int64  
 28  Response             2240 non-null   int64  
dtypes: float64(1), int64(25), object(3)
memory usage: 507.6+ KB


%matplotlib inline
Cust_details_col=['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'Complain']

for column in Cust_details_col:
    count_uniques = pd.DataFrame(data[column].value_counts()).rename(columns={column:'Total_Count'}).sort_values('Total_Count',ascending=False)
    
    # parameters in format function.
    print(color.BOLD +"Number of unique values in {} is {}".format(column, count_uniques.shape[0]), "\n"+ color.END)

    # Create Figure
    fig, ax = plt.subplots(figsize=(5,5))

    ax = sns.barplot(x=count_uniques.index.values.tolist()  , y="Total_Count", data=count_uniques, palette= 'rocket')
    # rotates labels and aligns them horizontally to left 
    plt.setp( ax.xaxis.get_majorticklabels(), rotation=90, ha="left" )

    plt.tight_layout()
    plt.show()

    print("\n",'-------------------------------------------------------------------------------------------------')

Number of unique values in Education is 5

 -------------------------------------------------------------------------------------------------
Number of unique values in Marital_Status is 8

 -------------------------------------------------------------------------------------------------
Number of unique values in Kidhome is 3

 -------------------------------------------------------------------------------------------------
Number of unique values in Teenhome is 3

 -------------------------------------------------------------------------------------------------
Number of unique values in Complain is 2

 -------------------------------------------------------------------------------------------------


column_names = ['Kidhome', 'Teenhome']
data['Children']= data[column_names].sum(axis=1)


Marital_Status = ['Married','Together']
data['Family_Size'] = data['Marital_Status'].apply(lambda x: 2 if x in Marital_Status else 1)
data['Family_Size'] = data.apply (lambda row: row['Family_Size']+row['Children'], axis=1)


data["Teenhome"] = np.where(data.Teenhome> 0, 1, 0)
data["Kidhome"] = np.where(data.Kidhome> 0, 1, 0)


Marital_Status = ['Married','Together']
data['Relation_status'] = data['Marital_Status'].apply(lambda x: 1 if x in Marital_Status else 0)


#Segmenting education levels in three groups
data["Education"] = data["Education"].replace({"Basic":"Undergraduate", "2n Cycle":"Undergraduate",
                                               "Graduation":"Graduate", 
                                               "Master":"Postgraduate", "PhD":"Postgraduate"})


#pd.DataFrame(data[['Complain']].value_counts()).apply(lambda x: x/2240*100)


fig, axes = plt.subplots(figsize=(10,5))
sns.boxplot(data=data.Income, color= 'Maroon', orient="h");
data.Income.describe()

count      2216.000000
mean      52247.251354
std       25173.076661
min        1730.000000
25%       35303.000000
50%       51381.500000
75%       68522.000000
max      666666.000000
Name: Income, dtype: float64


print(color.BOLD +'There are {} entries where income is > 1,00,000'.format((data.Income >100000).sum())+ color.END)
print(color.BOLD +'There are {}% entries where income is <= 1,00,000'.format(round(((data.Income <=100000).sum()/data.shape[0])*100,2))+ color.END)
print(color.BOLD +"99% value in income is {}".format(round(data.Income.quantile(0.99),2))+ color.END)

There are 13 entries where income is > 1,00,000
There are 98.35% entries where income is <= 1,00,000
99% value in income is 94458.8


print(color.BOLD +'Entries with max income value 666666',"\n"+ color.END)
data[data['Income']==666666]

Entries with max income value 666666


data['Income'].replace(666666, np.nan, inplace=True)


fig, axes = plt.subplots(figsize=(10,5))
sns.boxplot(data=data.Recency, color= 'Maroon', orient="h");
data.Recency.describe()

count    2240.000000
mean       49.109375
std        28.962453
min         0.000000
25%        24.000000
50%        49.000000
75%        74.000000
max        99.000000
Name: Recency, dtype: float64


# Create subset of numeric variables from the dataset
purchase_sourse_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']


for column in purchase_sourse_cols:
    count_uniques = pd.DataFrame(data[column].value_counts()).rename(columns={column:'Total_Count'}).sort_values('Total_Count',ascending=False)
    
    # parameters in format function.
    print(color.BOLD +"Number of unique values in {} is {}".format(column, count_uniques.shape[0]), "\n"+ color.END)

    # Create Figure
    fig, ax = plt.subplots(figsize=(5,5))

    ax = sns.barplot(x=count_uniques.index.values.tolist()  , y="Total_Count", data=count_uniques, palette= 'rocket')
    # rotates labels and aligns them horizontally to left 
    plt.setp( ax.xaxis.get_majorticklabels(), rotation=90, ha="left" )
    
    plt.tight_layout()
    plt.show()

    print("\n",'-------------------------------------------------------------------------------------------------')

Number of unique values in NumDealsPurchases is 15

 -------------------------------------------------------------------------------------------------
Number of unique values in NumWebPurchases is 15

 -------------------------------------------------------------------------------------------------
Number of unique values in NumCatalogPurchases is 14

 -------------------------------------------------------------------------------------------------
Number of unique values in NumStorePurchases is 14

 -------------------------------------------------------------------------------------------------
Number of unique values in NumWebVisitsMonth is 16

 -------------------------------------------------------------------------------------------------


# Create subset of numeric variables from the dataset
Promotion = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']


for column in Promotion:
    count_uniques = pd.DataFrame(data[column].value_counts()).rename(columns={column:'Total_Count'}).sort_values('Total_Count',ascending=False)
    
    # parameters in format function.
    print(color.BOLD +"Number of unique values in {} is {}".format(column, count_uniques.shape[0]), "\n"+ color.END)

    # Create Figure
    fig, ax = plt.subplots(figsize=(5,5))

    ax = sns.barplot(x=count_uniques.index.values.tolist()  , y="Total_Count", data=count_uniques, palette= 'rocket')
    # rotates labels and aligns them horizontally to left 
    plt.setp( ax.xaxis.get_majorticklabels(), rotation=90, ha="left" )
    
    plt.tight_layout()
    plt.show()

    print("\n",'-------------------------------------------------------------------------------------------------')

Number of unique values in AcceptedCmp1 is 2

 -------------------------------------------------------------------------------------------------
Number of unique values in AcceptedCmp2 is 2

 -------------------------------------------------------------------------------------------------
Number of unique values in AcceptedCmp3 is 2

 -------------------------------------------------------------------------------------------------
Number of unique values in AcceptedCmp4 is 2

 -------------------------------------------------------------------------------------------------
Number of unique values in AcceptedCmp5 is 2

 -------------------------------------------------------------------------------------------------
Number of unique values in Response is 2

 -------------------------------------------------------------------------------------------------


# Create subset of numeric variables from the dataset
numeric_columns = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
numeric_df = data[numeric_columns]

fig, axes = plt.subplots(figsize=(15, 6))

sns.boxplot(data=numeric_df, color= 'Maroon');


print(color.BOLD +color.BLUE +"\n","Summary statistics of these variables"+ color.END)
numeric_df.describe()


 Summary statistics of these variables


print(color.BOLD +color.BLUE +"\n","Total spending in each of Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds categories is"+ color.END)
numeric_df.sum(axis = 0, skipna = True)


 Total spending in each of Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds categories is

MntWines            680816
MntFruits            58917
MntMeatProducts     373968
MntFishProducts      84057
MntSweetProducts     60621
MntGoldProds         98609
dtype: int64


current_year = dt.datetime.now().year #get current year
data["Age"] = data['Year_Birth'].apply(lambda x: current_year-x)  # substract to get the year delta
#drop Year_Birth


fig, axes = plt.subplots(figsize=(10, 5))
sns.boxplot(data=data['Age'], color= 'Maroon', orient="h" );
data.Age.describe()

count    2240.000000
mean       53.194196
std        11.984069
min        26.000000
25%        45.000000
50%        52.000000
75%        63.000000
max       129.000000
Name: Age, dtype: float64


print(color.BOLD +'Print entries with Age above 100',"\n"+ color.END)
data[data['Age']>=100]

Print entries with Age above 100


data['Age'].replace(to_replace =[122,129,123],  value =[22,29,23], inplace=True)


current_day = pd.to_datetime('today').strftime("%d-%m-%Y")
current_day = pd.to_datetime(current_day, format='%d-%m-%Y')


#convert variable to date time
data['Dt_Customer'] = pd.to_datetime(data['Dt_Customer'], format='%d-%m-%Y')


data['Days_Registered'] = (current_day-data['Dt_Customer']).dt.days
#drop Dt_Customer


fig, axes = plt.subplots(figsize=(10, 5))
ax = sns.histplot(data=data['Days_Registered'], color= 'Maroon', shrink=.55)
ax.set_xticks([2800,2850,2900,2950,3000,3050,3100,3150,3200,3250,3300,3350,3400,3450]);
data.Days_Registered.describe()

count    2240.000000
mean     3154.582143
std       202.122512
min      2801.000000
25%      2981.750000
50%      3156.500000
75%      3330.000000
max      3500.000000
Name: Days_Registered, dtype: float64


column_names = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
data['Spending'] = data[column_names].sum(axis=1)


fig, axes = plt.subplots(figsize=(10, 5))
sns.boxplot(data=data['Spending'], color= 'Maroon', orient="h" );
data.Spending.describe()

count    2240.000000
mean      605.798214
std       602.249288
min         5.000000
25%        68.750000
50%       396.000000
75%      1045.500000
max      2525.000000
Name: Spending, dtype: float64


print(color.BOLD +"80% value in Spending is {}".format(round(data.Spending.quantile(0.8),2)), "\n"+ color.END)
print(color.BOLD +"90% value in Spending is {}".format(round(data.Spending.quantile(0.9),2)), "\n"+ color.END)
print(color.BOLD +"99% value in Spending is {}".format(round(data.Spending.quantile(0.99),2)), "\n"+ color.END)

80% value in Spending is 1174.0 

90% value in Spending is 1536.2 

99% value in Spending is 2126.0


#data['Spending'] = np.where(data['Spending'] > 2400, 2400, data['Spending'])


#data['Spending'].loc[105]


column_names = ['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
data['NumTotalPurchases'] = data[column_names].sum(axis=1)


count_uniques = pd.DataFrame(data['NumTotalPurchases'].value_counts()).rename(columns={'NumTotalPurchases':'Total_Count'}).sort_values('Total_Count',ascending=False)

# parameters in format function.
print(color.BOLD +"Number of unique values in NumTotalPurchases is {}".format(count_uniques.shape[0]), "\n"+ color.END)

# Create Figure
fig, ax = plt.subplots(figsize=(10,5))

ax = sns.barplot(x=count_uniques.index.values.tolist()  , y="Total_Count", data=count_uniques, palette= 'rocket')
# rotates labels and aligns them horizontally to left 
plt.setp( ax.xaxis.get_majorticklabels(), rotation=90, ha="left" )

plt.tight_layout()
plt.show()

Number of unique values in NumTotalPurchases is 33


column_names = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5' ]
data['NumPromoAccepted'] = data[column_names].sum(axis=1)


count_uniques = pd.DataFrame(data['NumPromoAccepted'].value_counts()).rename(columns={'NumPromoAccepted':'Total_Count'}).sort_values('Total_Count',ascending=False)

# parameters in format function.
print(color.BOLD +"Number of unique values in NumPromoAccepted is {}".format(count_uniques.shape[0]), "\n"+ color.END)

# Create Figure
fig, ax = plt.subplots(figsize=(10,5))

ax = sns.barplot(x=count_uniques.index.values.tolist()  , y="Total_Count", data=count_uniques, palette= 'rocket')
# rotates labels and aligns them horizontally to left 
plt.setp( ax.xaxis.get_majorticklabels(), rotation=90, ha="left" )

plt.tight_layout()
plt.show()

Number of unique values in NumPromoAccepted is 5


numeric_variables = data[['Income', 'Spending', 'NumTotalPurchases', 'Age']]
corr= numeric_variables.corr()
print(color.BOLD+color.BLUE  +"Correlation between Income', 'Spending', 'NumTotalPurchases', 'Age' is ", "\n"+ color.END)
print(corr,"\n")
print(color.BOLD+color.BLUE  +"Pair plot between Income', 'Spending', 'NumTotalPurchases', 'Age' ", "\n"+ color.END)
sns.pairplot(numeric_variables);

Correlation between Income', 'Spending', 'NumTotalPurchases', 'Age' is  

                     Income  Spending  NumTotalPurchases       Age
Income             1.000000  0.792650           0.741975  0.197848
Spending           0.792650  1.000000           0.820687  0.112838
NumTotalPurchases  0.741975  0.820687           1.000000  0.174618
Age                0.197848  0.112838           0.174618  1.000000 

Pair plot between Income', 'Spending', 'NumTotalPurchases', 'Age'


print(color.BOLD+color.BLUE  +"Correlation between variables  is Days_Registered, Spending, NumTotalPurchases, NumPromoAccepted ", "\n"+ color.END)
print(data[["Days_Registered", "Spending", "NumTotalPurchases", "NumPromoAccepted"]].corr(),"\n")
print(color.BOLD+color.BLUE  +"Pair plot between Days_Registered, Spending, NumTotalPurchases based on NumPromoAccepted", "\n"+ color.END)
sns.pairplot( data,
             x_vars=["Spending", "NumTotalPurchases"],
             y_vars=["Days_Registered"], hue= "NumPromoAccepted",height=5, aspect=1/1, palette="icefire");

Correlation between variables  is Days_Registered, Spending, NumTotalPurchases, NumPromoAccepted  

                   Days_Registered  Spending  NumTotalPurchases  \
Days_Registered           1.000000  0.158814           0.162911   
Spending                  0.158814  1.000000           0.820687   
NumTotalPurchases         0.162911  0.820687           1.000000   
NumPromoAccepted         -0.011366  0.459554           0.307342   

                   NumPromoAccepted  
Days_Registered           -0.011366  
Spending                   0.459554  
NumTotalPurchases          0.307342  
NumPromoAccepted           1.000000   

Pair plot between Days_Registered, Spending, NumTotalPurchases based on NumPromoAccepted


print(color.BOLD+color.BLUE  +"Correlation between variables Income, Spending,NumPromoAccepted", "\n"+ color.END)
print(data[["Income", "Spending","NumPromoAccepted"]].corr(),"\n")
print(color.BOLD+color.BLUE  +"Income vs Spending scatter plot with number of promotions applied", "\n"+ color.END)
sns.pairplot( data, x_vars=["Spending"], y_vars=["Income"],
             hue= "NumPromoAccepted", palette="icefire", height=5, aspect=2.5/1);

Correlation between variables Income, Spending,NumPromoAccepted 

                    Income  Spending  NumPromoAccepted
Income            1.000000  0.792650          0.366384
Spending          0.792650  1.000000          0.459554
NumPromoAccepted  0.366384  0.459554          1.000000 

Income vs Spending scatter plot with number of promotions applied


print(color.BOLD+color.BLUE  +"Total spending on Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds wrt to NumPromoAccepted"+ color.END)
sns.catplot(x="NumPromoAccepted", y="Spending", kind="box", data=data, height=4, aspect=2.5/1, palette='rocket');

Total spending on Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds wrt to NumPromoAccepted


# printing variable name and number of null values if any
print(color.BOLD+color.BLUE +"printing variable name and number of null values if any"+ color.END)
col_name = data.isnull().sum(axis=0).sort_values(ascending = False)
col_name

printing variable name and number of null values if any

Income                 25
ID                      0
Response                0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Children                0
NumWebVisitsMonth       0
Family_Size             0
Relation_status         0
Age                     0
Days_Registered         0
Spending                0
NumTotalPurchases       0
AcceptedCmp3            0
NumStorePurchases       0
Year_Birth              0
MntWines                0
Education               0
Marital_Status          0
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntFruits               0
NumCatalogPurchases     0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumPromoAccepted        0
dtype: int64


data['Income'] = data['Income'].fillna(data.groupby(['Education','Age'])['Income'].transform('mean'))
#data.iloc[missing, :]


data[['Z_CostContact', 'Z_Revenue']].describe()


#Drop redundant variables
to_drop = ['ID', 'Year_Birth', 'Marital_Status', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue']
data = data.drop(to_drop, axis=1)


corr= data.corr()

fig, ax = plt.subplots(figsize=(14,14))

ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, mask = np.abs(corr)<.7,cmap=sns.color_palette("rocket", as_cmap=True), square=True, annot=True, fmt='.2f')
#ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, cmap=sns.color_palette("rocket", as_cmap=True), square=True, annot=True, fmt='.2f')

ax.set_xticklabels(ax.get_xticklabels(), rotation=45,horizontalalignment='right');


#create copy of dataframe for later
data_copy = data.copy()


to_drop = ['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 
           'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
          'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
data = data.drop(to_drop, axis=1)


fig, ax = plt.subplots(figsize=(16,8))

#ax.text(-1, 1.2, 'Correlation of Various variables with Spending', fontsize=15, fontweight='bold', color='blue')
corr_spending = pd.DataFrame(data[data.columns[~data.columns.isin(['Education','Spending'])]].corrwith(data['Spending'])).rename(columns={0:'Correlation'}).sort_values('Correlation',ascending=False)
corr_spending

sns.barplot(x=corr_spending.index, y=corr_spending['Correlation'], ax=ax, palette='rocket')
ax.set_ylabel('Correlation with Spending')

for s in ["top","right"]:
    ax.spines[s].set_visible(False)
    
ax.set_xticklabels(ax.get_xticklabels(), rotation=90);

plt.show()


print(color.BOLD+color.BLUE  +"Correlation of Various variables with Spending"+ color.END)
corr_spending

Correlation of Various variables with Spending


print(color.BOLD+color.BLUE  +"Total spending on Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds wrt to Family_Size"+ color.END)
sns.catplot(x="Family_Size", y="Spending", kind="box", data=data, height=4, aspect=2.5/1, palette='rocket');

Total spending on Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds wrt to Family_Size


print(color.BOLD+color.BLUE  +"Total spending on Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds wrt to #Children"+ color.END)
sns.catplot(x="Children", y="Spending", kind="box", data=data, height=4, aspect=2.5/1, palette='rocket');

Total spending on Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds wrt to #Children


print(color.BOLD+color.BLUE  +"Total spending on Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds wrt to NumWebVisitsMonth"+ color.END)
sns.catplot(x="NumWebVisitsMonth", y="Spending", kind="box", data=data, height=4, aspect=2.5/1, palette='rocket');

Total spending on Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds wrt to NumWebVisitsMonth


# Encode factors of education into binary creating new variables PG_Education and Grad_Education
data["PG_Education"] = np.where(data.Education=='Postgraduate', 1, 0)
data["Grad_Education"] = np.where(data.Education=='Graduate', 1, 0)


#drop categorical variable education
data = data.drop(['Education'], axis=1)


#Scaling Variables using standard scaler
scaler = StandardScaler()
scaler.fit(data)
scaled_data = pd.DataFrame(scaler.transform(data),columns= data.columns )


# based on the number of variables, I will reduce all the variables to 4 principle components

pca = PCA(n_components=4)
pca.fit(scaled_data)
components = pd.DataFrame(pca.transform(scaled_data), columns=(['PC1','PC2','PC3','PC4']))


components.describe().T


fig, ax = plt.subplots(figsize=(15,6))

# Quick examination of elbow method to find numbers of clusters to make.
print(color.BOLD+color.BLUE  +'Elbow method to determine optimal number of clusters:'+ color.END)

X = components.values
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss,color = "Maroon",marker = '*')
plt.title('The Elbow Method', fontsize=15)
plt.xlabel('Number of clusters', fontsize=10)
plt.ylabel('WCSS', fontsize=10)
plt.show()

Elbow method to determine optimal number of clusters:


fig, ax = plt.subplots(figsize=(12,8))

# define the model
KM_model = KMeans(n_clusters=4)
# fit the model
KM_model.fit(X)
# assign a cluster to each example
yhat_KM = KM_model.predict(X)
# retrieve unique clusters
clusters = unique(yhat_KM)

plt.title('K-means Clustering', fontsize=15)

# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat_KM == cluster)
    # create scatter of these samples
    plt.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
plt.show()


#Adding the Clusters feature to the orignal dataframe.
data["Clusters_KM"]= yhat_KM
data_copy["Clusters_KM"]= yhat_KM


fig, ax = plt.subplots(figsize=(12,8))

# define the model
model = AgglomerativeClustering(n_clusters=4)
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)

plt.title(' Agglomerative Clustering', fontsize=15)

# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    plt.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
plt.show()


fig, ax = plt.subplots(figsize=(12,8))

# define the model
model = Birch(threshold=0.01, n_clusters=4)
# fit the model
model.fit(X)
# assign a cluster to each example
yhat = model.predict(X)
# retrieve unique clusters
clusters = unique(yhat)

plt.title('BRICH Clustering', fontsize=15)

# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    plt.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
plt.show()


fig, ax = plt.subplots(figsize=(12,8))

# define the model
GMM_model = GaussianMixture(n_components=4, random_state=1)
#Randomstate 2432 for PCA5 and 1 for PCA4
# fit the model
GMM_model.fit(X)
# assign a cluster to each example
yhat_GMM = GMM_model.predict(X)
# retrieve unique clusters
clusters = unique(yhat_GMM)

plt.title('Gaussian Mixture Model', fontsize=15)

# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat_GMM == cluster)
    # create scatter of these samples
    plt.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
plt.show()


data_copy  = data_copy.sort_values(by=['Clusters_KM'])
data_copy["Clusters_KM"].replace({0: 'Cluster-0',1: 'Cluster-1',2: 'Cluster-2',3: 'Cluster-3'}, inplace=True)


color_list=["tan","maroon", "gold","grey"]


fig, ax1 = plt.subplots(figsize=(10,5))
sns.countplot(x=data_copy["Clusters_KM"], palette=color_list, ax=ax1)
ax1.set_title("Distribution of data points based on K-Means Clustering");


g0 = sns.pairplot(data_copy, x_vars=["Spending"], y_vars=["Income"],
             hue= "Clusters_KM", palette=color_list, height=5, aspect=2.5/1);
g0.fig.suptitle("Income vs Spending scatter plot based on K-Means Clustering", y=1, fontsize=15);


fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5))

sns.boxplot(x= "Clusters_KM", y='Income', data=data_copy, palette=color_list, ax=ax1)
ax1.set_title("Income Distribution based on K-Means Clustering")

sns.boxplot(x= "Clusters_KM", y='Spending', data=data_copy, palette=color_list, ax=ax2)
ax2.set_title("Spending Distribution based on K-Means Clustering");


family = pd.DataFrame(data_copy.groupby(["Clusters_KM"]).agg({'Family_Size': ['min', 'max', 'mean'],'Children': ['min', 'max', 'mean']})).round(0)
family.index.names = ['']


family


print(color.BOLD+color.BLUE  +"A look at family size and number of Children in the family"+ color.END)

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5))

labels = ['Cluster-0', 'Cluster-1', 'Cluster-2', 'Cluster-3']

family.iloc[:,2].plot(ax=ax1, kind='bar', title ="Family_Size", color=['tan','maroon'],legend=True, fontsize=12).set_xticklabels(labels, rotation=0)
ax1.legend(["K-Means", "GMM"]);
ax1.set_yticklabels(['0','','1','','2','','3','','4'])

family.iloc[:,5].plot(ax=ax2, kind='bar', title ="Number of Children", color=['tan','maroon'],legend=False, fontsize=12).set_xticklabels(labels, rotation=0)
ax2.set_yticklabels(['0','','','','1','','','','2'])

fig.tight_layout(pad=2)

plt.show()

A look at family size and number of Children in the family


#family
#pd.DataFrame(data_copy.groupby("Clusters_KM")['Age'].mean()).join(data_copy.groupby("Clusters_GMM")['Age'].mean(), lsuffix='_KM', rsuffix='_GMM').round(0)


kid_KM=pd.DataFrame(data_copy.groupby(['Clusters_KM','Kidhome'])['Kidhome'].count().T.unstack())
teen_KM=pd.DataFrame(data_copy.groupby(['Clusters_KM','Teenhome'])['Teenhome'].count().T.unstack())
relation_KM=pd.DataFrame(data_copy.groupby(['Clusters_KM','Relation_status'])['Relation_status'].count().T.unstack())
kid_KM.index.names = ['']
teen_KM.index.names = ['']
relation_KM.index.names = ['']


members = pd.DataFrame(kid_KM).join(teen_KM,lsuffix='_Kids', rsuffix='_Teens')
members = pd.DataFrame(members).join(relation_KM, rsuffix='_relation')
members


print(color.BOLD+color.BLUE  +"Who are in the family"+ color.END)

fig, ((ax1, ax2),(ax3, ax4)) = plt.subplots(2,2, figsize=(12,10))

labels = ['Cluster-0', 'Cluster-1', 'Cluster-2', 'Cluster-3']

members.iloc[:,[0,1]].plot(ax=ax1, kind='bar', title ="Is there a Kid at home", color=['tan','maroon'],legend=True, fontsize=12).set_xticklabels(labels, rotation=0)
ax1.legend(["No Kids", "Kids"])
ax1.set_yticklabels(['0','','','','','1']);

members.iloc[:,[2,3]].plot(ax=ax2, kind='bar', title ="Is there a teen at home", color=['tan','maroon'],legend=True, fontsize=12).set_xticklabels(labels, rotation=0)
ax2.legend(["No Teens", "Teens"])
ax2.set_yticklabels(['0','','','','','1']);

members.iloc[:,[2,3]].plot(ax=ax3, kind='bar', title ="Couples or Single", color=['tan','maroon'],legend=True, fontsize=12).set_xticklabels(labels, rotation=0)
ax3.legend(["Single","Couples"])
ax3.set_yticklabels(['0','','','','','1']);

sns.boxplot(x= "Clusters_KM", y='Age', data=data_copy, palette=color_list, ax=ax4)
ax4.set_title("Age Distribution based on K-Means Clustering")

#ax4.axis('off')
fig.tight_layout(pad=2)

plt.show()

Who are in the family


def sum_by_cluster(list_cols):
    return pd.DataFrame(data_copy.groupby(["Clusters_KM"])[list_cols].sum())


list_cols = [ 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']


purchases_clusters = sum_by_cluster(list_cols).reset_index().drop('Clusters_KM', axis=1).T


purchases_clusters


print(color.BOLD+color.BLUE  +"Total spending on Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds based on clusters"+ color.END)

labels = ['Wines', 'Fruits', 'Meat', 'Fish', 'Sweet', 'Gold']
color_list2=["maroon", "gold","grey","salmon","tan","olive"]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4,1, figsize=(12,16))

purchases_clusters.iloc[:,0].plot(ax=ax1, kind='bar', title ="Family_Size",legend=False, fontsize=12, color=color_list2).set_xticklabels(labels, rotation=0)
purchases_clusters.iloc[:,1].plot(ax=ax2, kind='bar', title ="Family_Size",legend=False, fontsize=12, color=color_list2).set_xticklabels(labels, rotation=0)
purchases_clusters.iloc[:,2].plot(ax=ax3, kind='bar', title ="Family_Size",legend=False, fontsize=12, color=color_list2).set_xticklabels(labels, rotation=0)
purchases_clusters.iloc[:,3].plot(ax=ax4, kind='bar', title ="Family_Size",legend=False, fontsize=12, color=color_list2).set_xticklabels(labels, rotation=0)

fig.tight_layout(pad=2)

plt.show()

Total spending on Wines, Fruits, MeatProducts, FishProducts, SweetProducts, GoldProds based on clusters


deals_KM=pd.DataFrame(data_copy.groupby(['Clusters_KM'])[['NumDealsPurchases','NumTotalPurchases','Days_Registered','Recency']].mean().round(0))

deals_KM.index.names = ['']


print(color.BOLD+color.BLUE  +"Customer response towards promotions for all clusters"+ color.END)

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(12,8))

labels = ['Cluster-0', 'Cluster-1', 'Cluster-2', 'Cluster-3']

deals_KM.iloc[:,0].plot(ax=ax1, kind='bar', title ="Number of purchases made with a discount.", color=color_list,legend=False, fontsize=12).set_xticklabels(labels, rotation=0)
ax1.set_yticklabels(['0','','1','','2','','3','','4'])
deals_KM.iloc[:,1].plot(ax=ax3, kind='bar', title ="Purchases made though various sources", color=color_list,legend=False, fontsize=12).set_xticklabels(labels, rotation=0)
ax3.set_yticklabels(['0','','5','','10','','15','','20'])
deals_KM.iloc[:,2].plot(ax=ax4, kind='bar', title ="Days since customer registered with store", color=color_list,legend=False, fontsize=12).set_xticklabels(labels, rotation=0)
deals_KM.iloc[:,3].plot(ax=ax2, kind='bar', title ="Number of days since last visit", color=color_list,legend=False, fontsize=12).set_xticklabels(labels, rotation=0)


fig.tight_layout(pad=2)

plt.show()

Customer response towards promotions for all clusters


PromoAccepted_KM=pd.DataFrame(data_copy.groupby(['Clusters_KM','NumPromoAccepted'])['NumPromoAccepted'].sum().T.unstack())
Response_KM=pd.DataFrame(data_copy.groupby(['Clusters_KM','Response'])['Response'].count().T.unstack())
Complain_KM=pd.DataFrame(data_copy.groupby(['Clusters_KM','Complain'])['Complain'].count().T.unstack())

PromoAccepted_KM.index.names = ['']
Response_KM.index.names = ['']
Complain_KM.index.names = ['']


print(color.BOLD+color.BLUE  +"Response to promotions and complaint status"+ color.END)

fig, ((ax1, ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(15,10))

labels = ['Cluster-0', 'Cluster-1', 'Cluster-2', 'Cluster-3']

PromoAccepted_KM.plot(ax=ax1, kind='bar', title ="How many promotions did the customer accept", color=color_list2,legend=True, fontsize=12).set_xticklabels(labels, rotation=0)
ax1.legend(["Never", "1 time", "2 times", "3 times", "4 times"]);

Response_KM.plot(ax=ax2, kind='bar', title ="Customer response to last promotion", color=['tan','maroon'],legend=True, fontsize=12).set_xticklabels(labels, rotation=0)
ax2.legend(["No response", "Responded"]);

Complain_KM.plot(ax=ax3, kind='bar', title ="Did customer complain in last 2 years", color=['tan','maroon'],legend=True, fontsize=12).set_xticklabels(labels, rotation=0)
ax3.legend(["No complain", "Complained"]);

ax4.axis('off');

Response to promotions and complaint status

	count	mean	std	min	25%	50%	75%	max
PC1	2240.0	1.982541e-17	2.158664	-5.859272	-1.454416	0.273159	1.798085	6.393371
PC2	2240.0	1.506731e-17	1.528503	-3.830168	-1.167739	0.090980	1.105131	5.501350
PC3	2240.0	4.559845e-18	1.335365	-2.821604	-1.252532	0.320393	1.163306	2.873853
PC4	2240.0	2.755732e-17	1.264388	-2.901653	-0.917062	-0.169902	0.724398	4.970224

	0	1	2	3
MntWines	55265	283817	22840	318894
MntFruits	2576	19605	4526	32210
MntMeatProducts	19404	99151	18094	237319
MntFishProducts	4021	25947	7027	47062
MntSweetProducts	3005	19867	4627	33122
MntGoldProds	10284	37765	12164	38396

	ID	Year_Birth	Education	Marital_Status	Income	Kidhome	Teenhome	Dt_Customer	Recency	MntWines	...	NumWebVisitsMonth	Z_CostContact	Z_Revenue	Response
0	5524	1957	Graduation	Single	58138.0	0	0	04-09-2012	58	635	...	7	3	11	1
1	2174	1954	Graduation	Single	46344.0	1	1	08-03-2014	38	11	...	5	3	11	0
2	4141	1965	Graduation	Together	71613.0	0	0	21-08-2013	26	426	...	4	3	11	0
3	6182	1984	Graduation	Together	26646.0	1	0	10-02-2014	26	11	...	6	3	11	0
4	5324	1981	PhD	Married	58293.0	1	0	19-01-2014	94	173	...	5	3	11	0

	MntWines	MntFruits	MntMeatProducts	MntFishProducts	MntSweetProducts	MntGoldProds
count	2240.000000	2240.000000	2240.000000	2240.000000	2240.000000	2240.000000
mean	303.935714	26.302232	166.950000	37.525446	27.062946	44.021875
std	336.597393	39.773434	225.715373	54.628979	41.280498	52.167439
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	23.750000	1.000000	16.000000	3.000000	1.000000	9.000000
50%	173.500000	8.000000	67.000000	12.000000	8.000000	24.000000
75%	504.250000	33.000000	232.000000	50.000000	33.000000	56.000000
max	1493.000000	199.000000	1725.000000	259.000000	263.000000	362.000000

	ID	Year_Birth	Education	Marital_Status	Income	Kidhome	Teenhome	Dt_Customer	Recency	MntWines	...	Complain	Z_CostContact	Z_Revenue	Children	Family_Size	Relation_status	Age
192	7829	1900	Undergraduate	Divorced	36640.0	1	0	26-09-2013	99	15	...	1	3	11	1	2	0	122
239	11004	1893	Undergraduate	Single	60182.0	0	1	17-05-2014	23	8	...	0	3	11	1	2	0	129
339	1150	1899	Postgraduate	Together	83532.0	0	0	26-09-2013	36	755	...	0	3	11	0	2	1	123

	Correlation
NumTotalPurchases	0.820687
Income	0.787799
NumPromoAccepted	0.459554
Response	0.265298
Days_Registered	0.158814
Age	0.112838
Recency	0.020433
Relation_status	-0.017744
Complain	-0.037058
NumDealsPurchases	-0.065112
Teenhome	-0.149480
Family_Size	-0.422907
Children	-0.498888
NumWebVisitsMonth	-0.500218
Kidhome	-0.573386

	Family_Size			Children
	min	max	mean	min	max	mean

Cluster-0	2	5	4.0	1	3	2.0
Cluster-1	2	5	3.0	0	3	1.0
Cluster-2	1	4	2.0	0	2	1.0
Cluster-3	1	3	2.0	0	1	0.0

	0_Kids	1_Kids	0_Teens	1_Teens	0	1

Cluster-0	38	424	15	447	142	320
Cluster-1	587	44	40	591	196	435
Cluster-2	154	473	592	35	243	384
Cluster-3	514	6	511	9	215	305

CUSTOMER SEGMENTATION¶

Please vote up and share your feedback in the comment box, if you like this notebook.¶

TABLE OF CONTENTS

Load Required Libraries

Load Data and Over View

Load Dataset ¶

Data Overview ¶

Exploratory Data Analysis & Data Cleaning

Create Visualization for Categorical Variables ¶

Creating New Variables

Children¶

Family_Size¶

Teenhome and Kidhome¶

Relation_status¶

Education¶

Create Visualization for Numeric Variables ¶

Income¶

Recency¶

Variables under Purchase Source and NumDealsPurchases¶

Variables under Promotion¶

variables under Purchases¶

Creating New variables

Age¶

Days_Registered¶

Spending¶

NumTotalPurchases¶

NumPromoAccepted¶

Multi Variate Analysis¶

Missing Value Treatment¶

missing = data[data['Income'].isna()].index¶

Drop Redundant Variables¶

Correlation heat map between Various variables¶

Correlation of Various variables with Spending¶

Data Preprocessing

Encoding categorical Variables¶

Scaling Variables using standard scaler¶

Dimensionality Reduction with PCA

Clustering

Finding K¶

K-Means Clustering¶

Agglomerative Clustering¶

BIRCH¶

BRICH or Balanced Iterative Reducing and Clustering using Hierarchies¶

Gaussian Mixture Model¶

Comparing Clustering models with 4 and 5 Principle Components¶

Extracting Cluster Characteristics

Distribution of data points between clusters¶

Income vs Spending scatter plot based on K-Means Clusters¶

Family Size, Age ¶

Spending in various categories¶

Customer behavior: Promotions, Complains & Registration

Customer Profiling

Conclusion