Visual evaluation of 9 common missing values imputation methods

4 min readDec 23, 2023

In this article we will compare imputation methods used to handle missing values in continuous quantitative variable ‘Age’ in well known Titanic df

# import packages
import pandas as pd
import numpy as np
import seaborn as sns
from miceforest import ImputationKernel
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
%matplotlib inline
pd.options.mode.chained_assignment = None
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# read the data
df = pd.read_excel(r'C..\Titanic-Dataset.xlsx')

Initial distribution by heatmap of missing values in Titanic df

plt.figure(figsize=(16, 6))
ax = plt.axes()
sns.heatmap(df.isna().transpose(), cbar=False, 
ax=ax, cmap=sns.cubehelix_palette(as_cmap=True))
plt.title('Missing Values', fontsize=25)
plt.yticks(rotation=0, fontsize = 18)
plt.xticks(rotation=80)
plt.show()

‘Cabin’ has too many missing values, so it is not really feasible to impute them, while ‘Age’ column has less than 19 % (to check it run: df[‘Age’].isna().sum()/df.shape[0]). As a rule of thumb, the imputation is reasonable if column has less than 20 % of missing data. It is debatable though and depends on stakeholders opinion of how to proceed with missing data..

Lets do some quick data preprocessing to substitute ‘Male/Female’ in ‘Sex’ column with something more computer friendly..

def sex_to_int(x):
    '''
    impute sex values 1/0
    '''
    if x == 'female':
        return 1
    else:
        return 0
    
df['Sex'] = df['Sex'].apply(lambda x: sex_to_int(x))

Here is the final df we will use to impute ‘Age’ column. ‘Pclass’, ‘Sex’, ‘SibSp’ and ‘Fare’ will be possibly used to find a potential correlation with ‘Age’.

Now we will take a look at the methods, those include imputation using mean, median, mode, random, regression, mice, outliers and custom feature such as ‘Pclass’, ‘Sex’, ‘SibSp’ and ‘Fare’.

def impute_nan_mean(df, variable):
    '''
    used for MCAR
    '''
    mean = df[variable].mean()
    df[variable +'_mean'] = df[variable].fillna(mean)
    
    return df


def impute_nan_mode(df, variable):
    '''
    used for MCAR
    '''
    mode = df[variable].mode()
    df[variable +'_mode'] = df[variable].fillna(int(df['Age'].mode()))
    
    return df


def impute_nan_median(df, variable):
    '''
    used for MCAR
    '''
    median = df[variable].median()
    df[variable +'_median'] = df[variable].fillna(median)
    
    return df


def random_sample_imputation(df):
    '''
    used for MCAR
    '''
    cols_with_missing_values = df.columns[df.isna().any()].tolist()
    
    for var in cols_with_missing_values:
        
        df[var+"_random"]=df[var]

        # extract a random sample
        random_sample_df = df[var].dropna().sample(df[var].isnull().sum(),
                                                      random_state=0)
        # re-index the randomly extracted sample
        random_sample_df.index = df[
                df[var].isnull()].index

        # replace the NA
        df.loc[df[var].isnull(), var+'_random'] = random_sample_df
        
        return df

    
def mice_imputation(df):
    '''
    used for MAR, change df to have only X variables
    '''
    df_work = df.copy()
    
    # fit regression model using ImputationKernel
    mice_kernel = ImputationKernel(data = df_work, 
            save_all_iterations = True,random_state = 2023)
    
    # iterate 2 times and impute columns
    mice_kernel.mice(2)
    df_work = mice_kernel.complete_data()
    df['Age_mice'] = df_work['Age']
    
    return df
   
    
def impute_KNN(df, variable):
    '''
    used for MAR, change df to have only X variables
    '''
    # fit regression model using Bayesian Ridge
    knn_imputer = KNNImputer(n_neighbors=3)
    inputt = df[variable].values.reshape(-1,1)
    
    # impute missing values
    res = knn_imputer.fit_transform(inputt)
    
    # substitute imputed values for missing values
    df[variable + '_KNN'] = res 
    
    return df


def impute_by_Sex_and_Pclass(df):
    '''
    used for MAR
    '''
    grouped = df.groupby(['Sex','Pclass'])  
    
    # view the median Age by the grouped features 
    grouped.Age.median()
    
    # apply the grouped median value on the Age NaN
    df['Age_Sex_Pclass'] = grouped.Age.apply(lambda x: x.fillna(x.median()))
    
    return df

   
def impute_Age_by_extreme_Age_values(df):
    '''
    used for MAR
    '''
    # get outliers values
    extreme = df.Age.mean() + 3*df.Age.std()
    df['Age_extreme'] = df['Age'].fillna(extreme)
    
    return df

def regression_imputation(df):
    '''
    used for MAR
    '''
    # identify variables with missing data
    missing_vars = ['Age']

    # identify variables to use as predictors
    predictor_vars = ['Pclass', 'SibSp']

    # fit regression model using Bayesian Ridge
    imputer = IterativeImputer(estimator=BayesianRidge())

    # impute missing values
    imputed_data = imputer.fit_transform(df[predictor_vars + missing_vars])

    # substitute imputed values for missing values
    df['Age_Ridge'] = imputed_data[:, -len(missing_vars):]
    
    return df

The best way to compare something is to visualize it

def missing_to_visual(df):

    # initialize the figure style
    plt.figure(figsize = (13,13))
    plt.rcParams['axes.facecolor'] = 'white'

    # multiple line plot
    num=0
    for column in df.columns[5:]:

        num+=1

        # find the right spot on the plot
        plt.subplot(3,3, num)
        plt.xticks([], [])
        plt.yticks([], [])
        plt.grid(None)

        # plot the lineplot
        sns.kdeplot(data=df, x=column,fill=True, color ="red",alpha=.4, 
                                                        linewidth=0)
        sns.kdeplot(data=df, x="Age",fill=True, color="blue",alpha=.2, 
                                                          linewidth=0)
        # add title
        plt.title('Histogram of {} vs Age'.format(column), y=-0.1, 
                              fontsize=14, fontweight=1, color= 'black')

    # show the graph
    plt.show()

Conclusion:

Interestingly, the ‘Age’ imputed by random imputer showed the closest overlap with the original ‘Age’ distribution. To be honest, I have anticipated that such methods as Regression, KNN or MICE will do better because they specifically design to be used for data missing at random (MAR) which is the case for ‘Age’ feature because ‘Age’ is partially dependent upon ‘Sex’ and ‘Pclass’. The main problem for the data scientist and even for stakeholders is that we do not really know whether the data is missing completely at random, at random or not at random, so it is important to still compare different imputation method to select the one that best fits the original distribution of the feature with missing values.

Visual evaluation of 9 common missing values imputation methods

Written by Valery Liamtsau

No responses yet