Visual evaluation of 9 common missing values imputation methods

Valery Liamtsau
4 min readDec 23, 2023

--

In this article we will compare imputation methods used to handle missing values in continuous quantitative variable ‘Age’ in well known Titanic df

# import packages
import pandas as pd
import numpy as np
import seaborn as sns
from miceforest import ImputationKernel
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
%matplotlib inline
pd.options.mode.chained_assignment = None
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# read the data
df = pd.read_excel(r'C..\Titanic-Dataset.xlsx')

Initial distribution by heatmap of missing values in Titanic df

plt.figure(figsize=(16, 6))
ax = plt.axes()
sns.heatmap(df.isna().transpose(), cbar=False,
ax=ax, cmap=sns.cubehelix_palette(as_cmap=True))
plt.title('Missing Values', fontsize=25)
plt.yticks(rotation=0, fontsize = 18)
plt.xticks(rotation=80)
plt.show()

‘Cabin’ has too many missing values, so it is not really feasible to impute them, while ‘Age’ column has less than 19 % (to check it run: df[‘Age’].isna().sum()/df.shape[0]). As a rule of thumb, the imputation is reasonable if column has less than 20 % of missing data. It is debatable though and depends on stakeholders opinion of how to proceed with missing data..

Lets do some quick data preprocessing to substitute ‘Male/Female’ in ‘Sex’ column with something more computer friendly..

def sex_to_int(x):
'''
impute sex values 1/0
'''
if x == 'female':
return 1
else:
return 0

df['Sex'] = df['Sex'].apply(lambda x: sex_to_int(x))

Here is the final df we will use to impute ‘Age’ column. ‘Pclass’, ‘Sex’, ‘SibSp’ and ‘Fare’ will be possibly used to find a potential correlation with ‘Age’.

Now we will take a look at the methods, those include imputation using mean, median, mode, random, regression, mice, outliers and custom feature such as ‘Pclass’, ‘Sex’, ‘SibSp’ and ‘Fare’.

def impute_nan_mean(df, variable):
'''
used for MCAR
'''
mean = df[variable].mean()
df[variable +'_mean'] = df[variable].fillna(mean)

return df


def impute_nan_mode(df, variable):
'''
used for MCAR
'''
mode = df[variable].mode()
df[variable +'_mode'] = df[variable].fillna(int(df['Age'].mode()))

return df


def impute_nan_median(df, variable):
'''
used for MCAR
'''
median = df[variable].median()
df[variable +'_median'] = df[variable].fillna(median)

return df


def random_sample_imputation(df):
'''
used for MCAR
'''
cols_with_missing_values = df.columns[df.isna().any()].tolist()

for var in cols_with_missing_values:

df[var+"_random"]=df[var]

# extract a random sample
random_sample_df = df[var].dropna().sample(df[var].isnull().sum(),
random_state=0)
# re-index the randomly extracted sample
random_sample_df.index = df[
df[var].isnull()].index

# replace the NA
df.loc[df[var].isnull(), var+'_random'] = random_sample_df

return df


def mice_imputation(df):
'''
used for MAR, change df to have only X variables
'''
df_work = df.copy()

# fit regression model using ImputationKernel
mice_kernel = ImputationKernel(data = df_work,
save_all_iterations = True,random_state = 2023)

# iterate 2 times and impute columns
mice_kernel.mice(2)
df_work = mice_kernel.complete_data()
df['Age_mice'] = df_work['Age']

return df


def impute_KNN(df, variable):
'''
used for MAR, change df to have only X variables
'''
# fit regression model using Bayesian Ridge
knn_imputer = KNNImputer(n_neighbors=3)
inputt = df[variable].values.reshape(-1,1)

# impute missing values
res = knn_imputer.fit_transform(inputt)

# substitute imputed values for missing values
df[variable + '_KNN'] = res

return df


def impute_by_Sex_and_Pclass(df):
'''
used for MAR
'''
grouped = df.groupby(['Sex','Pclass'])

# view the median Age by the grouped features
grouped.Age.median()

# apply the grouped median value on the Age NaN
df['Age_Sex_Pclass'] = grouped.Age.apply(lambda x: x.fillna(x.median()))

return df


def impute_Age_by_extreme_Age_values(df):
'''
used for MAR
'''
# get outliers values
extreme = df.Age.mean() + 3*df.Age.std()
df['Age_extreme'] = df['Age'].fillna(extreme)

return df

def regression_imputation(df):
'''
used for MAR
'''
# identify variables with missing data
missing_vars = ['Age']

# identify variables to use as predictors
predictor_vars = ['Pclass', 'SibSp']

# fit regression model using Bayesian Ridge
imputer = IterativeImputer(estimator=BayesianRidge())

# impute missing values
imputed_data = imputer.fit_transform(df[predictor_vars + missing_vars])

# substitute imputed values for missing values
df['Age_Ridge'] = imputed_data[:, -len(missing_vars):]

return df

The best way to compare something is to visualize it

def missing_to_visual(df):

# initialize the figure style
plt.figure(figsize = (13,13))
plt.rcParams['axes.facecolor'] = 'white'

# multiple line plot
num=0
for column in df.columns[5:]:

num+=1

# find the right spot on the plot
plt.subplot(3,3, num)
plt.xticks([], [])
plt.yticks([], [])
plt.grid(None)

# plot the lineplot
sns.kdeplot(data=df, x=column,fill=True, color ="red",alpha=.4,
linewidth=0)
sns.kdeplot(data=df, x="Age",fill=True, color="blue",alpha=.2,
linewidth=0)
# add title
plt.title('Histogram of {} vs Age'.format(column), y=-0.1,
fontsize=14, fontweight=1, color= 'black')

# show the graph
plt.show()

Conclusion:

Interestingly, the ‘Age’ imputed by random imputer showed the closest overlap with the original ‘Age’ distribution. To be honest, I have anticipated that such methods as Regression, KNN or MICE will do better because they specifically design to be used for data missing at random (MAR) which is the case for ‘Age’ feature because ‘Age’ is partially dependent upon ‘Sex’ and ‘Pclass’. The main problem for the data scientist and even for stakeholders is that we do not really know whether the data is missing completely at random, at random or not at random, so it is important to still compare different imputation method to select the one that best fits the original distribution of the feature with missing values.

--

--

No responses yet