Outlier Detection and Handling Outliers

Overview

Teaching: 130 min
Exercises: 130 min
Questions
  • What methods are available for identifying outliers?

  • How can we implement outlier detection method?

  • How to deal with the detected outliers?”

Objectives
  • Develop the ability to detect unusual observations that deviate significantly from the bulk of the data and understand strategies for addressing them.

Graphical Outlier Detection

Statistical Outlier Detection

Machine learning Models

Handling Outliers

Importing Libraries

# Import pandas and numpy
import pandas as pd
import numpy as np

# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Zscore
from scipy import stats

# Outlier Detection
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# Import the datetime class from the datetime module
from pandas.tseries.offsets import MonthEnd

# Interactive Widgets
import ipywidgets as widgets

# Display libraries
from IPython.display import display

from PIL import Image

Import the data

Data

gdpreal = pd.read_csv('gdpreal.csv')
gdpreal.head()

Convert the ‘Quarter’ column to datetime format and adjust for quarter-end

gdpreal['Quarter'] = pd.PeriodIndex(gdpreal['Quarter'], freq='Q').to_timestamp()
gdpreal['Quarter'] = gdpreal['Quarter'] + MonthEnd(3)

# Convert 'RealGDP' to float (ensure clean data)
gdpreal['RealGDP'] = gdpreal['RealGDP'].str.replace(',', '').astype(float)

gdpreal.head()

Plot the Real GDP data

plt.figure(figsize=(12, 6))
plt.plot(
    gdpreal['Quarter'],
    gdpreal['RealGDP'],
    marker='o',
    linestyle='-',
    color='blue',
    label='Real GDP (Billion Naira)'
)

# Add titles and labels
plt.title('Nigeria Real GDP (2010 Q1 - 2024 Q3)', fontsize=16)
plt.xlabel('Quarter', fontsize=12)
plt.ylabel('Real GDP (Billion Naira)', fontsize=12)

# Improve y-axis ticks
plt.yticks(fontsize=10)  # Reduce frequency and size of y-ticks
plt.ticklabel_format(axis='y', style='plain')  # Prevent scientific notation on the y-axis

# Grid, legend, and layout
plt.grid(True, linestyle='--', alpha=0.6)
plt.xticks(rotation=45)
plt.legend(fontsize=12)
plt.tight_layout()

# Show the plot
plt.show()

Graphical Outlier Detection: Box Plot

# Drop the Date column
gdpreal_d = gdpreal.drop(columns=['Quarter'])
gdpreal_d.head()

def plot_boxplot(column):
    plt.figure(figsize=(10, 6))
    gdpreal_d.boxplot(column=column, color='b', patch_artist=True, figsize=(8, 6),
                      grid= 1, fontsize=12)
    plt.title(f'Box Plot of {column}')
    plt.ylabel(column)
    plt.show()

# Create a dropdown widget for selecting the column
column_selector = widgets.Dropdown(
    options=gdpreal_d.columns,
    description='Column:',
    disabled=False,
)

# Link the dropdown widget to the plot_boxplot function
interactive_plot = widgets.interactive_output(plot_boxplot, {'column': column_selector})

# Display the widget and the interactive plot
display(column_selector, interactive_plot)

Statistical Outlier Detection: Interquartile Range (IQR) Method

def detect_outliers_IQR(data, column_names):
    # Initialize a dictionary to store the count of outliers for each column
    outlier_counts = {}
    
    # Iterate over each column name provided in the column_names list
    for column_name in column_names:
        # Calculate the first quartile (Q1) for the column
        Q1 = data[column_name].quantile(0.25)
        # Calculate the third quartile (Q3) for the column
        Q3 = data[column_name].quantile(0.75)
        # Calculate the Interquartile Range (IQR)
        IQR = Q3 - Q1

        # Calculate the lower bound for outliers
        lower_bound = Q1 - 1.5 * IQR
        # Calculate the upper bound for outliers
        upper_bound = Q3 + 1.5 * IQR

        # Create a new column 'IQR_Outlier' to mark outliers (1 if outlier, 0 otherwise)
        data['IQR_Outlier'] = ((data[column_name] < lower_bound) | (data[column_name] > upper_bound)).astype(int)

        # Filter the data to get only the rows that are outliers
        iqr_outliers = data[data['IQR_Outlier'] == 1]
        
        # Count the number of outliers in the current column
        outlier_count = iqr_outliers.shape[0]
        
        # Store the count of outliers in the dictionary
        outlier_counts[column_name] = outlier_count
        
        # Print the outliers detected in the current column
        print(f"Outliers detected by Interquartile Range (IQR) in {column_name}:")
        print(iqr_outliers)
        print("-----------------------------")  

    # Report the total counts of outliers detected for each column
    print("Total outlier counts detected by Interquartile Range (IQR) Method:")
    for column_name, count in outlier_counts.items():
        print(f"{column_name}: {count}")

Detect outliers in data using IQR Method

df_IQR = gdpreal_d.copy()

data_col_bilate = df_IQR.columns

# Convert the data column to a list
data_col_list = data_col_bilate.tolist()

detect_outliers_IQR(df_IQR, data_col_list)

Statistical Outlier Detection: Z-Score Method

def detect_outliers_zscore(data, column_names):
    # Initialize a dictionary to store the count of outliers for each column
    outlier_counts = {}
    
    # Z-Score Method
    for column_name in column_names:
        # Calculate the z-scores for the column
        z_scores = np.abs(stats.zscore(data[column_name]))
        # Create a new column 'ZScore_Outlier' to mark outliers (1 if outlier, 0 otherwise)
        data['ZScore_Outlier'] = (z_scores > 3).astype(int)
        # Filter the data to get only the rows that are outliers
        z_outliers = data[data['ZScore_Outlier'] == 1]
        
        # Count the number of outliers in the current column
        outlier_count = z_outliers.shape[0]
        # Store the count of outliers in the dictionary
        outlier_counts[column_name] = outlier_count
        
        # Print the outliers detected in the current column
        print(f"Outliers detected by Z-Score in {column_name}:")
        print(z_outliers)
        print("-----------------------------")
    
    # Report the total counts of outliers detected for each column
    print("Total outlier counts detected by Z-score Method:")
    for column_name, count in outlier_counts.items():
        print(f"{column_name}: {count}")

Detect outliers in data using Z-Score Method

df_zscore = gdpreal_d.copy()
detect_outliers_zscore(df_zscore, data_col_list)

Isolation Forest

How Isolation Forest Works:

Initialize the Isolation Forest model

iso_forest = IsolationForest(contamination=0.01, random_state=42)

def detect_outliers_IF(data, column_names):
    # Initialize a dictionary to store the count of outliers for each column
    outlier_counts = {}
    
    # Iterate over each column name provided in the column_names list
    for column_name in column_names:

        # Fit the Isolation Forest model and predict outliers for the current column
        data['IFOutlier'] = iso_forest.fit_predict(data[[column_name]])

        # Filter the data to get only the rows that are outliers (predicted as -1)
        outliers = data[data['IFOutlier'] == -1]
        
        # Count the number of outliers in the current column
        outlier_count = outliers.shape[0]
        
        # Store the count of outliers in the dictionary
        outlier_counts[column_name] = outlier_count
        
        # Print the outliers detected in the current column
        print(f"Outliers detected in {column_name}:")
        print(outliers)
        print("-----------------------------")
    
    # Report the total counts of outliers detected for each column
    print("Total outlier counts detected by Isolation Forest:")
    for column_name, count in outlier_counts.items():
        print(f"{column_name}: {count}")

Detect outliers in data using Isolation Forest Method

df_IF= gdpreal_d.copy()

detect_outliers_IF(df_IF, data_col_list)

Local Outlier Factor

lof1

lof2

Initialize the Local Outlier Factor (LOF) model

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01)

def detect_outliers_LOF(data, column_names):

    # Initialize a dictionary to store the count of outliers for each column
    outlier_counts = {}
    
    # Iterate over each column name provided in the column_names list
    for column_name in column_names:

        # Fit the LOF model and predict outliers for the current column
        lof_labels = lof.fit_predict(data[[column_name]])

        # Create a new column 'LOF_Outlier' to mark outliers (-1 if outlier, 1 otherwise)
        data['LOF_Outlier'] = lof_labels

        # Filter the data to get only the rows that are outliers (predicted as -1)
        lof_outliers = data[data['LOF_Outlier'] == -1]
        
        # Count the number of outliers in the current column
        outlier_count = lof_outliers.shape[0]
        
        # Store the count of outliers in the dictionary
        outlier_counts[column_name] = outlier_count
        
        # Print the outliers detected in the current column
        print(f"Outliers detected by Local Outlier Factor in {column_name}:")
        print(lof_outliers)
        print("-----------------------------")
    
    # Report the total counts of outliers detected for each column
    print("Total outlier counts detected by Local Outlier Factor:")
    for column_name, count in outlier_counts.items():
        print(f"{column_name}: {count}")

Detect outliers in data using Local Outlier Factor Method

df_LOF= gdpreal_d.copy()
detect_outliers_LOF(df_LOF, data_col_list)

Handling Outliers

Capping

df_out = gdpreal_d.copy()
data_col_list = df_out.columns
# convert the column names to a list
# data_col_list = ['CPYC' ]  

for column in data_col_list:
    percentile_5 = df_out[column].quantile(0.05)
    percentile_95 = df_out[column].quantile(0.95)
    
    # Capping the values
    df_out[column] = df_out[column].clip(lower=percentile_5, upper=percentile_95)

Summary statistics of the data after capping

df_out.describe().T

Box Plot after capping

def plot_boxplot(column):
    plt.figure(figsize=(10, 6))
    df_out.boxplot(column=column, color='b', patch_artist=True, figsize=(8, 6),
                      grid= 1, fontsize=12)
    plt.title(f'Box Plot of {column}')
    plt.ylabel(column)
    plt.show()

# Create a dropdown widget for selecting the column
column_selector = widgets.Dropdown(
    options=df_out.columns,
    description='Column:',
    disabled=False,
)

# Link the dropdown widget to the plot_boxplot function
interactive_plot = widgets.interactive_output(plot_boxplot, {'column': column_selector})

# Display the widget and the interactive plot
display(column_selector, interactive_plot)

Key Points

  • Data quality directly impacts the predictive power of models.

  • Using measures such as z-scores or interquartile ranges to flag outliers.

  • Visual techniques (box plots, histograms, scatter plots) to identify unusual observations.

  • Whether to remove, cap, or transform outliers. The trade-offs between losing information and ensuring model robustness.

Copyright © 2024 UNECA-ACS

Contact