In this Notebook, we will describe how to deal with outliers
#Importing the dataset
import pandas as p
import numpy as n
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_boston
import warnings
warnings.filterwarnings('ignore')
boston=load_boston() #it is stored as dictionary
df= p.DataFrame(boston['data'],columns=boston['feature_names'])
df.head()

sns.distplot(df['RM'])

#As we can see outliers
sns.boxplot(df['RM'])

Trimming outliers from the dataset
def outliers(data):
IQR=data.quantile(0.75)-data.quantile(0.25)
lr=data.quantile(0.25)-(1.5*IQR) #lower range
hr=data.quantile(0.70)+(1.5*IQR) #higher range
return data.loc[~(n.where(data<lr,True,n.where(data>hr,True,False)))]
outliers(df['RM']) #as we csn there is no outliers

sns.boxplot(outliers(df['RM']))

#We can find outlier with using mean and standard deviation in case of IQR
def outliers(data,k):
lr=data.mean()-(data.std()*k) #where n is number
hr=data.mean()+(data.std()*k)
return data.loc[~(n.where(data<lr,True,n.where(data>hr,True,False)))]
outliers(df['RM'],1.5)

Performing winsorization
Winsorizing is different from trimming because the extreme values are not removed, but are instead replaced by
other values. Data greater than quantile 90 percent is replaced by value at 90 quantiles similarly less then
quantile 5 percent is replaced by a value at 5 quantiles.
def fn(data,lw,h):
lr=data.quantile(lw)
hr=data.quantile(h)
return n.where(data<lr,lr,n.where(data>hr,hr,data))
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
sns.boxplot(df['RM'])
plt.subplot(1,2,2)
sns.boxplot(fn(df['RM'],0.05,0.95))

Capping the variable at arbitrary maximum and minimum values
Similarly to winsorization, we can replace the extreme values with values closer to other values in the variable, by determining the
maximum and minimum boundaries with the mean plus or minus the standard deviation, or the inter-quartile range proximity
rule.
def outliers(data):
IQR=data.quantile(0.75)-data.quantile(0.25)
lr=data.quantile(0.25)-(1.5*IQR) #lower range
hr=data.quantile(0.75)+(1.5*IQR) #higher range
return n.where(data<lr,lr,n.where(data>hr,hr,data))
def outliers_mean(data,k):
lr=data.mean()-(data.std()*k) #where k is number
hr=data.mean()+(data.std()*k)
return n.where(data<lr,lr,n.where(data>hr,hr,data))
sns.boxplot(outliers(df['CRIM']))

sns.boxplot(df['CRIM'])

plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
sns.kdeplot(df['CRIM'])
plt.subplot(1,2,2)
sns.kdeplot(outliers(df['CRIM']))

There is one problem in this and winsorization when we have outliers in one tail so it will create the bump that is shown in this
graph.
Performing zero-coding – capping the variable at zero
Zero-coding is a variant of bottom-coding and refers to the process of capping, usually the lower value of the variable, at zero. It
is commonly used for variables that cannot take negative values, such as age or income.
#creating a dummy dataset
n.random.seed(29)
x=n.random.randn(200)
y=n.random.randn(200)*2
z=n.random.randn(200)*6+5
df.head()

df.min() #minmum values are negative

plt.figure(figsize=(15,5))
df.hist(bins= 30)

#Replacing the negative values with zeros
df['x'].loc[df['x']<0]=0
df['y'].loc[df['y']<0]=0
df['z'].loc[df['z']<0]=0
plt.figure(figsize=(15,5))
df.hist(bins= 30)

Now we can see an increase in the number of zero values present
Important Notice for college students
If you’re a college student and have skills in programming languages, Want to earn through blogging? Mail us at geekycomail@gmail.com
For more Programming related blogs Visit Us at Geekycodes. Follow us on Instagram.