# Function for scoring Categorical Column using HBOS
def Hbos_cat(col_nm):
df=pd.DataFrame(col_nm,columns=['Feature'])
df1=(df['Feature'].value_counts()/max(df['Feature'].value_counts())).reset_index()
df['Index_val']=range(0,df.shape[0])
df1.columns=['Feature','Proportion']
df2=pd.merge(df,df1,how='inner',on='Feature').sort_values('Index_val')
df3=pd.concat([df2['Feature'],np.log(df2['Proportion'])],axis=1)['Proportion']
return(df3)
# The result of the scores will be stored in Hbos_score
Hbos_score=Hbos_cat(df['Feature'])
# Now we need to create a method for handling numeric column
# lets have the values stored in x
x=[12,11,10,9,8,10,11,14,17,20,50,60,70]
# First: Identify the total records in the numeric variable
N=len(x)
# Second: decide the number of bins to divide the data
# k<-round(sqrt(N))
k=math.sqrt(N)
# Next see the value of N/k:records in each group
records_Each_Group=round(N/k)
hb=pd.DataFrame(x)
hb.columns=['x']
hb['ID1']=range(1,len(x)+1)
hb=hb.sort_values('x')
hb['ID2']=range(1,len(x)+1)
hb['Group']=['G'+str(int(x)+1) for x in hb['ID2'] / (records_Each_Group+1) ]
g=hb.groupby('Group')['x']
# Max function for calculating the highest within each group
# https://pandas.pydata.org/pandas-docs/stable/groupby.html
# Link for using group by and apply
def max_f(group):
return(pd.DataFrame({'original':group,'Highest':group.max()})['Highest'])
hb['Highest']=g.apply(max_f)
# Min function for calculating the Lowest within each group
def min_f(group):
return(pd.DataFrame({'original':group,'Lowest':group.min()})['Lowest'])
hb['Lowest']=g.apply(min_f)
# Creating the difference column for Highest and Lowest
# If Highest - Lowest =0, then its value will be equal to 1
# Creating the function usnig lambda
cond_diff=lambda x,y: x-y if x!=y else 1
# Checking it on dummy data
list(map(cond_diff,[1,2],[1,3]))
hb['Diff_Flag']=list(map(cond_diff,hb['Highest'],hb['Lowest']))
# Now calculating the height of each bin by dividing the records in each group by Diff_Flag
hb['Height']=records_Each_Group/hb['Diff_Flag']
# Calculating the calibrated hieght based on zero height values
cond_diff2=lambda x: x if x!=0 else 1
hb['Height2']=list(map(cond_diff2,hb['Height']))
# Normalising the Height2 by dividing it by max(Height2)
hb['Normalised_Hieght']=hb['Height2']/max(hb['Height2'])
# Calculating the log of hb['Normalised_Hieght']
hb['hb_Score']=np.log(hb['Normalised_Hieght'])
# Now sorting the data frame based on ID1
hb_final=hb.sort_values('ID1')['hb_Score']