from IPython.core.display import display, HTML
display(HTML("<style>.container { width:103% !important;margin-left: -2%; }</style>"))
Introduction¶
Customers are the single most important entity in any value chain.Everything and anything under the sun is designed to please customer so that they buy products,use medicines/drugs and what not. They are the ones who actually move the needle and hence it is important to understand what is their different touchpoints with the external stimulus.In other words, how exactly they move in their lets say shopping experience helps companies fine tune the touchpoints to better identify selling opportunities.In this blog, we will take a small dummy example to understand how different customers go through the marketing campaigns driven by a company
We will be using plotly library to create the journey plots or the Sankey Charts.The main task here is to create the dataset that will be used as an input for creating Sankey Chart
import pandas as pd
import numpy as np
import random
Mocking up Input Data¶
individual = np.repeat(list(range(1,21)),5)
timperiod = ['0_week','18_week','36_week','54_week','72_week']*20
Type = np.random.choice(['Email','Freebees','Coupon','Freebees','Vouchers'],100)
df = pd.DataFrame(np.c_[individual,timperiod,Type])
df.columns = ['Individual','timperiod','Type']
df.head()
- There are 20 customers in this data
- Each customer is targeted in 0,18,36,54 and 72nd Week
- Email,Vouchers,Freebees and Coupons are the channels used to target customers
from IPython.display import Image
Image(filename="basic_workflow.jpg")
Brief description of the steps are :
- Step 1:Creating the data at individual, time and channel level which is sorted on time
- Step 2:Compressing the channel(separated by |) at individual level
- Step 3:Adding source and target fields based on sequence
- Step 4:Padding source and target fields based on sequence number
individual = np.repeat(list(range(1,21)),5)
print(timperiod)
Step 1:Creating the data at individual,time and channel level which is sorted on time¶
We already have the data at the said granularity.If the data is not in the desired form, we should use aggregation
Step 2:Compressing the channel¶
df.head()
df['Promo_Combined'] = df.groupby('Individual')['Type'].transform(lambda x: '|'.join(x))
df2 = df[['Individual','Promo_Combined']].drop_duplicates()
d0 = pd.DataFrame(df2['Promo_Combined'])
d0.head()
d1=pd.DataFrame(d0.groupby('Promo_Combined').size())
d1.reset_index(inplace=True)
d1.columns=[['Promo_Combined','Total_Records']]
d1.head()
d1=d0.groupby('Promo_Combined').size().reset_index(name='Total_Records')
d1.head()
Step 3:Adding source and target¶
For every compressed channel at an individual level, create source and target fields by traversing through the length of the compressed value and assign source and target values in each iteration
Logic building using single individual example¶
test=d1['Promo_Combined'][:1]
test
Getting individual components
path_descr=test[0].split("|")
path_descr
Getting length of the element
path_len=len(test[0].split("|"))
path_len
Total_Count=pd.Series(d1['Total_Records'][:1])
Now we have to traverse through path_descr and as we go along we will create source and target values where the first element in each iteration will be the source and next element will be the target
l1=[]
for i in range(0,path_len):
if(path_len!=1 and i!=(path_len-1)):
source=pd.Series(str(i+1) + "_" + path_descr[i])
target=pd.Series(str(i+2) + "_" + path_descr[i+1])
Total_Count=pd.Series(d1['Total_Records'][:1])
interim_df0=pd.DataFrame(np.c_[source,target,Total_Count],columns = ["source","target","Total_Count"])
l1.append(interim_df0)
else:
source=pd.Series(str(i+1) + "_" + path_descr[i])
target=pd.Series("End of Activity")
Total_Count=pd.Series(d1['Total_Records'][:1])
interim_df0=pd.DataFrame(np.c_[source,target,Total_Count],columns = ["source","target","Total_Count"])
l1.append(interim_df0)
interim_df=pd.concat(l1)
interim_df.head()
We saw how the compressed sequence '['Coupon', 'Freebees', 'Freebees', 'Email', 'Freebees']' was converted into source and target data frame
d1.shape[0]
Repeating the above process for all sequences in d1¶
l1=[]
for j in range(0,d1.shape[0]):
test=d1['Promo_Combined'][j]
path_descr=test.split("|")
path_len=len(test.split("|"))
l2=[]
for i in range(0,path_len):
if(path_len!=1 and i!=(path_len-1)):
source=pd.Series(str(i+1) + "_" + path_descr[i])
target=pd.Series(str(i+2) + "_" + path_descr[i+1])
Total_Count=pd.Series(d1['Total_Records'][:1])
interim_df0=pd.DataFrame(np.c_[source,target,Total_Count],columns = ["source","target","Total_Count"])
l2.append(interim_df0)
else:
source=pd.Series(str(i+1) + "_" + path_descr[i])
target=pd.Series("End of Activity")
Total_Count=pd.Series(d1['Total_Records'][:1])
interim_df0=pd.DataFrame(np.c_[source,target,Total_Count],columns = ["source","target","Total_Count"])
l2.append(interim_df0)
interim_df=pd.concat(l2)
l1.append(interim_df)
interim_df2=pd.concat(l1)
interim_df2.head(10)
Step 4:Padding up source and target¶
The steps in the code have been labelled as # 1, # 2,# 3 and # 4.Their description is as below
- 1: Removing NA values
- 2:Aggregating total customer count at source and target level
- 3:Creating source padded variable as Actual_source
- 4:Creating target padded variable as Actual_target
1: Removing NA values¶
interim_df3=interim_df2.dropna()
interim_df3.head()
2:Aggregating total customer count at source and target level¶
interim_df4=interim_df3.groupby(['source','target'])['Total_Count'].sum().reset_index(name='Total_Records')
interim_df4.head()
def space_padding(x):
y=x.split("_")[0]
try:
y1=int(y)
y2=y1*"_"
z=y2 + x.split("_")[1]
except:
z=y
return(z)
# Testing the function
space_padding("3_Vouchers")
Adding '_' padding source¶
source_ls=list(interim_df4['source'])
Actual_source=[space_padding(x) for x in source_ls]
print(Actual_source)
Adding '_' padding target¶
target_ls=list(interim_df4['target'])
Actual_target=[space_padding(x) for x in target_ls]
print(Actual_target)
interim_df4['Actual_source']=Actual_source
interim_df4['Actual_target']=Actual_target
interim_df4.head()
interim_df4['Actual_source2'] = interim_df4['Actual_source'].apply(lambda x: x.replace("_", " "))
interim_df4['Actual_target2'] = interim_df4['Actual_target'].apply(lambda x: x.replace("_", " "))
interim_df4.head(10)
Step 5:Setting up Node and label for Sankey Chart¶
lb_source=list(np.unique(np.array(interim_df4['Actual_source2'])))
lb_target=list(np.unique(np.array(interim_df4['Actual_target2'])))
lb_combined=list(np.unique(np.array(lb_source + lb_target)))
print(lb_combined)
Step 6:Setting up Node number and color for nodes¶
node_number=list(range(len(lb_combined)))
node_number
actual_label=[x.lstrip() for x in lb_combined]
print(actual_label)
lb_df=pd.DataFrame(np.c_[lb_combined,actual_label,node_number])
lb_df.columns=['label','actual_label','node_number']
lb_df.head()
color=np.where(lb_df.actual_label == 'Coupon',"blue",
np.where(lb_df.actual_label == 'Email',"red",
np.where(lb_df.actual_label == 'Freebees',"orange",
np.where(lb_df.actual_label == 'Vouchers',"grey","black"))))
lb_df['color']=color
lb_df.head()
Step 7:Creating the Sankey Chart¶
interim_df4.head(2)
lb_df.head(2)
Getting node number for source labels¶
lb_df=lb_df.rename({'label': 'Actual_source2'}, axis=1)
lb_df_interim=lb_df[['Actual_source2','node_number']]
interim_df5=interim_df4.merge(lb_df_interim)
interim_df5=interim_df5.rename({'node_number': 'source_label'}, axis=1)
interim_df5.head()
Getting node number for target labels¶
lb_df=lb_df.rename({'Actual_source2': 'Actual_target2'}, axis=1)
lb_df.head()
lb_df_interim=lb_df[['Actual_target2','node_number']]
interim_df6=interim_df5.merge(lb_df_interim)
interim_df6=interim_df6.rename({'node_number': 'target_label'}, axis=1)
interim_df6.head()
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode()
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 20,
line = dict(color = "black", width = 0.5),
label = lb_combined,
color = list(lb_df['color'])
),
link = dict(
source = list(interim_df6['source_label']),
target = list(interim_df6['target_label']),
value = list(interim_df6['Total_Records'])
))])
fig.update_layout(title_text="Customer Marketing Journey", font_size=10)
fig.show()
No comments:
Post a Comment