Creating a new column based on whether the date value is present between two date ranges¶
In [244]:
import numpy as np
import pandas as pd
import datetime
from datetime import timedelta
from datetime import date
Creating Dummy data¶
In [243]:
date_val = pd.Series(['20/09/2020','22/09/2020','25/09/2020','15/10/2020','20/11/2020','25/12/2020'])
PID=['A','A','A','A','B','B']
metric_val = pd.Series([1,2,3,4,5,6])
df=pd.DataFrame([PID,date_val,metric_val]).T
df.columns=['PID','COLLDATE','Mertic_Val']
# Adding reference date
df['RAND_DATE']='15/09/2020'
df
Out[243]:
PID | COLLDATE | Mertic_Val | RAND_DATE | |
---|---|---|---|---|
0 | A | 20/09/2020 | 1 | 15/09/2020 |
1 | A | 22/09/2020 | 2 | 15/09/2020 |
2 | A | 25/09/2020 | 3 | 15/09/2020 |
3 | A | 15/10/2020 | 4 | 15/09/2020 |
4 | B | 20/11/2020 | 5 | 15/09/2020 |
5 | B | 25/12/2020 | 6 | 15/09/2020 |
Checking data type¶
In [149]:
df.dtypes
Out[149]:
PID object COLLDATE object Mertic_Val object RAND_DATE object dtype: object
Converting dates into proper date format¶
In [245]:
interim_df=df[['COLLDATE','RAND_DATE']].apply(lambda x: pd.to_datetime(x))
interim_df['Metric_Val']=df['Mertic_Val']
interim_df['PID']=df['PID']
interim_df.dtypes
Out[245]:
COLLDATE datetime64[ns] RAND_DATE datetime64[ns] Metric_Val object PID object dtype: object
In [207]:
interim_df
Out[207]:
COLLDATE | RAND_DATE | Metric_Val | PID | |
---|---|---|---|---|
0 | 2020-09-20 | 2020-09-15 | 1 | A |
1 | 2020-09-22 | 2020-09-15 | 2 | A |
2 | 2020-09-25 | 2020-09-15 | 3 | A |
3 | 2020-10-15 | 2020-09-15 | 4 | A |
4 | 2020-11-20 | 2020-09-15 | 5 | B |
5 | 2020-12-25 | 2020-09-15 | 6 | B |
We have to check whether Date_Val is between reference_date + days ranges.¶
For this we will create additional columns, that way it will be easy¶
In [246]:
interim_df['RAND_DATE_Plus_20']=interim_df['RAND_DATE'] + timedelta(days=20)
interim_df['RAND_DATE_Plus_40']=interim_df['RAND_DATE'] + timedelta(days=40)
interim_df['RAND_DATE_Plus_70']=interim_df['RAND_DATE'] + timedelta(days=70)
interim_df
Out[246]:
COLLDATE | RAND_DATE | Metric_Val | PID | RAND_DATE_Plus_20 | RAND_DATE_Plus_40 | RAND_DATE_Plus_70 | |
---|---|---|---|---|---|---|---|
0 | 2020-09-20 | 2020-09-15 | 1 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 |
1 | 2020-09-22 | 2020-09-15 | 2 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 |
2 | 2020-09-25 | 2020-09-15 | 3 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 |
3 | 2020-10-15 | 2020-09-15 | 4 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 |
4 | 2020-11-20 | 2020-09-15 | 5 | B | 2020-10-05 | 2020-10-25 | 2020-11-24 |
5 | 2020-12-25 | 2020-09-15 | 6 | B | 2020-10-05 | 2020-10-25 | 2020-11-24 |
We will now create the Flag¶
In [224]:
def max_cntr(interim_df):
test_df=interim_df
nm1=[i for i in test_df.columns if "RAND_DATE" in i]
nm2=[i for i in test_df.columns if "COLLDATE" in i]
y=test_df[nm1]
x=test_df[nm2]
val0=pd.Series([i for i in (x - y).dt.days])
val1=pd.Series(np.where(val0 > 0,1,0))
test_df.loc[:,['val1']]=val1
test_df['G'] = test_df.groupby('PID').val1.apply(lambda x :(x.diff().ne(0)&x==1)|x==1)
Cntr=test_df.groupby([test_df.G.cumsum()]).G.apply(lambda x : (~x).cumsum())
test_df['Cntr']=Cntr
test_df2=test_df.merge(test_df.groupby(['PID'])['Cntr'].max().reset_index(),on="PID",how="left")
test_df2['Cntr_Flag']=np.where(test_df2['Cntr_x']==test_df2['Cntr_y'],True,False)
z=test_df2['Cntr_Flag']
return(list(z))
In [257]:
pos_0_20_1 = interim_df['COLLDATE'] >= interim_df['RAND_DATE']
pos_0_20_2 = interim_df['COLLDATE'] <= interim_df['RAND_DATE_Plus_20']
pos_0_20 = np.array(pos_0_20_1) & np.array(pos_0_20_2)
pos_0_20
Out[257]:
array([ True, True, True, False, False, False])
In [263]:
val0=[i for i in ( interim_df['COLLDATE'] - interim_df['RAND_DATE_Plus_20'] ).dt.days]
val0
#np.where(val0)
Out[263]:
[-15, -13, -10, 10, 46, 81]
In [258]:
pos_20_40_1 = interim_df['COLLDATE'] >= interim_df['RAND_DATE_Plus_20']
pos_20_40_2 = interim_df['COLLDATE'] <= interim_df['RAND_DATE_Plus_40']
pos_20_40 = np.array(pos_20_40_1) & np.array(pos_20_40_2)
pos_20_40
Out[258]:
array([False, False, False, True, False, False])
In [259]:
pos_40_70_1 = interim_df['COLLDATE'] >= interim_df['RAND_DATE_Plus_40']
pos_40_70_2 = interim_df['COLLDATE'] <= interim_df['RAND_DATE_Plus_70']
pos_40_70 = np.array(pos_40_70_1) & np.array(pos_40_70_2)
pos_40_70
Out[259]:
array([False, False, False, False, True, False])
In [213]:
i="Metric"
In [260]:
interim_df['Flag']=np.where(pos_0_20, i + " at Day 20",
np.where(pos_20_40, i + " at Day 40",
np.where(pos_40_70, i + " at Day 70","No")))
interim_df
Out[260]:
COLLDATE | RAND_DATE | Metric_Val | PID | RAND_DATE_Plus_20 | RAND_DATE_Plus_40 | RAND_DATE_Plus_70 | Flag | |
---|---|---|---|---|---|---|---|---|
0 | 2020-09-20 | 2020-09-15 | 1 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 20 |
1 | 2020-09-22 | 2020-09-15 | 2 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 20 |
2 | 2020-09-25 | 2020-09-15 | 3 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 20 |
3 | 2020-10-15 | 2020-09-15 | 4 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 40 |
4 | 2020-11-20 | 2020-09-15 | 5 | B | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 70 |
5 | 2020-12-25 | 2020-09-15 | 6 | B | 2020-10-05 | 2020-10-25 | 2020-11-24 | No |
Creating Counter to determine which Flag is closest to Rand dates plus days¶
In [281]:
bool0=["Day 20" in i for i in interim_df['Flag']]
bool1=["Day 40" in i for i in interim_df['Flag']]
bool2=["Day 70" in i for i in interim_df['Flag']]
val0=[i for i in ( interim_df['COLLDATE'] - interim_df['RAND_DATE_Plus_20'] ).dt.days]
val1=[i for i in ( interim_df['COLLDATE'] - interim_df['RAND_DATE_Plus_40'] ).dt.days]
val2=[i for i in ( interim_df['COLLDATE'] - interim_df['RAND_DATE_Plus_70'] ).dt.days]
In [282]:
interim_df['Diff_Days']=np.where(bool0,val0,
np.where(bool1,val1,
np.where(bool2,val2,0)))
interim_df
Out[282]:
COLLDATE | RAND_DATE | Metric_Val | PID | RAND_DATE_Plus_20 | RAND_DATE_Plus_40 | RAND_DATE_Plus_70 | Flag | Diff_Days | |
---|---|---|---|---|---|---|---|---|---|
0 | 2020-09-20 | 2020-09-15 | 1 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 20 | -15 |
1 | 2020-09-22 | 2020-09-15 | 2 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 20 | -13 |
2 | 2020-09-25 | 2020-09-15 | 3 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 20 | -10 |
3 | 2020-10-15 | 2020-09-15 | 4 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 40 | -10 |
4 | 2020-11-20 | 2020-09-15 | 5 | B | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 70 | -4 |
5 | 2020-12-25 | 2020-09-15 | 6 | B | 2020-10-05 | 2020-10-25 | 2020-11-24 | No | 0 |
In [305]:
interim_df['val1']=np.where(interim_df['Diff_Days'] > 0,1,0)
interim_df['G'] = interim_df.groupby(['PID','val1']).val1.apply(lambda x :(x.diff().ne(0)&x==1)|x==1)
Cntr=interim_df.groupby([interim_df.PID,interim_df.val1,interim_df.G.cumsum()]).G.apply(lambda x : (~x).cumsum())
interim_df['Cntr']=Cntr
interim_df
test_df2=interim_df.merge(interim_df.groupby(['PID','Flag'])['Cntr'].max().reset_index(),on=["PID","Flag"],how="left")
test_df2['Cntr_Flag']=np.where(test_df2['Cntr_x']==test_df2['Cntr_y'],True,False)
test_df2['Cntr_Flag']
test_df2.drop(columns=['val1','G','Cntr_x','Cntr_y'],inplace=True)
test_df2
Out[305]:
COLLDATE | RAND_DATE | Metric_Val | PID | RAND_DATE_Plus_20 | RAND_DATE_Plus_40 | RAND_DATE_Plus_70 | Flag | Diff_Days | Cntr_Flag | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-09-20 | 2020-09-15 | 1 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 20 | -15 | False |
1 | 2020-09-22 | 2020-09-15 | 2 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 20 | -13 | False |
2 | 2020-09-25 | 2020-09-15 | 3 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 20 | -10 | True |
3 | 2020-10-15 | 2020-09-15 | 4 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 40 | -10 | True |
4 | 2020-11-20 | 2020-09-15 | 5 | B | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 70 | -4 | True |
5 | 2020-12-25 | 2020-09-15 | 6 | B | 2020-10-05 | 2020-10-25 | 2020-11-24 | No | 0 | True |
In [308]:
# Filtering Cntr_Flag as True
interim_df2=test_df2.loc[test_df2['Cntr_Flag']==True,]
interim_df2
Out[308]:
COLLDATE | RAND_DATE | Metric_Val | PID | RAND_DATE_Plus_20 | RAND_DATE_Plus_40 | RAND_DATE_Plus_70 | Flag | Diff_Days | Cntr_Flag | |
---|---|---|---|---|---|---|---|---|---|---|
2 | 2020-09-25 | 2020-09-15 | 3 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 20 | -10 | True |
3 | 2020-10-15 | 2020-09-15 | 4 | A | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 40 | -10 | True |
4 | 2020-11-20 | 2020-09-15 | 5 | B | 2020-10-05 | 2020-10-25 | 2020-11-24 | Metric at Day 70 | -4 | True |
5 | 2020-12-25 | 2020-09-15 | 6 | B | 2020-10-05 | 2020-10-25 | 2020-11-24 | No | 0 | True |
Pivoting to get values in Flag as columns¶
In [310]:
final_df=interim_df2.pivot(index=['PID'],columns='Flag', values='Metric_Val').reset_index()
final_df
Out[310]:
Flag | PID | Metric at Day 20 | Metric at Day 40 | Metric at Day 70 | No |
---|---|---|---|---|---|
0 | A | 3 | 4 | NaN | NaN |
1 | B | NaN | NaN | 5 | 6 |
In [81]:
final_df.drop(columns=['No'],inplace=True)
In [311]:
final_df
Out[311]:
Flag | PID | Metric at Day 20 | Metric at Day 40 | Metric at Day 70 | No |
---|---|---|---|---|---|
0 | A | 3 | 4 | NaN | NaN |
1 | B | NaN | NaN | 5 | 6 |