In [176]:
import pandas as pd
import numpy as np
Q1:Calculate the number of characters in each word in a series¶
In [177]:
s1=pd.Series(['abc','abcd','abcde'])
s1
Out[177]:
0 abc 1 abcd 2 abcde dtype: object
In [6]:
s1_len=[len(i) for i in s1]
s1_len
Out[6]:
[3, 4, 5]
We can also use map and lambda to get the length¶
In [8]:
s1_lenv2=s1.map(lambda x: len(x))
s1_lenv2
Out[8]:
0 3 1 4 2 5 dtype: int64
Q2:Subtract mean from each number in a series¶
In [15]:
s2=pd.Series([1,2,3])
s2
Out[15]:
0 1 1 2 2 3 dtype: int64
In [16]:
s2.map(lambda x: x-np.mean(s2))
Out[16]:
0 -1.0 1 0.0 2 1.0 dtype: float64
Q3:Compute difference of consequtive numbers of a series¶
In [19]:
s1=pd.Series([1,2,3,4,5])
s1
Out[19]:
0 1 1 2 2 3 3 4 4 5 dtype: int64
In [21]:
s2=s1.shift(1)
s2
Out[21]:
0 NaN 1 1.0 2 2.0 3 3.0 4 4.0 dtype: float64
In [22]:
s3=s2-s1
s3
Out[22]:
0 NaN 1 -1.0 2 -1.0 3 -1.0 4 -1.0 dtype: float64
It can also be done using diff¶
In [24]:
s3=s1.diff()
s3
Out[24]:
0 NaN 1 1.0 2 1.0 3 1.0 4 1.0 dtype: float64
Q4:Convert a series of date-strings to a timeseries¶
Solution 1:Using datatime function in pandas¶
In [41]:
s1=pd.Series(['20 Jan 2022','05/02/2018','05-02-1987','20220302','2015-06-30T20:13'])
s1
Out[41]:
0 20 Jan 2022 1 05/02/2018 2 05-02-1987 3 20220302 4 2015-06-30T20:13 dtype: object
In [42]:
pd.to_datetime(s1)
Out[42]:
0 2022-01-20 00:00:00 1 2018-05-02 00:00:00 2 1987-05-02 00:00:00 3 2022-03-02 00:00:00 4 2015-06-30 20:13:00 dtype: datetime64[ns]
Solution 2: Using dateutil parser¶
In [37]:
import dateutil
from dateutil.parser import parse
In [38]:
s1.map(lambda x: parse(x))
Out[38]:
0 2022-01-20 1 2018-05-02 2 1987-05-02 dtype: datetime64[ns]
Q5: Day of month, week number, day of year and day of week from a series of date strings¶
In [43]:
s1=pd.Series(['20 Jan 2022','05/02/2018','05-02-1987','20220302','2015-06-30T20:13'])
s1
Out[43]:
0 20 Jan 2022 1 05/02/2018 2 05-02-1987 3 20220302 4 2015-06-30T20:13 dtype: object
In [51]:
s2=pd.to_datetime(s1)
s2
Out[51]:
0 2022-01-20 00:00:00 1 2018-05-02 00:00:00 2 1987-05-02 00:00:00 3 2022-03-02 00:00:00 4 2015-06-30 20:13:00 dtype: datetime64[ns]
Month Number¶
In [57]:
s2.dt.month
Out[57]:
0 1 1 5 2 5 3 3 4 6 dtype: int64
Month Name¶
In [63]:
s2.dt.month_name()
Out[63]:
0 January 1 May 2 May 3 March 4 June dtype: object
year¶
In [64]:
s2.dt.year
Out[64]:
0 2022 1 2018 2 1987 3 2022 4 2015 dtype: int64
Date¶
In [66]:
s2.dt.day
Out[66]:
0 20 1 2 2 2 3 2 4 30 dtype: int64
Q6: Convert year-month string to dates corresponding to the 4th day of the month¶
In [73]:
s1=pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
s1
Out[73]:
0 Jan 2010 1 Feb 2011 2 Mar 2012 dtype: object
Solution 1:Using parse function¶
In [78]:
s1.map(lambda x: parse('04 ' + x))
Out[78]:
0 2010-01-04 1 2011-02-04 2 2012-03-04 dtype: datetime64[ns]
Solution 2:Using pandas datetime function¶
In [80]:
s1.map(lambda x: pd.to_datetime('04 ' + x))
Out[80]:
0 2010-01-04 1 2011-02-04 2 2012-03-04 dtype: datetime64[ns]
Q7: Get the mean of a series grouped by another series¶
In [81]:
s1=pd.Series(np.random.rand(10))
s1
Out[81]:
0 0.970426 1 0.174701 2 0.034530 3 0.339069 4 0.418222 5 0.072938 6 0.302780 7 0.911173 8 0.156089 9 0.344069 dtype: float64
In [82]:
s2=pd.Series(np.random.choice(['A','B','C'],10))
s2
Out[82]:
0 A 1 A 2 C 3 C 4 A 5 A 6 B 7 A 8 C 9 B dtype: object
In [86]:
df=pd.DataFrame([s1,s2]).T
df.columns=['Values','Categories']
df
Out[86]:
Values | Categories | |
---|---|---|
0 | 0.970426 | A |
1 | 0.174701 | A |
2 | 0.03453 | C |
3 | 0.339069 | C |
4 | 0.418222 | A |
5 | 0.072938 | A |
6 | 0.30278 | B |
7 | 0.911173 | A |
8 | 0.156089 | C |
9 | 0.344069 | B |
In [92]:
df.groupby('Categories').mean().reset_index()
Out[92]:
Categories | Values | |
---|---|---|
0 | A | 0.509492 |
1 | B | 0.323424 |
2 | C | 0.176563 |
Q8: Replace missing spaces in a string with the least frequent character¶
In [93]:
my_str = 'dbc deb abed gade'
my_str
Out[93]:
'dbc deb abed gade'
In [98]:
my_str_individual=[i for i in my_str]
my_str_individual
Out[98]:
['d', 'b', 'c', ' ', 'd', 'e', 'b', ' ', 'a', 'b', 'e', 'd', ' ', 'g', 'a', 'd', 'e']
In [105]:
df=pd.DataFrame(my_str_individual)
df.columns=['String']
df
Out[105]:
String | |
---|---|
0 | d |
1 | b |
2 | c |
3 | |
4 | d |
5 | e |
6 | b |
7 | |
8 | a |
9 | b |
10 | e |
11 | d |
12 | |
13 | g |
14 | a |
15 | d |
16 | e |
Solution 1:Finding least frequent item and then replacing it using position based repalcement¶
In [117]:
df2=df.groupby('String').size().reset_index()
df2.columns=['String','Count']
df3=df2.sort_values('Count')
df3
Out[117]:
String | Count | |
---|---|---|
3 | c | 1 |
6 | g | 1 |
1 | a | 2 |
0 | 3 | |
2 | b | 3 |
5 | e | 3 |
4 | d | 4 |
Taking c as the alphabet as it is least frequent¶
In [131]:
required_alphabet=list(df3['String'])[0]
required_alphabet
Out[131]:
'c'
Replacing space with 'c'¶
In [147]:
print(my_str_individual)
['d', 'b', 'c', ' ', 'd', 'e', 'b', ' ', 'a', 'b', 'e', 'd', ' ', 'g', 'a', 'd', 'e']
In [165]:
s1=pd.Series(my_str_individual)
s1
Out[165]:
0 d 1 b 2 c 3 4 d 5 e 6 b 7 8 a 9 b 10 e 11 d 12 13 g 14 a 15 d 16 e dtype: object
In [166]:
pos=s1.isin([" "])
pos
Out[166]:
0 False 1 False 2 False 3 True 4 False 5 False 6 False 7 True 8 False 9 False 10 False 11 False 12 True 13 False 14 False 15 False 16 False dtype: bool
In [167]:
s1[pos]=required_alphabet
In [168]:
s1
Out[168]:
0 d 1 b 2 c 3 c 4 d 5 e 6 b 7 c 8 a 9 b 10 e 11 d 12 c 13 g 14 a 15 d 16 e dtype: object
In [175]:
s2=list(s1)
s3="".join(s2)
s3
Out[175]:
'dbccdebcabedcgade'
Solution 2: Using replace function¶
In [170]:
s2="".join(s1.replace(' ', required_alphabet))
s2
Out[170]:
'dbccdebcabedcgade'
Q10: Count the number of missing values in each column¶
In [199]:
df=pd.DataFrame([[" ","B"," "],[1,2," "]]).T
df.columns=['Name','Salary']
df
# So there are two missing values in Name and one missing value in Salary
Out[199]:
Name | Salary | |
---|---|---|
0 | 1 | |
1 | B | 2 |
2 |
lets use a small example to see how it can be done and then we will use apply function with lambda to perform it on all the columns¶
In [188]:
s1=pd.Series([1,2," "," "])
s1
Out[188]:
0 1 1 2 2 3 dtype: object
The above series has two missing values¶
In [197]:
total_count = s1[s1==" "].count()
total_count
Out[197]:
2
In [200]:
df.apply(lambda x : x[x==" "].count())
Out[200]:
Name 2 Salary 1 dtype: int64