Monday, September 6, 2021

Blog 5 Introduction to Pandas DataFrame

Pandas Basics
In [176]:
#Adjusting the cell width and margin
In [184]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 100%; }
    div#notebook-container    { margin-left: -2.8%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))
In [148]:
# Importing the libraries
In [1]:
import pandas as pd
import numpy as np
In [ ]:
# Lets create a dummy data frame and
# review the functions
# associated with a data frame 
In [76]:
df_m = pd.DataFrame(np.array([[1,2,3],[10,11,12]]), columns=['col1','col2','col3'])
df_m
Out[76]:
col1 col2 col3
0 1 2 3
1 10 11 12
In [120]:
# Number of rows and columns.The output is in the form
# of a tuple
# Two rows and 4 columns
df_m.shape
Out[120]:
(2, 4)
In [121]:
# Get number of rows
df_m.shape[0]
Out[121]:
2
In [122]:
# Get number of columns
df_m.shape[1]
Out[122]:
4
In [77]:
# Lets look at the top record
df_m.head(1)
Out[77]:
col1 col2 col3
0 1 2 3
In [78]:
# Lets look at the bottom record
df_m.tail(1)
Out[78]:
col1 col2 col3
1 10 11 12
In [81]:
# Change column names
df_m.columns=['Col3','Col4','Col5']
df_m
Out[81]:
Col3 Col4 Col5
0 1 2 3
1 10 11 12
In [82]:
# Data Types of data frame
df_m.dtypes
Out[82]:
Col3    int32
Col4    int32
Col5    int32
dtype: object
In [ ]:
# Summarising the data using describe function
In [83]:
df_m.describe()
Out[83]:
Col3 Col4 Col5
count 2.000000 2.000000 2.000000
mean 5.500000 6.500000 7.500000
std 6.363961 6.363961 6.363961
min 1.000000 2.000000 3.000000
25% 3.250000 4.250000 5.250000
50% 5.500000 6.500000 7.500000
75% 7.750000 8.750000 9.750000
max 10.000000 11.000000 12.000000
In [86]:
# Accessing the index of the data frame
list(df_m.index)
Out[86]:
[0, 1]
In [87]:
# Changing the index of the data frame
df_m.index = ['Ind1','Ind2']
In [18]:
df_m
Out[18]:
Col3 Col4
Ind1 1 2
Ind2 3 4
In [88]:
# Accessing single column(results in a series)
# Series is the building block of dataframe
s1=df_m['Col3']
type(s1)
Out[88]:
pandas.core.series.Series
In [89]:
# Adding a data frame
df_m['Name']=['Roger','Nadal']
df_m
Out[89]:
Col3 Col4 Col5 Name
Ind1 1 2 3 Roger
Ind2 10 11 12 Nadal
In [90]:
df_m.describe()
# Describe works only for Numeric columns
Out[90]:
Col3 Col4 Col5
count 2.000000 2.000000 2.000000
mean 5.500000 6.500000 7.500000
std 6.363961 6.363961 6.363961
min 1.000000 2.000000 3.000000
25% 3.250000 4.250000 5.250000
50% 5.500000 6.500000 7.500000
75% 7.750000 8.750000 9.750000
max 10.000000 11.000000 12.000000
In [91]:
# Accessing Multiple column
df_m2=df_m[['Col3','Name']]
df_m2
Out[91]:
Col3 Name
Ind1 1 Roger
Ind2 10 Nadal
In [92]:
# Returns a series
type(df_m['Name'])
Out[92]:
pandas.core.series.Series
In [93]:
# Returns a DataFrame
type(df_m[['Name']])
Out[93]:
pandas.core.frame.DataFrame
In [94]:
# Selecting rows using numerical values/slices
# The below selects data frame records 
# from 0 to 1 rows
df_m[:2]
Out[94]:
Col3 Col4 Col5 Name
Ind1 1 2 3 Roger
Ind2 10 11 12 Nadal
In [95]:
# The below selects data frame records 
# from 0th row
df_m[:1]
Out[95]:
Col3 Col4 Col5 Name
Ind1 1 2 3 Roger
In [97]:
# Filtering a dataset
pos1=list(df_m['Name']=="Roger")
pos1
Out[97]:
[True, False]
In [98]:
df_m[pos1]
Out[98]:
Col3 Col4 Col5 Name
Ind1 1 2 3 Roger
In [99]:
# Filtering a dataset based on Multiple
# conditions
pos1=list(df_m['Name']=="Roger")
pos1
Out[99]:
[True, False]
In [100]:
pos2=list(df_m['Col3']==1)
pos2
Out[100]:
[True, False]
In [105]:
pos3=pos1 and pos2
pos3
Out[105]:
[True, False]
In [106]:
df_m[pos3]
Out[106]:
Col3 Col4 Col5 Name
Ind1 1 2 3 Roger
In [ ]:
# Using Iloc for indexing
In [107]:
df_m.iloc[0:1]
Out[107]:
Col3 Col4 Col5 Name
Ind1 1 2 3 Roger
In [108]:
df_m.iloc[0:2]
Out[108]:
Col3 Col4 Col5 Name
Ind1 1 2 3 Roger
Ind2 10 11 12 Nadal
In [114]:
# Fist two rows and First Column
df_m.iloc[0:2,0:1]
Out[114]:
Col3
Ind1 1
Ind2 10
In [115]:
# Fist two rows and First two Column
df_m.iloc[0:2,0:2]
Out[115]:
Col3 Col4
Ind1 1 2
Ind2 10 11
In [116]:
# Fist two rows and Column at first
# index position
df_m.iloc[0:2,1:2]
Out[116]:
Col4
Ind1 2
Ind2 11
In [118]:
# Fist two rows and all columns
df_m.iloc[0:2,0:len(df_m.columns)]
Out[118]:
Col3 Col4 Col5 Name
Ind1 1 2 3 Roger
Ind2 10 11 12 Nadal
In [123]:
# All rows and all columns
df_m.iloc[0:df_m.shape[0],0:len(df_m.columns)]
Out[123]:
Col3 Col4 Col5 Name
Ind1 1 2 3 Roger
Ind2 10 11 12 Nadal

Web Scraping Tutorial 4- Getting the busy information data from Popular time page from Google

Popular Times Popular Times In this blog we will try to scrape the ...