Introduction to Pandas 27 Jan 2019

A brief introduction to pandas

# Import packages
import numpy as np
import pandas as pd

Series

“Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.).”

# Create a Series from a ndarray
s = pd.Series(np.arange(1,20,5))
# Indexes are created by default
s
0     1
1     6
2    11
3    16
dtype: int32
# Create a Series with indexes
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
# Query the indexes of a Series
s.index
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
# Create a Series from a dict
d = {'k1': 'hello', 'k2': 10, 'k3': 19.8}
pd.Series(d)
k1    hello
k2       10
k3     19.8
dtype: object
# Create a Series from a dict
d = {'k1': 1.0, 'k2': 111, 'k3': 19.8}
pd.Series(d)
k1      1.0
k2    111.0
k3     19.8
dtype: float64
# Create a Series from a scalar value
s = pd.Series(10., index=['a', 'b', 'c'])
s
a    10.0
b    10.0
c    10.0
dtype: float64
# Query the data type
s.dtype
dtype('float64')
# Series to ndarray-like
s.values
array([10., 10., 10.])
# Access values by index values
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s['e'] 
0.22115728602833706
# Access and Set values by index values
s['a'] = 0
s
a    0.000000
b   -0.378787
c   -0.380565
d   -1.015425
e    0.221157
dtype: float64
# Query if index exists
'c' in s
True
# Sort values
s.sort_values(ascending=False)
e    0.221157
a    0.000000
b   -0.378787
c   -0.380565
d   -1.015425
dtype: float64

DataFrame

“DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects”

“Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments”

# Create a DataFrame from a dict of Series
d = {'first': pd.Series([10, 20, 30], index=['a', 'b', 'c']),
     'second': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df
first second
a 10.0 1
b 20.0 2
c 30.0 3
d NaN 4
# Query the indexes
df.index
Index(['a', 'b', 'c', 'd'], dtype='object')
# Query the columns
df.columns
Index(['first', 'second'], dtype='object')
# Query the datatypes
df.dtypes
first     float64
second      int64
dtype: object
# Create a dataframe picking some indexes
pd.DataFrame(d, index=['c', 'b', 'd'])
first second
c 30.0 3
b 20.0 2
d NaN 4
# Create a dataframe picking some columns too
pd.DataFrame(d, index=['c', 'b', 'd'], columns=['second', 'third'])
second third
c 3 NaN
b 2 NaN
d 4 NaN
# Create a dataframe from a dict of ndarrays/list
d = {'first': [10, 20, 30, 40], 'second': [1.1, 1.2, 1.3, 1.4]}
df = pd.DataFrame(d)
df
first second
0 10 1.1
1 20 1.2
2 30 1.3
3 40 1.4
# Create dataframe defining indexes
pd.DataFrame(d, index = ['a', 'b', 'c', 'd'])
first second
a 10 1.1
b 20 1.2
c 30 1.3
d 40 1.4
# Create a dataframe from a NumPy array
df = pd.DataFrame(np.random.randn(10, 4), index=list('abcdefghij'),
             columns=list('ABCD'))
df
A B C D
a -0.396688 0.311491 -0.333861 0.168932
b 0.525654 0.834869 1.636704 0.393203
c 1.461059 1.011699 0.089861 -1.813207
d 2.069092 1.195152 -1.061165 -0.777508
e 1.453618 -0.481199 0.995436 0.367830
f -1.236010 -1.037308 0.981124 -0.903936
g -0.839587 0.049658 -0.431221 0.603668
h -1.420413 0.412668 -0.963789 0.471481
i 0.385292 0.831554 -0.781036 -1.117775
j 0.747233 0.215479 -0.535672 -0.231791
# View first rows
df.head()
A B C D
a 0.260968 1.462966 0.935928 -1.416606
b -0.158592 1.765306 0.153076 -0.099588
c -3.704010 0.346566 1.709313 3.173983
d -0.454166 1.719496 0.095962 -1.357128
e -2.583424 -0.319739 -0.212480 1.210537
# View last rows
df.tail(3)
A B C D
h 0.526778 -0.383277 0.599304 0.267912
i 0.518058 2.062913 1.682550 -0.083072
j -0.603516 0.332087 -1.479013 1.254228
# Return a numpy representation
A = df.values
A
array([[-0.39668765,  0.31149099, -0.33386089,  0.16893183],
       [ 0.5256545 ,  0.83486865,  1.63670404,  0.39320261],
       [ 1.46105881,  1.01169944,  0.08986103, -1.81320698],
       [ 2.06909247,  1.19515187, -1.06116542, -0.77750805],
       [ 1.45361793, -0.48119892,  0.9954361 ,  0.36783024],
       [-1.23601035, -1.03730773,  0.98112443, -0.90393631],
       [-0.83958668,  0.04965843, -0.43122089,  0.60366836],
       [-1.42041273,  0.41266751, -0.9637889 ,  0.47148076],
       [ 0.38529206,  0.83155384, -0.78103554, -1.11777487],
       [ 0.74723321,  0.2154795 , -0.53567181, -0.23179069]])
# Get dataframe column as NumPy presentation
import numpy as np
X = np.c_[df['A']]
X
array([[-0.39668765],
       [ 0.5256545 ],
       [ 1.46105881],
       [ 2.06909247],
       [ 1.45361793],
       [-1.23601035],
       [-0.83958668],
       [-1.42041273],
       [ 0.38529206],
       [ 0.74723321]])
# Shape of A are rows and columns of the dataframe
A.shape
(10, 4)
# Show a stat summary of the data
df.describe()
A B C D
count 10.000000 10.000000 10.000000 10.000000
mean -0.776800 0.823109 0.156050 0.289448
std 1.368299 0.955949 1.100099 1.360076
min -3.704010 -0.383277 -1.479013 -1.416606
25% -0.862804 -0.049376 -0.681005 -0.395471
50% -0.528841 0.883935 0.124519 0.092420
75% 0.156078 1.655364 0.851772 1.017481
max 0.526778 2.062913 1.709313 3.173983
# Show a summary of our dataframe
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
A    10 non-null float64
B    10 non-null float64
C    10 non-null float64
D    10 non-null float64
dtypes: float64(4)
memory usage: 720.0+ bytes
# Transpose data
df.T
a b c d e f g h i j
A 0.260968 -0.158592 -3.704010 -0.454166 -2.583424 -0.629530 -0.940561 0.526778 0.518058 -0.603516
B 1.462966 1.765306 0.346566 1.719496 -0.319739 1.421304 -0.176530 -0.383277 2.062913 0.332087
C 0.935928 0.153076 1.709313 0.095962 -0.212480 -0.837180 -1.086958 0.599304 1.682550 -1.479013
D -1.416606 -0.099588 3.173983 -1.357128 1.210537 0.438314 -0.494099 0.267912 -0.083072 1.254228
# Sort by values
df.sort_values(by='C', ascending=False)
A B C D
c -3.704010 0.346566 1.709313 3.173983
i 0.518058 2.062913 1.682550 -0.083072
a 0.260968 1.462966 0.935928 -1.416606
h 0.526778 -0.383277 0.599304 0.267912
b -0.158592 1.765306 0.153076 -0.099588
d -0.454166 1.719496 0.095962 -1.357128
e -2.583424 -0.319739 -0.212480 1.210537
f -0.629530 1.421304 -0.837180 0.438314
g -0.940561 -0.176530 -1.086958 -0.494099
j -0.603516 0.332087 -1.479013 1.254228

Selection

# Initial Dataframe
df = pd.DataFrame(np.random.randn(10, 4), index=list('abcdefghij'),
             columns=list('ABCD'))
df
A B C D
a -0.294648 0.023458 -0.735998 -0.552704
b -1.030628 1.069447 0.483863 0.723867
c 0.253603 0.066511 0.141998 0.406838
d -0.807581 -0.065567 -0.399008 0.321400
e 0.417448 -0.397863 1.852155 2.455886
f 0.287584 0.317224 0.767366 -0.053437
g 0.457348 -1.253723 2.869324 -0.327216
h -0.268315 -0.588351 1.476119 -0.224964
i 0.436500 1.178807 0.452377 0.083320
j -1.013040 -1.230992 -0.021071 3.256197
# Select a single column. Returns a Series
df['A']
a   -0.294648
b   -1.030628
c    0.253603
d   -0.807581
e    0.417448
f    0.287584
g    0.457348
h   -0.268315
i    0.436500
j   -1.013040
Name: A, dtype: float64
# Select a single column. Returns a Series
df.B
a    0.023458
b    1.069447
c    0.066511
d   -0.065567
e   -0.397863
f    0.317224
g   -1.253723
h   -0.588351
i    1.178807
j   -1.230992
Name: B, dtype: float64
# Select a slice of rows. Recall indexes start at 0
df[2:4]
A B C D
c 0.253603 0.066511 0.141998 0.406838
d -0.807581 -0.065567 -0.399008 0.321400
# Select on multi-axis
df.loc[:, ['A', 'D']]
A D
a -0.294648 -0.552704
b -1.030628 0.723867
c 0.253603 0.406838
d -0.807581 0.321400
e 0.417448 2.455886
f 0.287584 -0.053437
g 0.457348 -0.327216
h -0.268315 -0.224964
i 0.436500 0.083320
j -1.013040 3.256197
# Select a row on multi-axis. Returns a Series
df.loc['c', ['A', 'D']]
A    0.253603
D    0.406838
Name: c, dtype: float64
# Select on multi-axis
df.loc[['a', 'c'], ['A', 'D']]
A D
a -0.294648 -0.552704
c 0.253603 0.406838
# Get a scalar value
df.loc['c', 'C']
0.14199778789540501
# Get a scalar value using 'at'
df.at['c', 'C']
0.14199778789540501
# Set values on a dataframe
indices = list(['a', 'd', 'f'])
df.loc[indices, 'D'] = 100
df
A B C D
a -0.294648 0.023458 -0.735998 100.000000
b -1.030628 1.069447 0.483863 0.723867
c 0.253603 0.066511 0.141998 0.406838
d -0.807581 -0.065567 -0.399008 100.000000
e 0.417448 -0.397863 1.852155 2.455886
f 0.287584 0.317224 0.767366 100.000000
g 0.457348 -1.253723 2.869324 -0.327216
h -0.268315 -0.588351 1.476119 -0.224964
i 0.436500 1.178807 0.452377 0.083320
j -1.013040 -1.230992 -0.021071 3.256197
# Select by position
df.iloc[2]
A    0.253603
B    0.066511
C    0.141998
D    0.406838
Name: c, dtype: float64
# Select like slicing
df.iloc[2:4,1:3]
B C
c 0.066511 0.141998
d -0.065567 -0.399008
# Select specific locations
df.iloc[[1, 3],[0, 2]]
A C
b -1.030628 0.483863
d -0.807581 -0.399008
# Slice rows
df.iloc[4:7, :]
A B C D
e 0.417448 -0.397863 1.852155 2.455886
f 0.287584 0.317224 0.767366 100.000000
g 0.457348 -1.253723 2.869324 -0.327216
# Slice columns
df.iloc[:, 2:4]
C D
a -0.735998 100.000000
b 0.483863 0.723867
c 0.141998 0.406838
d -0.399008 100.000000
e 1.852155 2.455886
f 0.767366 100.000000
g 2.869324 -0.327216
h 1.476119 -0.224964
i 0.452377 0.083320
j -0.021071 3.256197
# Get a scalar value at specific position
df.iloc[2,3]
0.40683806053761956
# Get a scalar value at specific position using 'iat'
df.iat[2,3]
0.40683806053761956
# Boolean indexing

Common features

# Load dataset into a dataframe
import os
datapath = os.path.join("datasets", "islr", "")
auto = pd.read_csv(datapath + "dataset_filename.csv", 
                   delim_whitespace=True, na_values='?')

# Query NaN values
auto[auto.isnull().any(axis=1)]
mpg cylinders displacement horsepower weight acceleration year origin name
32 25.0 4 98.0 NaN 2046.0 19.0 71 1 ford pinto
126 21.0 6 200.0 NaN 2875.0 17.0 74 1 ford maverick
330 40.9 4 85.0 NaN 1835.0 17.3 80 2 renault lecar deluxe
336 23.6 4 140.0 NaN 2905.0 14.3 80 1 ford mustang cobra
354 34.5 4 100.0 NaN 2320.0 15.8 81 2 renault 18i
# Count values for a particular column
auto['year'].value_counts()
73    40
78    36
76    34
82    30
75    30
81    29
80    29
79    29
70    29
77    28
72    28
71    28
74    27
Name: year, dtype: int64
# Generate a correlation matrix
auto.corr()
mpg cylinders displacement horsepower weight acceleration year origin
mpg 1.000000 -0.776260 -0.804443 -0.778427 -0.831739 0.422297 0.581469 0.563698
cylinders -0.776260 1.000000 0.950920 0.842983 0.897017 -0.504061 -0.346717 -0.564972
displacement -0.804443 0.950920 1.000000 0.897257 0.933104 -0.544162 -0.369804 -0.610664
horsepower -0.778427 0.842983 0.897257 1.000000 0.864538 -0.689196 -0.416361 -0.455171
weight -0.831739 0.897017 0.933104 0.864538 1.000000 -0.419502 -0.307900 -0.581265
acceleration 0.422297 -0.504061 -0.544162 -0.689196 -0.419502 1.000000 0.282901 0.210084
year 0.581469 -0.346717 -0.369804 -0.416361 -0.307900 0.282901 1.000000 0.184314
origin 0.563698 -0.564972 -0.610664 -0.455171 -0.581265 0.210084 0.184314 1.000000
# Copy a dataframe
auto_copy = auto.copy()
# Clean data with NaN Values for a column
auto_copy = auto_copy.dropna(subset=['year'])
# Clean data with NaN Values for all dataframe
auto_copy = auto_copy.dropna()
# Drop a column
auto_copy = auto_copy.drop('name', axis=1)
auto_copy.head()
mpg cylinders displacement horsepower weight acceleration year origin
0 18.0 8 307.0 130.0 3504.0 12.0 70 1
1 15.0 8 350.0 165.0 3693.0 11.5 70 1
2 18.0 8 318.0 150.0 3436.0 11.0 70 1
3 16.0 8 304.0 150.0 3433.0 12.0 70 1
4 17.0 8 302.0 140.0 3449.0 10.5 70 1
# Make a list from dataframe column names
list(auto_copy)
['mpg',
 'cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'year',
 'origin']

References

http://pandas.pydata.org/pandas-docs/stable/index.html