Introduction to Pandas 27 Jan 2019
A brief introduction to pandas
# Import packages
import numpy as np
import pandas as pd
Series
“Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.).”
# Create a Series from a ndarray
s = pd.Series(np.arange(1,20,5))
# Indexes are created by default
s
0 1
1 6
2 11
3 16
dtype: int32
# Create a Series with indexes
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
# Query the indexes of a Series
s.index
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
# Create a Series from a dict
d = {'k1': 'hello', 'k2': 10, 'k3': 19.8}
pd.Series(d)
k1 hello
k2 10
k3 19.8
dtype: object
# Create a Series from a dict
d = {'k1': 1.0, 'k2': 111, 'k3': 19.8}
pd.Series(d)
k1 1.0
k2 111.0
k3 19.8
dtype: float64
# Create a Series from a scalar value
s = pd.Series(10., index=['a', 'b', 'c'])
s
a 10.0
b 10.0
c 10.0
dtype: float64
# Query the data type
s.dtype
dtype('float64')
# Series to ndarray-like
s.values
array([10., 10., 10.])
# Access values by index values
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s['e']
0.22115728602833706
# Access and Set values by index values
s['a'] = 0
s
a 0.000000
b -0.378787
c -0.380565
d -1.015425
e 0.221157
dtype: float64
# Query if index exists
'c' in s
True
# Sort values
s.sort_values(ascending=False)
e 0.221157
a 0.000000
b -0.378787
c -0.380565
d -1.015425
dtype: float64
DataFrame
“DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects”
“Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments”
# Create a DataFrame from a dict of Series
d = {'first': pd.Series([10, 20, 30], index=['a', 'b', 'c']),
'second': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df
first | second | |
---|---|---|
a | 10.0 | 1 |
b | 20.0 | 2 |
c | 30.0 | 3 |
d | NaN | 4 |
# Query the indexes
df.index
Index(['a', 'b', 'c', 'd'], dtype='object')
# Query the columns
df.columns
Index(['first', 'second'], dtype='object')
# Query the datatypes
df.dtypes
first float64
second int64
dtype: object
# Create a dataframe picking some indexes
pd.DataFrame(d, index=['c', 'b', 'd'])
first | second | |
---|---|---|
c | 30.0 | 3 |
b | 20.0 | 2 |
d | NaN | 4 |
# Create a dataframe picking some columns too
pd.DataFrame(d, index=['c', 'b', 'd'], columns=['second', 'third'])
second | third | |
---|---|---|
c | 3 | NaN |
b | 2 | NaN |
d | 4 | NaN |
# Create a dataframe from a dict of ndarrays/list
d = {'first': [10, 20, 30, 40], 'second': [1.1, 1.2, 1.3, 1.4]}
df = pd.DataFrame(d)
df
first | second | |
---|---|---|
0 | 10 | 1.1 |
1 | 20 | 1.2 |
2 | 30 | 1.3 |
3 | 40 | 1.4 |
# Create dataframe defining indexes
pd.DataFrame(d, index = ['a', 'b', 'c', 'd'])
first | second | |
---|---|---|
a | 10 | 1.1 |
b | 20 | 1.2 |
c | 30 | 1.3 |
d | 40 | 1.4 |
# Create a dataframe from a NumPy array
df = pd.DataFrame(np.random.randn(10, 4), index=list('abcdefghij'),
columns=list('ABCD'))
df
A | B | C | D | |
---|---|---|---|---|
a | -0.396688 | 0.311491 | -0.333861 | 0.168932 |
b | 0.525654 | 0.834869 | 1.636704 | 0.393203 |
c | 1.461059 | 1.011699 | 0.089861 | -1.813207 |
d | 2.069092 | 1.195152 | -1.061165 | -0.777508 |
e | 1.453618 | -0.481199 | 0.995436 | 0.367830 |
f | -1.236010 | -1.037308 | 0.981124 | -0.903936 |
g | -0.839587 | 0.049658 | -0.431221 | 0.603668 |
h | -1.420413 | 0.412668 | -0.963789 | 0.471481 |
i | 0.385292 | 0.831554 | -0.781036 | -1.117775 |
j | 0.747233 | 0.215479 | -0.535672 | -0.231791 |
# View first rows
df.head()
A | B | C | D | |
---|---|---|---|---|
a | 0.260968 | 1.462966 | 0.935928 | -1.416606 |
b | -0.158592 | 1.765306 | 0.153076 | -0.099588 |
c | -3.704010 | 0.346566 | 1.709313 | 3.173983 |
d | -0.454166 | 1.719496 | 0.095962 | -1.357128 |
e | -2.583424 | -0.319739 | -0.212480 | 1.210537 |
# View last rows
df.tail(3)
A | B | C | D | |
---|---|---|---|---|
h | 0.526778 | -0.383277 | 0.599304 | 0.267912 |
i | 0.518058 | 2.062913 | 1.682550 | -0.083072 |
j | -0.603516 | 0.332087 | -1.479013 | 1.254228 |
# Return a numpy representation
A = df.values
A
array([[-0.39668765, 0.31149099, -0.33386089, 0.16893183],
[ 0.5256545 , 0.83486865, 1.63670404, 0.39320261],
[ 1.46105881, 1.01169944, 0.08986103, -1.81320698],
[ 2.06909247, 1.19515187, -1.06116542, -0.77750805],
[ 1.45361793, -0.48119892, 0.9954361 , 0.36783024],
[-1.23601035, -1.03730773, 0.98112443, -0.90393631],
[-0.83958668, 0.04965843, -0.43122089, 0.60366836],
[-1.42041273, 0.41266751, -0.9637889 , 0.47148076],
[ 0.38529206, 0.83155384, -0.78103554, -1.11777487],
[ 0.74723321, 0.2154795 , -0.53567181, -0.23179069]])
# Get dataframe column as NumPy presentation
import numpy as np
X = np.c_[df['A']]
X
array([[-0.39668765],
[ 0.5256545 ],
[ 1.46105881],
[ 2.06909247],
[ 1.45361793],
[-1.23601035],
[-0.83958668],
[-1.42041273],
[ 0.38529206],
[ 0.74723321]])
# Shape of A are rows and columns of the dataframe
A.shape
(10, 4)
# Show a stat summary of the data
df.describe()
A | B | C | D | |
---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
mean | -0.776800 | 0.823109 | 0.156050 | 0.289448 |
std | 1.368299 | 0.955949 | 1.100099 | 1.360076 |
min | -3.704010 | -0.383277 | -1.479013 | -1.416606 |
25% | -0.862804 | -0.049376 | -0.681005 | -0.395471 |
50% | -0.528841 | 0.883935 | 0.124519 | 0.092420 |
75% | 0.156078 | 1.655364 | 0.851772 | 1.017481 |
max | 0.526778 | 2.062913 | 1.709313 | 3.173983 |
# Show a summary of our dataframe
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
A 10 non-null float64
B 10 non-null float64
C 10 non-null float64
D 10 non-null float64
dtypes: float64(4)
memory usage: 720.0+ bytes
# Transpose data
df.T
a | b | c | d | e | f | g | h | i | j | |
---|---|---|---|---|---|---|---|---|---|---|
A | 0.260968 | -0.158592 | -3.704010 | -0.454166 | -2.583424 | -0.629530 | -0.940561 | 0.526778 | 0.518058 | -0.603516 |
B | 1.462966 | 1.765306 | 0.346566 | 1.719496 | -0.319739 | 1.421304 | -0.176530 | -0.383277 | 2.062913 | 0.332087 |
C | 0.935928 | 0.153076 | 1.709313 | 0.095962 | -0.212480 | -0.837180 | -1.086958 | 0.599304 | 1.682550 | -1.479013 |
D | -1.416606 | -0.099588 | 3.173983 | -1.357128 | 1.210537 | 0.438314 | -0.494099 | 0.267912 | -0.083072 | 1.254228 |
# Sort by values
df.sort_values(by='C', ascending=False)
A | B | C | D | |
---|---|---|---|---|
c | -3.704010 | 0.346566 | 1.709313 | 3.173983 |
i | 0.518058 | 2.062913 | 1.682550 | -0.083072 |
a | 0.260968 | 1.462966 | 0.935928 | -1.416606 |
h | 0.526778 | -0.383277 | 0.599304 | 0.267912 |
b | -0.158592 | 1.765306 | 0.153076 | -0.099588 |
d | -0.454166 | 1.719496 | 0.095962 | -1.357128 |
e | -2.583424 | -0.319739 | -0.212480 | 1.210537 |
f | -0.629530 | 1.421304 | -0.837180 | 0.438314 |
g | -0.940561 | -0.176530 | -1.086958 | -0.494099 |
j | -0.603516 | 0.332087 | -1.479013 | 1.254228 |
Selection
# Initial Dataframe
df = pd.DataFrame(np.random.randn(10, 4), index=list('abcdefghij'),
columns=list('ABCD'))
df
A | B | C | D | |
---|---|---|---|---|
a | -0.294648 | 0.023458 | -0.735998 | -0.552704 |
b | -1.030628 | 1.069447 | 0.483863 | 0.723867 |
c | 0.253603 | 0.066511 | 0.141998 | 0.406838 |
d | -0.807581 | -0.065567 | -0.399008 | 0.321400 |
e | 0.417448 | -0.397863 | 1.852155 | 2.455886 |
f | 0.287584 | 0.317224 | 0.767366 | -0.053437 |
g | 0.457348 | -1.253723 | 2.869324 | -0.327216 |
h | -0.268315 | -0.588351 | 1.476119 | -0.224964 |
i | 0.436500 | 1.178807 | 0.452377 | 0.083320 |
j | -1.013040 | -1.230992 | -0.021071 | 3.256197 |
# Select a single column. Returns a Series
df['A']
a -0.294648
b -1.030628
c 0.253603
d -0.807581
e 0.417448
f 0.287584
g 0.457348
h -0.268315
i 0.436500
j -1.013040
Name: A, dtype: float64
# Select a single column. Returns a Series
df.B
a 0.023458
b 1.069447
c 0.066511
d -0.065567
e -0.397863
f 0.317224
g -1.253723
h -0.588351
i 1.178807
j -1.230992
Name: B, dtype: float64
# Select a slice of rows. Recall indexes start at 0
df[2:4]
A | B | C | D | |
---|---|---|---|---|
c | 0.253603 | 0.066511 | 0.141998 | 0.406838 |
d | -0.807581 | -0.065567 | -0.399008 | 0.321400 |
# Select on multi-axis
df.loc[:, ['A', 'D']]
A | D | |
---|---|---|
a | -0.294648 | -0.552704 |
b | -1.030628 | 0.723867 |
c | 0.253603 | 0.406838 |
d | -0.807581 | 0.321400 |
e | 0.417448 | 2.455886 |
f | 0.287584 | -0.053437 |
g | 0.457348 | -0.327216 |
h | -0.268315 | -0.224964 |
i | 0.436500 | 0.083320 |
j | -1.013040 | 3.256197 |
# Select a row on multi-axis. Returns a Series
df.loc['c', ['A', 'D']]
A 0.253603
D 0.406838
Name: c, dtype: float64
# Select on multi-axis
df.loc[['a', 'c'], ['A', 'D']]
A | D | |
---|---|---|
a | -0.294648 | -0.552704 |
c | 0.253603 | 0.406838 |
# Get a scalar value
df.loc['c', 'C']
0.14199778789540501
# Get a scalar value using 'at'
df.at['c', 'C']
0.14199778789540501
# Set values on a dataframe
indices = list(['a', 'd', 'f'])
df.loc[indices, 'D'] = 100
df
A | B | C | D | |
---|---|---|---|---|
a | -0.294648 | 0.023458 | -0.735998 | 100.000000 |
b | -1.030628 | 1.069447 | 0.483863 | 0.723867 |
c | 0.253603 | 0.066511 | 0.141998 | 0.406838 |
d | -0.807581 | -0.065567 | -0.399008 | 100.000000 |
e | 0.417448 | -0.397863 | 1.852155 | 2.455886 |
f | 0.287584 | 0.317224 | 0.767366 | 100.000000 |
g | 0.457348 | -1.253723 | 2.869324 | -0.327216 |
h | -0.268315 | -0.588351 | 1.476119 | -0.224964 |
i | 0.436500 | 1.178807 | 0.452377 | 0.083320 |
j | -1.013040 | -1.230992 | -0.021071 | 3.256197 |
# Select by position
df.iloc[2]
A 0.253603
B 0.066511
C 0.141998
D 0.406838
Name: c, dtype: float64
# Select like slicing
df.iloc[2:4,1:3]
B | C | |
---|---|---|
c | 0.066511 | 0.141998 |
d | -0.065567 | -0.399008 |
# Select specific locations
df.iloc[[1, 3],[0, 2]]
A | C | |
---|---|---|
b | -1.030628 | 0.483863 |
d | -0.807581 | -0.399008 |
# Slice rows
df.iloc[4:7, :]
A | B | C | D | |
---|---|---|---|---|
e | 0.417448 | -0.397863 | 1.852155 | 2.455886 |
f | 0.287584 | 0.317224 | 0.767366 | 100.000000 |
g | 0.457348 | -1.253723 | 2.869324 | -0.327216 |
# Slice columns
df.iloc[:, 2:4]
C | D | |
---|---|---|
a | -0.735998 | 100.000000 |
b | 0.483863 | 0.723867 |
c | 0.141998 | 0.406838 |
d | -0.399008 | 100.000000 |
e | 1.852155 | 2.455886 |
f | 0.767366 | 100.000000 |
g | 2.869324 | -0.327216 |
h | 1.476119 | -0.224964 |
i | 0.452377 | 0.083320 |
j | -0.021071 | 3.256197 |
# Get a scalar value at specific position
df.iloc[2,3]
0.40683806053761956
# Get a scalar value at specific position using 'iat'
df.iat[2,3]
0.40683806053761956
# Boolean indexing
Common features
# Load dataset into a dataframe
import os
datapath = os.path.join("datasets", "islr", "")
auto = pd.read_csv(datapath + "dataset_filename.csv",
delim_whitespace=True, na_values='?')
# Query NaN values
auto[auto.isnull().any(axis=1)]
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
32 | 25.0 | 4 | 98.0 | NaN | 2046.0 | 19.0 | 71 | 1 | ford pinto |
126 | 21.0 | 6 | 200.0 | NaN | 2875.0 | 17.0 | 74 | 1 | ford maverick |
330 | 40.9 | 4 | 85.0 | NaN | 1835.0 | 17.3 | 80 | 2 | renault lecar deluxe |
336 | 23.6 | 4 | 140.0 | NaN | 2905.0 | 14.3 | 80 | 1 | ford mustang cobra |
354 | 34.5 | 4 | 100.0 | NaN | 2320.0 | 15.8 | 81 | 2 | renault 18i |
# Count values for a particular column
auto['year'].value_counts()
73 40
78 36
76 34
82 30
75 30
81 29
80 29
79 29
70 29
77 28
72 28
71 28
74 27
Name: year, dtype: int64
# Generate a correlation matrix
auto.corr()
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | |
---|---|---|---|---|---|---|---|---|
mpg | 1.000000 | -0.776260 | -0.804443 | -0.778427 | -0.831739 | 0.422297 | 0.581469 | 0.563698 |
cylinders | -0.776260 | 1.000000 | 0.950920 | 0.842983 | 0.897017 | -0.504061 | -0.346717 | -0.564972 |
displacement | -0.804443 | 0.950920 | 1.000000 | 0.897257 | 0.933104 | -0.544162 | -0.369804 | -0.610664 |
horsepower | -0.778427 | 0.842983 | 0.897257 | 1.000000 | 0.864538 | -0.689196 | -0.416361 | -0.455171 |
weight | -0.831739 | 0.897017 | 0.933104 | 0.864538 | 1.000000 | -0.419502 | -0.307900 | -0.581265 |
acceleration | 0.422297 | -0.504061 | -0.544162 | -0.689196 | -0.419502 | 1.000000 | 0.282901 | 0.210084 |
year | 0.581469 | -0.346717 | -0.369804 | -0.416361 | -0.307900 | 0.282901 | 1.000000 | 0.184314 |
origin | 0.563698 | -0.564972 | -0.610664 | -0.455171 | -0.581265 | 0.210084 | 0.184314 | 1.000000 |
# Copy a dataframe
auto_copy = auto.copy()
# Clean data with NaN Values for a column
auto_copy = auto_copy.dropna(subset=['year'])
# Clean data with NaN Values for all dataframe
auto_copy = auto_copy.dropna()
# Drop a column
auto_copy = auto_copy.drop('name', axis=1)
auto_copy.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | |
---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504.0 | 12.0 | 70 | 1 |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693.0 | 11.5 | 70 | 1 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436.0 | 11.0 | 70 | 1 |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433.0 | 12.0 | 70 | 1 |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449.0 | 10.5 | 70 | 1 |
# Make a list from dataframe column names
list(auto_copy)
['mpg',
'cylinders',
'displacement',
'horsepower',
'weight',
'acceleration',
'year',
'origin']
References
http://pandas.pydata.org/pandas-docs/stable/index.html