Introduction to Pandas

Introduction to Pandas 27 Jan 2019

A brief introduction to pandas

# Import packages
import numpy as np
import pandas as pd

Series

“Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.).”

# Create a Series from a ndarray
s = pd.Series(np.arange(1,20,5))
# Indexes are created by default
s

   1
   6
  11
  16
dtype: int32

# Create a Series with indexes
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])

# Query the indexes of a Series
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

# Create a Series from a dict
d = {'k1': 'hello', 'k2': 10, 'k3': 19.8}
pd.Series(d)

k1    hello
k2       10
k3     19.8
dtype: object

# Create a Series from a dict
d = {'k1': 1.0, 'k2': 111, 'k3': 19.8}
pd.Series(d)

k1      1.0
k2    111.0
k3     19.8
dtype: float64

# Create a Series from a scalar value
s = pd.Series(10., index=['a', 'b', 'c'])
s

a    10.0
b    10.0
c    10.0
dtype: float64

# Query the data type
s.dtype

dtype('float64')

# Series to ndarray-like
s.values

array([10., 10., 10.])

# Access values by index values
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s['e'] 

0.22115728602833706

# Access and Set values by index values
s['a'] = 0
s

a    0.000000
b   -0.378787
c   -0.380565
d   -1.015425
e    0.221157
dtype: float64

# Query if index exists
'c' in s

True

# Sort values
s.sort_values(ascending=False)

e    0.221157
a    0.000000
b   -0.378787
c   -0.380565
d   -1.015425
dtype: float64

DataFrame

“DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects”

“Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments”

# Create a DataFrame from a dict of Series
d = {'first': pd.Series([10, 20, 30], index=['a', 'b', 'c']),
     'second': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

	first	second
a	10.0	1
b	20.0	2
c	30.0	3
d	NaN	4

# Query the indexes
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

# Query the columns
df.columns

Index(['first', 'second'], dtype='object')

# Query the datatypes
df.dtypes

first     float64
second      int64
dtype: object

# Create a dataframe picking some indexes
pd.DataFrame(d, index=['c', 'b', 'd'])

	first	second
c	30.0	3
b	20.0	2
d	NaN	4

# Create a dataframe picking some columns too
pd.DataFrame(d, index=['c', 'b', 'd'], columns=['second', 'third'])

	second	third
c	3	NaN
b	2	NaN
d	4	NaN

# Create a dataframe from a dict of ndarrays/list
d = {'first': [10, 20, 30, 40], 'second': [1.1, 1.2, 1.3, 1.4]}
df = pd.DataFrame(d)
df

	first	second
0	10	1.1
1	20	1.2
2	30	1.3
3	40	1.4

# Create dataframe defining indexes
pd.DataFrame(d, index = ['a', 'b', 'c', 'd'])

	first	second
a	10	1.1
b	20	1.2
c	30	1.3
d	40	1.4

# Create a dataframe from a NumPy array
df = pd.DataFrame(np.random.randn(10, 4), index=list('abcdefghij'),
             columns=list('ABCD'))
df

	A	B	C	D
a	-0.396688	0.311491	-0.333861	0.168932
b	0.525654	0.834869	1.636704	0.393203
c	1.461059	1.011699	0.089861	-1.813207
d	2.069092	1.195152	-1.061165	-0.777508
e	1.453618	-0.481199	0.995436	0.367830
f	-1.236010	-1.037308	0.981124	-0.903936
g	-0.839587	0.049658	-0.431221	0.603668
h	-1.420413	0.412668	-0.963789	0.471481
i	0.385292	0.831554	-0.781036	-1.117775
j	0.747233	0.215479	-0.535672	-0.231791

# View first rows
df.head()

	A	B	C	D
a	0.260968	1.462966	0.935928	-1.416606
b	-0.158592	1.765306	0.153076	-0.099588
c	-3.704010	0.346566	1.709313	3.173983
d	-0.454166	1.719496	0.095962	-1.357128
e	-2.583424	-0.319739	-0.212480	1.210537

# View last rows
df.tail(3)

	A	B	C	D
h	0.526778	-0.383277	0.599304	0.267912
i	0.518058	2.062913	1.682550	-0.083072
j	-0.603516	0.332087	-1.479013	1.254228

# Return a numpy representation
A = df.values
A

array([[-0.39668765,  0.31149099, -0.33386089,  0.16893183],
       [ 0.5256545 ,  0.83486865,  1.63670404,  0.39320261],
       [ 1.46105881,  1.01169944,  0.08986103, -1.81320698],
       [ 2.06909247,  1.19515187, -1.06116542, -0.77750805],
       [ 1.45361793, -0.48119892,  0.9954361 ,  0.36783024],
       [-1.23601035, -1.03730773,  0.98112443, -0.90393631],
       [-0.83958668,  0.04965843, -0.43122089,  0.60366836],
       [-1.42041273,  0.41266751, -0.9637889 ,  0.47148076],
       [ 0.38529206,  0.83155384, -0.78103554, -1.11777487],
       [ 0.74723321,  0.2154795 , -0.53567181, -0.23179069]])

# Get dataframe column as NumPy presentation
import numpy as np
X = np.c_[df['A']]
X

array([[-0.39668765],
       [ 0.5256545 ],
       [ 1.46105881],
       [ 2.06909247],
       [ 1.45361793],
       [-1.23601035],
       [-0.83958668],
       [-1.42041273],
       [ 0.38529206],
       [ 0.74723321]])

# Shape of A are rows and columns of the dataframe
A.shape

(10, 4)

# Show a stat summary of the data
df.describe()

	A	B	C	D
count	10.000000	10.000000	10.000000	10.000000
mean	-0.776800	0.823109	0.156050	0.289448
std	1.368299	0.955949	1.100099	1.360076
min	-3.704010	-0.383277	-1.479013	-1.416606
25%	-0.862804	-0.049376	-0.681005	-0.395471
50%	-0.528841	0.883935	0.124519	0.092420
75%	0.156078	1.655364	0.851772	1.017481
max	0.526778	2.062913	1.709313	3.173983

# Show a summary of our dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
A    10 non-null float64
B    10 non-null float64
C    10 non-null float64
D    10 non-null float64
dtypes: float64(4)
memory usage: 720.0+ bytes

# Transpose data
df.T

	a	b	c	d	e	f	g	h	i	j
A	0.260968	-0.158592	-3.704010	-0.454166	-2.583424	-0.629530	-0.940561	0.526778	0.518058	-0.603516
B	1.462966	1.765306	0.346566	1.719496	-0.319739	1.421304	-0.176530	-0.383277	2.062913	0.332087
C	0.935928	0.153076	1.709313	0.095962	-0.212480	-0.837180	-1.086958	0.599304	1.682550	-1.479013
D	-1.416606	-0.099588	3.173983	-1.357128	1.210537	0.438314	-0.494099	0.267912	-0.083072	1.254228

# Sort by values
df.sort_values(by='C', ascending=False)

	A	B	C	D
c	-3.704010	0.346566	1.709313	3.173983
i	0.518058	2.062913	1.682550	-0.083072
a	0.260968	1.462966	0.935928	-1.416606
h	0.526778	-0.383277	0.599304	0.267912
b	-0.158592	1.765306	0.153076	-0.099588
d	-0.454166	1.719496	0.095962	-1.357128
e	-2.583424	-0.319739	-0.212480	1.210537
f	-0.629530	1.421304	-0.837180	0.438314
g	-0.940561	-0.176530	-1.086958	-0.494099
j	-0.603516	0.332087	-1.479013	1.254228

Selection

# Initial Dataframe
df = pd.DataFrame(np.random.randn(10, 4), index=list('abcdefghij'),
             columns=list('ABCD'))
df

	A	B	C	D
a	-0.294648	0.023458	-0.735998	-0.552704
b	-1.030628	1.069447	0.483863	0.723867
c	0.253603	0.066511	0.141998	0.406838
d	-0.807581	-0.065567	-0.399008	0.321400
e	0.417448	-0.397863	1.852155	2.455886
f	0.287584	0.317224	0.767366	-0.053437
g	0.457348	-1.253723	2.869324	-0.327216
h	-0.268315	-0.588351	1.476119	-0.224964
i	0.436500	1.178807	0.452377	0.083320
j	-1.013040	-1.230992	-0.021071	3.256197

# Select a single column. Returns a Series
df['A']

a   -0.294648
b   -1.030628
c    0.253603
d   -0.807581
e    0.417448
f    0.287584
g    0.457348
h   -0.268315
i    0.436500
j   -1.013040
Name: A, dtype: float64

# Select a single column. Returns a Series
df.B

a    0.023458
b    1.069447
c    0.066511
d   -0.065567
e   -0.397863
f    0.317224
g   -1.253723
h   -0.588351
i    1.178807
j   -1.230992
Name: B, dtype: float64

# Select a slice of rows. Recall indexes start at 0
df[2:4]

	A	B	C	D
c	0.253603	0.066511	0.141998	0.406838
d	-0.807581	-0.065567	-0.399008	0.321400

# Select on multi-axis
df.loc[:, ['A', 'D']]

	A	D
a	-0.294648	-0.552704
b	-1.030628	0.723867
c	0.253603	0.406838
d	-0.807581	0.321400
e	0.417448	2.455886
f	0.287584	-0.053437
g	0.457348	-0.327216
h	-0.268315	-0.224964
i	0.436500	0.083320
j	-1.013040	3.256197

# Select a row on multi-axis. Returns a Series
df.loc['c', ['A', 'D']]

A    0.253603
D    0.406838
Name: c, dtype: float64

# Select on multi-axis
df.loc[['a', 'c'], ['A', 'D']]

	A	D
a	-0.294648	-0.552704
c	0.253603	0.406838

# Get a scalar value
df.loc['c', 'C']

0.14199778789540501

# Get a scalar value using 'at'
df.at['c', 'C']

0.14199778789540501

# Set values on a dataframe
indices = list(['a', 'd', 'f'])
df.loc[indices, 'D'] = 100
df

	A	B	C	D
a	-0.294648	0.023458	-0.735998	100.000000
b	-1.030628	1.069447	0.483863	0.723867
c	0.253603	0.066511	0.141998	0.406838
d	-0.807581	-0.065567	-0.399008	100.000000
e	0.417448	-0.397863	1.852155	2.455886
f	0.287584	0.317224	0.767366	100.000000
g	0.457348	-1.253723	2.869324	-0.327216
h	-0.268315	-0.588351	1.476119	-0.224964
i	0.436500	1.178807	0.452377	0.083320
j	-1.013040	-1.230992	-0.021071	3.256197

# Select by position
df.iloc[2]

A    0.253603
B    0.066511
C    0.141998
D    0.406838
Name: c, dtype: float64

# Select like slicing
df.iloc[2:4,1:3]

	B	C
c	0.066511	0.141998
d	-0.065567	-0.399008

# Select specific locations
df.iloc[[1, 3],[0, 2]]

	A	C
b	-1.030628	0.483863
d	-0.807581	-0.399008

# Slice rows
df.iloc[4:7, :]

	A	B	C	D
e	0.417448	-0.397863	1.852155	2.455886
f	0.287584	0.317224	0.767366	100.000000
g	0.457348	-1.253723	2.869324	-0.327216

# Slice columns
df.iloc[:, 2:4]

	C	D
a	-0.735998	100.000000
b	0.483863	0.723867
c	0.141998	0.406838
d	-0.399008	100.000000
e	1.852155	2.455886
f	0.767366	100.000000
g	2.869324	-0.327216
h	1.476119	-0.224964
i	0.452377	0.083320
j	-0.021071	3.256197

# Get a scalar value at specific position
df.iloc[2,3]

0.40683806053761956

# Get a scalar value at specific position using 'iat'
df.iat[2,3]

0.40683806053761956

# Boolean indexing

Common features

# Load dataset into a dataframe
import os
datapath = os.path.join("datasets", "islr", "")
auto = pd.read_csv(datapath + "dataset_filename.csv", 
                   delim_whitespace=True, na_values='?')

# Query NaN values
auto[auto.isnull().any(axis=1)]

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
32	25.0	4	98.0	NaN	2046.0	19.0	71	1	ford pinto
126	21.0	6	200.0	NaN	2875.0	17.0	74	1	ford maverick
330	40.9	4	85.0	NaN	1835.0	17.3	80	2	renault lecar deluxe
336	23.6	4	140.0	NaN	2905.0	14.3	80	1	ford mustang cobra
354	34.5	4	100.0	NaN	2320.0	15.8	81	2	renault 18i

# Count values for a particular column
auto['year'].value_counts()

  40
  36
  34
  30
  30
  29
  29
  29
  29
  28
  28
  28
  27
Name: year, dtype: int64

# Generate a correlation matrix
auto.corr()

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin
mpg	1.000000	-0.776260	-0.804443	-0.778427	-0.831739	0.422297	0.581469	0.563698
cylinders	-0.776260	1.000000	0.950920	0.842983	0.897017	-0.504061	-0.346717	-0.564972
displacement	-0.804443	0.950920	1.000000	0.897257	0.933104	-0.544162	-0.369804	-0.610664
horsepower	-0.778427	0.842983	0.897257	1.000000	0.864538	-0.689196	-0.416361	-0.455171
weight	-0.831739	0.897017	0.933104	0.864538	1.000000	-0.419502	-0.307900	-0.581265
acceleration	0.422297	-0.504061	-0.544162	-0.689196	-0.419502	1.000000	0.282901	0.210084
year	0.581469	-0.346717	-0.369804	-0.416361	-0.307900	0.282901	1.000000	0.184314
origin	0.563698	-0.564972	-0.610664	-0.455171	-0.581265	0.210084	0.184314	1.000000

# Copy a dataframe
auto_copy = auto.copy()

# Clean data with NaN Values for a column
auto_copy = auto_copy.dropna(subset=['year'])

# Clean data with NaN Values for all dataframe
auto_copy = auto_copy.dropna()

# Drop a column
auto_copy = auto_copy.drop('name', axis=1)
auto_copy.head()

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin
0	18.0	8	307.0	130.0	3504.0	12.0	70	1
1	15.0	8	350.0	165.0	3693.0	11.5	70	1
2	18.0	8	318.0	150.0	3436.0	11.0	70	1
3	16.0	8	304.0	150.0	3433.0	12.0	70	1
4	17.0	8	302.0	140.0	3449.0	10.5	70	1

# Make a list from dataframe column names
list(auto_copy)

['mpg',
 'cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'year',
 'origin']

References

http://pandas.pydata.org/pandas-docs/stable/index.html