3. Data with the Series
Representing Univariate Data with the Series
Configuring pandas
# import numpy and pandas
import numpy as np
import pandas as pd
# used for dates
import datetime
from datetime import datetime, date
# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 80)
# bring in matplotlib for graphics
import matplotlib.pyplot as plt
%matplotlib inline
Creating a Series using
Python lists and dictionaries
# create a series of multiple values from a list
s = pd.Series([10, 11, 12, 13, 14])
s
0 10
1 11
2 12
3 13
4 14
dtype: int64
# value stored at index label 3
s[3]
13
# create a Series of alphas
pd.Series(['Mike', 'Marcia', 'Mikael', 'Bleu'])
0 Mike
1 Marcia
2 Mikael
3 Bleu
dtype: object
# a sequence of 5 values, all 2
pd.Series([2]*5)
0 2
1 2
2 2
3 2
4 2
dtype: int64
# use each character as a value
pd.Series(list('abcde'))
0 a
1 b
2 c
3 d
4 e
dtype: object
# create Series from dict
pd.Series({'Mike': 'Dad',
'Marcia': 'Mom',
'Mikael': 'Son',
'Bleu': 'Best doggie ever' })
Bleu Best doggie ever
Marcia Mom
Mikael Son
Mike Dad
dtype: object
Creation using NumPy functions
# 4 through 8
pd.Series(np.arange(4, 9))
0 4
1 5
2 6
3 7
4 8
dtype: int64
# 0 through 9
pd.Series(np.linspace(0, 9, 5))
0 0.00
1 2.25
2 4.50
3 6.75
4 9.00
dtype: float64
# random numbers
np.random.seed(12345) # always generate the same values
# 5 normally random numbers
pd.Series(np.random.normal(size=5))
0 -0.204708
1 0.478943
2 -0.519439
3 -0.555730
4 1.965781
dtype: float64
Creation using a scalar value
# create a one item Series
s = pd.Series(2)
s
0 2
dtype: int64
# create the Series
s = pd.Series(np.arange(0, 5))
# multiple all values by 2
s * 2
0 0
1 2
2 4
3 6
4 8
dtype: int64
The .index and .values properties
# get the values in the Series
s = pd.Series([1, 2, 3])
s.values
array([1, 2, 3])
# show that this is a numpy array
type(s.values)
numpy.ndarray
# get the index of the Series
s.index
RangeIndex(start=0, stop=3, step=1)
The size and shape of a Series
# example series
s = pd.Series([0, 1, 2, 3])
len(s)
4
# .size is also the # of items in the Series
s.size
4
# .shape is a tuple with one value
s.shape
(4,)
Specifying an index at creation
# explicitly create an index
labels = ['Mike', 'Marcia', 'Mikael', 'Bleu']
role = ['Dad', 'Mom', 'Son', 'Dog']
s = pd.Series(labels, index=role)
s
Dad Mike
Mom Marcia
Son Mikael
Dog Bleu
dtype: object
# examine the index
s.index
Index(['Dad', 'Mom', 'Son', 'Dog'], dtype='object')
# who is the Dad?
s['Dad']
'Mike'
Heads, tails and takes
# a ten item Series
s = pd.Series(np.arange(1, 10),
index=list('abcdefghi'))
# show the first five
s.head()
a 1
b 2
c 3
d 4
e 5
dtype: int64
# the first three
s.head(n = 3) # s.head(3) is equivalent
a 1
b 2
c 3
dtype: int64
# the last five
s.tail()
e 5
f 6
g 7
h 8
i 9
dtype: int64
# the last 3
s.tail(n = 3) # equivalent to s.tail(3)
g 7
h 8
i 9
dtype: int64
# only take specific items by position
s.take([1, 5, 8])
b 2
f 6
i 9
dtype: int64
Lookup by label using the [] and .ix[] operators
# we will use this series to examine lookups
s1 = pd.Series(np.arange(10, 15), index=list('abcde'))
s1
a 10
b 11
c 12
d 13
e 14
dtype: int64
# get the value with label 'a'
s1['a']
10
# get multiple items
s1[['d', 'b']]
d 13
b 11
dtype: int64
# gets values based upon position
s1[[3, 1]]
d 13
b 11
dtype: int64
# to demo lookup by matching labels as integer values
s2 = pd.Series([1, 2, 3, 4], index=[10, 11, 12, 13])
s2
10 1
11 2
12 3
13 4
dtype: int64
# this is by label not position
s2[[13, 10]]
13 4
10 1
dtype: int64
Explicit position lookup with .iloc[]
# explicitly by position
s1.iloc[[0, 2]]
a 10
c 12
dtype: int64
# explicitly by position
s2.iloc[[3, 2]]
13 4
12 3
dtype: int64
Explicit label lookup with .loc[]
# explicit via labels
s1.loc[['a', 'd']]
a 10
d 13
dtype: int64
# get items at position 11 an d12
s2.loc[[11, 12]]
11 2
12 3
dtype: int64
# -1 and 15 will be NaN
s1.loc[['a', 'f']]
a 10.0
f NaN
dtype: float64
Slicing a Series into subsets
# a Series to use for slicing
# using index labels not starting at 0 to demonstrate
# position based slicing
s = pd.Series(np.arange(100, 110), index=np.arange(10, 20))
s
10 100
11 101
12 102
13 103
14 104
15 105
16 106
17 107
18 108
19 109
dtype: int64
# slice showing items at position 1 thorugh 5
s[1:6]
11 101
12 102
13 103
14 104
15 105
dtype: int64
# lookup via list of positions
s.iloc[[1, 2, 3, 4, 5]]
11 101
12 102
13 103
14 104
15 105
dtype: int64
# items at position 1, 3, 5
s[1:6:2]
11 101
13 103
15 105
dtype: int64
# first five by slicing, same as .head(5)
s[:5]
10 100
11 101
12 102
13 103
14 104
dtype: int64
# fourth position to the end
s[4:]
14 104
15 105
16 106
17 107
18 108
19 109
dtype: int64
# every other item in the first five positions
s[:5:2]
10 100
12 102
14 104
dtype: int64
# every other item starting at the fourth position
s[4::2]
14 104
16 106
18 108
dtype: int64
# reverse the Series
s[::-1]
19 109
18 108
17 107
16 106
15 105
14 104
13 103
12 102
11 101
10 100
dtype: int64
# every other starting at position 4, in reverse
s[4::-2]
14 104
12 102
10 100
dtype: int64
# -4:, which means the last 4 rows
s[-4:]
16 106
17 107
18 108
19 109
dtype: int64
# :-4, all but the last 4
s[:-4]
10 100
11 101
12 102
13 103
14 104
15 105
dtype: int64
# equivalent to s.tail(4).head(3)
s[-4:-1]
16 106
17 107
18 108
dtype: int64
# used to demonstrate the next two slices
s = pd.Series(np.arange(0, 5),
index=['a', 'b', 'c', 'd', 'e'])
s
a 0
b 1
c 2
d 3
e 4
dtype: int64
# slices by position as the index is characters
s[1:3]
b 1
c 2
dtype: int64
# this slices by the strings in the index
s['b':'d']
b 1
c 2
d 3
dtype: int64
Alignment via index labels
# First series for alignment
s1 = pd.Series([1, 2], index=['a', 'b'])
s1
a 1
b 2
dtype: int64
# Second series for alignment
s2 = pd.Series([4, 3], index=['b', 'a'])
s2
b 4
a 3
dtype: int64
# add them
s1 + s2
a 4
b 6
dtype: int64
# multiply all values in s3 by 2
s1 * 2
a 2
b 4
dtype: int64
# scalar series using s3's index
t = pd.Series(2, s1.index)
t
a 2
b 2
dtype: int64
# multiply s1 by t
s1 * t
a 2
b 4
dtype: int64
# we will add this to s1
s3 = pd.Series([5, 6], index=['b', 'c'])
s3
b 5
c 6
dtype: int64
# s1 and s3 have different sets of index labels
# NaN will result for a and c
s1 + s3
a NaN
b 7.0
c NaN
dtype: float64
# 2 'a' labels
s1 = pd.Series([1.0, 2.0, 3.0], index=['a', 'a', 'b'])
s1
a 1.0
a 2.0
b 3.0
dtype: float64
# 3 a labels
s2 = pd.Series([4.0, 5.0, 6.0, 7.0], index=['a', 'a', 'c', 'a'])
s2
a 4.0
a 5.0
c 6.0
a 7.0
dtype: float64
# will result in 6 'a' index labels, and NaN for b and c
s1 + s2
a 5.0
a 6.0
a 8.0
a 6.0
a 7.0
a 9.0
b NaN
c NaN
dtype: float64
Boolean selection
# which rows have values that are > 5?
s = pd.Series(np.arange(0, 5), index=list('abcde'))
logical_results = s >= 3
logical_results
a False
b False
c False
d True
e True
dtype: bool
# select where True
s[logical_results]
d 3
e 4
dtype: int64
# a little shorter version
s[s > 5]
Series([], dtype: int64)
# commented as it throws an exception
# s[s >= 2 and s < 5]
# correct syntax
s[(s >=2) & (s < 5)]
c 2
d 3
e 4
dtype: int64
# are all items >= 0?
(s >= 0).all()
True
# any items < 2?
s[s < 2].any()
True
# how many values < 2?
(s < 2).sum()
2
Reindexing a Series
# sample series of five items
np.random.seed(123456)
s = pd.Series(np.random.randn(5))
s
0 0.469112
1 -0.282863
2 -1.509059
3 -1.135632
4 1.212112
dtype: float64
# change the index
s.index = ['a', 'b', 'c', 'd', 'e']
s
a 0.469112
b -0.282863
c -1.509059
d -1.135632
e 1.212112
dtype: float64
# a series that we will reindex
np.random.seed(123456)
s1 = pd.Series(np.random.randn(4), ['a', 'b', 'c', 'd'])
s1
a 0.469112
b -0.282863
c -1.509059
d -1.135632
dtype: float64
# reindex with different number of labels
# results in dropped rows and/or NaN's
s2 = s1.reindex(['a', 'c', 'g'])
s2
a 0.469112
c -1.509059
g NaN
dtype: float64
# different types for the same values of labels
# causes big trouble
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])
s1 + s2
0 NaN
1 NaN
2 NaN
0 NaN
1 NaN
2 NaN
dtype: float64
# reindex by casting the label types
# and we will get the desired result
s2.index = s2.index.values.astype(int)
s1 + s2
0 3
1 5
2 7
dtype: int64
# fill with 0 instead of NaN
s2 = s.copy()
s2.reindex(['a', 'f'], fill_value=0)
a 0.469112
f 0.000000
dtype: float64
# create example to demonstrate fills
s3 = pd.Series(['red', 'green', 'blue'], index=[0, 3, 5])
s3
0 red
3 green
5 blue
dtype: object
# forward fill example
s3.reindex(np.arange(0,7), method='ffill')
0 red
1 red
2 red
3 green
4 green
5 blue
6 blue
dtype: object
# backwards fill example
s3.reindex(np.arange(0,7), method='bfill')
0 red
1 green
2 green
3 green
4 blue
5 blue
6 NaN
dtype: object
Modifying a Series in-place
# generate a Series to play with
np.random.seed(123456)
s = pd.Series(np.random.randn(3), index=['a', 'b', 'c'])
s
a 0.469112
b -0.282863
c -1.509059
dtype: float64
# change a value in the Series
# this is done in-place
# a new Series is not returned that has a modified value
s['d'] = 100
s
a 0.469112
b -0.282863
c -1.509059
d 100.000000
dtype: float64
# modify the value at 'd' in-place
s['d'] = -100
s
a 0.469112
b -0.282863
c -1.509059
d -100.000000
dtype: float64
# remove a row / item
del(s['a'])
s
b -0.282863
c -1.509059
d -100.000000
dtype: float64
copy = s.copy() # preserve s
slice = copy[:2] # slice with first two rows
slice
b -0.282863
c -1.509059
dtype: float64
# change item with label 10 to 1000
slice['b'] = 0
# and see it in the source
copy
b 0.000000
c -1.509059
d -100.000000
dtype: float64
Last updated