2. Running with pandas

Up and Running with pandas

Configuring pandas

# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 80)

# bring in matplotlib for graphics
import matplotlib.pyplot as plt
%matplotlib inline

The pandas Series

# create a four item Series
s = pd.Series([1, 2, 3, 4])
s
0    1
1    2
2    3
3    4
dtype: int64
# get value at label 1
s[1]
2
# return a Series with the row with labels 1 and 3
s[[1, 3]]
1    2
3    4
dtype: int64
# create a series using an explicit index
s = pd.Series([1, 2, 3, 4], 
               index = ['a', 'b', 'c', 'd'])
s
a    1
b    2
c    3
d    4
dtype: int64
# look up items the series having index 'a' and 'd'
s[['a', 'd']]
a    1
d    4
dtype: int64
# passing a list of integers to a Series that has
# non-integer index labels will look up based upon
# 0-based index like an array
s[[1, 2]]
b    2
c    3
dtype: int64
# get only the index of the Series
s.index
Index(['a', 'b', 'c', 'd'], dtype='object')
# create a Series who's index is a series of dates
# between the two specified dates (inclusive)
dates = pd.date_range('2016-04-01', '2016-04-06')
dates
DatetimeIndex(['2016-04-01', '2016-04-02', '2016-04-03', '2016-04-04',
               '2016-04-05', '2016-04-06'],
              dtype='datetime64[ns]', freq='D')
# create a Series with values (representing temperatures)
# for each date in the index
temps1 = pd.Series([80, 82, 85, 90, 83, 87], 
                   index = dates)
temps1
2016-04-01    80
2016-04-02    82
2016-04-03    85
2016-04-04    90
2016-04-05    83
2016-04-06    87
Freq: D, dtype: int64
# what's the temperation for 2016-4-4?
temps1['2016-04-04']
90
# create a second series of values using the same index
temps2 = pd.Series([70, 75, 69, 83, 79, 77], 
                   index = dates)
# the following aligns the two by their index values
# and calculates the difference at those matching labels
temp_diffs = temps1 - temps2
temp_diffs
2016-04-01    10
2016-04-02     7
2016-04-03    16
2016-04-04     7
2016-04-05     4
2016-04-06    10
Freq: D, dtype: int64
# and also possible by integer position as if the 
# series was an array
temp_diffs[2]
16
# calculate the mean of the values in the Series
temp_diffs.mean()
9.0

The pandas DataFrame

# create a DataFrame from the two series objects temp1 and temp2
# and give them column names
temps_df = pd.DataFrame(
            {'Missoula': temps1, 
             'Philadelphia': temps2})
temps_df
            Missoula  Philadelphia
2016-04-01        80            70
2016-04-02        82            75
2016-04-03        85            69
2016-04-04        90            83
2016-04-05        83            79
2016-04-06        87            77
# get the column with the name Missoula
temps_df['Missoula']
2016-04-01    80
2016-04-02    82
2016-04-03    85
2016-04-04    90
2016-04-05    83
2016-04-06    87
Freq: D, Name: Missoula, dtype: int64
# likewise we can get just the Philadelphia column
temps_df['Philadelphia']
2016-04-01    70
2016-04-02    75
2016-04-03    69
2016-04-04    83
2016-04-05    79
2016-04-06    77
Freq: D, Name: Philadelphia, dtype: int64
# return both columns in a different order
temps_df[['Philadelphia', 'Missoula']]
            Philadelphia  Missoula
2016-04-01            70        80
2016-04-02            75        82
2016-04-03            69        85
2016-04-04            83        90
2016-04-05            79        83
2016-04-06            77        87
# retrieve the Missoula column through property syntax
temps_df.Missoula
2016-04-01    80
2016-04-02    82
2016-04-03    85
2016-04-04    90
2016-04-05    83
2016-04-06    87
Freq: D, Name: Missoula, dtype: int64
# calculate the temperature difference between the two cities
temps_df.Missoula - temps_df.Philadelphia
2016-04-01    10
2016-04-02     7
2016-04-03    16
2016-04-04     7
2016-04-05     4
2016-04-06    10
Freq: D, dtype: int64
# add a column to temp_df which contains the difference in temps
temps_df['Difference'] = temp_diffs
temps_df
            Missoula  Philadelphia  Difference
2016-04-01        80            70          10
2016-04-02        82            75           7
2016-04-03        85            69          16
2016-04-04        90            83           7
2016-04-05        83            79           4
2016-04-06        87            77          10
# get the columns, which is also an Index object
temps_df.columns
Index(['Missoula', 'Philadelphia', 'Difference'], dtype='object')
# slice the temp differences column for the rows at 
# location 1 through 4 (as though it is an array)
temps_df.Difference[1:4]
2016-04-02     7
2016-04-03    16
2016-04-04     7
Freq: D, Name: Difference, dtype: int64
# get the row at array position 1
temps_df.iloc[1]
Missoula        82
Philadelphia    75
Difference       7
Name: 2016-04-02 00:00:00, dtype: int64
# the names of the columns have become the index
# they have been 'pivoted'
temps_df.iloc[1].index
Index(['Missoula', 'Philadelphia', 'Difference'], dtype='object')
# retrieve row by index label using .loc
temps_df.loc['2016-04-05']
Missoula        83
Philadelphia    79
Difference       4
Name: 2016-04-05 00:00:00, dtype: int64
# get the values in the Differences column in tows 1, 3 and 5
# using 0-based location
temps_df.iloc[[1, 3, 5]].Difference
2016-04-02     7
2016-04-04     7
2016-04-06    10
Freq: 2D, Name: Difference, dtype: int64
# which values in the Missoula column are > 82?
temps_df.Missoula > 82
2016-04-01    False
2016-04-02    False
2016-04-03     True
2016-04-04     True
2016-04-05     True
2016-04-06     True
Freq: D, Name: Missoula, dtype: bool
# return the rows where the temps for Missoula > 82
temps_df[temps_df.Missoula > 82]
            Missoula  Philadelphia  Difference
2016-04-03        85            69          16
2016-04-04        90            83           7
2016-04-05        83            79           4
2016-04-06        87            77          10

Loading data from a CSV file into a DataFrame

# display the contents of test1.csv
# which command to use depends on your OS
!head data/goog.csv # on non-windows systems
#!type data/test1.csv # on windows systems, all lines
Date,Open,High,Low,Close,Volume
12/19/2016,790.219971,797.659973,786.27002,794.200012,1225900
12/20/2016,796.76001,798.650024,793.27002,796.419983,925100
12/21/2016,795.840027,796.676025,787.099976,794.559998,1208700
12/22/2016,792.359985,793.320007,788.580017,791.26001,969100
12/23/2016,790.900024,792.73999,787.280029,789.909973,623400
12/27/2016,790.679993,797.859985,787.656982,791.549988,789100
12/28/2016,793.700012,794.22998,783.200012,785.049988,1132700
12/29/2016,783.330017,785.929993,778.919983,782.789978,742200
12/30/2016,782.75,782.780029,770.409973,771.820007,1760200
# read the contents of the file into a DataFrame
df = pd.read_csv('data/goog.csv')
df
          Date        Open        High         Low       Close   Volume
0   12/19/2016  790.219971  797.659973  786.270020  794.200012  1225900
1   12/20/2016  796.760010  798.650024  793.270020  796.419983   925100
2   12/21/2016  795.840027  796.676025  787.099976  794.559998  1208700
3   12/22/2016  792.359985  793.320007  788.580017  791.260010   969100
4   12/23/2016  790.900024  792.739990  787.280029  789.909973   623400
..         ...         ...         ...         ...         ...      ...
56   3/13/2017  844.000000  848.684998  843.250000  845.539978  1149500
57   3/14/2017  843.640015  847.239990  840.799988  845.619995   779900
58   3/15/2017  847.590027  848.630005  840.770020  847.200012  1379600
59   3/16/2017  849.030029  850.849976  846.130005  848.780029   970400
60   3/17/2017  851.609985  853.400024  847.109985  852.119995  1712300

[61 rows x 6 columns]
# the contents of the date column
df.Date
0     12/19/2016
1     12/20/2016
2     12/21/2016
3     12/22/2016
4     12/23/2016
         ...    
56     3/13/2017
57     3/14/2017
58     3/15/2017
59     3/16/2017
60     3/17/2017
Name: Date, Length: 61, dtype: object
# we can get the first value in the date column
df.Date[0]
'12/19/2016'
# it is a string
type(df.Date[0])
str
# read the data and tell pandas the date column should be 
# a date in the resulting DataFrame
df = pd.read_csv('data/goog.csv', parse_dates=['Date'])
df
         Date        Open        High         Low       Close   Volume
0  2016-12-19  790.219971  797.659973  786.270020  794.200012  1225900
1  2016-12-20  796.760010  798.650024  793.270020  796.419983   925100
2  2016-12-21  795.840027  796.676025  787.099976  794.559998  1208700
3  2016-12-22  792.359985  793.320007  788.580017  791.260010   969100
4  2016-12-23  790.900024  792.739990  787.280029  789.909973   623400
..        ...         ...         ...         ...         ...      ...
56 2017-03-13  844.000000  848.684998  843.250000  845.539978  1149500
57 2017-03-14  843.640015  847.239990  840.799988  845.619995   779900
58 2017-03-15  847.590027  848.630005  840.770020  847.200012  1379600
59 2017-03-16  849.030029  850.849976  846.130005  848.780029   970400
60 2017-03-17  851.609985  853.400024  847.109985  852.119995  1712300

[61 rows x 6 columns]
# verify the type now is date
# in pandas, this is actually a Timestamp
type(df.Date[0])
pandas._libs.tslib.Timestamp
# unfortunately the index is numeric which makes
# accessing data by date more complicated
df.index
RangeIndex(start=0, stop=61, step=1)
# read in again, now specity the data column as being the 
# index of the resulting DataFrame
df = pd.read_csv('data/goog.csv', 
                 parse_dates=['Date'], 
                 index_col='Date')
df
                  Open        High         Low       Close   Volume
Date                                                               
2016-12-19  790.219971  797.659973  786.270020  794.200012  1225900
2016-12-20  796.760010  798.650024  793.270020  796.419983   925100
2016-12-21  795.840027  796.676025  787.099976  794.559998  1208700
2016-12-22  792.359985  793.320007  788.580017  791.260010   969100
2016-12-23  790.900024  792.739990  787.280029  789.909973   623400
...                ...         ...         ...         ...      ...
2017-03-13  844.000000  848.684998  843.250000  845.539978  1149500
2017-03-14  843.640015  847.239990  840.799988  845.619995   779900
2017-03-15  847.590027  848.630005  840.770020  847.200012  1379600
2017-03-16  849.030029  850.849976  846.130005  848.780029   970400
2017-03-17  851.609985  853.400024  847.109985  852.119995  1712300

[61 rows x 5 columns]
# and the index is now a DatetimeIndex
df.index
DatetimeIndex(['2016-12-19', '2016-12-20', '2016-12-21', '2016-12-22',
               '2016-12-23', '2016-12-27', '2016-12-28', '2016-12-29',
               '2016-12-30', '2017-01-03', '2017-01-04', '2017-01-05',
               '2017-01-06', '2017-01-09', '2017-01-10', '2017-01-11',
               '2017-01-12', '2017-01-13', '2017-01-17', '2017-01-18',
               '2017-01-19', '2017-01-20', '2017-01-23', '2017-01-24',
               '2017-01-25', '2017-01-26', '2017-01-27', '2017-01-30',
               '2017-01-31', '2017-02-01', '2017-02-02', '2017-02-03',
               '2017-02-06', '2017-02-07', '2017-02-08', '2017-02-09',
               '2017-02-10', '2017-02-13', '2017-02-14', '2017-02-15',
               '2017-02-16', '2017-02-17', '2017-02-21', '2017-02-22',
               '2017-02-23', '2017-02-24', '2017-02-27', '2017-02-28',
               '2017-03-01', '2017-03-02', '2017-03-03', '2017-03-06',
               '2017-03-07', '2017-03-08', '2017-03-09', '2017-03-10',
               '2017-03-13', '2017-03-14', '2017-03-15', '2017-03-16',
               '2017-03-17'],
              dtype='datetime64[ns]', name='Date', freq=None)

Visualization

# plots the values in the Close column
df.Close.plot();

Last updated