4. Create DataFrame

Table and MultiVariate Data with The DataFrame

Configuring pandas

# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 80)

# bring in matplotlib for graphics
import matplotlib.pyplot as plt
%matplotlib inline

Creating a DataFrame using NumPy function results

# From a 1-d array
pd.DataFrame(np.arange(1, 6))
   0
0  1
1  2
2  3
3  4
4  5
# create a DataFrame from a 2-d ndarray
df = pd.DataFrame(np.array([[10, 11], [20, 21]]))
df
    0   1
0  10  11
1  20  21
# retrieve the columns index
df.columns
RangeIndex(start=0, stop=2, step=1)
# specify column names
df = pd.DataFrame(np.array([[70, 71], [90, 91]]),
                  columns=['Missoula', 'Philadelphia'])
df
   Missoula  Philadelphia
0        70            71
1        90            91
# how many rows?
len(df)
2
# what is the dimensionality
df.shape
(2, 2)

Creating a DataFrame using a Python dictionary and pandas Series objects

# initialization using a python dictionary
temps_missoula = [70, 71]
temps_philly = [90, 91]
temperatures = {'Missoula': temps_missoula,
                'Philadelphia': temps_philly}
pd.DataFrame(temperatures)
   Missoula  Philadelphia
0        70            90
1        71            91
# create a DataFrame for a list of Series objects
temps_at_time0 = pd.Series([70, 90])
temps_at_time1 = pd.Series([71, 91])
df = pd.DataFrame([temps_at_time0, temps_at_time1])
df
    0   1
0  70  90
1  71  91
# try to specify column names
df = pd.DataFrame([temps_at_time0, temps_at_time1],
                  columns=['Missoula', 'Philadelphia'])
df
   Missoula  Philadelphia
0       NaN           NaN
1       NaN           NaN
# specify names of columns after creation
df = pd.DataFrame([temps_at_time0, temps_at_time1])
df.columns = ['Missoula', 'Philadelphia']
df
   Missoula  Philadelphia
0        70            90
1        71            91
# construct using a dict of Series objects
temps_mso_series = pd.Series(temps_missoula)
temps_phl_series = pd.Series(temps_philly)
df = pd.DataFrame({'Missoula': temps_mso_series,
                   'Philadelphia': temps_phl_series})
df
   Missoula  Philadelphia
0        70            90
1        71            91
# alignment occurs during creation
temps_nyc_series = pd.Series([85, 87], index=[1, 2])
df = pd.DataFrame({'Missoula': temps_mso_series,
                   'Philadelphia': temps_phl_series,
                   'New York': temps_nyc_series})
df
   Missoula  Philadelphia  New York
0      70.0          90.0       NaN
1      71.0          91.0      85.0
2       NaN           NaN      87.0

Creating a DataFrame from a CSV file

# read in the data and print the first five rows
# use the Symbol column as the index, and 
# only read in columns in positions 0, 2, 3, 7
sp500 = pd.read_csv("data/sp500.csv", 
                    index_col='Symbol', 
                    usecols=[0, 2, 3, 7])
# peek at the first 5 rows of the data using .head()
sp500.head()
                        Sector   Price  Book Value
Symbol                                            
MMM                Industrials  141.14      26.668
ABT                Health Care   39.60      15.573
ABBV               Health Care   53.95       2.954
ACN     Information Technology   79.79       8.326
ACE                 Financials  102.91      86.897
# how many rows of data?  Should be 500
len(sp500)
500
# what is the shape?
sp500.shape
(500, 3)
# what is the size?
sp500.size
1500
# examine the index
sp500.index
Index(['MMM', 'ABT', 'ABBV', 'ACN', 'ACE', 'ACT', 'ADBE', 'AES', 'AET', 'AFL',
       ...
       'XEL', 'XRX', 'XLNX', 'XL', 'XYL', 'YHOO', 'YUM', 'ZMH', 'ZION', 'ZTS'],
      dtype='object', name='Symbol', length=500)
# get the columns
sp500.columns
Index(['Sector', 'Price', 'Book Value'], dtype='object')

Selecting columns of a DataFrame

# retrieve the Sector column
sp500['Sector'].head()
Symbol
MMM                Industrials
ABT                Health Care
ABBV               Health Care
ACN     Information Technology
ACE                 Financials
Name: Sector, dtype: object
type(sp500['Sector'])
pandas.core.series.Series
# retrieve the Price and Book Value columns
sp500[['Price', 'Book Value']].head()
         Price  Book Value
Symbol                    
MMM     141.14      26.668
ABT      39.60      15.573
ABBV     53.95       2.954
ACN      79.79       8.326
ACE     102.91      86.897
# show that this is a DataFrame
type(sp500[['Price', 'Book Value']])
pandas.core.frame.DataFrame
# attribute access of column by name
sp500.Price
Symbol
MMM     141.14
ABT      39.60
ABBV     53.95
ACN      79.79
ACE     102.91
         ...  
YHOO     35.02
YUM      74.77
ZMH     101.84
ZION     28.43
ZTS      30.53
Name: Price, Length: 500, dtype: float64

Selecting rows of a DataFrame

# get row with label MMM
# returned as a Series
sp500.loc['MMM']
Sector        Industrials
Price              141.14
Book Value         26.668
Name: MMM, dtype: object
# rows with label MMM and MSFT
# this is a DataFrame result
sp500.loc[['MMM', 'MSFT']]
                        Sector   Price  Book Value
Symbol                                            
MMM                Industrials  141.14      26.668
MSFT    Information Technology   40.12      10.584
# get rows in location 0 and 2
sp500.iloc[[0, 2]]
             Sector   Price  Book Value
Symbol                                 
MMM     Industrials  141.14      26.668
ABBV    Health Care   53.95       2.954
# get the location of MMM and A in the index
i1 = sp500.index.get_loc('MMM')
i2 = sp500.index.get_loc('A')
(i1, i2)
(0, 10)
# and get the rows
sp500.iloc[[i1, i2]]
             Sector   Price  Book Value
Symbol                                 
MMM     Industrials  141.14      26.668
A       Health Care   56.18      16.928

Scalar lookup by label or location using .at[] and .iat[]

# by label in both the index and column
sp500.at['MMM', 'Price']
141.14
# by location.  Row 0, column 1
sp500.iat[0, 1]
141.14

Slicing using the [] operator

# first five rows
sp500[:5]
                        Sector   Price  Book Value
Symbol                                            
MMM                Industrials  141.14      26.668
ABT                Health Care   39.60      15.573
ABBV               Health Care   53.95       2.954
ACN     Information Technology   79.79       8.326
ACE                 Financials  102.91      86.897
# ABT through ACN labels
sp500['ABT':'ACN']
                        Sector  Price  Book Value
Symbol                                           
ABT                Health Care  39.60      15.573
ABBV               Health Care  53.95       2.954
ACN     Information Technology  79.79       8.326

Selecting rows using Boolean selection

# what rows have a price < 100?
sp500.Price < 100
Symbol
MMM     False
ABT      True
ABBV     True
ACN      True
ACE     False
        ...  
YHOO     True
YUM      True
ZMH     False
ZION     True
ZTS      True
Name: Price, Length: 500, dtype: bool
# now get the rows with Price < 100
sp500[sp500.Price < 100]
                        Sector  Price  Book Value
Symbol                                           
ABT                Health Care  39.60      15.573
ABBV               Health Care  53.95       2.954
ACN     Information Technology  79.79       8.326
ADBE    Information Technology  64.30      13.262
AES                  Utilities  13.61       5.781
...                        ...    ...         ...
XYL                Industrials  38.42      12.127
YHOO    Information Technology  35.02      12.768
YUM     Consumer Discretionary  74.77       5.147
ZION                Financials  28.43      30.191
ZTS                Health Care  30.53       2.150

[407 rows x 3 columns]
# get only the Price where Price is < 10 and > 0
r = sp500[(sp500.Price < 10) & 
          (sp500.Price > 6)] ['Price']
r
Symbol
HCBK    9.80
HBAN    9.10
SLM     8.82
WIN     9.38
Name: Price, dtype: float64
# price > 100 and in the Health Care Sector
r = sp500[(sp500.Sector == 'Health Care') & 
          (sp500.Price > 100.00)] [['Price', 'Sector']]
r
         Price       Sector
Symbol                     
ACT     213.77  Health Care
ALXN    162.30  Health Care
AGN     166.92  Health Care
AMGN    114.33  Health Care
BCR     146.62  Health Care
...        ...          ...
REGN    297.77  Health Care
TMO     115.74  Health Care
WAT     100.54  Health Care
WLP     108.82  Health Care
ZMH     101.84  Health Care

[19 rows x 2 columns]

Selecting across both rows and columns

# select the price and sector columns for ABT and ZTS
sp500.loc[['ABT', 'ZTS']][['Sector', 'Price']]
             Sector  Price
Symbol                    
ABT     Health Care  39.60
ZTS     Health Care  30.53

Last updated