5. Manipulating DataFrame Structure

การจัดโครงสร้าง Dataframe

Configuring pandas

# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 60)

# read in the data and print the first five rows
# use the Symbol column as the index, and 
# only read in columns in positions 0, 2, 3, 7
sp500 = pd.read_csv("data/sp500.csv", 
                    index_col='Symbol', 
                    usecols=[0, 2, 3, 7])

Renaming columns

# rename the Book Value column to not have a space
# this returns a copy with the column renamed
newSP500 = sp500.rename(columns=
                        {'Book Value': 'BookValue'})
# print first 2 rows
newSP500[:2]
             Sector   Price  BookValue
Symbol                                
MMM     Industrials  141.14     26.668
ABT     Health Care   39.60     15.573
# verify the columns in the original did not change
sp500.columns
Index(['Sector', 'Price', 'Book Value'], dtype='object')
# this changes the column in-place
sp500.rename(columns=                  
             {'Book Value': 'BookValue'},                   
             inplace=True)
# we can see the column is changed
sp500.columns
Index(['Sector', 'Price', 'BookValue'], dtype='object')
# and now we can use .BookValue
sp500.BookValue[:5]
Symbol
MMM     26.668
ABT     15.573
ABBV     2.954
ACN      8.326
ACE     86.897
Name: BookValue, dtype: float64

Adding new columns with [] and .insert()

# make a copy so that we keep the original data unchanged
sp500_copy = sp500.copy()
# add the new column
sp500_copy['RoundedPrice'] = sp500.Price.round()
sp500_copy[:2]
             Sector   Price  BookValue  RoundedPrice
Symbol                                              
MMM     Industrials  141.14     26.668         141.0
ABT     Health Care   39.60     15.573          40.0
# make a copy so that we keep the original data unchanged
copy = sp500.copy()
# insert sp500.Price * 2 as the 
# second column in the DataFrame
copy.insert(1, 'RoundedPrice', sp500.Price.round())
copy[:2]
             Sector  RoundedPrice   Price  BookValue
Symbol                                              
MMM     Industrials         141.0  141.14     26.668
ABT     Health Care          40.0   39.60     15.573

Adding columns through enlargement

# copy of subset / slice
ss = sp500[:3].copy()
# add the new column initialized to 0
ss.loc[:,'PER'] = 0
# take a look at the results
ss
             Sector   Price  BookValue  PER
Symbol                                     
MMM     Industrials  141.14     26.668    0
ABT     Health Care   39.60     15.573    0
ABBV    Health Care   53.95      2.954    0
# copy of subset / slice
ss = sp500[:3].copy()
# add the new column initialized with random numbers
np.random.seed(123456)
ss.loc[:,'PER'] = pd.Series(np.random.normal(size=3), index=ss.index)
# take a look at the results
ss
             Sector   Price  BookValue       PER
Symbol                                          
MMM     Industrials  141.14     26.668  0.469112
ABT     Health Care   39.60     15.573 -0.282863
ABBV    Health Care   53.95      2.954 -1.509059

Adding columns using concatenation

# create a DataFrame with only the RoundedPrice column
rounded_price = pd.DataFrame({'RoundedPrice':    
                              sp500.Price.round()})
# concatenate along the columns axis
concatenated = pd.concat([sp500, rounded_price], axis=1)
concatenated[:5]
                        Sector   Price  BookValue  \
Symbol                                              
MMM                Industrials  141.14     26.668   
ABT                Health Care   39.60     15.573   
ABBV               Health Care   53.95      2.954   
ACN     Information Technology   79.79      8.326   
ACE                 Financials  102.91     86.897   

        RoundedPrice  
Symbol                
MMM            141.0  
ABT             40.0  
ABBV            54.0  
ACN             80.0  
ACE            103.0  
# create a DataFrame with only the RoundedPrice column
rounded_price = pd.DataFrame({'Price': sp500.Price.round()})
rounded_price[:5]
        Price
Symbol       
MMM     141.0
ABT      40.0
ABBV     54.0
ACN      80.0
ACE     103.0
# this will result in duplicate Price columm
dups = pd.concat([sp500, rounded_price], axis=1)
dups[:5]
                        Sector   Price  BookValue  Price
Symbol                                                  
MMM                Industrials  141.14     26.668  141.0
ABT                Health Care   39.60     15.573   40.0
ABBV               Health Care   53.95      2.954   54.0
ACN     Information Technology   79.79      8.326   80.0
ACE                 Financials  102.91     86.897  103.0
# retrieves both Price columns
dups.Price[:5]
         Price  Price
Symbol               
MMM     141.14  141.0
ABT      39.60   40.0
ABBV     53.95   54.0
ACN      79.79   80.0
ACE     102.91  103.0

Reordering columns

# return a new DataFrame with the columns reversed
reversed_column_names = sp500.columns[::-1]
sp500[reversed_column_names][:5]
        BookValue   Price                  Sector
Symbol                                           
MMM        26.668  141.14             Industrials
ABT        15.573   39.60             Health Care
ABBV        2.954   53.95             Health Care
ACN         8.326   79.79  Information Technology
ACE        86.897  102.91              Financials

Replacing the contents of a column

# this occurs in-place so let's use a copy
copy = sp500.copy()
# replace the Price column data with the new values
# instead of adding a new column
copy.Price = rounded_price.Price
copy[:5]
                        Sector  Price  BookValue
Symbol                                          
MMM                Industrials  141.0     26.668
ABT                Health Care   40.0     15.573
ABBV               Health Care   54.0      2.954
ACN     Information Technology   80.0      8.326
ACE                 Financials  103.0     86.897
# this occurs in-place so let's use a copy
copy = sp500.copy()
# replace the Price column data wwith rounded values
copy.loc[:,'Price'] = rounded_price.Price
copy[:5]
                        Sector  Price  BookValue
Symbol                                          
MMM                Industrials  141.0     26.668
ABT                Health Care   40.0     15.573
ABBV               Health Care   54.0      2.954
ACN     Information Technology   80.0      8.326
ACE                 Financials  103.0     86.897

Deleting columns

# Example of using del to delete a column
# make a copy as this is done in-place
copy = sp500.copy()
del copy['BookValue']
copy[:2]
             Sector   Price
Symbol                     
MMM     Industrials  141.14
ABT     Health Care   39.60
# Example of using pop to remove a column from a DataFrame
# first make a copy of a subset of the data frame as
# pop works in place
copy = sp500.copy()
# this will remove Sector and return it as a series
popped = copy.pop('Sector')
# Sector column removed in-place
copy[:2]
         Price  BookValue
Symbol                   
MMM     141.14     26.668
ABT      39.60     15.573
# and we have the Sector column as the result of the pop
popped[:5]
Symbol
MMM                Industrials
ABT                Health Care
ABBV               Health Care
ACN     Information Technology
ACE                 Financials
Name: Sector, dtype: object
# Example of using drop to remove a column 
# make a copy of a subset of the data frame
copy = sp500.copy()
# this will return a new DataFrame with 'Sector’ removed
# the copy DataFrame is not modified
afterdrop = copy.drop(['Sector'], axis = 1)
afterdrop[:5]
         Price  BookValue
Symbol                   
MMM     141.14     26.668
ABT      39.60     15.573
ABBV     53.95      2.954
ACN      79.79      8.326
ACE     102.91     86.897

Appending rows from other DataFrame objects with .append()

# copy the first three rows of sp500
df1 = sp500.iloc[0:3].copy()
# copy 10th and 11th rows
df2 = sp500.iloc[[10, 11, 2]]
# append df1 and df2
appended = df1.append(df2)
# the result is the rows of the first followed by 
# those of the second
appended
             Sector   Price  BookValue
Symbol                                
MMM     Industrials  141.14     26.668
ABT     Health Care   39.60     15.573
ABBV    Health Care   53.95      2.954
A       Health Care   56.18     16.928
GAS       Utilities   52.98     32.462
ABBV    Health Care   53.95      2.954
# data frame using df1.index and just a PER column
# also a good example of using a scalar value
# to initialize multiple rows
df3 = pd.DataFrame(0.0, 
                   index=df1.index,
                   columns=['PER'])
df3
        PER
Symbol     
MMM     0.0
ABT     0.0
ABBV    0.0
# append df1 and df3
# each has three rows, so 6 rows is the result
# df1 had no PER column, so NaN from for those rows
# df3 had no BookValue, Price or Sector, so NaN's
df1.append(df3)
c:\users\user\appdata\local\programs\python\python37-32\lib\site-packages\pandas\core\frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  sort=sort)





        BookValue  PER   Price       Sector
Symbol                                     
MMM        26.668  NaN  141.14  Industrials
ABT        15.573  NaN   39.60  Health Care
ABBV        2.954  NaN   53.95  Health Care
MMM           NaN  0.0     NaN          NaN
ABT           NaN  0.0     NaN          NaN
ABBV          NaN  0.0     NaN          NaN
# ignore index labels, create default index
df1.append(df3, ignore_index=True)
   BookValue  PER   Price       Sector
0     26.668  NaN  141.14  Industrials
1     15.573  NaN   39.60  Health Care
2      2.954  NaN   53.95  Health Care
3        NaN  0.0     NaN          NaN
4        NaN  0.0     NaN          NaN
5        NaN  0.0     NaN          NaN

Concatenating rows

# copy the first three rows of sp500
df1 = sp500.iloc[0:3].copy()
# copy 10th and 11th rows
df2 = sp500.iloc[[10, 11, 2]]
# pass them as a list
pd.concat([df1, df2])
             Sector   Price  BookValue
Symbol                                
MMM     Industrials  141.14     26.668
ABT     Health Care   39.60     15.573
ABBV    Health Care   53.95      2.954
A       Health Care   56.18     16.928
GAS       Utilities   52.98     32.462
ABBV    Health Care   53.95      2.954
# copy df2
df2_2 = df2.copy()
# add a column to df2_2 that is not in df1
df2_2.insert(3, 'Foo', pd.Series(0, index=df2.index))
# see what it looks like
df2_2
             Sector  Price  BookValue  Foo
Symbol                                    
A       Health Care  56.18     16.928    0
GAS       Utilities  52.98     32.462    0
ABBV    Health Care  53.95      2.954    0
# now concatenate
pd.concat([df1, df2_2])
c:\users\user\appdata\local\programs\python\python37-32\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.







        BookValue  Foo   Price       Sector
Symbol                                     
MMM        26.668  NaN  141.14  Industrials
ABT        15.573  NaN   39.60  Health Care
ABBV        2.954  NaN   53.95  Health Care
A          16.928  0.0   56.18  Health Care
GAS        32.462  0.0   52.98    Utilities
ABBV        2.954  0.0   53.95  Health Care
# specify keys
r = pd.concat([df1, df2_2], keys=['df1', 'df2'])
r
            BookValue  Foo   Price       Sector
    Symbol                                     
df1 MMM        26.668  NaN  141.14  Industrials
    ABT        15.573  NaN   39.60  Health Care
    ABBV        2.954  NaN   53.95  Health Care
df2 A          16.928  0.0   56.18  Health Care
    GAS        32.462  0.0   52.98    Utilities
    ABBV        2.954  0.0   53.95  Health Care

Adding and replacing rows via setting with enlargement

# get a small subset of the sp500 
# make sure to copy the slice to make a copy
ss = sp500[:3].copy()
# create a new row with index label FOO
# and assign some values to the columns via a list
ss.loc['FOO'] = ['the sector', 100, 110]
ss
             Sector   Price  BookValue
Symbol                                
MMM     Industrials  141.14     26.668
ABT     Health Care   39.60     15.573
ABBV    Health Care   53.95      2.954
FOO      the sector  100.00    110.000

Removing rows using .drop()

# get a copy of the first 5 rows of sp500
ss = sp500[:5]
ss
                        Sector   Price  BookValue
Symbol                                           
MMM                Industrials  141.14     26.668
ABT                Health Care   39.60     15.573
ABBV               Health Care   53.95      2.954
ACN     Information Technology   79.79      8.326
ACE                 Financials  102.91     86.897
# drop rows with labels ABT and ACN
afterdrop = ss.drop(['ABT', 'ACN'])
afterdrop[:5]
             Sector   Price  BookValue
Symbol                                
MMM     Industrials  141.14     26.668
ABBV    Health Care   53.95      2.954
ACE      Financials  102.91     86.897

Removing rows using Boolean selection

# determine the rows where Price > 300
selection = sp500.Price > 300
# report number of rows and number that will be dropped
(len(selection), selection.sum())
(500, 10)
# select the complement of the expression
# note the use of the complement of the selection
price_less_than_300 = sp500[~selection]
price_less_than_300
                        Sector   Price  BookValue
Symbol                                           
MMM                Industrials  141.14     26.668
ABT                Health Care   39.60     15.573
ABBV               Health Care   53.95      2.954
ACN     Information Technology   79.79      8.326
ACE                 Financials  102.91     86.897
...                        ...     ...        ...
YHOO    Information Technology   35.02     12.768
YUM     Consumer Discretionary   74.77      5.147
ZMH                Health Care  101.84     37.181
ZION                Financials   28.43     30.191
ZTS                Health Care   30.53      2.150

[490 rows x 3 columns]

Removing rows using a slice

# get only the first three rows
only_first_three = sp500[:3]
only_first_three
             Sector   Price  BookValue
Symbol                                
MMM     Industrials  141.14     26.668
ABT     Health Care   39.60     15.573
ABBV    Health Care   53.95      2.954
# first three, but a copy of them
only_first_three = sp500[:3].copy()
only_first_three
             Sector   Price  BookValue
Symbol                                
MMM     Industrials  141.14     26.668
ABT     Health Care   39.60     15.573
ABBV    Health Care   53.95      2.954

Last updated