5. Manipulating DataFrame Structure
การจัดโครงสร้าง Dataframe
Configuring pandas
# import numpy and pandas
import numpy as np
import pandas as pd
# used for dates
import datetime
from datetime import datetime, date
# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 60)
# read in the data and print the first five rows
# use the Symbol column as the index, and
# only read in columns in positions 0, 2, 3, 7
sp500 = pd.read_csv("data/sp500.csv",
index_col='Symbol',
usecols=[0, 2, 3, 7])
Renaming columns
# rename the Book Value column to not have a space
# this returns a copy with the column renamed
newSP500 = sp500.rename(columns=
{'Book Value': 'BookValue'})
# print first 2 rows
newSP500[:2]
Sector Price BookValue
Symbol
MMM Industrials 141.14 26.668
ABT Health Care 39.60 15.573
# verify the columns in the original did not change
sp500.columns
Index(['Sector', 'Price', 'Book Value'], dtype='object')
# this changes the column in-place
sp500.rename(columns=
{'Book Value': 'BookValue'},
inplace=True)
# we can see the column is changed
sp500.columns
Index(['Sector', 'Price', 'BookValue'], dtype='object')
# and now we can use .BookValue
sp500.BookValue[:5]
Symbol
MMM 26.668
ABT 15.573
ABBV 2.954
ACN 8.326
ACE 86.897
Name: BookValue, dtype: float64
Adding new columns with [] and .insert()
# make a copy so that we keep the original data unchanged
sp500_copy = sp500.copy()
# add the new column
sp500_copy['RoundedPrice'] = sp500.Price.round()
sp500_copy[:2]
Sector Price BookValue RoundedPrice
Symbol
MMM Industrials 141.14 26.668 141.0
ABT Health Care 39.60 15.573 40.0
# make a copy so that we keep the original data unchanged
copy = sp500.copy()
# insert sp500.Price * 2 as the
# second column in the DataFrame
copy.insert(1, 'RoundedPrice', sp500.Price.round())
copy[:2]
Sector RoundedPrice Price BookValue
Symbol
MMM Industrials 141.0 141.14 26.668
ABT Health Care 40.0 39.60 15.573
Adding columns through enlargement
# copy of subset / slice
ss = sp500[:3].copy()
# add the new column initialized to 0
ss.loc[:,'PER'] = 0
# take a look at the results
ss
Sector Price BookValue PER
Symbol
MMM Industrials 141.14 26.668 0
ABT Health Care 39.60 15.573 0
ABBV Health Care 53.95 2.954 0
# copy of subset / slice
ss = sp500[:3].copy()
# add the new column initialized with random numbers
np.random.seed(123456)
ss.loc[:,'PER'] = pd.Series(np.random.normal(size=3), index=ss.index)
# take a look at the results
ss
Sector Price BookValue PER
Symbol
MMM Industrials 141.14 26.668 0.469112
ABT Health Care 39.60 15.573 -0.282863
ABBV Health Care 53.95 2.954 -1.509059
Adding columns using concatenation
# create a DataFrame with only the RoundedPrice column
rounded_price = pd.DataFrame({'RoundedPrice':
sp500.Price.round()})
# concatenate along the columns axis
concatenated = pd.concat([sp500, rounded_price], axis=1)
concatenated[:5]
Sector Price BookValue \
Symbol
MMM Industrials 141.14 26.668
ABT Health Care 39.60 15.573
ABBV Health Care 53.95 2.954
ACN Information Technology 79.79 8.326
ACE Financials 102.91 86.897
RoundedPrice
Symbol
MMM 141.0
ABT 40.0
ABBV 54.0
ACN 80.0
ACE 103.0
# create a DataFrame with only the RoundedPrice column
rounded_price = pd.DataFrame({'Price': sp500.Price.round()})
rounded_price[:5]
Price
Symbol
MMM 141.0
ABT 40.0
ABBV 54.0
ACN 80.0
ACE 103.0
# this will result in duplicate Price columm
dups = pd.concat([sp500, rounded_price], axis=1)
dups[:5]
Sector Price BookValue Price
Symbol
MMM Industrials 141.14 26.668 141.0
ABT Health Care 39.60 15.573 40.0
ABBV Health Care 53.95 2.954 54.0
ACN Information Technology 79.79 8.326 80.0
ACE Financials 102.91 86.897 103.0
# retrieves both Price columns
dups.Price[:5]
Price Price
Symbol
MMM 141.14 141.0
ABT 39.60 40.0
ABBV 53.95 54.0
ACN 79.79 80.0
ACE 102.91 103.0
Reordering columns
# return a new DataFrame with the columns reversed
reversed_column_names = sp500.columns[::-1]
sp500[reversed_column_names][:5]
BookValue Price Sector
Symbol
MMM 26.668 141.14 Industrials
ABT 15.573 39.60 Health Care
ABBV 2.954 53.95 Health Care
ACN 8.326 79.79 Information Technology
ACE 86.897 102.91 Financials
Replacing the contents of a column
# this occurs in-place so let's use a copy
copy = sp500.copy()
# replace the Price column data with the new values
# instead of adding a new column
copy.Price = rounded_price.Price
copy[:5]
Sector Price BookValue
Symbol
MMM Industrials 141.0 26.668
ABT Health Care 40.0 15.573
ABBV Health Care 54.0 2.954
ACN Information Technology 80.0 8.326
ACE Financials 103.0 86.897
# this occurs in-place so let's use a copy
copy = sp500.copy()
# replace the Price column data wwith rounded values
copy.loc[:,'Price'] = rounded_price.Price
copy[:5]
Sector Price BookValue
Symbol
MMM Industrials 141.0 26.668
ABT Health Care 40.0 15.573
ABBV Health Care 54.0 2.954
ACN Information Technology 80.0 8.326
ACE Financials 103.0 86.897
Deleting columns
# Example of using del to delete a column
# make a copy as this is done in-place
copy = sp500.copy()
del copy['BookValue']
copy[:2]
Sector Price
Symbol
MMM Industrials 141.14
ABT Health Care 39.60
# Example of using pop to remove a column from a DataFrame
# first make a copy of a subset of the data frame as
# pop works in place
copy = sp500.copy()
# this will remove Sector and return it as a series
popped = copy.pop('Sector')
# Sector column removed in-place
copy[:2]
Price BookValue
Symbol
MMM 141.14 26.668
ABT 39.60 15.573
# and we have the Sector column as the result of the pop
popped[:5]
Symbol
MMM Industrials
ABT Health Care
ABBV Health Care
ACN Information Technology
ACE Financials
Name: Sector, dtype: object
# Example of using drop to remove a column
# make a copy of a subset of the data frame
copy = sp500.copy()
# this will return a new DataFrame with 'Sector’ removed
# the copy DataFrame is not modified
afterdrop = copy.drop(['Sector'], axis = 1)
afterdrop[:5]
Price BookValue
Symbol
MMM 141.14 26.668
ABT 39.60 15.573
ABBV 53.95 2.954
ACN 79.79 8.326
ACE 102.91 86.897
Appending rows from other DataFrame objects with .append()
# copy the first three rows of sp500
df1 = sp500.iloc[0:3].copy()
# copy 10th and 11th rows
df2 = sp500.iloc[[10, 11, 2]]
# append df1 and df2
appended = df1.append(df2)
# the result is the rows of the first followed by
# those of the second
appended
Sector Price BookValue
Symbol
MMM Industrials 141.14 26.668
ABT Health Care 39.60 15.573
ABBV Health Care 53.95 2.954
A Health Care 56.18 16.928
GAS Utilities 52.98 32.462
ABBV Health Care 53.95 2.954
# data frame using df1.index and just a PER column
# also a good example of using a scalar value
# to initialize multiple rows
df3 = pd.DataFrame(0.0,
index=df1.index,
columns=['PER'])
df3
PER
Symbol
MMM 0.0
ABT 0.0
ABBV 0.0
# append df1 and df3
# each has three rows, so 6 rows is the result
# df1 had no PER column, so NaN from for those rows
# df3 had no BookValue, Price or Sector, so NaN's
df1.append(df3)
c:\users\user\appdata\local\programs\python\python37-32\lib\site-packages\pandas\core\frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
sort=sort)
BookValue PER Price Sector
Symbol
MMM 26.668 NaN 141.14 Industrials
ABT 15.573 NaN 39.60 Health Care
ABBV 2.954 NaN 53.95 Health Care
MMM NaN 0.0 NaN NaN
ABT NaN 0.0 NaN NaN
ABBV NaN 0.0 NaN NaN
# ignore index labels, create default index
df1.append(df3, ignore_index=True)
BookValue PER Price Sector
0 26.668 NaN 141.14 Industrials
1 15.573 NaN 39.60 Health Care
2 2.954 NaN 53.95 Health Care
3 NaN 0.0 NaN NaN
4 NaN 0.0 NaN NaN
5 NaN 0.0 NaN NaN
Concatenating rows
# copy the first three rows of sp500
df1 = sp500.iloc[0:3].copy()
# copy 10th and 11th rows
df2 = sp500.iloc[[10, 11, 2]]
# pass them as a list
pd.concat([df1, df2])
Sector Price BookValue
Symbol
MMM Industrials 141.14 26.668
ABT Health Care 39.60 15.573
ABBV Health Care 53.95 2.954
A Health Care 56.18 16.928
GAS Utilities 52.98 32.462
ABBV Health Care 53.95 2.954
# copy df2
df2_2 = df2.copy()
# add a column to df2_2 that is not in df1
df2_2.insert(3, 'Foo', pd.Series(0, index=df2.index))
# see what it looks like
df2_2
Sector Price BookValue Foo
Symbol
A Health Care 56.18 16.928 0
GAS Utilities 52.98 32.462 0
ABBV Health Care 53.95 2.954 0
# now concatenate
pd.concat([df1, df2_2])
c:\users\user\appdata\local\programs\python\python37-32\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
BookValue Foo Price Sector
Symbol
MMM 26.668 NaN 141.14 Industrials
ABT 15.573 NaN 39.60 Health Care
ABBV 2.954 NaN 53.95 Health Care
A 16.928 0.0 56.18 Health Care
GAS 32.462 0.0 52.98 Utilities
ABBV 2.954 0.0 53.95 Health Care
# specify keys
r = pd.concat([df1, df2_2], keys=['df1', 'df2'])
r
BookValue Foo Price Sector
Symbol
df1 MMM 26.668 NaN 141.14 Industrials
ABT 15.573 NaN 39.60 Health Care
ABBV 2.954 NaN 53.95 Health Care
df2 A 16.928 0.0 56.18 Health Care
GAS 32.462 0.0 52.98 Utilities
ABBV 2.954 0.0 53.95 Health Care
Adding and replacing rows via setting with enlargement
# get a small subset of the sp500
# make sure to copy the slice to make a copy
ss = sp500[:3].copy()
# create a new row with index label FOO
# and assign some values to the columns via a list
ss.loc['FOO'] = ['the sector', 100, 110]
ss
Sector Price BookValue
Symbol
MMM Industrials 141.14 26.668
ABT Health Care 39.60 15.573
ABBV Health Care 53.95 2.954
FOO the sector 100.00 110.000
Removing rows using .drop()
# get a copy of the first 5 rows of sp500
ss = sp500[:5]
ss
Sector Price BookValue
Symbol
MMM Industrials 141.14 26.668
ABT Health Care 39.60 15.573
ABBV Health Care 53.95 2.954
ACN Information Technology 79.79 8.326
ACE Financials 102.91 86.897
# drop rows with labels ABT and ACN
afterdrop = ss.drop(['ABT', 'ACN'])
afterdrop[:5]
Sector Price BookValue
Symbol
MMM Industrials 141.14 26.668
ABBV Health Care 53.95 2.954
ACE Financials 102.91 86.897
Removing rows using Boolean selection
# determine the rows where Price > 300
selection = sp500.Price > 300
# report number of rows and number that will be dropped
(len(selection), selection.sum())
(500, 10)
# select the complement of the expression
# note the use of the complement of the selection
price_less_than_300 = sp500[~selection]
price_less_than_300
Sector Price BookValue
Symbol
MMM Industrials 141.14 26.668
ABT Health Care 39.60 15.573
ABBV Health Care 53.95 2.954
ACN Information Technology 79.79 8.326
ACE Financials 102.91 86.897
... ... ... ...
YHOO Information Technology 35.02 12.768
YUM Consumer Discretionary 74.77 5.147
ZMH Health Care 101.84 37.181
ZION Financials 28.43 30.191
ZTS Health Care 30.53 2.150
[490 rows x 3 columns]
Removing rows using a slice
# get only the first three rows
only_first_three = sp500[:3]
only_first_three
Sector Price BookValue
Symbol
MMM Industrials 141.14 26.668
ABT Health Care 39.60 15.573
ABBV Health Care 53.95 2.954
# first three, but a copy of them
only_first_three = sp500[:3].copy()
only_first_three
Sector Price BookValue
Symbol
MMM Industrials 141.14 26.668
ABT Health Care 39.60 15.573
ABBV Health Care 53.95 2.954
Last updated