# Data wrangling

import pandas as pd
import numpy as np

# Hierarchical Indexing

Hierarchical indexing is an important feature of pandas that enables you to have multiple (two or more) index levels on an axis. Another way of thinking about it is that it provides a way for you to work with higher dimensional data in a lower dimensional form

data = pd.Series(np.random.uniform(size=9),
 index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"],
 [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data
a  1    0.413574
   2    0.036274
   3    0.375760
b  1    0.084912
   3    0.250775
c  1    0.413276
   2    0.340771
d  2    0.289855
   3    0.670660
dtype: float64
data["b"]
1    0.084912
3    0.250775
dtype: float64
data["b": "d"]
b  1    0.084912
   3    0.250775
c  1    0.413276
   2    0.340771
d  2    0.289855
   3    0.670660
dtype: float64
data[["b", "d"]]
b  1    0.084912
   3    0.250775
d  2    0.289855
   3    0.670660
dtype: float64
data.unstack()
1 2 3
a 0.413574 0.036274 0.375760
b 0.084912 NaN 0.250775
c 0.413276 0.340771 NaN
d NaN 0.289855 0.670660
data.index.nlevels # Number of indices
2
# Reordering & sorting levels
data.swaplevel()
1  a    0.413574
2  a    0.036274
3  a    0.375760
1  b    0.084912
3  b    0.250775
1  c    0.413276
2  c    0.340771
   d    0.289855
3  d    0.670660
dtype: float64
data.sort_index(level=0)
a  1    0.413574
   2    0.036274
   3    0.375760
b  1    0.084912
   3    0.250775
c  1    0.413276
   2    0.340771
d  2    0.289855
   3    0.670660
dtype: float64
data.sort_index(level=1)
a  1    0.413574
b  1    0.084912
c  1    0.413276
a  2    0.036274
c  2    0.340771
d  2    0.289855
a  3    0.375760
b  3    0.250775
d  3    0.670660
dtype: float64
# Aggregate fns by level
data.groupby(level=0).sum()
a    0.825608
b    0.335687
c    0.754047
d    0.960516
dtype: float64
data.groupby(level=1).sum()
1    0.911762
2    0.666900
3    1.297195
dtype: float64

# Combining and merging datasets

  • pandas.merge Connect rows in DataFrames based on one or more keys. This will be familiar to users of SQL or other relational databases, as it implements database join operations.
  • pandas.concat Concatenate or “stack” objects together along an axis.
  • combine_first Splice together overlapping data to fill in missing values in one object with values from another.
# merge 
df1 = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "a", "b"],
 "data1": pd.Series(range(7), dtype="Int64")})
df2 = pd.DataFrame({"key": ["a", "b", "d"],
 "data2": pd.Series(range(3), dtype="Int64")})
df1
key data1
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 a 5
6 b 6
df2
key data2
0 a 0
1 b 1
2 d 2
pd.merge(df1, df2) # uses overlapping col names as key
# left_on, right_on key params exist
# how params to specify type of join
# right/left_index param to specify to join based on index 
# result order is unspecified
key data1 data2
0 b 0 1
1 b 1 1
2 b 6 1
3 a 2 0
4 a 4 0
5 a 5 0
# concat
s1 = pd.Series([0, 1], index=["a", "b"], dtype="Int64")
s2 = pd.Series([2, 3, 4], index=["c", "d", "e"], dtype="Int64")
s3 = pd.Series([5, 6], index=["f", "g"], dtype="Int64")
pd.concat([s1, s2, s3])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: Int64
pd.concat([s1, s2, s3], axis=1)
0 1 2
a 0 <NA> <NA>
b 1 <NA> <NA>
c <NA> 2 <NA>
d <NA> 3 <NA>
e <NA> 4 <NA>
f <NA> <NA> 5
g <NA> <NA> 6
pd.concat([s1, pd.concat([s1, s3])], axis=1, join="inner")
0 1
a 0 0
b 1 1
pd.concat([s1, s1, s3], keys=["one", "two", "three"])
one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: Int64
# overlap based on index
a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan],
  index=["f", "e", "d", "c", "b", "a"])
b = pd.Series([0., np.nan, 2., np.nan, np.nan, 5.],
  index=["a", "b", "c", "d", "e", "f"])
a.combine_first(b) # a gets preference
a    0.0
b    4.5
c    3.5
d    0.0
e    2.5
f    5.0
dtype: float64

# Reshaping & Pivoting

# Pivoting long to wide format