# Data wrangling

import pandas as pd
import numpy as np

# Hierarchical Indexing

Hierarchical indexing is an important feature of pandas that enables you to have multiple (two or more) index levels on an axis. Another way of thinking about it is that it provides a way for you to work with higher dimensional data in a lower dimensional form

data = pd.Series(np.random.uniform(size=9),
 index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"],
 [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1    0.413574
   2    0.036274
   3    0.375760
b  1    0.084912
   3    0.250775
c  1    0.413276
   2    0.340771
d  2    0.289855
   3    0.670660
dtype: float64

data["b"]

1    0.084912
3    0.250775
dtype: float64

data["b": "d"]

b  1    0.084912
   3    0.250775
c  1    0.413276
   2    0.340771
d  2    0.289855
   3    0.670660
dtype: float64

data[["b", "d"]]

b  1    0.084912
   3    0.250775
d  2    0.289855
   3    0.670660
dtype: float64

data.unstack()

	1	2	3
a	0.413574	0.036274	0.375760
b	0.084912	NaN	0.250775
c	0.413276	0.340771	NaN
d	NaN	0.289855	0.670660

data.index.nlevels # Number of indices

# Reordering & sorting levels
data.swaplevel()

a    0.413574
a    0.036274
a    0.375760
b    0.084912
b    0.250775
c    0.413276
c    0.340771
   d    0.289855
d    0.670660
dtype: float64

data.sort_index(level=0)

a  1    0.413574
   2    0.036274
   3    0.375760
b  1    0.084912
   3    0.250775
c  1    0.413276
   2    0.340771
d  2    0.289855
   3    0.670660
dtype: float64

data.sort_index(level=1)

a  1    0.413574
b  1    0.084912
c  1    0.413276
a  2    0.036274
c  2    0.340771
d  2    0.289855
a  3    0.375760
b  3    0.250775
d  3    0.670660
dtype: float64

# Aggregate fns by level
data.groupby(level=0).sum()

a    0.825608
b    0.335687
c    0.754047
d    0.960516
dtype: float64

data.groupby(level=1).sum()

  0.911762
  0.666900
  1.297195
dtype: float64

# Combining and merging datasets

pandas.merge Connect rows in DataFrames based on one or more keys. This will be familiar to users of SQL or other relational databases, as it implements database join operations.
pandas.concat Concatenate or “stack” objects together along an axis.
combine_first Splice together overlapping data to fill in missing values in one object with values from another.

# merge 
df1 = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "a", "b"],
 "data1": pd.Series(range(7), dtype="Int64")})
df2 = pd.DataFrame({"key": ["a", "b", "d"],
 "data2": pd.Series(range(3), dtype="Int64")})

df1

	key	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	a	5
6	b	6

df2

	key	data2
0	a	0
1	b	1
2	d	2

pd.merge(df1, df2) # uses overlapping col names as key
# left_on, right_on key params exist
# how params to specify type of join
# right/left_index param to specify to join based on index 
# result order is unspecified

	key	data1	data2
0	b	0	1
1	b	1	1
2	b	6	1
3	a	2	0
4	a	4	0
5	a	5	0

# concat
s1 = pd.Series([0, 1], index=["a", "b"], dtype="Int64")
s2 = pd.Series([2, 3, 4], index=["c", "d", "e"], dtype="Int64")
s3 = pd.Series([5, 6], index=["f", "g"], dtype="Int64")

pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: Int64

pd.concat([s1, s2, s3], axis=1)

	0	1	2
a	0	<NA>	<NA>
b	1	<NA>	<NA>
c	<NA>	2	<NA>
d	<NA>	3	<NA>
e	<NA>	4	<NA>
f	<NA>	<NA>	5
g	<NA>	<NA>	6

pd.concat([s1, pd.concat([s1, s3])], axis=1, join="inner")

	0	1
a	0	0
b	1	1

pd.concat([s1, s1, s3], keys=["one", "two", "three"])

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: Int64

# overlap based on index
a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan],
  index=["f", "e", "d", "c", "b", "a"])
b = pd.Series([0., np.nan, 2., np.nan, np.nan, 5.],
  index=["a", "b", "c", "d", "e", "f"])

a.combine_first(b) # a gets preference

a    0.0
b    4.5
c    3.5
d    0.0
e    2.5
f    5.0
dtype: float64

# Reshaping & Pivoting

# Pivoting long to wide format