Preprocessing
import pandas as pd
import numpy as np
Missing data
- All desc stats ignore missing data by default
- Missing/null values are known as sentinel values
- null values are represented as NaN (np.nan)
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, :2] = np.nan
df
|
0 |
1 |
2 |
0 |
NaN |
NaN |
0.826938 |
1 |
NaN |
NaN |
1.200128 |
2 |
-0.363270 |
NaN |
-0.624431 |
3 |
0.519002 |
NaN |
0.469106 |
4 |
-0.418187 |
-1.448240 |
-1.227023 |
5 |
-0.101606 |
1.505895 |
0.283625 |
6 |
-0.595061 |
-1.463362 |
1.033548 |
# Checking null
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
string_data.isna()
0 False
1 True
2 True
3 False
dtype: bool
# Filtering
# dropna(how="all") only drops rows that are all NaN
# dropna(thresh=2) keeps those rows with atmost 2 NaN
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()
0 1.0
2 3.5
4 7.0
dtype: float64
0 1.0
2 3.5
4 7.0
dtype: float64
# Filling data
data.fillna(0)
0 1.0
1 0.0
2 3.5
3 0.0
4 7.0
dtype: float64
df.fillna({0: 0.5, 1: 0})
|
0 |
1 |
2 |
0 |
0.500000 |
0.000000 |
0.826938 |
1 |
0.500000 |
0.000000 |
1.200128 |
2 |
-0.363270 |
0.000000 |
-0.624431 |
3 |
0.519002 |
0.000000 |
0.469106 |
4 |
-0.418187 |
-1.448240 |
-1.227023 |
5 |
-0.101606 |
1.505895 |
0.283625 |
6 |
-0.595061 |
-1.463362 |
1.033548 |
df = df.iloc[:0:-1, :]
df.fillna(method="ffill") # automatic interpolation filling
|
0 |
1 |
2 |
6 |
-0.595061 |
-1.463362 |
1.033548 |
5 |
-0.101606 |
1.505895 |
0.283625 |
4 |
-0.418187 |
-1.448240 |
-1.227023 |
3 |
0.519002 |
-1.448240 |
0.469106 |
2 |
-0.363270 |
-1.448240 |
-0.624431 |
1 |
-0.363270 |
-1.448240 |
1.200128 |
Data transformation
# Removing duplicates
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"], "k2": [1, 1, 2, 3, 3, 4, 4]})
data
|
k1 |
k2 |
0 |
one |
1 |
1 |
two |
1 |
2 |
one |
2 |
3 |
two |
3 |
4 |
one |
3 |
5 |
two |
4 |
6 |
two |
4 |
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
data["v1"] = range(7)
data.drop_duplicates(subset=["k1"]) # for a column
|
k1 |
k2 |
v1 |
0 |
one |
1 |
0 |
1 |
two |
1 |
1 |
# Mapping
data = pd.DataFrame(
{
"food": [
"bacon",
"pulled pork",
"bacon",
"pastrami",
"corned beef",
"bacon",
"pastrami",
"honey ham",
"nova lox",
],
"ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6],
}
)
meat_to_animal = {
"bacon": "pig",
"pulled pork": "pig",
"pastrami": "cow",
"corned beef": "cow",
"honey ham": "pig",
"nova lox": "salmon",
}
data["animal"] = data["food"].map(meat_to_animal)
data
|
food |
ounces |
animal |
0 |
bacon |
4.0 |
pig |
1 |
pulled pork |
3.0 |
pig |
2 |
bacon |
12.0 |
pig |
3 |
pastrami |
6.0 |
cow |
4 |
corned beef |
7.5 |
cow |
5 |
bacon |
8.0 |
pig |
6 |
pastrami |
3.0 |
cow |
7 |
honey ham |
5.0 |
pig |
8 |
nova lox |
6.0 |
salmon |
# Replacing values
data = pd.Series([1.0, -999.0, 2.0, -999.0, -1000.0, 3.0])
data.replace(-999, np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
data.replace([1.0, 2.0], np.nan)
0 NaN
1 -999.0
2 NaN
3 -999.0
4 -1000.0
5 3.0
dtype: float64
# Replacing index
data.index = data.index.map(lambda x: f"0{x}")
data
00 1.0
01 -999.0
02 2.0
03 -999.0
04 -1000.0
05 3.0
dtype: float64
data.rename(index={"00": "000"})
000 1.0
01 -999.0
02 2.0
03 -999.0
04 -1000.0
05 3.0
dtype: float64
# Discretization & Binning
# cut(right=False) makes the range [), default is (]
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = pd.cut(ages, [0, 25, 50, 75], labels=["small", "big", "biggest"])
print(bins)
print(bins.categories)
print(bins.codes)
['small', 'small', 'small', 'big', 'small', ..., 'big', 'biggest', 'big', 'big', 'big']
Length: 12
Categories (3, object): ['small' < 'big' < 'biggest']
Index(['small', 'big', 'biggest'], dtype='object')
[0 0 0 1 0 0 1 1 2 1 1 1]
data = np.random.uniform(size=20)
pd.cut(data, 4, precision=2)
[(0.43, 0.64], (0.64, 0.85], (0.0073, 0.22], (0.0073, 0.22], (0.22, 0.43], ..., (0.0073, 0.22], (0.22, 0.43], (0.43, 0.64], (0.0073, 0.22], (0.64, 0.85]]
Length: 20
Categories (4, interval[float64, right]): [(0.0073, 0.22] < (0.22, 0.43] < (0.43, 0.64] < (0.64, 0.85]]
data = np.random.standard_normal(1000)
quartiles = pd.qcut(data, 4, precision=2) # Uses quartiles
quartiles
[(-0.6, 0.06], (0.66, 3.29], (0.06, 0.66], (0.66, 3.29], (0.06, 0.66], ..., (0.06, 0.66], (-0.6, 0.06], (-3.51, -0.6], (-3.51, -0.6], (0.06, 0.66]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.51, -0.6] < (-0.6, 0.06] < (0.06, 0.66] < (0.66, 3.29]]
# Outliers
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data
|
0 |
1 |
2 |
3 |
0 |
0.447450 |
0.302007 |
-0.262682 |
0.467595 |
1 |
0.025539 |
-0.234731 |
-1.337733 |
-1.071232 |
2 |
-0.230109 |
0.579962 |
-0.870994 |
-1.803677 |
3 |
1.411035 |
0.383448 |
-0.732203 |
-0.879131 |
4 |
1.050857 |
0.522194 |
1.800851 |
0.117501 |
... |
... |
... |
... |
... |
995 |
1.033074 |
1.869259 |
0.864272 |
0.419260 |
996 |
-0.548120 |
0.836433 |
1.609818 |
1.914834 |
997 |
0.517092 |
-1.443731 |
0.732647 |
-1.196046 |
998 |
0.445024 |
-0.697722 |
-0.483329 |
1.322649 |
999 |
1.738151 |
-0.393678 |
0.206340 |
-0.520771 |
1000 rows × 4 columns
data[(data.abs() > 3).any(axis=1)]
|
0 |
1 |
2 |
3 |
60 |
-1.181639 |
-0.889499 |
-1.779437 |
3.025088 |
62 |
3.113506 |
-0.373270 |
-0.306986 |
0.686003 |
68 |
1.158020 |
-0.024131 |
3.111495 |
-0.484787 |
630 |
1.067924 |
-1.611424 |
3.047750 |
1.521451 |
631 |
1.185161 |
0.222738 |
-3.039918 |
1.651147 |
645 |
-0.722068 |
2.187831 |
-0.620601 |
-3.071194 |
655 |
-0.135742 |
3.165692 |
-0.522130 |
-0.443482 |
762 |
-0.138561 |
-0.169544 |
0.221776 |
4.002613 |
774 |
3.430154 |
1.606393 |
1.160599 |
-2.437773 |
778 |
-1.561205 |
-0.036121 |
3.319209 |
0.235932 |
833 |
0.122729 |
1.523586 |
-3.516562 |
0.850086 |
920 |
0.024640 |
0.877622 |
1.145380 |
3.358830 |
# Sampling
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
0 |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
1 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
2 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
3 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
4 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
sampler = np.random.permutation(5)
print(sampler)
df.take(sampler)
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
1 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
2 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
3 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
4 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
0 |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
df.sample(10, replace=True)
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
4 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
3 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
0 |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
0 |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
0 |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
1 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
0 |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
0 |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
2 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
2 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
# Dummy variables
pd.get_dummies(df[1])
|
1 |
8 |
15 |
22 |
29 |
0 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
2 |
0 |
0 |
1 |
0 |
0 |
3 |
0 |
0 |
0 |
1 |
0 |
4 |
0 |
0 |
0 |
0 |
1 |
# Extension types
pd.Series(["one", "two", None, "three"], dtype=pd.StringDtype())
0 one
1 two
2 <NA>
3 three
dtype: string
pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())
0 1
1 2
2 3
3 <NA>
dtype: Int64
pd.Series([1, 2, 3, None]) # Normal
0 1.0
1 2.0
2 3.0
3 NaN
dtype: float64
# String manipulation
data = {
"Dave": "dave@google.com",
"Steve": "steve@gmail.com",
"Rob": "rob@gmail.com",
"Wes": np.nan,
}
data = pd.Series(data)
data
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Wes NaN
dtype: object
data.str.contains("gmail") # Would fail for normal .contains due to NaN
Dave False
Steve True
Rob True
Wes NaN
dtype: object
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Wes <NA>
dtype: string
Dave 2.0
Steve 1.0
Rob 1.0
Wes NaN
dtype: float64
Categorical data
Many data systems (for data warehousing, statistical computing, or other uses) have
developed specialized approaches for representing data with repeated values for more
efficient storage and computation. In data warehousing, a best practice is to use
so-called dimension tables containing the distinct values and storing the primary
observations as integer keys referencing the dimension table
This representation as integers is called the categorical or dictionary-encoded repre‐
sentation. The array of distinct values can be called the categories, dictionary, or levels
of the data. The integer
values that reference the categories are called the category codes or simply codes
values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(['apple', 'orange'])
values
0 0
1 1
2 0
3 0
4 0
5 1
6 0
7 0
dtype: int64
0 apple
1 orange
0 apple
0 apple
0 apple
1 orange
0 apple
0 apple
dtype: object
df[''].astype(pd.CategoricalDtype())
df.array.categories
df.array.codes
s = pd.Series(['a', 'b', 'c', 'd'] * 2)
cat_s = s.astype('category')
cat_s
0 a
1 b
2 c
3 d
4 a
5 b
6 c
7 d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']
actual_categories = ['a', 'b', 'c', 'd', 'e']
cat_s2 = cat_s.cat.set_categories(actual_categories)
cat_s2
0 a
1 b
2 c
3 d
4 a
5 b
6 c
7 d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']
a 2
b 2
c 2
d 2
dtype: int64
cat_s2.cat.remove_unused_categories()
0 a
1 b
2 c
3 d
4 a
5 b
6 c
7 d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')
pd.get_dummies(cat_s)
|
a |
b |
c |
d |
0 |
1 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
2 |
0 |
0 |
1 |
0 |
3 |
0 |
0 |
0 |
1 |
4 |
1 |
0 |
0 |
0 |
5 |
0 |
1 |
0 |
0 |
6 |
0 |
0 |
1 |
0 |
7 |
0 |
0 |
0 |
1 |