# Preprocessing

import pandas as pd
import numpy as np

# Missing data

  • All desc stats ignore missing data by default
  • Missing/null values are known as sentinel values
  • null values are represented as NaN (np.nan)
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, :2] = np.nan
df
0 1 2
0 NaN NaN 0.826938
1 NaN NaN 1.200128
2 -0.363270 NaN -0.624431
3 0.519002 NaN 0.469106
4 -0.418187 -1.448240 -1.227023
5 -0.101606 1.505895 0.283625
6 -0.595061 -1.463362 1.033548
# Checking null
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
string_data.isna()
0    False
1     True
2     True
3    False
dtype: bool
# Filtering
# dropna(how="all") only drops rows that are all NaN
# dropna(thresh=2) keeps those rows with atmost 2 NaN
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()
0    1.0
2    3.5
4    7.0
dtype: float64
data[data.notna()]
0    1.0
2    3.5
4    7.0
dtype: float64
# Filling data
data.fillna(0)
0    1.0
1    0.0
2    3.5
3    0.0
4    7.0
dtype: float64
df.fillna({0: 0.5, 1: 0})
0 1 2
0 0.500000 0.000000 0.826938
1 0.500000 0.000000 1.200128
2 -0.363270 0.000000 -0.624431
3 0.519002 0.000000 0.469106
4 -0.418187 -1.448240 -1.227023
5 -0.101606 1.505895 0.283625
6 -0.595061 -1.463362 1.033548
df = df.iloc[:0:-1, :]
df.fillna(method="ffill")  # automatic interpolation filling
0 1 2
6 -0.595061 -1.463362 1.033548
5 -0.101606 1.505895 0.283625
4 -0.418187 -1.448240 -1.227023
3 0.519002 -1.448240 0.469106
2 -0.363270 -1.448240 -0.624431
1 -0.363270 -1.448240 1.200128

# Data transformation

# Removing duplicates
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"], "k2": [1, 1, 2, 3, 3, 4, 4]})
data
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
6 two 4
data.duplicated()
0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool
data["v1"] = range(7)
data.drop_duplicates(subset=["k1"])  # for a column
k1 k2 v1
0 one 1 0
1 two 1 1
# Mapping
data = pd.DataFrame(
    {
        "food": [
            "bacon",
            "pulled pork",
            "bacon",
            "pastrami",
            "corned beef",
            "bacon",
            "pastrami",
            "honey ham",
            "nova lox",
        ],
        "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6],
    }
)

meat_to_animal = {
    "bacon": "pig",
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox": "salmon",
}

data["animal"] = data["food"].map(meat_to_animal)
data
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 pastrami 6.0 cow
4 corned beef 7.5 cow
5 bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
# Replacing values
data = pd.Series([1.0, -999.0, 2.0, -999.0, -1000.0, 3.0])
data.replace(-999, np.nan)
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
data.replace([1.0, 2.0], np.nan)
0       NaN
1    -999.0
2       NaN
3    -999.0
4   -1000.0
5       3.0
dtype: float64
# Replacing index
data.index = data.index.map(lambda x: f"0{x}")
data
00       1.0
01    -999.0
02       2.0
03    -999.0
04   -1000.0
05       3.0
dtype: float64
data.rename(index={"00": "000"})
000       1.0
01     -999.0
02        2.0
03     -999.0
04    -1000.0
05        3.0
dtype: float64
# Discretization & Binning
# cut(right=False) makes the range [), default is (]
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = pd.cut(ages, [0, 25, 50, 75], labels=["small", "big", "biggest"])
print(bins)
print(bins.categories)
print(bins.codes)
['small', 'small', 'small', 'big', 'small', ..., 'big', 'biggest', 'big', 'big', 'big']
Length: 12
Categories (3, object): ['small' < 'big' < 'biggest']
Index(['small', 'big', 'biggest'], dtype='object')
[0 0 0 1 0 0 1 1 2 1 1 1]
data = np.random.uniform(size=20)
pd.cut(data, 4, precision=2)
[(0.43, 0.64], (0.64, 0.85], (0.0073, 0.22], (0.0073, 0.22], (0.22, 0.43], ..., (0.0073, 0.22], (0.22, 0.43], (0.43, 0.64], (0.0073, 0.22], (0.64, 0.85]]
Length: 20
Categories (4, interval[float64, right]): [(0.0073, 0.22] < (0.22, 0.43] < (0.43, 0.64] < (0.64, 0.85]]
data = np.random.standard_normal(1000)
quartiles = pd.qcut(data, 4, precision=2) # Uses quartiles
quartiles
[(-0.6, 0.06], (0.66, 3.29], (0.06, 0.66], (0.66, 3.29], (0.06, 0.66], ..., (0.06, 0.66], (-0.6, 0.06], (-3.51, -0.6], (-3.51, -0.6], (0.06, 0.66]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.51, -0.6] < (-0.6, 0.06] < (0.06, 0.66] < (0.66, 3.29]]
# Outliers
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data
0 1 2 3
0 0.447450 0.302007 -0.262682 0.467595
1 0.025539 -0.234731 -1.337733 -1.071232
2 -0.230109 0.579962 -0.870994 -1.803677
3 1.411035 0.383448 -0.732203 -0.879131
4 1.050857 0.522194 1.800851 0.117501
... ... ... ... ...
995 1.033074 1.869259 0.864272 0.419260
996 -0.548120 0.836433 1.609818 1.914834
997 0.517092 -1.443731 0.732647 -1.196046
998 0.445024 -0.697722 -0.483329 1.322649
999 1.738151 -0.393678 0.206340 -0.520771

1000 rows × 4 columns

data[(data.abs() > 3).any(axis=1)]
0 1 2 3
60 -1.181639 -0.889499 -1.779437 3.025088
62 3.113506 -0.373270 -0.306986 0.686003
68 1.158020 -0.024131 3.111495 -0.484787
630 1.067924 -1.611424 3.047750 1.521451
631 1.185161 0.222738 -3.039918 1.651147
645 -0.722068 2.187831 -0.620601 -3.071194
655 -0.135742 3.165692 -0.522130 -0.443482
762 -0.138561 -0.169544 0.221776 4.002613
774 3.430154 1.606393 1.160599 -2.437773
778 -1.561205 -0.036121 3.319209 0.235932
833 0.122729 1.523586 -3.516562 0.850086
920 0.024640 0.877622 1.145380 3.358830
# Sampling
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df
0 1 2 3 4 5 6
0 0 1 2 3 4 5 6
1 7 8 9 10 11 12 13
2 14 15 16 17 18 19 20
3 21 22 23 24 25 26 27
4 28 29 30 31 32 33 34
sampler = np.random.permutation(5)
print(sampler)
df.take(sampler)
[1 2 3 4 0]
0 1 2 3 4 5 6
1 7 8 9 10 11 12 13
2 14 15 16 17 18 19 20
3 21 22 23 24 25 26 27
4 28 29 30 31 32 33 34
0 0 1 2 3 4 5 6
df.sample(10, replace=True)
0 1 2 3 4 5 6
4 28 29 30 31 32 33 34
3 21 22 23 24 25 26 27
0 0 1 2 3 4 5 6
0 0 1 2 3 4 5 6
0 0 1 2 3 4 5 6
1 7 8 9 10 11 12 13
0 0 1 2 3 4 5 6
0 0 1 2 3 4 5 6
2 14 15 16 17 18 19 20
2 14 15 16 17 18 19 20
# Dummy variables
pd.get_dummies(df[1])
1 8 15 22 29
0 1 0 0 0 0
1 0 1 0 0 0
2 0 0 1 0 0
3 0 0 0 1 0
4 0 0 0 0 1
# Extension types
pd.Series(["one", "two", None, "three"], dtype=pd.StringDtype())
0      one
1      two
2     <NA>
3    three
dtype: string
pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())
0       1
1       2
2       3
3    <NA>
dtype: Int64
pd.Series([1, 2, 3, None]) # Normal
0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64
# String manipulation
data = {
    "Dave": "dave@google.com",
    "Steve": "steve@gmail.com",
    "Rob": "rob@gmail.com",
    "Wes": np.nan,
}
data = pd.Series(data)
data
Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object
data.str.contains("gmail") # Would fail for normal .contains due to NaN
Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object
data.astype("string")
Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                 <NA>
dtype: string
data.str.count(pat="g")
Dave     2.0
Steve    1.0
Rob      1.0
Wes      NaN
dtype: float64

# Categorical data

Many data systems (for data warehousing, statistical computing, or other uses) have developed specialized approaches for representing data with repeated values for more efficient storage and computation. In data warehousing, a best practice is to use so-called dimension tables containing the distinct values and storing the primary observations as integer keys referencing the dimension table

This representation as integers is called the categorical or dictionary-encoded repre‐ sentation. The array of distinct values can be called the categories, dictionary, or levels of the data. The integer values that reference the categories are called the category codes or simply codes

values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(['apple', 'orange'])
values
0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64
dim.take(values)
0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object
df[''].astype(pd.CategoricalDtype())
df.array.categories
df.array.codes
s = pd.Series(['a', 'b', 'c', 'd'] * 2)
cat_s = s.astype('category')
cat_s
0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']
actual_categories = ['a', 'b', 'c', 'd', 'e']
cat_s2 = cat_s.cat.set_categories(actual_categories)
cat_s2
0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']
cat_s.value_counts()
a    2
b    2
c    2
d    2
dtype: int64
cat_s2.cat.remove_unused_categories()
0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')
pd.get_dummies(cat_s)
a b c d
0 1 0 0 0
1 0 1 0 0
2 0 0 1 0
3 0 0 0 1
4 1 0 0 0
5 0 1 0 0
6 0 0 1 0
7 0 0 0 1