# Preprocessing

import pandas as pd
import numpy as np

# Missing data

All desc stats ignore missing data by default
Missing/null values are known as sentinel values
null values are represented as NaN (np.nan)

df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, :2] = np.nan
df

	0	1	2
0	NaN	NaN	0.826938
1	NaN	NaN	1.200128
2	-0.363270	NaN	-0.624431
3	0.519002	NaN	0.469106
4	-0.418187	-1.448240	-1.227023
5	-0.101606	1.505895	0.283625
6	-0.595061	-1.463362	1.033548

# Checking null
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
string_data.isna()

  False
   True
   True
  False
dtype: bool

# Filtering
# dropna(how="all") only drops rows that are all NaN
# dropna(thresh=2) keeps those rows with atmost 2 NaN
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

  1.0
  3.5
  7.0
dtype: float64

data[data.notna()]

  1.0
  3.5
  7.0
dtype: float64

# Filling data
data.fillna(0)

  1.0
  0.0
  3.5
  0.0
  7.0
dtype: float64

df.fillna({0: 0.5, 1: 0})

	0	1	2
0	0.500000	0.000000	0.826938
1	0.500000	0.000000	1.200128
2	-0.363270	0.000000	-0.624431
3	0.519002	0.000000	0.469106
4	-0.418187	-1.448240	-1.227023
5	-0.101606	1.505895	0.283625
6	-0.595061	-1.463362	1.033548

df = df.iloc[:0:-1, :]
df.fillna(method="ffill")  # automatic interpolation filling

	0	1	2
6	-0.595061	-1.463362	1.033548
5	-0.101606	1.505895	0.283625
4	-0.418187	-1.448240	-1.227023
3	0.519002	-1.448240	0.469106
2	-0.363270	-1.448240	-0.624431
1	-0.363270	-1.448240	1.200128

# Data transformation

# Removing duplicates
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"], "k2": [1, 1, 2, 3, 3, 4, 4]})
data

	k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4
6	two	4

data.duplicated()

  False
  False
  False
  False
  False
  False
   True
dtype: bool

data["v1"] = range(7)
data.drop_duplicates(subset=["k1"])  # for a column

	k1	k2	v1
0	one	1	0
1	two	1	1

# Mapping
data = pd.DataFrame(
    {
        "food": [
            "bacon",
            "pulled pork",
            "bacon",
            "pastrami",
            "corned beef",
            "bacon",
            "pastrami",
            "honey ham",
            "nova lox",
        ],
        "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6],
    }
)

meat_to_animal = {
    "bacon": "pig",
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox": "salmon",
}

data["animal"] = data["food"].map(meat_to_animal)
data

	food	ounces	animal
0	bacon	4.0	pig
1	pulled pork	3.0	pig
2	bacon	12.0	pig
3	pastrami	6.0	cow
4	corned beef	7.5	cow
5	bacon	8.0	pig
6	pastrami	3.0	cow
7	honey ham	5.0	pig
8	nova lox	6.0	salmon

# Replacing values
data = pd.Series([1.0, -999.0, 2.0, -999.0, -1000.0, 3.0])
data.replace(-999, np.nan)

     1.0
     NaN
     2.0
     NaN
 -1000.0
     3.0
dtype: float64

data.replace([1.0, 2.0], np.nan)

     NaN
  -999.0
     NaN
  -999.0
 -1000.0
     3.0
dtype: float64

# Replacing index
data.index = data.index.map(lambda x: f"0{x}")
data

     1.0
  -999.0
     2.0
  -999.0
 -1000.0
     3.0
dtype: float64

data.rename(index={"00": "000"})

     1.0
   -999.0
      2.0
   -999.0
  -1000.0
      3.0
dtype: float64

# Discretization & Binning
# cut(right=False) makes the range [), default is (]
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = pd.cut(ages, [0, 25, 50, 75], labels=["small", "big", "biggest"])
print(bins)
print(bins.categories)
print(bins.codes)

['small', 'small', 'small', 'big', 'small', ..., 'big', 'biggest', 'big', 'big', 'big']
Length: 12
Categories (3, object): ['small' < 'big' < 'biggest']
Index(['small', 'big', 'biggest'], dtype='object')
[0 0 0 1 0 0 1 1 2 1 1 1]

data = np.random.uniform(size=20)
pd.cut(data, 4, precision=2)

[(0.43, 0.64], (0.64, 0.85], (0.0073, 0.22], (0.0073, 0.22], (0.22, 0.43], ..., (0.0073, 0.22], (0.22, 0.43], (0.43, 0.64], (0.0073, 0.22], (0.64, 0.85]]
Length: 20
Categories (4, interval[float64, right]): [(0.0073, 0.22] < (0.22, 0.43] < (0.43, 0.64] < (0.64, 0.85]]

data = np.random.standard_normal(1000)
quartiles = pd.qcut(data, 4, precision=2) # Uses quartiles
quartiles

[(-0.6, 0.06], (0.66, 3.29], (0.06, 0.66], (0.66, 3.29], (0.06, 0.66], ..., (0.06, 0.66], (-0.6, 0.06], (-3.51, -0.6], (-3.51, -0.6], (0.06, 0.66]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.51, -0.6] < (-0.6, 0.06] < (0.06, 0.66] < (0.66, 3.29]]

# Outliers
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data

	0	1	2	3
0	0.447450	0.302007	-0.262682	0.467595
1	0.025539	-0.234731	-1.337733	-1.071232
2	-0.230109	0.579962	-0.870994	-1.803677
3	1.411035	0.383448	-0.732203	-0.879131
4	1.050857	0.522194	1.800851	0.117501
...	...	...	...	...
995	1.033074	1.869259	0.864272	0.419260
996	-0.548120	0.836433	1.609818	1.914834
997	0.517092	-1.443731	0.732647	-1.196046
998	0.445024	-0.697722	-0.483329	1.322649
999	1.738151	-0.393678	0.206340	-0.520771

1000 rows × 4 columns

data[(data.abs() > 3).any(axis=1)]

	0	1	2	3
60	-1.181639	-0.889499	-1.779437	3.025088
62	3.113506	-0.373270	-0.306986	0.686003
68	1.158020	-0.024131	3.111495	-0.484787
630	1.067924	-1.611424	3.047750	1.521451
631	1.185161	0.222738	-3.039918	1.651147
645	-0.722068	2.187831	-0.620601	-3.071194
655	-0.135742	3.165692	-0.522130	-0.443482
762	-0.138561	-0.169544	0.221776	4.002613
774	3.430154	1.606393	1.160599	-2.437773
778	-1.561205	-0.036121	3.319209	0.235932
833	0.122729	1.523586	-3.516562	0.850086
920	0.024640	0.877622	1.145380	3.358830

# Sampling
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df

	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
2	14	15	16	17	18	19	20
3	21	22	23	24	25	26	27
4	28	29	30	31	32	33	34

sampler = np.random.permutation(5)
print(sampler)
df.take(sampler)

[1 2 3 4 0]

	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
2	14	15	16	17	18	19	20
3	21	22	23	24	25	26	27
4	28	29	30	31	32	33	34
0	0	1	2	3	4	5	6

df.sample(10, replace=True)

	0	1	2	3	4	5	6
4	28	29	30	31	32	33	34
3	21	22	23	24	25	26	27
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
2	14	15	16	17	18	19	20
2	14	15	16	17	18	19	20

# Dummy variables
pd.get_dummies(df[1])

	1	8	15	22	29
0	1	0	0	0	0
1	0	1	0	0	0
2	0	0	1	0	0
3	0	0	0	1	0
4	0	0	0	0	1

# Extension types
pd.Series(["one", "two", None, "three"], dtype=pd.StringDtype())

    one
    two
   <NA>
  three
dtype: string

pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())

     1
     2
     3
  <NA>
dtype: Int64

pd.Series([1, 2, 3, None]) # Normal

  1.0
  2.0
  3.0
  NaN
dtype: float64

# String manipulation
data = {
    "Dave": "dave@google.com",
    "Steve": "steve@gmail.com",
    "Rob": "rob@gmail.com",
    "Wes": np.nan,
}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

data.str.contains("gmail") # Would fail for normal .contains due to NaN

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

data.astype("string")

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                 <NA>
dtype: string

data.str.count(pat="g")

Dave     2.0
Steve    1.0
Rob      1.0
Wes      NaN
dtype: float64

# Categorical data

Many data systems (for data warehousing, statistical computing, or other uses) have developed specialized approaches for representing data with repeated values for more efficient storage and computation. In data warehousing, a best practice is to use so-called dimension tables containing the distinct values and storing the primary observations as integer keys referencing the dimension table

This representation as integers is called the categorical or dictionary-encoded repre‐ sentation. The array of distinct values can be called the categories, dictionary, or levels of the data. The integer values that reference the categories are called the category codes or simply codes

values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(['apple', 'orange'])
values

  0
  1
  0
  0
  0
  1
  0
  0
dtype: int64

dim.take(values)

   apple
  orange
   apple
   apple
   apple
  orange
   apple
   apple
dtype: object

df[''].astype(pd.CategoricalDtype())
df.array.categories
df.array.codes

s = pd.Series(['a', 'b', 'c', 'd'] * 2)
cat_s = s.astype('category')
cat_s

  a
  b
  c
  d
  a
  b
  c
  d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

actual_categories = ['a', 'b', 'c', 'd', 'e']
cat_s2 = cat_s.cat.set_categories(actual_categories)
cat_s2

  a
  b
  c
  d
  a
  b
  c
  d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

cat_s.value_counts()

a    2
b    2
c    2
d    2
dtype: int64

cat_s2.cat.remove_unused_categories()

  a
  b
  c
  d
  a
  b
  c
  d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')
pd.get_dummies(cat_s)

	a	b	c	d
0	1	0	0	0
1	0	1	0	0
2	0	0	1	0
3	0	0	0	1
4	1	0	0	0
5	0	1	0	0
6	0	0	1	0
7	0	0	0	1

	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
2	14	15	16	17	18	19	20
3	21	22	23	24	25	26	27
4	28	29	30	31	32	33	34

	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
2	14	15	16	17	18	19	20
3	21	22	23	24	25	26	27
4	28	29	30	31	32	33	34
0	0	1	2	3	4	5	6

	0	1	2	3	4	5	6
4	28	29	30	31	32	33	34
3	21	22	23	24	25	26	27
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
2	14	15	16	17	18	19	20
2	14	15	16	17	18	19	20

	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
2	14	15	16	17	18	19	20
3	21	22	23	24	25	26	27
4	28	29	30	31	32	33	34

	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
2	14	15	16	17	18	19	20
3	21	22	23	24	25	26	27
4	28	29	30	31	32	33	34
0	0	1	2	3	4	5	6

	0	1	2	3	4	5	6
4	28	29	30	31	32	33	34
3	21	22	23	24	25	26	27
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
2	14	15	16	17	18	19	20
2	14	15	16	17	18	19	20

	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
2	14	15	16	17	18	19	20
3	21	22	23	24	25	26	27
4	28	29	30	31	32	33	34

	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
2	14	15	16	17	18	19	20
3	21	22	23	24	25	26	27
4	28	29	30	31	32	33	34
0	0	1	2	3	4	5	6

	0	1	2	3	4	5	6
4	28	29	30	31	32	33	34
3	21	22	23	24	25	26	27
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
1	7	8	9	10	11	12	13
0	0	1	2	3	4	5	6
0	0	1	2	3	4	5	6
2	14	15	16	17	18	19	20
2	14	15	16	17	18	19	20