pandas
import numpy as np
import pandas as pd
# Used for heterogenous data whereas numpy for homogenous
Series
Series - 1D array like obj with associated array of data labels called index
Can be thought of as a fixed length ordered dict
series = pd.Series([1, 2, 3, 4, 5])
series
0 1
1 2
2 3
3 4
4 5
dtype: int64
print(series.array) # wraps as np array
<PandasArray>
[1, 2, 3, 4, 5]
Length: 5, dtype: int64
series2 = pd.Series([1], index=["a"])
print(series2)
print(series2.index)
series2["a"]
a 1
dtype: int64
Index(['a'], dtype='object')
1
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
ser = pd.Series(sdata)
ser
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
# Change index name
ser.index.name = "hello"
ser
hello
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
# Check null/NaN
# Can use notna() too
pd.isna(ser)
hello
Ohio False
Texas False
Oregon False
Utah False
dtype: bool
DataFrame
Table of data that contains ordered, named collection of columns each of which can be of a different type
Has both row and column index
df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [10, 20, 30, 40, 50]})
df
|
x |
y |
0 |
1 |
10 |
1 |
2 |
20 |
2 |
3 |
30 |
3 |
4 |
40 |
4 |
5 |
50 |
data = {
"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
"year": [2000, 2001, 2002, 2001, 2002, 2003],
"pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2],
}
frame = pd.DataFrame(data, columns=["year", "pop", "state"]) # Rearrange
frame
|
year |
pop |
state |
0 |
2000 |
1.5 |
Ohio |
1 |
2001 |
1.7 |
Ohio |
2 |
2002 |
3.6 |
Ohio |
3 |
2001 |
2.4 |
Nevada |
4 |
2002 |
2.9 |
Nevada |
5 |
2003 |
3.2 |
Nevada |
|
x |
y |
0 |
1 |
10 |
1 |
2 |
20 |
2 |
3 |
30 |
3 |
4 |
40 |
4 |
5 |
50 |
0 1
1 2
2 3
3 4
4 5
Name: x, dtype: int64
# Locating
print(df.loc[0])
print(df.loc[0, "x"])
print(df.iloc[0])
print(df.iloc[0, 0])
x 1
y 10
Name: 0, dtype: int64
1
x 1
y 10
Name: 0, dtype: int64
1
# Mutation
df["z"] = np.arange(5)
df
|
x |
y |
z |
0 |
1 |
10 |
0 |
1 |
2 |
20 |
1 |
2 |
3 |
30 |
2 |
3 |
4 |
40 |
3 |
4 |
5 |
50 |
4 |
# Deletion
del df["z"]
df
|
x |
y |
0 |
1 |
10 |
1 |
2 |
20 |
2 |
3 |
30 |
3 |
4 |
40 |
4 |
5 |
50 |
|
0 |
1 |
2 |
3 |
4 |
x |
1 |
2 |
3 |
4 |
5 |
y |
10 |
20 |
30 |
40 |
50 |
array([[ 1, 10],
[ 2, 20],
[ 3, 30],
[ 4, 40],
[ 5, 50]], dtype=int64)
df.index.name = "year"
df.columns.name = "state"
df
state |
x |
y |
year |
|
|
0 |
1 |
10 |
1 |
2 |
20 |
2 |
3 |
30 |
3 |
4 |
40 |
4 |
5 |
50 |
Index Objects
Index objects are immutable
Behaves like a fixed size set, has set methods but can contain duplicate labels
RangeIndex(start=0, stop=5, step=1, name='year')
Essential functionality
# Reindexing (can reindex both col, row)
obj = pd.Series([4.5, 7.2, -5.3], index=[0, 2, 4])
obj
0 4.5
2 7.2
4 -5.3
dtype: float64
obj = obj.reindex(np.arange(6), method="ffill") # ffill fills NaN values
obj
0 4.5
1 4.5
2 7.2
3 7.2
4 -5.3
5 -5.3
dtype: float64
# Dropping from axis
data = pd.DataFrame(
np.arange(16).reshape((4, 4)),
index=["Ohio", "Colorado", "Utah", "New York"],
columns=["one", "two", "three", "four"],
)
data
|
one |
two |
three |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
new = data.drop("three", axis=1)
new
|
one |
two |
four |
Ohio |
0 |
1 |
3 |
Colorado |
4 |
5 |
7 |
Utah |
8 |
9 |
11 |
New York |
12 |
13 |
15 |
new = data.drop("Ohio", axis=0)
new
|
one |
two |
three |
four |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
# Index selection & filtering
# Series indexing is like np array indexing
# Preference always given to row (if only one index given)
# loc is better because
# it treats integers differently - when passed integers, it fails
# if the index of the DataFrame is not an integer
# regular indexing behaviour differs per datatype
# -1 indices work whereas they result in weird behaviour with normal indices
# helps in chained indexing (a.loc[a.b == 2, "c"])
obj = pd.Series(np.arange(4.0), index=["a", "b", "c", "d"])
obj.loc[["b", "a", "d"]]
b 1.0
a 0.0
d 3.0
dtype: float64
# iloc exists to work exclusively with integers
obj = pd.Series(np.arange(4.0), index=["a", "b", "c", "d"])
obj.iloc[[1, 0, 3]]
b 1.0
a 0.0
d 3.0
dtype: float64
b 1.0
a 0.0
d 3.0
dtype: float64
# slicing works but endpoint is inclusive
obj.loc["a":"c"]
a 0.0
b 1.0
c 2.0
dtype: float64
data = pd.DataFrame(
np.arange(16).reshape((4, 4)),
index=["Ohio", "Colorado", "Utah", "New York"],
columns=["one", "two", "three", "four"],
)
data
|
one |
two |
three |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
Ohio 1
Colorado 5
Utah 9
New York 13
Name: two, dtype: int32
|
two |
Ohio |
1 |
Colorado |
5 |
Utah |
9 |
New York |
13 |
# Cannot use boolean arrays in iloc
data.iloc[1:2, 2:3]
|
one |
two |
three |
four |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
# Data alignment
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
s1 + s2
a 5.2
c 1.1
d NaN
e 0.0
f NaN
g NaN
dtype: float64
a 5.2
c 1.1
d 4.4
e 0.0
f 5.0
g 4.1
dtype: float64
# Function application and mapping
np.abs(data)
|
one |
two |
three |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data.apply(lambda x: x.max() - x.min())
one 12
two 12
three 12
four 12
dtype: int64
data.apply(lambda x: x.max() - x.min(), axis=1)
Ohio 3
Colorado 3
Utah 3
New York 3
dtype: int64
# Sorting
data.sort_index()
|
one |
two |
three |
four |
Colorado |
4 |
5 |
6 |
7 |
New York |
12 |
13 |
14 |
15 |
Ohio |
0 |
1 |
2 |
3 |
Utah |
8 |
9 |
10 |
11 |
data.sort_values(by=["one"])
|
one |
two |
three |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
# Ranking
s1.rank(method="first")
a 4.0
c 1.0
d 3.0
e 2.0
dtype: float64
Desc stats
df = pd.DataFrame(
[[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
index=["a", "b", "c", "d"],
columns=["one", "two"],
)
df
|
one |
two |
a |
1.40 |
NaN |
b |
7.10 |
-4.5 |
c |
NaN |
NaN |
d |
0.75 |
-1.3 |
one 9.25
two -5.80
dtype: float64
one 3.083333
two -2.900000
dtype: float64
# returns index of max value
df.idxmax()
one b
two d
dtype: object
|
one |
two |
a |
1.40 |
NaN |
b |
8.50 |
-4.5 |
c |
NaN |
NaN |
d |
9.25 |
-5.8 |
|
one |
two |
count |
3.000000 |
2.000000 |
mean |
3.083333 |
-2.900000 |
std |
3.493685 |
2.262742 |
min |
0.750000 |
-4.500000 |
25% |
1.075000 |
-3.700000 |
50% |
1.400000 |
-2.900000 |
75% |
4.250000 |
-2.100000 |
max |
7.100000 |
-1.300000 |
|
one |
two |
one |
12.205833 |
-10.16 |
two |
-10.160000 |
5.12 |
|
one |
two |
one |
1.0 |
-1.0 |
two |
-1.0 |
1.0 |
# Membership
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
obj.value_counts()
c 3
a 3
b 2
d 1
dtype: int64
Loading data
# index_col can be multiple cols to make grouped hierarchical indices
# sep is to separate contiguous data into cols
# skiprows skips lines in the data
# na_values defines which chars are to be taken as null
# nrows reads only n rows
# chunksize reads only n chunks of rows
pd.read_csv(
"", names=["a", "b"], index_col="a", sep="\s+", skiprows=[1, 2, 4], na_values=["NA"], nrows=10
)
# na_rep is to replace the blank with a value
df.to_csv("", na_rep="NULL VALUE")