# pandas

import numpy as np
import pandas as pd

# Used for heterogenous data whereas numpy for homogenous

# Series

Series - 1D array like obj with associated array of data labels called index
Can be thought of as a fixed length ordered dict

series = pd.Series([1, 2, 3, 4, 5])
series
0    1
1    2
2    3
3    4
4    5
dtype: int64
print(series.array)  # wraps as np array
<PandasArray>
[1, 2, 3, 4, 5]
Length: 5, dtype: int64
series2 = pd.Series([1], index=["a"])
print(series2)
print(series2.index)
series2["a"]
a    1
dtype: int64
Index(['a'], dtype='object')





1
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
ser = pd.Series(sdata)
ser
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
ser.to_dict()
{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
# Change index name
ser.index.name = "hello"
ser
hello
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
# Check null/NaN
# Can use notna() too
pd.isna(ser)
hello
Ohio      False
Texas     False
Oregon    False
Utah      False
dtype: bool

# DataFrame

Table of data that contains ordered, named collection of columns each of which can be of a different type
Has both row and column index

df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [10, 20, 30, 40, 50]})
df
x y
0 1 10
1 2 20
2 3 30
3 4 40
4 5 50
data = {
    "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
    "year": [2000, 2001, 2002, 2001, 2002, 2003],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2],
}
frame = pd.DataFrame(data, columns=["year", "pop", "state"])  # Rearrange
frame
year pop state
0 2000 1.5 Ohio
1 2001 1.7 Ohio
2 2002 3.6 Ohio
3 2001 2.4 Nevada
4 2002 2.9 Nevada
5 2003 3.2 Nevada
df.head()
df.tail()
x y
0 1 10
1 2 20
2 3 30
3 4 40
4 5 50
# Retrieval
df["x"]
df.x
0    1
1    2
2    3
3    4
4    5
Name: x, dtype: int64
# Locating
print(df.loc[0])
print(df.loc[0, "x"])
print(df.iloc[0])
print(df.iloc[0, 0])
x     1
y    10
Name: 0, dtype: int64
1
x     1
y    10
Name: 0, dtype: int64
1
# Mutation
df["z"] = np.arange(5)
df
x y z
0 1 10 0
1 2 20 1
2 3 30 2
3 4 40 3
4 5 50 4
# Deletion
del df["z"]
df
x y
0 1 10
1 2 20
2 3 30
3 4 40
4 5 50
# Transpose
df.T
0 1 2 3 4
x 1 2 3 4 5
y 10 20 30 40 50
df.to_numpy()
array([[ 1, 10],
       [ 2, 20],
       [ 3, 30],
       [ 4, 40],
       [ 5, 50]], dtype=int64)
df.index.name = "year"
df.columns.name = "state"
df
state x y
year
0 1 10
1 2 20
2 3 30
3 4 40
4 5 50

# Index Objects

Index objects are immutable
Behaves like a fixed size set, has set methods but can contain duplicate labels

df.index
RangeIndex(start=0, stop=5, step=1, name='year')

# Essential functionality

# Reindexing (can reindex both col, row)
obj = pd.Series([4.5, 7.2, -5.3], index=[0, 2, 4])
obj
0    4.5
2    7.2
4   -5.3
dtype: float64
obj = obj.reindex(np.arange(6), method="ffill")  # ffill fills NaN values
obj
0    4.5
1    4.5
2    7.2
3    7.2
4   -5.3
5   -5.3
dtype: float64
# Dropping from axis
data = pd.DataFrame(
    np.arange(16).reshape((4, 4)),
    index=["Ohio", "Colorado", "Utah", "New York"],
    columns=["one", "two", "three", "four"],
)
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
new = data.drop("three", axis=1)
new
one two four
Ohio 0 1 3
Colorado 4 5 7
Utah 8 9 11
New York 12 13 15
new = data.drop("Ohio", axis=0)
new
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
# Index selection & filtering
# Series indexing is like np array indexing

# Preference always given to row (if only one index given)

# loc is better because
#   it treats integers differently - when passed integers, it fails
#       if the index of the DataFrame is not an integer
#       regular indexing behaviour differs per datatype
#   -1 indices work whereas they result in weird behaviour with normal indices
#   helps in chained indexing (a.loc[a.b == 2, "c"])
obj = pd.Series(np.arange(4.0), index=["a", "b", "c", "d"])
obj.loc[["b", "a", "d"]]
b    1.0
a    0.0
d    3.0
dtype: float64
# iloc exists to work exclusively with integers
obj = pd.Series(np.arange(4.0), index=["a", "b", "c", "d"])
obj.iloc[[1, 0, 3]]
b    1.0
a    0.0
d    3.0
dtype: float64
obj[[1, 0, 3]]
b    1.0
a    0.0
d    3.0
dtype: float64
# slicing works but endpoint is inclusive
obj.loc["a":"c"]
a    0.0
b    1.0
c    2.0
dtype: float64
data = pd.DataFrame(
    np.arange(16).reshape((4, 4)),
    index=["Ohio", "Colorado", "Utah", "New York"],
    columns=["one", "two", "three", "four"],
)
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data["two"]
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
data.loc[:, ["two"]]
two
Ohio 1
Colorado 5
Utah 9
New York 13
# Cannot use boolean arrays in iloc
data.iloc[1:2, 2:3]
three
Colorado 6
data[data["three"] > 5]
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
# Data alignment
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
s1 + s2
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
s1.add(s2, fill_value=1)
a    5.2
c    1.1
d    4.4
e    0.0
f    5.0
g    4.1
dtype: float64
# Function application and mapping
np.abs(data)
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data.apply(lambda x: x.max() - x.min())
one      12
two      12
three    12
four     12
dtype: int64
data.apply(lambda x: x.max() - x.min(), axis=1)
Ohio        3
Colorado    3
Utah        3
New York    3
dtype: int64
# Sorting
data.sort_index()
one two three four
Colorado 4 5 6 7
New York 12 13 14 15
Ohio 0 1 2 3
Utah 8 9 10 11
data.sort_values(by=["one"])
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
# Ranking
s1.rank(method="first")
a    4.0
c    1.0
d    3.0
e    2.0
dtype: float64

# Desc stats

df = pd.DataFrame(
    [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
    index=["a", "b", "c", "d"],
    columns=["one", "two"],
)
df
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
df.sum()
one    9.25
two   -5.80
dtype: float64
df.mean()
one    3.083333
two   -2.900000
dtype: float64
# returns index of max value
df.idxmax()
one    b
two    d
dtype: object
df.cumsum()
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8
df.describe()
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000
df.cov()
one two
one 12.205833 -10.16
two -10.160000 5.12
df.corr()
one two
one 1.0 -1.0
two -1.0 1.0
# Membership
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
obj.value_counts()
c    3
a    3
b    2
d    1
dtype: int64
obj[obj.isin(["b"])]
5    b
6    b
dtype: object

# Loading data

# index_col can be multiple cols to make grouped hierarchical indices
# sep is to separate contiguous data into cols
# skiprows skips lines in the data
# na_values defines which chars are to be taken as null
# nrows reads only n rows
# chunksize reads only n chunks of rows
pd.read_csv(
    "", names=["a", "b"], index_col="a", sep="\s+", skiprows=[1, 2, 4], na_values=["NA"], nrows=10
)
# na_rep is to replace the blank with a value
df.to_csv("", na_rep="NULL VALUE")