# pandas

import numpy as np
import pandas as pd

# Used for heterogenous data whereas numpy for homogenous

# Series

Series - 1D array like obj with associated array of data labels called index
Can be thought of as a fixed length ordered dict

series = pd.Series([1, 2, 3, 4, 5])
series

  1
  2
  3
  4
  5
dtype: int64

print(series.array)  # wraps as np array

<PandasArray>
[1, 2, 3, 4, 5]
Length: 5, dtype: int64

series2 = pd.Series([1], index=["a"])
print(series2)
print(series2.index)
series2["a"]

a    1
dtype: int64
Index(['a'], dtype='object')

1

sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
ser = pd.Series(sdata)
ser

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

ser.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

# Change index name
ser.index.name = "hello"
ser

hello
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

# Check null/NaN
# Can use notna() too
pd.isna(ser)

hello
Ohio      False
Texas     False
Oregon    False
Utah      False
dtype: bool

# DataFrame

Table of data that contains ordered, named collection of columns each of which can be of a different type
Has both row and column index

df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [10, 20, 30, 40, 50]})
df

	x	y
0	1	10
1	2	20
2	3	30
3	4	40
4	5	50

data = {
    "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
    "year": [2000, 2001, 2002, 2001, 2002, 2003],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2],
}
frame = pd.DataFrame(data, columns=["year", "pop", "state"])  # Rearrange
frame

	year	pop	state
0	2000	1.5	Ohio
1	2001	1.7	Ohio
2	2002	3.6	Ohio
3	2001	2.4	Nevada
4	2002	2.9	Nevada
5	2003	3.2	Nevada

df.head()
df.tail()

	x	y
0	1	10
1	2	20
2	3	30
3	4	40
4	5	50

# Retrieval
df["x"]
df.x

  1
  2
  3
  4
  5
Name: x, dtype: int64

# Locating
print(df.loc[0])
print(df.loc[0, "x"])
print(df.iloc[0])
print(df.iloc[0, 0])

x     1
y    10
Name: 0, dtype: int64
1
x     1
y    10
Name: 0, dtype: int64
1

# Mutation
df["z"] = np.arange(5)
df

	x	y	z
0	1	10	0
1	2	20	1
2	3	30	2
3	4	40	3
4	5	50	4

# Deletion
del df["z"]
df

	x	y
0	1	10
1	2	20
2	3	30
3	4	40
4	5	50

# Transpose
df.T

	0	1	2	3	4
x	1	2	3	4	5
y	10	20	30	40	50

df.to_numpy()

array([[ 1, 10],
       [ 2, 20],
       [ 3, 30],
       [ 4, 40],
       [ 5, 50]], dtype=int64)

df.index.name = "year"
df.columns.name = "state"
df

state	x	y
year
0	1	10
1	2	20
2	3	30
3	4	40
4	5	50

# Index Objects

Index objects are immutable
Behaves like a fixed size set, has set methods but can contain duplicate labels

df.index

RangeIndex(start=0, stop=5, step=1, name='year')

# Essential functionality

# Reindexing (can reindex both col, row)
obj = pd.Series([4.5, 7.2, -5.3], index=[0, 2, 4])
obj

  4.5
  7.2
 -5.3
dtype: float64

obj = obj.reindex(np.arange(6), method="ffill")  # ffill fills NaN values
obj

  4.5
  4.5
  7.2
  7.2
 -5.3
 -5.3
dtype: float64

# Dropping from axis
data = pd.DataFrame(
    np.arange(16).reshape((4, 4)),
    index=["Ohio", "Colorado", "Utah", "New York"],
    columns=["one", "two", "three", "four"],
)
data

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

new = data.drop("three", axis=1)
new

	one	two	four
Ohio	0	1	3
Colorado	4	5	7
Utah	8	9	11
New York	12	13	15

new = data.drop("Ohio", axis=0)
new

	one	two	three	four
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

# Index selection & filtering
# Series indexing is like np array indexing

# Preference always given to row (if only one index given)

# loc is better because
#   it treats integers differently - when passed integers, it fails
#       if the index of the DataFrame is not an integer
#       regular indexing behaviour differs per datatype
#   -1 indices work whereas they result in weird behaviour with normal indices
#   helps in chained indexing (a.loc[a.b == 2, "c"])
obj = pd.Series(np.arange(4.0), index=["a", "b", "c", "d"])
obj.loc[["b", "a", "d"]]

b    1.0
a    0.0
d    3.0
dtype: float64

# iloc exists to work exclusively with integers
obj = pd.Series(np.arange(4.0), index=["a", "b", "c", "d"])
obj.iloc[[1, 0, 3]]

b    1.0
a    0.0
d    3.0
dtype: float64

obj[[1, 0, 3]]

b    1.0
a    0.0
d    3.0
dtype: float64

# slicing works but endpoint is inclusive
obj.loc["a":"c"]

a    0.0
b    1.0
c    2.0
dtype: float64

data = pd.DataFrame(
    np.arange(16).reshape((4, 4)),
    index=["Ohio", "Colorado", "Utah", "New York"],
    columns=["one", "two", "three", "four"],
)
data

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

data["two"]

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

data.loc[:, ["two"]]

	two
Ohio	1
Colorado	5
Utah	9
New York	13

# Cannot use boolean arrays in iloc
data.iloc[1:2, 2:3]

	three
Colorado	6

data[data["three"] > 5]

	one	two	three	four
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

# Data alignment
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

s1.add(s2, fill_value=1)

a    5.2
c    1.1
d    4.4
e    0.0
f    5.0
g    4.1
dtype: float64

# Function application and mapping
np.abs(data)

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

data.apply(lambda x: x.max() - x.min())

one      12
two      12
three    12
four     12
dtype: int64

data.apply(lambda x: x.max() - x.min(), axis=1)

Ohio        3
Colorado    3
Utah        3
New York    3
dtype: int64

# Sorting
data.sort_index()

	one	two	three	four
Colorado	4	5	6	7
New York	12	13	14	15
Ohio	0	1	2	3
Utah	8	9	10	11

data.sort_values(by=["one"])

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

# Ranking
s1.rank(method="first")

a    4.0
c    1.0
d    3.0
e    2.0
dtype: float64

# Desc stats

df = pd.DataFrame(
    [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
    index=["a", "b", "c", "d"],
    columns=["one", "two"],
)
df

	one	two
a	1.40	NaN
b	7.10	-4.5
c	NaN	NaN
d	0.75	-1.3

df.sum()

one    9.25
two   -5.80
dtype: float64

df.mean()

one    3.083333
two   -2.900000
dtype: float64

# returns index of max value
df.idxmax()

one    b
two    d
dtype: object

df.cumsum()

	one	two
a	1.40	NaN
b	8.50	-4.5
c	NaN	NaN
d	9.25	-5.8

df.describe()

	one	two
count	3.000000	2.000000
mean	3.083333	-2.900000
std	3.493685	2.262742
min	0.750000	-4.500000
25%	1.075000	-3.700000
50%	1.400000	-2.900000
75%	4.250000	-2.100000
max	7.100000	-1.300000

df.cov()

	one	two
one	12.205833	-10.16
two	-10.160000	5.12

df.corr()

	one	two
one	1.0	-1.0
two	-1.0	1.0

# Membership
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

obj[obj.isin(["b"])]

5    b
6    b
dtype: object

# Loading data

# index_col can be multiple cols to make grouped hierarchical indices
# sep is to separate contiguous data into cols
# skiprows skips lines in the data
# na_values defines which chars are to be taken as null
# nrows reads only n rows
# chunksize reads only n chunks of rows
pd.read_csv(
    "", names=["a", "b"], index_col="a", sep="\s+", skiprows=[1, 2, 4], na_values=["NA"], nrows=10
)

# na_rep is to replace the blank with a value
df.to_csv("", na_rep="NULL VALUE")