Pandas

From Torben's Wiki

Getting started

import pandas as pd

Docu

nice cheat sheets can be found here

Create DF

import pandas as pd

# empty
df = pd.DataFrame()

# from 1 dim list
df = pd.DataFrame(data={"Column Name": lst})
# from N dim list, and set name of columns
df = pd.DataFrame(
    data=lst, columns=("num_songs_played", "pct_all_played", "pct_80pct_played")
)
# from multiple lists
data = zip(
    l_days,
    l_2016,
    l_2017,
    l_2018,
    l_2019,
    l_2020,
    strict=True,
)
...

from csv

see [1]

df = pd.read_csv("data.tsv", sep="\t")
# only selected columns and parse date
df = pd.read_csv(
    "data.tsv",
    sep="\t",
    decimal=",",
    usecols=[
        "Date",
        "Deaths_New",
    ],  # only load these columns
    parse_dates=[
        "Date",
    ],  # convert to date object if format is yyyy-mm-dd pr dd.mm.yyyy
    index_col="Date",  # choose this column as index
    na_values=["<4"],  # values to treat as NA
)
df = df.rename(columns={"Deaths_New": "Deaths_Covid"})

# convert data upon loading
# read only first 10 chars from 2021-04-29T12:15:00+02:00 -> 2021-04-29
pd_date_converter = lambda x: (x[0:10])  # noqa: E731
df = pd.read_csv(
    "data.tsv",
    sep=",",
    converters={"Datum": pd_date_converter},
    parse_dates=[
        "Datum",  # # convert to date object if format is yyyy-mm-dd
    ],
)
# convert datetime to date after loading
for c in ("Buchungstag", "Valutadatum"):
    df2[c] = pd.to_datetime(df2[c], format="%d.%m.%Y").dt.date

from list[str] via read_csv

from io import StringIO

csv_string_io = StringIO("\n".join(lines))
df = pd.read_csv(
    csv_string_io,
    sep=";",
)

from Excel

see [2]

import openpyxl  # pip install openpyxl

df = pd.read_excel("file.xlsx", sheet_name="Sheet1", engine="openpyxl")

df = pd.read_excel(
    "file.xlsx",
    skiprows=4,
    usecols=[
        "ArbPlatz",
        "BezArt",
    ],
)

from dict

see [3]

df = pd.DataFrame.from_dict(d, orient="index", columns=["Col1", Col2])

flatten dict in list of dict

# l1 is list of dict, each dict has a sub-dict called metadata
l2 = []
for item in l1:
    flat = item.copy()
    meta = item.pop("metadata")
    flat.update(meta)
    l2.append(flat)
del l1

df = pd.DataFrame.from_dict(l2)

from array/record/list

see [4]

data = [ {"id":1, "name":"asdf"}, {"id":2, "name":"qwertz"}, ]
df = pd.DataFrame.from_records(data)

from JSON

see [5]

df = pd.read_json("file.json")

from SQLite

import sqlite3
sql = "SELECT * FROM tab WHERE col2 = ? ORDER BY id";
df = pd.read_sql_query(sql, con, params=(MY_FILTER_VALUE,))

Export Data

to csv

df.to_csv(
    "data.tsv",
    sep="\t",
    lineterminator="\n",
)
# selected columns only
df[ ["Col1", "Col2"] ].to_csv(
    "data.tsv",
    sep="\t",
    lineterminator="\n",
)

to Excel

see [6]

df.to_excel("data.xlsx", index=False)

Export 2 df as sheets in 1 file

with pd.ExcelWriter(path=file_in.with_suffix(".xlsx")) as writer:  
    df1.to_excel(writer, sheet_name='Sheet_name_1')
    df2.to_excel(writer, sheet_name='Sheet_name_2')
writer.close()

to HTML

see [7]

df.to_html(
    "out.html", index=False, render_links=False, escape=False, justify="center"
    )
# html encoding of column name only
df["name"] = df["name"].str.encode("ascii", "xmlcharrefreplace").str.decode("utf-8")
# add link to name
df["name"] = "<a href='" + df["url"] + "' target='_blank' >" + df["name"] + "</a>"
# export to html
df[ ["name", "due", "overdue", "priority", "overdue priority"] ].to_html(
    "out.html", index=False, render_links=False, escape=False, justify="center"
)

to JSON

json_data = df.values.tolist()

with Path(pathStatsExport / "ts_types_month.json").open(
    "w", encoding="UTF-8"
) as fh:
    json.dump(
        json_data,
        fp=fh,
        ensure_ascii=False,
        sort_keys=False,
        indent=2,
    )

Modification

Add Row

add row from list to end

idx = df.index[-1] + 1
list_of_values = (...)
df.loc[idx] = list_of_values

add dummy row for missing 1.1.2020 and reindex

df.loc[-1] = "2020-01-01", 0
df.index = df.index + 1  # shifting index
df = df.sort_index()  # sorting by index


Replace na Values

df = df.dropna()
# only in certain columns
df = df.dropna(subset=["power", "heart_rate", "cadence"])
df = df[df["value"].notna()]

Interpolate missing values

# interpolate missing data, e.g. after df.groupby([pd.Grouper(key="time", freq="1min")])
df["kWh_total_in"] = df["kWh_total_in"].interpolate(method="linear")

Calc Diff / Delta of rows

df["delta_prev"] = df["kWh"].diff()
df["delta_next"] = df["kWh"].shift(-1) - df["kWh"]

Read-out Data

Columns

df2 = df [ [ "col1", "col2" ] ]

Rows

for index, row in df.iterrows():

Aggregation

Sum Column or Row

# sum of 1 column
sum_cases = df["Cases"].sum()
# sum per column
df_sums = df.sum(axis="columns")
# sum per row / index
df_sums = df.sum(axis="index")

Filtering

Basics

# new notation
df = df.query("power >= 80 & cadence >= 40")
# old notation
df = df[ df["power"] >= 80 ]
df = df[ (df["power"] >= 80) & (df["cadence"] >= 40) ]
# ~ inverts
# list of multiple values
df = df[ ~df["col1"].isin(("A", "B", "C")) ]

filter on index

df = df[df.index >= start_yearweek]

df = df.drop("Summe")

Filtering Dates

# filter a date column via year
df = df[df["Date"].dt.year == 2020]
# filter a date via str
df = df[df["Date"] == "2021-11-13"]
# filter on date via dt
df = df[
    df["RECEIVE_DATE"].dt.date # convert pandas datetime64[ns] to date
    >= (dt.date.today() - dt.timedelta(days=30))
]

Drop Duplicate Rows

df = df.drop_duplicates()

Date Handling

Dates

Convert Column to Date

df["Date"] = pd.to_datetime(df["Date])
df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
df["Year"] = df["Date"].dt.year

Week num to Date

# V2
def get_date_of_week_start(year, week_number) -> dt.date:
    date = dt.date.fromisocalendar(int(year), int(week_number), 1)
    return date
df["date"] = df.apply(
    lambda row: get_date_of_week_start(row["year"], row["week"]),
    axis=1,
)

# V1
# split "week/year" to date
# 51/2021 -> date of sunday
def convert2date(s: str) -> dt.date:
    week, year = s.split("/")
    date = dt.date.fromisocalendar(int(year), int(week), 1)
    return date
df["dt"] = df["Kalenderwoche"].apply(lambda x: convert2date(x))

Date to Start of Week

df["week_start"] = df["date"].dt.to_period("W").apply(lambda r: r.start_time)

DateTime

DateTime: remove miliseconds

df["DateTime"] = df["DateTime"].dt.ceil(freq="s")

Remove timezone offset info

# V1: proper aproach using tz_convert(None)
df[col] = pd.to_datetime(df[col], format="%Y-%m-%dT%H:%M:%S%z")
df[col] = df[col].dt.tz_convert(None)
# if it throws:
#  AttributeError: Can only use .dt accessor with datetimelike values. Did you mean: 'at'?
# see V2

# V2: simple removing the timezone offset
# Remove the timezone information by replacing the "+01:00", "+02:00", "-02:00",
#  etc. with an empty string
df[col] = df[col].str.replace(r"[+\-]\d{2}:\d{2}.*$", "", regex=True)

# Parse the datetime column without timezone information
df[col] = pd.to_datetime(df[col], format="%Y-%m-%dT%H:%M:%S")
# note: now without the timezone %z info: format="%Y-%m-%dT%H:%M:%S%z"

TimeZone conversion

# convert UTC to local time
df["DateTime"] = (
    df["DateTime"]
    .dt.tz_localize("utc")
    .dt.tz_convert(tz="Europe/Berlin")
    # drop timezone info, since Excel can not handle it
    .dt.tz_localize(None)  
)

TimeStamps

Convert Timestamp to Datetime and use as Index

df["datetime"] = pd.to_datetime(  # convert to datetime
    df["start_ts"],
    unit="s",  # timestamp is in seconds
    utc=True,  # timestamp is in UTC
).dt.tz_convert(  # convert to local TZ
    "Europe/Berlin"
)
df = df.set_index("datetime")

Convert Datetime to Timestamp

df["timestamp"] = df["datetime"].values.astype(np.int64) // 10**9
# or
df["timestamp"] = df["datetime"].astype(int) // 10**9
# // is integer division

Timestamp: substract first

# calc elapsed time
df["seconds"] = df["timestamp"] - df["timestamp"].iloc[0]

String Modifications

Replace

df["name"] = df["name"].str.strip() # trim whitespaces
df["time"] = df["time"].str.replace(r"\..*$", "Z", regex=True)
# remove word "Probable"
df["col"] = df["col"].replace(
    to_replace=r"^Probable ", value="", regex=True
)
df["Text"] = df["Text"].str.replace(
    r"^Menke, *Torben *",
    "Torben Menke",
    regex=True,
)

split string

df [ [ "week", "year" ] ] = df["Kalenderwoche"].str.split("/", expand=True)

string <-> number

string to float: 1.234,56 -> 1234.56

df["Euro"] = (
    df["Euro"]
    .str.replace(".", "")
    .str.replace(",", ".")
    .astype(float)
)

convert int to str adding leading zeros

df["Sterbewoche"].astype(str).str.zfill(2)

capitalization

df["Wer"] = df["Wer"].apply(lambda x: string.capwords(x))

trim spaces

df["Wer"] = df["Wer"].str.strip()

count word per row and check if more than one

str.count()

df["cnt_Buchungstext"] = df["Text"].str.count("word")
df_search = df[df["cnt_Buchungstext"] != 1]
if len(df_search) > 0:
    print(df_search)
del df_search
df = df.drop(columns=["cnt_Buchungstext"])

Overwrite Data

# overwrite values > 123 by 123
df["col"] = df["col"].clip(upper=123)
 
# negative -> 0
df[df < 0] = 0

# overwrite column data of last 3 weeks by None
df["DateAsDate"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
date_3w = dt.date.today() - dt.timedelta(weeks=3)
df.loc[df["DateAsDate"].dt.date >= date_3w, "MyColumn"] = None
 
 # rolling takes NAN values into account, so I need to overwrite them as well
df3["Deaths_Covid_roll"] = np.where(
    df3["Deaths_Covid"].isnull(), np.nan, df3["Deaths_Covid_roll"]

df.loc[(df["Kat2"] == "") & (df["IBAN"] == "DE02100100100152517108"), "Kat2"] = "Bahn"
df.loc[(df["Kat1"] == "") & (df["Kat2"] == "Bahn"), "Kat1"] = "Mobilität"

# assigning multiple values
df.loc[
    (df["Kat1"] == "") & (df["IBAN"] == "DE02100100100152517108"),
    ["Kat1", "Kat2", "Kat3"],
] = ["Mobilität], "Reisen", "Bahn"]

# str endswith
df.loc[
    (df["RespTime"] == 0.0)
    & (df["Text"].str.endswith("(read timeout=30)")),
    "RespTime",
] = 30.0

Sorting / order by

# sort by column
df = df.sort_values(by=['betten_belegt'], ascending=False)
df = df.sort_values(by=["Buchungstag", "Text"], ignore_index=True)

Top10

df_top_ten = (
    df.sort_values(by="count", ascending=False)
    .head(10)
)


Group By / Top10

df2 = (
    df[ [ "name", "amount" ] ]
    .groupby("name")
    .agg(amountSum=("amount", "sum"), count=("name", "count"))
    .sort_values(by=["amountSum", "count"], ascending=False)
)
print(df2.head(10)) # top10

# resort by count
print(df2.sort_values(by="count", ascending=False).head(10))

Group by Type and Date

# group by hour
# truncate min and sec data
df["hour"] = pd.to_datetime(df["time"]).dt.floor("H")
# sum per hour
df2 = df[ ["hour", "amount"] ].groupby("hour").agg(amountSum=("amount", "sum"))
df2.plot()
plt.show()
# Datetime Index Grouping
df = df.groupby(pd.Grouper(freq="5min", offset="00h00min")).max()

# Group by type and date of month start
df_month = df.groupby(["type", pd.Grouper(key="date", freq="MS")]).agg(
    {"id": "count", "minutes": "sum"}
)

# generate index of the desired month-freq:
idx = pd.date_range(
    start=df["date"].min().replace(day=1),
    end=df["date"].max().replace(day=1),
    freq="MS",  # MS = Month Start
)

# add missing months
df_month = df_month.reindex(
    pd.MultiIndex.from_product(
        [df_month.index.get_level_values("type"), idx],
        names=["type", "date"],
    )
)

# fill missing by 0 and convert count back to type int
df_month = df_month.fillna(0).astype({"count": int})

Group Datetime by Time of Day only

df["Time"] = df["Date"].dt.round("5min").dt.time

df_grouped = df [ ["RespTime", "Time" ] ].groupby("Time").mean()

Group and calculate min,avg,max

df_grouped = (
    df[ ["Time", "RespTime"] ]
    .groupby("Time")
    .agg(
        max=pd.NamedAgg(column="RespTime", aggfunc="max"),
        avg=pd.NamedAgg(column="RespTime", aggfunc="mean"),
        min=pd.NamedAgg(column="RespTime", aggfunc="min"),
    )
)

Column Handling

Renaming

df = df.rename(
     columns = {"invasiv_beatmet": "beatmet",}, 
     errors="raise",
     )

rename column headers by extracting some int values from a string

l2 = []
for col in df.columns:
    year = int(col[0:4])
    week = int(col[5:7])
    l2.append(year * 100 + week)
df.columns = l2

Dropping

drop columns

df = df.drop(columns=["Sterbejahr", "Sterbewoche"])

Change Column Order

df = df.reindex(
    [
        "Wann",
        "Art",
        "Wer",
    ],
    axis="columns",
)
# move certain columns to front, leaving the remaining unchanged
first_colmns = ("description", "isin", "side", "quantity", "Price", "amount")
i = 0
for col in first_colmns:
    df.insert(i, col, df.pop(col))
    i += 1
del i

Index Handling

select column as index

df = df.set_index("Date")

move date from index back to column

df = df.reset_index()

rename index

df.index.name = "Date"

reset index to start at 0

df2 = df1[1 * 365 : 2 * 365].reset_index(drop=True)

extract cell based on Index

col1_first = df2["col1"].iloc[0]

de_sum = df["col"].loc["Summe"]

text indexes

df = df.set_index("Altersgruppe")

datetime indexes

# select "Date" column as index
df = df.set_index(["Date"])

# convert index to datetime 
df.index = pd.to_datetime(df.index)

# add missing dates
df = df.reindex(
    pd.date_range(df.index.min(), df.index.max(), freq="D"), fill_value=0
)
df.index.name = "date"

# remove timezone offset (for Excel can not handle this)
df.index = df.index.tz_localize(None)
 
date_last = df.index[-1])
# reindex and fill missing with 0
date_last = pd.to_datetime(df.index[-1]).date()
idx = pd.date_range('2020-01-01', date_last))
df = df.reindex(idx, fill_value=0)

# add missing dates
df = df.asfreq('D', fill_value=0)
df = df.asfreq(freq="5M")
df = df.sort_index() # needed?

# drop values of column for last 3 weeks
date_3w = dt.date.today() - dt.timedelta(weeks=3)
df.loc[df.index.date >= date_3w, "Cases"] = None
# or
df.loc[df.index.date < pd.to_datetime("2020-03-01"), "Value"] = None

filter on date index

# drop data prior to 2020
df = df.loc['2020-01-01':]
# alternative:
df = df[df.index >= "2021-01-10"]

Pivot, UnPivot/Melt

TODO: Pivot

melt/unpivot wide table to long table format

df2 = pd.melt(df, id_vars="Date", value_vars=df.columns[1:])  #
df2 = df2.rename(columns={"variable": "Machine", "value": "Status"})

# not working, using melt instead
# df3 = pd.wide_to_long(df, stubnames="Date", i=df.columns[1:], j="Status")

Merge/Concat and Append DFs

Merge/Concat

df_sum = pd.DataFrame()
df_sum = pd.concat(
    [df_sum, df],
    ignore_index=True,
)

Append

# join series of 2 df
df_covid_2020 = pd.DataFrame()
df_covid_2020['Deaths_Covid_2020'] = df1['Deaths_Covid_2020'].append(
    df2['Deaths_Covid_2020'], ignore_index=True)
df = pd.concat([df, df2], axis=1)

# note: this would require all index of df in df2 present:
# df[machine] = df2[machine]


Rolling Average, mean of columns, min, max

df['2016_roll'] = df['2016'].rolling(window=7, min_periods=1).mean().round(1)
df['2017_roll'] = df['2017'].rolling(window=7, min_periods=1).mean().round(1)
df['2018_roll'] = df['2018'].rolling(window=7, min_periods=1).mean().round(1)
df['2019_roll'] = df['2019'].rolling(window=7, min_periods=1).mean().round(1)
df['2020_roll'] = df['2020'].rolling(window=7, min_periods=1).mean().round(1)
# mean value of 4 columns
df['2016_2019_mean'] = df.iloc[:, [1, 2, 3, 4]
                               ].mean(axis=1)  # not column 0 = day
df['2016_2019_mean_roll'] = df['2016_2019_mean'].rolling(
    window=7, min_periods=1).mean().round(1)

df['2016_2019_roll_max'] = df.iloc[:, [6, 7, 8, 9]].max(axis=1)
df['2016_2019_roll_min'] = df.iloc[:, [6, 7, 8, 9]].min(axis=1)

Helpers

def pandas_set_date_index(df, date_column: str):
    """ use date as index """
    df[date_column] = pd.to_datetime(df[date_column], format='%Y-%m-%d')
    df = df.set_index([date_column])
    return df

def pandas_calc_roll_av(df, column: str, days: int = 7):
    """ calc rolling average over column """
    df[column + '_roll_av'] = df[column].rolling(
        window=days, min_periods=1).mean().round(1)
    return df

# custom rounding
def custom_round(x: float, base: int = 5) -> int:
    """Custom rounding."""
    # from https://stackoverflow.com/questions/40372030/pandas-round-to-the-nearest-n
    return int(base * round(float(x) / base))

df["power_rounded"] = df["power"].apply( lambda x: custom_round(x, base=20) )

Transpose

df = df.transpose()

Leftovers

# when in a function one might get the SettingWithCopyWarning, fix via
df = df.copy()
# copy
df2['Date'] = df0['Date']

# drop 2 rows from the beginning
df = df2.drop([0, 1])

append today using yesterdays value

str_today = dt.datetime.today().strftime("%Y-%m-%d")
ts_today = pd.Timestamp(str_today)
if df.index[-1] != ts_today:
    df.loc[ts_today] = df.loc[df.index[-1], "Count"]

Plotting via Matplotlib

see Matplotlib