Pandas

From Torben's Wiki

Pandas

import pandas as pd

create

# empty
df = pd.DataFrame()

# from 1 dim list
df = pd.DataFrame(data={'Deaths_Covid_2020': l})
# from N dim list, and set name of columns
df = pd.DataFrame(data=l_results, columns=('num_songs_played', 'pct_all_played', 'pct_80pct_played'))
# from multiple lists
data = zip(l_dates, l_deaths2016, l_deaths2017,
           l_deaths2018, l_deaths2019, l_deaths2020)
df = pd.DataFrame(data, columns=['Day', '2016', '2017',
                                 '2018', '2019', '2020'])

from csv

see [1]

df = pd.read_csv('data/de-states/de-state-DE-total.tsv', sep="\t")
# only selected columns and parse date 
df = pd.read_csv('data/de-states/de-state-DE-total.tsv', 
     sep="\t", 
     decimal=",",
     usecols=["Date", "Deaths_New",], # only load these columns
     parse_dates=["Date",],           # convert to date object if format is yyyy-mm-dd pr dd.mm.yyyy
     index_col="Date",                # choose this column as index
     # na_values=["<4",]              # values to treat as NA
     )
df.rename(columns={"Deaths_New": "Deaths_Covid"}, inplace = True)

# convert data upon loading
# read only first 10 chars from 2021-04-29T12:15:00+02:00 -> 2021-04-29
pd_date_converter = lambda x: (x[0:10])
df = pd.read_csv(
    filename,
    sep=",",
    converters={"Datum": pd_date_converter},
    parse_dates=[
        "Datum", # # convert to date object if format is yyyy-mm-dd
    ],
)

# convert datetime to date after loading
for c in ("Buchungstag", "Valutadatum"):
    df2[c] = pd.to_datetime(df2[c], format="%d.%m.%Y").dt.date

from Excel

see [2]

import openpyxl # pip install openpyxl

df = pd.read_excel(
        excelFile, sheet_name="COVID_Todesfälle_KW_AG10", engine="openpyxl"
    )

df = pd.read_excel(
    "in/PP_APLATZ.xlsx",
    skiprows=4,
    usecols=[
        "ArbPlatz",
        "BezArt",
    ],
)

from dict

see [3]

d = {}
for col in df_rki.columns:
    d[col] = df_rki[col].sum()
df = pd.DataFrame.from_dict(d, orient="index", columns=["Covid_Tote"])

# deep copy 
df2 = df.copy()

from array/record/list

see [4]

read JSON

see [5]

export

to csv

df.to_csv(
    "data/de-mortality-timeseries.tsv",
    sep="\t",
    line_terminator="\n",
)
# selected columns only
df[ ["Deaths", "Deaths_roll_av"] ].to_csv(
    "data/de-mortality-timeseries.tsv",
    sep="\t",
    line_terminator="\n",
)

to Excel

see [6]

df.to_excel("data.xlsx", index=False)

Export 2 df as sheets in 1 file

with pd.ExcelWriter('output.xlsx') as writer:  
    df1.to_excel(writer, sheet_name='Sheet_name_1')
    df2.to_excel(writer, sheet_name='Sheet_name_2')

to HTML

see [7]

Add row

# add dummy row for missing 1.1.2020
df.loc[-1] = "2020-01-01", 0
df.index = df.index + 1  # shifting index
df = df.sort_index()  # sorting by index

Loop over rows

for index, row in df.iterrows():

selecting

# select columns
df = df [ list_of_column_names ]
df = df [ [ "date", "gemeindeschluessel", "betten_frei", "betten_belegt" ] ]

substring

df["Was"] = df["Was"].str.slice(0, 4)

sum

# calc sum of row
sum_cases = df["Cases"].sum()
# sum per index
df_sums = df.sum(axis="index")
# sum per column
df_sums = df.sum(axis="columns")

filter

# filter on column value via str
df = df[df["Date"] == "2021-11-13"]
df = df[~(df["Date"] == "2021-11-13")] # ~ -> invert

# filter on date column via year
df["Date"].dt.year == 2020

# multiple
df = df[~df["Date"].isin(("2020-02-29", "2024-02-29", "2028-02-29"))]
# filter on date
df = df[
    df["RECEIVE_DATE"].dt.date # convert pandas datetime64[ns] to date
    >= (dt.date.today() - dt.timedelta(days=30))
]

# filter on 2 columns
df2 = df[(df["type"] == "SleepAnalysis") & (df["sourceName"] == "T-iP8")]

# filter column value based on list
df = df[~((df["Sg"] == 19) & (df["Usg"].isin([0, 1, 2, 3])))]
df = df[df["gemeindeschluessel"].isin(l_lkids)]
df = df[df["Date"].isin(("2020-02-29", "2024-02-29", "2028-02-29"))]
# is not in
df = df[~df["Date"].isin(("2020-02-29", "2024-02-29", "2028-02-29"))]

# remove data from latest week, as it might not be complete
df = df[df["YearWeek"] < df["YearWeek"].max()]

# drop value=na rows
df = df[df["value"].notna()]

Duplicates

df = df.drop_duplicates()

convert to date

# convert date_str to date
for c in ("DATE_DRAW", "RECEIVE_DATE", "PROCESSING_DATE"):
    df[c] = pd.to_datetime(df[c], format="%Y-%m-%d")

# split "week/year" to date
# 51/2021 -> date of sunday
def convert2date(s: str) -> dt.date:
    year, week = s.split("/")
    date = dt.date.fromisocalendar(int(week), int(year), 7)
    return date

df["dt"] = df["Kalenderwoche"].apply(lambda x: convert2date(x))

df["Year"] = df["Date"].dt.year

# Remove timezone offset info 
df["Date"] = df["Date"].dt.tz_localize(tz=None)

convert Pandas Timestamp to Python datetime

def convert_pd_timestamp_to_dt(
    pd_timestamp: pd.Timestamp,
) -> dt.datetime:
    my_date_time = pd_timestamp.to_pydatetime()
    return my_date_time

df["Date"] = df["Timestamp"].apply(convert_pd_timestamp_to_dt)

convert date to timestamp

df["timestamp"] = df["date"].values.astype(np.int64) // 10**9

TimeZone conversion

# convert UTC to local time
df["Date"] = (
    df["Date"]
    .dt.tz_localize("utc")
    .dt.tz_convert(tz="Europe/Berlin")
    .dt.tz_localize(None)  # drop timezone info, since Excel can not handle it
)

or

# convert UTC to local time
from zoneinfo import ZoneInfo
TZ_ES = ZoneInfo("Europe/Madrid")
df["Date"] = df["Date"].tz_convert(tz=TZ_ES)

overwrite data

# overwrite large values
df["time to fall asleep"].where(df["time to fall asleep"] > 100, 100, inplace=True)
# overwrite column data of last 3 weeks by None
df["DateAsDate"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
date_3w = dt.date.today() - dt.timedelta(weeks=3)
df.loc[df["DateAsDate"].dt.date >= date_3w, "MyColumn"] = None

# overwrite column value based on date index
df.loc[df.index < pd.Timestamp("2020-03-01"), "InzidenzChange"] = None

 # rolling takes NAN values into account, so I need to overwrite them as well
df3["Deaths_Covid_roll"] = np.where(
    df3["Deaths_Covid"].isnull(), np.nan, df3["Deaths_Covid_roll"]

# replace na by 0
df = df.fillna(0)

# remove word "Probable" from scorpio_call for better clustering
df["scorpio_call"] = df["scorpio_call"].replace(
    to_replace=r"^Probable ", value="", regex=True
)

change value based on condition

df.loc[(df["Kat2"] == "") & (df["IBAN"] == "DE02100100100152517108"), "Kat2"] = "Bahn"
df.loc[(df["Kat1"] == "") & (df["Kat2"] == "Bahn"), "Kat1"] = "Mobilität"

# assigning multiple values
df.loc[
    (df["Kat1"] == "") & (df["IBAN"] == "DE02100100100152517108"),
    ["Kat1", "Kat2", "Kat3"],
] = ["Mobilität], "Reisen", "Bahn"]

split str date

df [ [ "week", "year" ] ] = df["Kalenderwoche"].str.split("/", expand=True)

Sorting / order by

# sort by column
df = df.sort_values(by=['betten_belegt'], ascending=False)

Group By

# group and count
df_lineages = (
    df_all_data.groupby(["lineage", "RECEIVE_DATE"]).size().reset_index(name="count")
)

# group and sum all other columns
df = df.groupby(["YearWeek"]).sum()
df = df.groupby(['Impfdatum'])['Anzahl'].sum().reset_index()

df = df.groupby(['date', 'bundesland']).agg(
  {'faelle_covid_aktuell_invasiv_beatmet': 'sum',
   'betten_ges': 'sum'}
)

df_top_ten = (
    df.groupby("myColumn")
    .sum() # sums over remaining column "count"
    .sort_values(by="count", ascending=False)
    .head(10)
)

Datetime Index Grouping

df = df.groupby(pd.Grouper(freq="5min", offset="00h00min")).max()

Column Handling

df = df.rename(
     columns = {"invasiv_beatmet": "beatmet",}, 
     errors="raise",
     )

# convert int to str adding leading zeros
df["Sterbewoche"].astype(str).str.zfill(2)

drop columns

df.drop(columns=["Sterbejahr", "Sterbewoche"], inplace=True)

sum up over all columns

sum_cases = 0
for col in df.columns:
    sum_cases += df[col].sum()

Column Header Handling

# rename column headers by extracting some int values from a string
l2 = []
for col in df.columns:
    year = int(col[0:4])
    week = int(col[5:7])
    l2.append(year * 100 + week)
df.columns = l2

Change Column Order

df = df.reindex(
    [
        "Wann",
        "Art",
        "Wer",
    ],
    axis="columns",
)

Row Handling

add row from list and index

idx = df.index[-1] + 1
list_of_values = (...)
df.loc[idx] = list_of_values

Index Handling

# set index name
df.index.name = "YearWeek"

int indexes

select column as index
df.set_index("Altersgruppe", inplace=True)

# reset index to start at 0
df2 = df1[1 * 365 : 2 * 365].reset_index(drop=True)

# filter on index
df = df[df.index >= start_yearweek]

datetime indexes

# select "Date" column as index
df.set_index(["Date"], inplace=True)

# move date from index back to column
df.reset_index(inplace=True)
assert df.columns[0] == "Date"

# convert index to datetime 
df.index = pd.to_datetime(df.index)

# filter on date
# drop data prior to 2020
df = df.loc['2020-01-01':]
# alternative:
df = df[df.index >= "2021-01-10"]

date_last = pd.to_datetime(df.index[-1]).date()

# reindex and fill missing with 0
date_last = pd.to_datetime(df.index[-1]).date()
idx = pd.date_range('2020-01-01', date_last))
df = df.reindex(idx, fill_value=0)

# add missing dates
df = df.asfreq('D', fill_value=0)
df = df.asfreq(freq="5M")
df.sort_index(inplace=True) # needed?

# drop values of column for last 3 weeks
date_3w = dt.date.today() - dt.timedelta(weeks=3)
df.loc[df.index.date >= date_3w, "Cases"] = None
# or
df.loc[df.index.date < pd.to_datetime("2020-03-01"), "Value"] = None

text indexes

df.set_index("Altersgruppe", inplace=True)

extract cell

# select column for index
betten_ges = df2["betten_ges"].iloc[0]
# extract cell
de_sum = df["Personen"].loc["Summe"]
# drop row based on index
df.drop("Summe", inplace=True)

melt/unpivot wide table to long table format

df2 = pd.melt(df, id_vars="Date", value_vars=df.columns[1:])  #
df2.rename(columns={"variable": "Machine", "value": "Status"}, inplace=True)

# not working, using melt instead
# df3 = pd.wide_to_long(df, stubnames="Date", i=df.columns[1:], j="Status")

merge/concat multiple dataframes

df_sum = pd.DataFrame()
df_sum = pd.concat(
    [df_sum, df],
    ignore_index=True,
)

# join series of 2 df
df_covid_2020 = pd.DataFrame()
df_covid_2020['Deaths_Covid_2020'] = df1['Deaths_Covid_2020'].append(
    df2['Deaths_Covid_2020'], ignore_index=True)

add rows

df = pd.concat([df, df2], axis=1)

# note: this would require all index of df in df2 present:
# df[machine] = df2[machine]

more

df['2016_roll'] = df['2016'].rolling(window=7, min_periods=1).mean().round(1)
df['2017_roll'] = df['2017'].rolling(window=7, min_periods=1).mean().round(1)
df['2018_roll'] = df['2018'].rolling(window=7, min_periods=1).mean().round(1)
df['2019_roll'] = df['2019'].rolling(window=7, min_periods=1).mean().round(1)
df['2020_roll'] = df['2020'].rolling(window=7, min_periods=1).mean().round(1)
# mean value of 4 columns
df['2016_2019_mean'] = df.iloc[:, [1, 2, 3, 4]
                               ].mean(axis=1)  # not column 0 = day
df['2016_2019_mean_roll'] = df['2016_2019_mean'].rolling(
    window=7, min_periods=1).mean().round(1)

df['2016_2019_roll_max'] = df.iloc[:, [6, 7, 8, 9]].max(axis=1)
df['2016_2019_roll_min'] = df.iloc[:, [6, 7, 8, 9]].min(axis=1)

# prepend empty values to df
# Jan und Feb values are missing for Covid Deaths series, so I need a couple of empty rows
l = [None] * 59
df1 = pd.DataFrame(data={'Deaths_Covid_2020': l})

# ensure first row is from 28.2
assert (df2.iloc[0]['Date'] ==
        '2020-02-28'), "Error of start date, expecting 2020-02-28"

# copy
df2['Date'] = df0['Date']

# drop 2 rows from the beginning
df2.drop([0, 1], inplace=True)

# negative -> 0
df[df < 0] = 0

Helpers

def pandas_set_date_index(df, date_column: str):
    """ use date as index """
    df[date_column] = pd.to_datetime(df[date_column], format='%Y-%m-%d')
    df.set_index([date_column], inplace=True)
    return df

def pandas_calc_roll_av(df, column: str, days: int = 7):
    """ calc rolling average over column """
    df[column + '_roll_av'] = df[column].rolling(
        window=days, min_periods=1).mean().round(1)
    return df

Transpose

# transpose to have yearweek as index
df = df.transpose()

Plotting via Matplotlib

see Matplotlib