Torben: /* Basics */

2026-03-08T10:30:38Z

Basics

New page

[[Category:Coding]][[Category:Python]]
==Getting started==
import pandas as pd

===Docu===
nice cheat sheets can be found here
* https://www.datacamp.com/cheat-sheet/pandas-cheat-sheet-for-data-science-in-python
* https://www.datacamp.com/cheat-sheet/pandas-cheat-sheet-data-wrangling-in-python

==Create DF==
import pandas as pd
# empty
df = pd.DataFrame()
===random data===
df = pd.DataFrame({"x": range(100)})
df["y"] = np.random.rand(len(df)) * 10

# y = sin(x) + noise
noise = np.random.rand(len(df)) * 0.1
noise = np.random.default_rng().random(10) # newer syntax
df["y"] = np.sin(df["x"] / 25 * np.pi) + noise

# 20 rows and 3 columns of random values
df = pd.DataFrame(np.random.rand(20, 3), columns=["a", "b", "c"])

===from Lists===

# from 1 dim list
df = pd.DataFrame(data={"Column Name": lst})
# per column
df = pd.DataFrame(
data={"col1":lst1, "col2":lst2, "col3":lst3})
)
# from N dim list, and set name of columns
df = pd.DataFrame(
data=lst, columns=("col1", "col2", "col3)
)
# from multiple lists
data = zip(
l_days,
l_2016,
l_2017,
l_2018,
l_2019,
l_2020,
strict=True,
)
...

===from csv===
see [https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html]
df = pd.read_csv("data.tsv", sep="\t")
# only selected columns and parse date
df = pd.read_csv(
"data.tsv",
sep="\t",
decimal=",",
usecols=[
"Date",
"Deaths_New",
], # only load these columns
parse_dates=[
"Date",
], # convert to date object if format is yyyy-mm-dd pr dd.mm.yyyy
index_col="Date", # choose this column as index
na_values=["<4"], # values to treat as NA
)
df = df.rename(columns={"Deaths_New": "Deaths_Covid"})

# convert data upon loading
# read only first 10 chars from 2021-04-29T12:15:00+02:00 -> 2021-04-29
pd_date_converter = lambda x: (x[0:10]) # noqa: E731
df = pd.read_csv(
"data.tsv",
sep=",",
converters={"Datum": pd_date_converter},
parse_dates=[
"Datum", # # convert to date object if format is yyyy-mm-dd
],
)
# convert datetime to date after loading
for c in ("Buchungstag", "Valutadatum"):
df2[c] = pd.to_datetime(df2[c], format="%d.%m.%Y").dt.date

===from list[str] via read_csv===
from io import StringIO

csv_string_io = StringIO("\n".join(lines))
df = pd.read_csv(
csv_string_io,
sep=";",
)

data_str = """
datetime_de value
30.09.2022 10:00 214516.10
13.10.2022 00:20 214563.85
"""
df = pd.read_csv(StringIO(data_str), sep="\t", lineterminator="\n")
df["value"] = df["value"].astype(float)

===from Excel===
There are 2 libs: openpyxl (for read and write) and XlsxWriter (write)
see [https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html]
import openpyxl # pip install openpyxl

df = pd.read_excel("file.xlsx", sheet_name="Sheet1", engine="openpyxl")

df = pd.read_excel(
"file.xlsx",
skiprows=4,
usecols=[
"ArbPlatz",
"BezArt",
],
)

===from dict===
see [https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html]
df = pd.DataFrame.from_dict(d, orient="index", columns=["Col1", Col2])

flatten dict in list of dict
# l1 is list of dict, each dict has a sub-dict called metadata
l2 = []
for item in l1:
flat = item.copy()
meta = item.pop("metadata")
flat.update(meta)
l2.append(flat)
del l1

df = pd.DataFrame.from_dict(l2)

===from array/record/list===
see [https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_records.html]
data = [ {"id":1, "name":"asdf"}, {"id":2, "name":"qwertz"}, ]
df = pd.DataFrame.from_records(data)

===from JSON===
see [https://pandas.pydata.org/docs/reference/api/pandas.read_json.html]
df = pd.read_json("file.json")

===from SQLite===
import sqlite3
sql = "SELECT * FROM tab WHERE col2 = ? ORDER BY id";
df = pd.read_sql_query(sql, con, params=(MY_FILTER_VALUE,))

==Export Data==
===to csv===
df.to_csv(
"data.tsv",
sep="\t",
lineterminator="\n",
)
# selected columns only
df[ ["Col1", "Col2"] ].to_csv(
"data.tsv",
sep="\t",
lineterminator="\n",
)
to string in CSV format
l_csv = (
df.drop(columns=["hour"]).to_csv(
index=False, sep="\t", lineterminator="\n", encoding="utf-8"
),
)
s_csv = "".join(l_csv)

===to Excel===
see [https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_excel.html]
df.to_excel("data.xlsx", index=False)

Export 2 df as sheets in 1 file
with pd.ExcelWriter(path=file_in.with_suffix(".xlsx")) as writer:
df1.to_excel(writer, sheet_name='Sheet_name_1')
df2.to_excel(writer, sheet_name='Sheet_name_2')

===to HTML===
see [https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_html.html]
df.to_html(
"out.html", index=False, render_links=False, escape=False, justify="center"
)

# html encoding of column name only
df["name"] = df["name"].str.encode("ascii", "xmlcharrefreplace").str.decode("utf-8")
# add link to name
df["name"] = "<a href='" + df["url"] + "' target='_blank' >" + df["name"] + "</a>"
# export to html
df[ ["name", "due", "overdue", "priority", "overdue priority"] ].to_html(
"out.html", index=False, render_links=False, escape=False, justify="center"
)

===to JSON===
json_data = df.values.tolist()

with Path(pathStatsExport / "ts_types_month.json").open(
"w", encoding="UTF-8"
) as fh:
json.dump(
json_data,
fp=fh,
ensure_ascii=False,
sort_keys=False,
indent=2,
)

==Modification==
===Add Row===
add row from list to end
idx = df.index[-1] + 1
list_of_values = (...)
df.loc[idx] = list_of_values

add dummy row for missing 1.1.2020 and reindex
df.loc[-1] = "2020-01-01", 0
df.index = df.index + 1 # shifting index
df = df.sort_index() # sorting by index

===Add column based on other columns===
df["URL"] = df.apply(
lambda row: f"https://www.openstreetmap.org/?mlat={row['Lat']}&mlon={row['Lng']}#map={zoom}/{row['Lat']}/{row['Lng']}",
axis=1,

===Apply map/dict id->name===
df["x_gear_name"] = df["gear_id"].map(d_id_name)

===Replace na Values===
df = df.dropna() # removes missing values
# only in certain columns
df = df.dropna(subset=["power", "heart_rate", "cadence"])
df = df[df["value"].notna()]

===Interpolate missing values===
# interpolate missing data, e.g. after df.groupby([pd.Grouper(key="time", freq="1min")])
df["kWh_total_in"] = df["kWh_total_in"].interpolate(method="linear")

===Calc Diff / Delta of rows===
df["delta_prev"] = df["kWh"].diff()
df["delta_next"] = df["kWh"].shift(-1) - df["kWh"]

===Overwrite Data===
# overwrite values > 123 by 123 etc.
df["Lat"] = df["Lat"].clip(lower=-180, upper=180)
df["Lng"] = df["Lng"].clip(lower=-90, upper=90 )

# set value of columns based on condition
# set "type" = "Ride" where "type" == "VirtualRide"
df.loc[df["type"] == "VirtualRide", "type"] = "Ride"

# negative -> 0
df[df < 0] = 0

# overwrite column data of last 3 weeks by None
df["DateAsDate"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
date_3w = dt.date.today() - dt.timedelta(weeks=3)
df.loc[df["DateAsDate"].dt.date >= date_3w, "MyColumn"] = None

# rolling takes NAN values into account, so I need to overwrite them as well
df3["Deaths_Covid_roll"] = np.where(
df3["Deaths_Covid"].isnull(), np.nan, df3["Deaths_Covid_roll"]

df.loc[(df["Kat2"] == "") & (df["IBAN"] == "DE02100100100152517108"), "Kat2"] = "Bahn"
df.loc[(df["Kat1"] == "") & (df["Kat2"] == "Bahn"), "Kat1"] = "Mobilität"

# assigning multiple values
df.loc[
(df["Kat1"] == "") & (df["IBAN"] == "DE02100100100152517108"),
["Kat1", "Kat2", "Kat3"],
] = ["Mobilität], "Reisen", "Bahn"]

# str endswith
df.loc[
(df["RespTime"] == 0.0)
& (df["Text"].str.endswith("(read timeout=30)")),
"RespTime",
] = 30.0

==Read-out Data==
===Columns===
df2 = df [ [ "col1", "col2" ] ]

===Rows===
for index, row in df.iterrows():

===extract cell based on Index===
col1_first = df2["col1"].iloc[0]
col1_last = float(df["col1"].iloc[-1])
de_sum = df["col"].loc["Summe"]

===Loop over DF===
for row in df.itertuples():
assert type(row.start_date) is dt.datetime
assert type(row.utc_offset) is int
start_date = row.start_date - dt.timedelta(seconds=row.utc_offset)

==Aggregation==
===Sum Column or Row===
# sum of 1 column
sum_cases = df["Cases"].sum()
# sum per column
df_sums = df.sum(axis="columns")
# sum per row / index
df_sums = df.sum(axis="index")

==Filtering==
===Basics===
# new notation
var = 40
df = df.query("power >= 80 & cadence >= @var")

# old notation
df = df[df["x_start_locality"].isna() & df["start_latlng"].notna()]

# Remove some cat
df = df[~df["Kategorie"].isin(KAT_REMOVE)]

# Remove missing date
df = df[df["Datum"].notna()]

df = df[ df["power"] >= 80 ]
df = df[ (df["power"] >= 80) & (df["cadence"] >= 40) ]
# ~ inverts
# list of multiple values
df = df[ ~df["col1"].isin(("A", "B", "C")) ]

===filter on index===
df = df[df.index >= start_yearweek]

df = df.drop("Summe")

===Filtering Dates===
# filter a date column via year
df = df[df["Date"].dt.year == 2020]
# filter a date via str
df = df[df["Date"] == "2021-11-13"]
# filter on date via dt
df = df[
df["RECEIVE_DATE"].dt.date # convert pandas datetime64[ns] to date
>= (dt.date.today() - dt.timedelta(days=30))
]

===Drop Duplicate Rows===
df = df.drop_duplicates()

===Distinct values of column as list===
gear_ids = df["gear_id"].unique()

==Date Handling==
===Dates===
====Create Date Range====
df = pd.DataFrame({"date": pd.date_range(start="2023-01", end="2024-12", freq="D")})
df = pd.DataFrame({"date": pd.date_range(start="2023-01", end="2024-12", freq="W")})

====Convert Str Column to Date====
# str to datetime
df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
# datetime to date
df["Date"] = df["Date"].dt.date

====Extract: year, month, quarter, week, weekstart====
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["quarter"] = df["date"].dt.quarter
df["week"] = df["date"].dt.isocalendar().week
df["weekyear"] = df["date"].dt.isocalendar().year
# fix week 53 (when not using weekyear)
# df["x_week"] = df["x_week"].clip(upper=52)
df.loc[(df["week"] == 53) & (df["month"] == 1), "x_week"] = 1
df.loc[(df["week"] == 53) & (df["month"] == 12), "x_week"] = 52

# week_start from date
df["week_start"] = df["date"].dt.to_period("W").apply(lambda r: r.start_time)
# week_start from week number
df["week_start"] = pd.to_datetime(
df.apply(lambda row: dt.date.fromisocalendar(row["year"], row["week"], 1), axis=1)
)

===DateTime===
====Convert String to Datetime and Timezone Handling====
# string to datetime, converting to UTC for future compatibility
df["date"] = pd.to_datetime(df["date"], format="ISO8601", utc=True)
# convert from utc to local timezone
df["date"] = df["date"].dt.tz_convert(tz="Europe/Berlin")
# drop timezone info, since Excel can not handle it
df["date"] = df["date"].dt.tz_localize(None)

====DateTime: remove miliseconds====
df["DateTime"] = df["DateTime"].dt.ceil(freq="s")

====DateTime: extract date and add offset, depending on time====
# values before 15:00 shall be treated end of last day value
df["day"] = df["dt"].dt.date
df.loc[df["dt"].dt.hour < 15, "day"] = df["dt"].dt.date - pd.Timedelta(days=1) # noqa: PLR2004

===TimeStamps===
===Convert Timestamp to Datetime and use as Index===
df["datetime"] = pd.to_datetime( # convert to datetime
df["start_ts"],
unit="s", # timestamp is in seconds
utc=True, # timestamp is in UTC
).dt.tz_convert( # convert to local TZ
"Europe/Berlin"
)
df = df.set_index("datetime")

====Convert Datetime to Timestamp====
df["timestamp"] = df["datetime"].values.astype(np.int64) // 10**9
# or
df["timestamp"] = df["datetime"].astype(int) // 10**9
# // is integer division

====Timestamp: substract first====
# calc elapsed time
df["seconds"] = df["timestamp"] - df["timestamp"].iloc[0]

==String Modifications==
===Replace===
df["name"] = df["name"].str.strip() # trim whitespaces
df["time"] = df["time"].str.replace(r"\..*$", "Z", regex=True)

# remove word "Probable"
df["col"] = df["col"].replace(
to_replace=r"^Probable ", value="", regex=True
)

df["Text"] = df["Text"].str.replace(
r"^Menke, *Torben *",
"Torben Menke",
regex=True,
)

===split string===
df [ [ "week", "year" ] ] = df["Kalenderwoche"].str.split("/", expand=True)

===string <-> number===
string to float: 1.234,56 -> 1234.56
df["Euro"] = (
df["Euro"]
.str.replace(".", "")
.str.replace(",", ".")
.astype(float)
)

convert int to str adding leading zeros
df["Sterbewoche"].astype(str).str.zfill(2)

capitalization
df["Wer"] = df["Wer"].apply(lambda x: string.capwords(x))

trim spaces
df["Wer"] = df["Wer"].str.strip()

===count word per row and check if more than one===
str.count()
df["cnt_Buchungstext"] = df["Text"].str.count("word")
df_search = df[df["cnt_Buchungstext"] != 1]
if len(df_search) > 0:
print(df_search)
del df_search
df = df.drop(columns=["cnt_Buchungstext"])

==Sorting / order by==
# sort by column
df = df.sort_values(by=['betten_belegt'], ascending=False)
df = df.sort_values(by=["count", "query"], ascending=[False, True])
df = df.sort_values(by=["Buchungstag", "Text"], ignore_index=True)

===Top10===
df_top_ten = (
df.sort_values(by="count", ascending=False)
.head(10)
)

==Group By / Top10==
df2 = (
df.groupby("sender")
.size()
.to_frame(name="count")
.sort_values("count", ascending=False)
)

df2 = (
df[ [ "name", "amount" ] ]
.groupby("name")
.agg(amountSum=("amount", "sum"), count=("name", "count"))
.sort_values(by=["amountSum", "count"], ascending=False)
)
print(df2.head(10)) # top10

# resort by count
print(df2.sort_values(by="count", ascending=False).head(10))

===Group by year, quarter, month, week using count and add missing===
Simple: add missing years for multi index
df1 = pd.DataFrame(
columns=["year", "type", "count"],
data=[
(2000, "Run", 1),
(2003, "Ride", 2),
(2003, "Swim", 1),
],
)
df2 = pd.DataFrame(
data={
"year": range(df1["year"].min(), df1["year"].max() + 1),
"type": "Run",
"count": 0,
}
).set_index(["year", "type"])
df1 = df1.set_index(["year", "type"])
df = df1.add(df2, fill_value=0).reset_index()

Advanced
# df has columns: date and derived year, quarter, month, week, see above
# group by year, quarter, month, week, with count as aggregation and gap-filling
year_min, year_max = df["year"].min, df["year"].max
# year
df_year = (
df[ [ "year", "date" ] ].groupby(["year"]).count().rename(columns={"date": "count"})
)
df_year = df_year.reindex(range(year_min(), year_max() + 1), fill_value=None)
df_year = df_year.reset_index().rename(columns={"year": "date-grouping"})

# quarter
df_quarter = (
df[ ["year", "quarter", "date"] ]
.groupby(["year", "quarter"])
.count()
.rename(columns={"date": "count"})
)
df_quarter = df_quarter.reindex(
pd.MultiIndex.from_product(
[range(year_min(), year_max() + 1), range(1, 5)], names=["year", "quarter"]
),
fill_value=None,
)
df_quarter = df_quarter.reset_index()
df_quarter["date-grouping"] = (
df_quarter["year"].astype(str) + "-Q" + df_quarter["quarter"].astype(str)
)
df_quarter = df_quarter.drop(columns=["year", "quarter"])

# month
df_month = (
df[ ["year", "month", "date"] ]
.groupby(["year", "month"])
.count()
.rename(columns={"date": "count"})
)
df_month = df_month.reindex(
pd.MultiIndex.from_product(
[range(year_min(), year_max() + 1), range(1, 13)], names=["year", "month"]
),
fill_value=None,
)
df_month = df_month.reset_index()
df_month["date-grouping"] = (
df_month["year"].astype(str) + "-" + df_month["month"].astype(str).str.zfill(2)
)
df_month = df_month.drop(columns=["year", "month"])

# week
df_week = (
df[ ["year", "week", "date"] ]
.groupby(["year", "week"])
.count()
.rename(columns={"date": "count"})
)
df_week = df_week.reindex(
pd.MultiIndex.from_product(
[range(year_min(), year_max() + 1), range(1, 53)], names=["year", "week"]
),
fill_value=None,
)
df_week = df_week.reset_index()
df_week["date-grouping"] = (
df_week["year"].astype(str) + "-" + df_week["week"].astype(str).str.zfill(2)
)
df_week = df_week.drop(columns=["year", "week"])

===Group by Type and Date===
# group by hour
# truncate min and sec data
df["hour"] = pd.to_datetime(df["time"]).dt.floor("H")
# sum per hour
df2 = df[ ["hour", "amount"] ].groupby("hour").agg(amountSum=("amount", "sum"))
df2.plot()
plt.show()

# Datetime Index Grouping
df = df.groupby(pd.Grouper(freq="5min", offset="00h00min")).max()

# Group by type and date of month start
df_month = df.groupby(["type", pd.Grouper(key="date", freq="MS")]).agg(
{"id": "count", "minutes": "sum"}
)

# generate index of the desired month-freq:
idx = pd.date_range(
start=df["date"].min().replace(day=1),
end=df["date"].max().replace(day=1),
freq="MS", # MS = Month Start
)

# add missing months
df_month = df_month.reindex(
pd.MultiIndex.from_product(
[df_month.index.get_level_values("type"), idx],
names=["type", "date"],
)
)

# fill missing by 0 and convert count back to type int
df_month = df_month.fillna(0).astype({"count": int})

====Group Datetime by Time of Day only====
df["Time"] = df["Date"].dt.round("5min").dt.time

df_grouped = df [ ["RespTime", "Time" ] ].groupby("Time").mean()

====Group and calculate min,avg,max====
df_grouped = (
df[ ["Time", "RespTime"] ]
.groupby("Time")
.agg(
max=pd.NamedAgg(column="RespTime", aggfunc="max"),
avg=pd.NamedAgg(column="RespTime", aggfunc="mean"),
min=pd.NamedAgg(column="RespTime", aggfunc="min"),
)
)

==Column Handling==
===Renaming===
df = df.rename(
columns = {"invasiv_beatmet": "beatmet",},
errors="raise",
)

rename column headers by extracting some int values from a string
l2 = []
for col in df.columns:
year = int(col[0:4])
week = int(col[5:7])
l2.append(year * 100 + week)
df.columns = l2

===Dropping===
drop columns
df = df.drop(columns=["Sterbejahr", "Sterbewoche"])

===Change Column Order===
df = df.reindex(
[
"Wann",
"Art",
"Wer",
],
axis="columns",

# column reorder
cols_first = ["year", "month", "week", "weekyear"]
cols_last = [col for col in df.columns if col not in cols_first]
cols_first.extend(cols_last)
df = df[cols_first]
del cols_first, cols_last

# same but different
first_columns = ("description", "isin", "side", "quantity", "Price", "amount")
i = 0
for col in first_columns:
df.insert(i, col, df.pop(col))
i += 1
del i

==Index Handling==
select column as index
df = df.set_index("Date")
move date from index back to column
df = df.reset_index()

rename index
df.index.name = "Date"

reset index to start at 0
df2 = df1[1 * 365 : 2 * 365].reset_index(drop=True)

int index to str index
df.index = df.index.map(str)

===text indexes===
df = df.set_index("Altersgruppe")

===datetime indexes===
assert type(df.index) is pd.DatetimeIndex
df.index = df.index.tz_convert("Europe/Berlin")
df.index = df.index.tz_convert(None) # to remove tz info

# select "Date" column as index
df = df.set_index(["Date"])

# convert index to datetime
df.index = pd.to_datetime(df.index)

# add missing dates
df = df.reindex(
pd.date_range(df.index.min(), df.index.max(), freq="D"), fill_value=0
)
df.index.name = "date"

# remove timezone offset (for Excel can not handle this)
df.index = df.index.tz_localize(None)

date_last = df.index[-1])

# reindex and fill missing with 0
date_last = pd.to_datetime(df.index[-1]).date()
idx = pd.date_range('2020-01-01', date_last))
df = df.reindex(idx, fill_value=0)

# add missing dates
df = df.asfreq('D', fill_value=0)
df = df.asfreq(freq="5M")
df = df.sort_index() # needed?

# drop values of column for last 3 weeks
date_3w = dt.date.today() - dt.timedelta(weeks=3)
df.loc[df.index.date >= date_3w, "Cases"] = None
# or
df.loc[df.index.date < pd.to_datetime("2020-03-01"), "Value"] = None

====filter on date index====
# drop data prior to 2020
df = df.loc['2020-01-01':]
# alternative:
df = df[df.index >= "2021-01-10"]

==Pivot, UnPivot/Melt==
===Pivot===
# df has columns "key" (column that shall be the index) , "year" (column you want to convert to multiple columns), "value" (the numbers to aggregate)
# convert to wide format with id as columns
df2 = df.pivot_table(index="key", columns="year", values="value", aggfunc="first") # aggfunc="sum"

===melt/unpivot wide table to long table format===
df2 = pd.melt(df, id_vars="Date", value_vars=df.columns[1:]) #
df2 = df2.rename(columns={"variable": "Machine", "value": "Status"})

# not working, using melt instead
# df3 = pd.wide_to_long(df, stubnames="Date", i=df.columns[1:], j="Status")

==Merge/Concat and Append DFs==
===Merge/Concat===
df_sum = pd.DataFrame()
df_sum = pd.concat(
[df_sum, df],
ignore_index=True,
)
===Append===
# merge columns from different dfs, when have matching index, e.g. user_id
df = pd.concat([df1, df2], axis=1)

# append df2 to end of df1
df1 = pd.DataFrame({"date": pd.date_range(start="2020-01", end="2020-02", freq="W")})
df2 = pd.DataFrame({"date": pd.date_range(start="2024-01", end="2024-12", freq="D")})
df = pd.concat([df1, df2]).reset_index(drop=True)

# join series of 2 df
df_covid_2020 = pd.DataFrame()
df_covid_2020['Deaths_Covid_2020'] = df1['Deaths_Covid_2020'].append(
df2['Deaths_Covid_2020'], ignore_index=True)

# note: this would require all index of df in df2 present:
# df[machine] = df2[machine]

==Rolling Average, mean of columns, min, max==
df['2016_roll'] = df['2016'].rolling(window=7, min_periods=1).mean().round(1)
df['2017_roll'] = df['2017'].rolling(window=7, min_periods=1).mean().round(1)
df['2018_roll'] = df['2018'].rolling(window=7, min_periods=1).mean().round(1)
df['2019_roll'] = df['2019'].rolling(window=7, min_periods=1).mean().round(1)
df['2020_roll'] = df['2020'].rolling(window=7, min_periods=1).mean().round(1)
# mean value of 4 columns
df['2016_2019_mean'] = df.iloc[:, [1, 2, 3, 4]
].mean(axis=1) # not column 0 = day
df['2016_2019_mean_roll'] = df['2016_2019_mean'].rolling(
window=7, min_periods=1).mean().round(1)

df['2016_2019_roll_max'] = df.iloc[:, [6, 7, 8, 9]].max(axis=1)
df['2016_2019_roll_min'] = df.iloc[:, [6, 7, 8, 9]].min(axis=1)

==Helpers==
def pandas_set_date_index(df, date_column: str):
""" use date as index """
df[date_column] = pd.to_datetime(df[date_column], format='%Y-%m-%d')
df = df.set_index([date_column])
return df

def pandas_calc_roll_av(df, column: str, days: int = 7):
""" calc rolling average over column """
df[column + '_roll_av'] = df[column].rolling(
window=days, min_periods=1).mean().round(1)
return df

# custom rounding
def custom_round(x: float, base: int = 5) -> int:
"""Custom rounding."""
# from https://stackoverflow.com/questions/40372030/pandas-round-to-the-nearest-n
return int(base * round(float(x) / base))

df["power_rounded"] = df["power"].apply( lambda x: custom_round(x, base=20) )

==Transpose==
df = df.transpose()

==Leftovers==
# when in a function one might get the SettingWithCopyWarning, fix via
df = df.copy()

# copy
df2['Date'] = df0['Date']

# drop 2 rows from the beginning
df = df2.drop([0, 1])

====append today using yesterdays value====
str_today = dt.datetime.today().strftime("%Y-%m-%d")
ts_today = pd.Timestamp(str_today)
if df.index[-1] != ts_today:
df.loc[ts_today] = df.loc[df.index[-1], "Count"]

==Plotting via Matplotlib==
see [[Matplotlib]]

Pandas - Revision history

Torben: /* Basics */