<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://entorb.net//wiki/index.php?action=history&amp;feed=atom&amp;title=Pandas</id>
	<title>Pandas - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://entorb.net//wiki/index.php?action=history&amp;feed=atom&amp;title=Pandas"/>
	<link rel="alternate" type="text/html" href="https://entorb.net//wiki/index.php?title=Pandas&amp;action=history"/>
	<updated>2026-05-06T10:30:00Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.43.1</generator>
	<entry>
		<id>https://entorb.net//wiki/index.php?title=Pandas&amp;diff=5400&amp;oldid=prev</id>
		<title>Torben: /* Basics */</title>
		<link rel="alternate" type="text/html" href="https://entorb.net//wiki/index.php?title=Pandas&amp;diff=5400&amp;oldid=prev"/>
		<updated>2026-03-08T10:30:38Z</updated>

		<summary type="html">&lt;p&gt;&lt;span class=&quot;autocomment&quot;&gt;Basics&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;[[Category:Coding]][[Category:Python]]&lt;br /&gt;
==Getting started==&lt;br /&gt;
 import pandas as pd&lt;br /&gt;
&lt;br /&gt;
===Docu===&lt;br /&gt;
nice cheat sheets can be found here&lt;br /&gt;
* https://www.datacamp.com/cheat-sheet/pandas-cheat-sheet-for-data-science-in-python&lt;br /&gt;
* https://www.datacamp.com/cheat-sheet/pandas-cheat-sheet-data-wrangling-in-python&lt;br /&gt;
&lt;br /&gt;
==Create DF==&lt;br /&gt;
 import pandas as pd&lt;br /&gt;
 # empty&lt;br /&gt;
 df = pd.DataFrame()&lt;br /&gt;
===random data===&lt;br /&gt;
 df = pd.DataFrame({&amp;quot;x&amp;quot;: range(100)})&lt;br /&gt;
 df[&amp;quot;y&amp;quot;] = np.random.rand(len(df)) * 10&lt;br /&gt;
 &lt;br /&gt;
 # y = sin(x) + noise&lt;br /&gt;
 noise = np.random.rand(len(df)) * 0.1&lt;br /&gt;
 noise = np.random.default_rng().random(10) # newer syntax&lt;br /&gt;
 df[&amp;quot;y&amp;quot;] = np.sin(df[&amp;quot;x&amp;quot;] / 25 * np.pi) + noise&lt;br /&gt;
  &lt;br /&gt;
 # 20 rows and 3 columns of random values&lt;br /&gt;
 df = pd.DataFrame(np.random.rand(20, 3), columns=[&amp;quot;a&amp;quot;, &amp;quot;b&amp;quot;, &amp;quot;c&amp;quot;])&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
===from Lists===&lt;br /&gt;
 &lt;br /&gt;
 # from 1 dim list&lt;br /&gt;
 df = pd.DataFrame(data={&amp;quot;Column Name&amp;quot;: lst})&lt;br /&gt;
 # per column&lt;br /&gt;
 df = pd.DataFrame(&lt;br /&gt;
     data={&amp;quot;col1&amp;quot;:lst1, &amp;quot;col2&amp;quot;:lst2, &amp;quot;col3&amp;quot;:lst3})&lt;br /&gt;
 )&lt;br /&gt;
 # from N dim list, and set name of columns&lt;br /&gt;
 df = pd.DataFrame(&lt;br /&gt;
     data=lst, columns=(&amp;quot;col1&amp;quot;, &amp;quot;col2&amp;quot;, &amp;quot;col3)&lt;br /&gt;
 )&lt;br /&gt;
 # from multiple lists&lt;br /&gt;
 data = zip(&lt;br /&gt;
     l_days,&lt;br /&gt;
     l_2016,&lt;br /&gt;
     l_2017,&lt;br /&gt;
     l_2018,&lt;br /&gt;
     l_2019,&lt;br /&gt;
     l_2020,&lt;br /&gt;
     strict=True,&lt;br /&gt;
 )&lt;br /&gt;
 ...&lt;br /&gt;
&lt;br /&gt;
===from csv===&lt;br /&gt;
see [https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html]&lt;br /&gt;
 df = pd.read_csv(&amp;quot;data.tsv&amp;quot;, sep=&amp;quot;\t&amp;quot;)&lt;br /&gt;
 # only selected columns and parse date&lt;br /&gt;
 df = pd.read_csv(&lt;br /&gt;
     &amp;quot;data.tsv&amp;quot;,&lt;br /&gt;
     sep=&amp;quot;\t&amp;quot;,&lt;br /&gt;
     decimal=&amp;quot;,&amp;quot;,&lt;br /&gt;
     usecols=[&lt;br /&gt;
         &amp;quot;Date&amp;quot;,&lt;br /&gt;
         &amp;quot;Deaths_New&amp;quot;,&lt;br /&gt;
     ],  # only load these columns&lt;br /&gt;
     parse_dates=[&lt;br /&gt;
         &amp;quot;Date&amp;quot;,&lt;br /&gt;
     ],  # convert to date object if format is yyyy-mm-dd pr dd.mm.yyyy&lt;br /&gt;
     index_col=&amp;quot;Date&amp;quot;,  # choose this column as index&lt;br /&gt;
     na_values=[&amp;quot;&amp;lt;4&amp;quot;],  # values to treat as NA&lt;br /&gt;
 )&lt;br /&gt;
 df = df.rename(columns={&amp;quot;Deaths_New&amp;quot;: &amp;quot;Deaths_Covid&amp;quot;})&lt;br /&gt;
 &lt;br /&gt;
 # convert data upon loading&lt;br /&gt;
 # read only first 10 chars from 2021-04-29T12:15:00+02:00 -&amp;gt; 2021-04-29&lt;br /&gt;
 pd_date_converter = lambda x: (x[0:10])  # noqa: E731&lt;br /&gt;
 df = pd.read_csv(&lt;br /&gt;
     &amp;quot;data.tsv&amp;quot;,&lt;br /&gt;
     sep=&amp;quot;,&amp;quot;,&lt;br /&gt;
     converters={&amp;quot;Datum&amp;quot;: pd_date_converter},&lt;br /&gt;
     parse_dates=[&lt;br /&gt;
         &amp;quot;Datum&amp;quot;,  # # convert to date object if format is yyyy-mm-dd&lt;br /&gt;
     ],&lt;br /&gt;
 )&lt;br /&gt;
 # convert datetime to date after loading&lt;br /&gt;
 for c in (&amp;quot;Buchungstag&amp;quot;, &amp;quot;Valutadatum&amp;quot;):&lt;br /&gt;
     df2[c] = pd.to_datetime(df2[c], format=&amp;quot;%d.%m.%Y&amp;quot;).dt.date&lt;br /&gt;
&lt;br /&gt;
===from list[str] via read_csv===&lt;br /&gt;
 from io import StringIO&lt;br /&gt;
 &lt;br /&gt;
 csv_string_io = StringIO(&amp;quot;\n&amp;quot;.join(lines))&lt;br /&gt;
 df = pd.read_csv(&lt;br /&gt;
     csv_string_io,&lt;br /&gt;
     sep=&amp;quot;;&amp;quot;,&lt;br /&gt;
 )&lt;br /&gt;
 &lt;br /&gt;
 data_str = &amp;quot;&amp;quot;&amp;quot;&lt;br /&gt;
 datetime_de	value&lt;br /&gt;
 30.09.2022 10:00	214516.10&lt;br /&gt;
 13.10.2022 00:20	214563.85&lt;br /&gt;
 &amp;quot;&amp;quot;&amp;quot;&lt;br /&gt;
 df = pd.read_csv(StringIO(data_str), sep=&amp;quot;\t&amp;quot;, lineterminator=&amp;quot;\n&amp;quot;)&lt;br /&gt;
 df[&amp;quot;value&amp;quot;] = df[&amp;quot;value&amp;quot;].astype(float)&lt;br /&gt;
&lt;br /&gt;
===from Excel===&lt;br /&gt;
There are 2 libs: openpyxl (for read and write) and XlsxWriter (write)&lt;br /&gt;
see [https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html]&lt;br /&gt;
 import openpyxl  # pip install openpyxl&lt;br /&gt;
 &lt;br /&gt;
 df = pd.read_excel(&amp;quot;file.xlsx&amp;quot;, sheet_name=&amp;quot;Sheet1&amp;quot;, engine=&amp;quot;openpyxl&amp;quot;)&lt;br /&gt;
 &lt;br /&gt;
 df = pd.read_excel(&lt;br /&gt;
     &amp;quot;file.xlsx&amp;quot;,&lt;br /&gt;
     skiprows=4,&lt;br /&gt;
     usecols=[&lt;br /&gt;
         &amp;quot;ArbPlatz&amp;quot;,&lt;br /&gt;
         &amp;quot;BezArt&amp;quot;,&lt;br /&gt;
     ],&lt;br /&gt;
 )&lt;br /&gt;
&lt;br /&gt;
===from dict===&lt;br /&gt;
see [https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html]&lt;br /&gt;
 df = pd.DataFrame.from_dict(d, orient=&amp;quot;index&amp;quot;, columns=[&amp;quot;Col1&amp;quot;, Col2])&lt;br /&gt;
&lt;br /&gt;
flatten dict in list of dict&lt;br /&gt;
 # l1 is list of dict, each dict has a sub-dict called metadata&lt;br /&gt;
 l2 = []&lt;br /&gt;
 for item in l1:&lt;br /&gt;
     flat = item.copy()&lt;br /&gt;
     meta = item.pop(&amp;quot;metadata&amp;quot;)&lt;br /&gt;
     flat.update(meta)&lt;br /&gt;
     l2.append(flat)&lt;br /&gt;
 del l1&lt;br /&gt;
 &lt;br /&gt;
 df = pd.DataFrame.from_dict(l2)&lt;br /&gt;
&lt;br /&gt;
===from array/record/list===&lt;br /&gt;
see [https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_records.html]&lt;br /&gt;
 data = [ {&amp;quot;id&amp;quot;:1, &amp;quot;name&amp;quot;:&amp;quot;asdf&amp;quot;}, {&amp;quot;id&amp;quot;:2, &amp;quot;name&amp;quot;:&amp;quot;qwertz&amp;quot;}, ]&lt;br /&gt;
 df = pd.DataFrame.from_records(data)&lt;br /&gt;
&lt;br /&gt;
===from JSON===&lt;br /&gt;
see [https://pandas.pydata.org/docs/reference/api/pandas.read_json.html]&lt;br /&gt;
 df = pd.read_json(&amp;quot;file.json&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
===from SQLite===&lt;br /&gt;
 import sqlite3&lt;br /&gt;
 sql = &amp;quot;SELECT * FROM tab WHERE col2 = ? ORDER BY id&amp;quot;;&lt;br /&gt;
 df = pd.read_sql_query(sql, con, params=(MY_FILTER_VALUE,))&lt;br /&gt;
&lt;br /&gt;
==Export Data==&lt;br /&gt;
===to csv===&lt;br /&gt;
 df.to_csv(&lt;br /&gt;
     &amp;quot;data.tsv&amp;quot;,&lt;br /&gt;
     sep=&amp;quot;\t&amp;quot;,&lt;br /&gt;
     lineterminator=&amp;quot;\n&amp;quot;,&lt;br /&gt;
 )&lt;br /&gt;
 # selected columns only&lt;br /&gt;
 df[ [&amp;quot;Col1&amp;quot;, &amp;quot;Col2&amp;quot;] ].to_csv(&lt;br /&gt;
     &amp;quot;data.tsv&amp;quot;,&lt;br /&gt;
     sep=&amp;quot;\t&amp;quot;,&lt;br /&gt;
     lineterminator=&amp;quot;\n&amp;quot;,&lt;br /&gt;
 )&lt;br /&gt;
to string in CSV format&lt;br /&gt;
 l_csv = (&lt;br /&gt;
     df.drop(columns=[&amp;quot;hour&amp;quot;]).to_csv(&lt;br /&gt;
         index=False, sep=&amp;quot;\t&amp;quot;, lineterminator=&amp;quot;\n&amp;quot;, encoding=&amp;quot;utf-8&amp;quot;&lt;br /&gt;
     ),&lt;br /&gt;
 )&lt;br /&gt;
 s_csv = &amp;quot;&amp;quot;.join(l_csv)&lt;br /&gt;
&lt;br /&gt;
===to Excel===&lt;br /&gt;
see [https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_excel.html]&lt;br /&gt;
 df.to_excel(&amp;quot;data.xlsx&amp;quot;, index=False)&lt;br /&gt;
&lt;br /&gt;
Export 2 df as sheets in 1 file&lt;br /&gt;
 with pd.ExcelWriter(path=file_in.with_suffix(&amp;quot;.xlsx&amp;quot;)) as writer:  &lt;br /&gt;
     df1.to_excel(writer, sheet_name=&amp;#039;Sheet_name_1&amp;#039;)&lt;br /&gt;
     df2.to_excel(writer, sheet_name=&amp;#039;Sheet_name_2&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
===to HTML===&lt;br /&gt;
see [https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_html.html]&lt;br /&gt;
 df.to_html(&lt;br /&gt;
     &amp;quot;out.html&amp;quot;, index=False, render_links=False, escape=False, justify=&amp;quot;center&amp;quot;&lt;br /&gt;
     )&lt;br /&gt;
&lt;br /&gt;
 # html encoding of column name only&lt;br /&gt;
 df[&amp;quot;name&amp;quot;] = df[&amp;quot;name&amp;quot;].str.encode(&amp;quot;ascii&amp;quot;, &amp;quot;xmlcharrefreplace&amp;quot;).str.decode(&amp;quot;utf-8&amp;quot;)&lt;br /&gt;
 # add link to name&lt;br /&gt;
 df[&amp;quot;name&amp;quot;] = &amp;quot;&amp;lt;a href=&amp;#039;&amp;quot; + df[&amp;quot;url&amp;quot;] + &amp;quot;&amp;#039; target=&amp;#039;_blank&amp;#039; &amp;gt;&amp;quot; + df[&amp;quot;name&amp;quot;] + &amp;quot;&amp;lt;/a&amp;gt;&amp;quot;&lt;br /&gt;
 # export to html&lt;br /&gt;
 df[ [&amp;quot;name&amp;quot;, &amp;quot;due&amp;quot;, &amp;quot;overdue&amp;quot;, &amp;quot;priority&amp;quot;, &amp;quot;overdue priority&amp;quot;] ].to_html(&lt;br /&gt;
     &amp;quot;out.html&amp;quot;, index=False, render_links=False, escape=False, justify=&amp;quot;center&amp;quot;&lt;br /&gt;
 )&lt;br /&gt;
&lt;br /&gt;
===to JSON===&lt;br /&gt;
 json_data = df.values.tolist()&lt;br /&gt;
 &lt;br /&gt;
 with Path(pathStatsExport / &amp;quot;ts_types_month.json&amp;quot;).open(&lt;br /&gt;
     &amp;quot;w&amp;quot;, encoding=&amp;quot;UTF-8&amp;quot;&lt;br /&gt;
 ) as fh:&lt;br /&gt;
     json.dump(&lt;br /&gt;
         json_data,&lt;br /&gt;
         fp=fh,&lt;br /&gt;
         ensure_ascii=False,&lt;br /&gt;
         sort_keys=False,&lt;br /&gt;
         indent=2,&lt;br /&gt;
     )&lt;br /&gt;
&lt;br /&gt;
==Modification==&lt;br /&gt;
===Add Row===&lt;br /&gt;
add row from list to end&lt;br /&gt;
 idx = df.index[-1] + 1&lt;br /&gt;
 list_of_values = (...)&lt;br /&gt;
 df.loc[idx] = list_of_values&lt;br /&gt;
&lt;br /&gt;
add dummy row for missing 1.1.2020 and reindex&lt;br /&gt;
 df.loc[-1] = &amp;quot;2020-01-01&amp;quot;, 0&lt;br /&gt;
 df.index = df.index + 1  # shifting index&lt;br /&gt;
 df = df.sort_index()  # sorting by index&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
===Add column based on other columns===&lt;br /&gt;
 df[&amp;quot;URL&amp;quot;] = df.apply(&lt;br /&gt;
     lambda row: f&amp;quot;https://www.openstreetmap.org/?mlat={row[&amp;#039;Lat&amp;#039;]}&amp;amp;mlon={row[&amp;#039;Lng&amp;#039;]}#map={zoom}/{row[&amp;#039;Lat&amp;#039;]}/{row[&amp;#039;Lng&amp;#039;]}&amp;quot;,&lt;br /&gt;
     axis=1,&lt;br /&gt;
&lt;br /&gt;
===Apply map/dict id-&amp;gt;name===&lt;br /&gt;
 df[&amp;quot;x_gear_name&amp;quot;] = df[&amp;quot;gear_id&amp;quot;].map(d_id_name)&lt;br /&gt;
&lt;br /&gt;
===Replace na Values===&lt;br /&gt;
 df = df.dropna() # removes missing values&lt;br /&gt;
 # only in certain columns&lt;br /&gt;
 df = df.dropna(subset=[&amp;quot;power&amp;quot;, &amp;quot;heart_rate&amp;quot;, &amp;quot;cadence&amp;quot;])&lt;br /&gt;
 df = df[df[&amp;quot;value&amp;quot;].notna()]&lt;br /&gt;
&lt;br /&gt;
===Interpolate missing values===&lt;br /&gt;
 # interpolate missing data, e.g. after df.groupby([pd.Grouper(key=&amp;quot;time&amp;quot;, freq=&amp;quot;1min&amp;quot;)])&lt;br /&gt;
 df[&amp;quot;kWh_total_in&amp;quot;] = df[&amp;quot;kWh_total_in&amp;quot;].interpolate(method=&amp;quot;linear&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
===Calc Diff / Delta of rows===&lt;br /&gt;
 df[&amp;quot;delta_prev&amp;quot;] = df[&amp;quot;kWh&amp;quot;].diff()&lt;br /&gt;
 df[&amp;quot;delta_next&amp;quot;] = df[&amp;quot;kWh&amp;quot;].shift(-1) - df[&amp;quot;kWh&amp;quot;]&lt;br /&gt;
&lt;br /&gt;
===Overwrite Data===&lt;br /&gt;
 # overwrite values &amp;gt; 123 by 123 etc.&lt;br /&gt;
 df[&amp;quot;Lat&amp;quot;] = df[&amp;quot;Lat&amp;quot;].clip(lower=-180, upper=180)&lt;br /&gt;
 df[&amp;quot;Lng&amp;quot;] = df[&amp;quot;Lng&amp;quot;].clip(lower=-90,  upper=90 )&lt;br /&gt;
 &lt;br /&gt;
 # set value of columns based on condition&lt;br /&gt;
 # set &amp;quot;type&amp;quot; = &amp;quot;Ride&amp;quot; where &amp;quot;type&amp;quot; == &amp;quot;VirtualRide&amp;quot;&lt;br /&gt;
 df.loc[df[&amp;quot;type&amp;quot;] == &amp;quot;VirtualRide&amp;quot;, &amp;quot;type&amp;quot;] = &amp;quot;Ride&amp;quot;&lt;br /&gt;
 &lt;br /&gt;
 # negative -&amp;gt; 0&lt;br /&gt;
 df[df &amp;lt; 0] = 0&lt;br /&gt;
 &lt;br /&gt;
 # overwrite column data of last 3 weeks by None&lt;br /&gt;
 df[&amp;quot;DateAsDate&amp;quot;] = pd.to_datetime(df[&amp;quot;Date&amp;quot;], format=&amp;quot;%Y-%m-%d&amp;quot;)&lt;br /&gt;
 date_3w = dt.date.today() - dt.timedelta(weeks=3)&lt;br /&gt;
 df.loc[df[&amp;quot;DateAsDate&amp;quot;].dt.date &amp;gt;= date_3w, &amp;quot;MyColumn&amp;quot;] = None&lt;br /&gt;
  &lt;br /&gt;
  # rolling takes NAN values into account, so I need to overwrite them as well&lt;br /&gt;
 df3[&amp;quot;Deaths_Covid_roll&amp;quot;] = np.where(&lt;br /&gt;
     df3[&amp;quot;Deaths_Covid&amp;quot;].isnull(), np.nan, df3[&amp;quot;Deaths_Covid_roll&amp;quot;]&lt;br /&gt;
 &lt;br /&gt;
 df.loc[(df[&amp;quot;Kat2&amp;quot;] == &amp;quot;&amp;quot;) &amp;amp; (df[&amp;quot;IBAN&amp;quot;] == &amp;quot;DE02100100100152517108&amp;quot;), &amp;quot;Kat2&amp;quot;] = &amp;quot;Bahn&amp;quot;&lt;br /&gt;
 df.loc[(df[&amp;quot;Kat1&amp;quot;] == &amp;quot;&amp;quot;) &amp;amp; (df[&amp;quot;Kat2&amp;quot;] == &amp;quot;Bahn&amp;quot;), &amp;quot;Kat1&amp;quot;] = &amp;quot;Mobilität&amp;quot;&lt;br /&gt;
 &lt;br /&gt;
 # assigning multiple values&lt;br /&gt;
 df.loc[&lt;br /&gt;
     (df[&amp;quot;Kat1&amp;quot;] == &amp;quot;&amp;quot;) &amp;amp; (df[&amp;quot;IBAN&amp;quot;] == &amp;quot;DE02100100100152517108&amp;quot;),&lt;br /&gt;
     [&amp;quot;Kat1&amp;quot;, &amp;quot;Kat2&amp;quot;, &amp;quot;Kat3&amp;quot;],&lt;br /&gt;
 ] = [&amp;quot;Mobilität], &amp;quot;Reisen&amp;quot;, &amp;quot;Bahn&amp;quot;]&lt;br /&gt;
 &lt;br /&gt;
 # str endswith&lt;br /&gt;
 df.loc[&lt;br /&gt;
     (df[&amp;quot;RespTime&amp;quot;] == 0.0)&lt;br /&gt;
     &amp;amp; (df[&amp;quot;Text&amp;quot;].str.endswith(&amp;quot;(read timeout=30)&amp;quot;)),&lt;br /&gt;
     &amp;quot;RespTime&amp;quot;,&lt;br /&gt;
 ] = 30.0&lt;br /&gt;
&lt;br /&gt;
==Read-out Data==&lt;br /&gt;
===Columns===&lt;br /&gt;
 df2 = df [ [ &amp;quot;col1&amp;quot;, &amp;quot;col2&amp;quot; ] ]&lt;br /&gt;
&lt;br /&gt;
===Rows===&lt;br /&gt;
 for index, row in df.iterrows():&lt;br /&gt;
&lt;br /&gt;
===extract cell based on Index===&lt;br /&gt;
 col1_first = df2[&amp;quot;col1&amp;quot;].iloc[0]&lt;br /&gt;
 col1_last = float(df[&amp;quot;col1&amp;quot;].iloc[-1]) &lt;br /&gt;
 de_sum = df[&amp;quot;col&amp;quot;].loc[&amp;quot;Summe&amp;quot;]&lt;br /&gt;
&lt;br /&gt;
===Loop over DF===&lt;br /&gt;
 for row in df.itertuples():&lt;br /&gt;
     assert type(row.start_date) is dt.datetime&lt;br /&gt;
     assert type(row.utc_offset) is int&lt;br /&gt;
     start_date = row.start_date - dt.timedelta(seconds=row.utc_offset)&lt;br /&gt;
&lt;br /&gt;
==Aggregation==&lt;br /&gt;
===Sum Column or Row===&lt;br /&gt;
 # sum of 1 column&lt;br /&gt;
 sum_cases = df[&amp;quot;Cases&amp;quot;].sum()&lt;br /&gt;
 # sum per column&lt;br /&gt;
 df_sums = df.sum(axis=&amp;quot;columns&amp;quot;)&lt;br /&gt;
 # sum per row / index&lt;br /&gt;
 df_sums = df.sum(axis=&amp;quot;index&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
==Filtering==&lt;br /&gt;
===Basics===&lt;br /&gt;
 # new notation&lt;br /&gt;
 var = 40&lt;br /&gt;
 df = df.query(&amp;quot;power &amp;gt;= 80 &amp;amp; cadence &amp;gt;= @var&amp;quot;)&lt;br /&gt;
 &lt;br /&gt;
 # old notation&lt;br /&gt;
 df = df[df[&amp;quot;x_start_locality&amp;quot;].isna() &amp;amp; df[&amp;quot;start_latlng&amp;quot;].notna()]&lt;br /&gt;
&lt;br /&gt;
 # Remove some cat&lt;br /&gt;
 df = df[~df[&amp;quot;Kategorie&amp;quot;].isin(KAT_REMOVE)]&lt;br /&gt;
 &lt;br /&gt;
 # Remove missing date&lt;br /&gt;
 df = df[df[&amp;quot;Datum&amp;quot;].notna()]&lt;br /&gt;
 &lt;br /&gt;
 df = df[ df[&amp;quot;power&amp;quot;] &amp;gt;= 80 ]&lt;br /&gt;
 df = df[ (df[&amp;quot;power&amp;quot;] &amp;gt;= 80) &amp;amp; (df[&amp;quot;cadence&amp;quot;] &amp;gt;= 40) ]&lt;br /&gt;
 # ~ inverts&lt;br /&gt;
 # list of multiple values&lt;br /&gt;
 df = df[ ~df[&amp;quot;col1&amp;quot;].isin((&amp;quot;A&amp;quot;, &amp;quot;B&amp;quot;, &amp;quot;C&amp;quot;)) ]&lt;br /&gt;
&lt;br /&gt;
===filter on index===&lt;br /&gt;
 df = df[df.index &amp;gt;= start_yearweek]&lt;br /&gt;
 &lt;br /&gt;
 df = df.drop(&amp;quot;Summe&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
===Filtering Dates=== &lt;br /&gt;
 # filter a date column via year&lt;br /&gt;
 df = df[df[&amp;quot;Date&amp;quot;].dt.year == 2020]&lt;br /&gt;
 # filter a date via str&lt;br /&gt;
 df = df[df[&amp;quot;Date&amp;quot;] == &amp;quot;2021-11-13&amp;quot;]&lt;br /&gt;
 # filter on date via dt&lt;br /&gt;
 df = df[&lt;br /&gt;
     df[&amp;quot;RECEIVE_DATE&amp;quot;].dt.date # convert pandas datetime64[ns] to date&lt;br /&gt;
     &amp;gt;= (dt.date.today() - dt.timedelta(days=30))&lt;br /&gt;
 ]&lt;br /&gt;
&lt;br /&gt;
===Drop Duplicate Rows===&lt;br /&gt;
 df = df.drop_duplicates()&lt;br /&gt;
&lt;br /&gt;
===Distinct values of column as list===&lt;br /&gt;
 gear_ids = df[&amp;quot;gear_id&amp;quot;].unique()&lt;br /&gt;
&lt;br /&gt;
==Date Handling==&lt;br /&gt;
===Dates===&lt;br /&gt;
====Create Date Range====&lt;br /&gt;
 df = pd.DataFrame({&amp;quot;date&amp;quot;: pd.date_range(start=&amp;quot;2023-01&amp;quot;, end=&amp;quot;2024-12&amp;quot;, freq=&amp;quot;D&amp;quot;)})&lt;br /&gt;
 df = pd.DataFrame({&amp;quot;date&amp;quot;: pd.date_range(start=&amp;quot;2023-01&amp;quot;, end=&amp;quot;2024-12&amp;quot;, freq=&amp;quot;W&amp;quot;)})&lt;br /&gt;
&lt;br /&gt;
====Convert Str Column to Date====&lt;br /&gt;
 # str to datetime&lt;br /&gt;
 df[&amp;quot;Date&amp;quot;] = pd.to_datetime(df[&amp;quot;Date&amp;quot;], format=&amp;quot;%Y-%m-%d&amp;quot;)&lt;br /&gt;
 # datetime to date&lt;br /&gt;
 df[&amp;quot;Date&amp;quot;] = df[&amp;quot;Date&amp;quot;].dt.date&lt;br /&gt;
&lt;br /&gt;
====Extract: year, month, quarter, week, weekstart====&lt;br /&gt;
 df[&amp;quot;year&amp;quot;] = df[&amp;quot;date&amp;quot;].dt.year&lt;br /&gt;
 df[&amp;quot;month&amp;quot;] = df[&amp;quot;date&amp;quot;].dt.month&lt;br /&gt;
 df[&amp;quot;quarter&amp;quot;] = df[&amp;quot;date&amp;quot;].dt.quarter&lt;br /&gt;
 df[&amp;quot;week&amp;quot;] = df[&amp;quot;date&amp;quot;].dt.isocalendar().week&lt;br /&gt;
 df[&amp;quot;weekyear&amp;quot;] = df[&amp;quot;date&amp;quot;].dt.isocalendar().year&lt;br /&gt;
 # fix week 53 (when not using weekyear)&lt;br /&gt;
 # df[&amp;quot;x_week&amp;quot;] = df[&amp;quot;x_week&amp;quot;].clip(upper=52)&lt;br /&gt;
 df.loc[(df[&amp;quot;week&amp;quot;] == 53) &amp;amp; (df[&amp;quot;month&amp;quot;] == 1), &amp;quot;x_week&amp;quot;] = 1&lt;br /&gt;
 df.loc[(df[&amp;quot;week&amp;quot;] == 53) &amp;amp; (df[&amp;quot;month&amp;quot;] == 12), &amp;quot;x_week&amp;quot;] = 52&lt;br /&gt;
 &lt;br /&gt;
 # week_start from date&lt;br /&gt;
 df[&amp;quot;week_start&amp;quot;] = df[&amp;quot;date&amp;quot;].dt.to_period(&amp;quot;W&amp;quot;).apply(lambda r: r.start_time)&lt;br /&gt;
 # week_start from week number&lt;br /&gt;
 df[&amp;quot;week_start&amp;quot;] = pd.to_datetime(&lt;br /&gt;
     df.apply(lambda row: dt.date.fromisocalendar(row[&amp;quot;year&amp;quot;], row[&amp;quot;week&amp;quot;], 1), axis=1)&lt;br /&gt;
 )&lt;br /&gt;
&lt;br /&gt;
===DateTime===&lt;br /&gt;
====Convert String to Datetime and Timezone Handling====&lt;br /&gt;
 # string to datetime, converting to UTC for future compatibility&lt;br /&gt;
 df[&amp;quot;date&amp;quot;] = pd.to_datetime(df[&amp;quot;date&amp;quot;], format=&amp;quot;ISO8601&amp;quot;, utc=True)&lt;br /&gt;
 # convert from utc to local timezone&lt;br /&gt;
 df[&amp;quot;date&amp;quot;] = df[&amp;quot;date&amp;quot;].dt.tz_convert(tz=&amp;quot;Europe/Berlin&amp;quot;)&lt;br /&gt;
 # drop timezone info, since Excel can not handle it&lt;br /&gt;
 df[&amp;quot;date&amp;quot;] = df[&amp;quot;date&amp;quot;].dt.tz_localize(None)&lt;br /&gt;
&lt;br /&gt;
====DateTime: remove miliseconds====&lt;br /&gt;
 df[&amp;quot;DateTime&amp;quot;] = df[&amp;quot;DateTime&amp;quot;].dt.ceil(freq=&amp;quot;s&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
====DateTime: extract date and add offset, depending on time====&lt;br /&gt;
 # values before 15:00 shall be treated end of last day value&lt;br /&gt;
 df[&amp;quot;day&amp;quot;] = df[&amp;quot;dt&amp;quot;].dt.date&lt;br /&gt;
 df.loc[df[&amp;quot;dt&amp;quot;].dt.hour &amp;lt; 15, &amp;quot;day&amp;quot;] = df[&amp;quot;dt&amp;quot;].dt.date - pd.Timedelta(days=1)  # noqa: PLR2004&lt;br /&gt;
&lt;br /&gt;
===TimeStamps===&lt;br /&gt;
===Convert Timestamp to Datetime and use as Index===&lt;br /&gt;
 df[&amp;quot;datetime&amp;quot;] = pd.to_datetime(  # convert to datetime&lt;br /&gt;
     df[&amp;quot;start_ts&amp;quot;],&lt;br /&gt;
     unit=&amp;quot;s&amp;quot;,  # timestamp is in seconds&lt;br /&gt;
     utc=True,  # timestamp is in UTC&lt;br /&gt;
 ).dt.tz_convert(  # convert to local TZ&lt;br /&gt;
     &amp;quot;Europe/Berlin&amp;quot;&lt;br /&gt;
 )&lt;br /&gt;
 df = df.set_index(&amp;quot;datetime&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
====Convert Datetime to Timestamp====&lt;br /&gt;
 df[&amp;quot;timestamp&amp;quot;] = df[&amp;quot;datetime&amp;quot;].values.astype(np.int64) // 10**9&lt;br /&gt;
 # or&lt;br /&gt;
 df[&amp;quot;timestamp&amp;quot;] = df[&amp;quot;datetime&amp;quot;].astype(int) // 10**9&lt;br /&gt;
 # // is integer division&lt;br /&gt;
&lt;br /&gt;
====Timestamp: substract first====&lt;br /&gt;
 # calc elapsed time&lt;br /&gt;
 df[&amp;quot;seconds&amp;quot;] = df[&amp;quot;timestamp&amp;quot;] - df[&amp;quot;timestamp&amp;quot;].iloc[0]&lt;br /&gt;
&lt;br /&gt;
==String Modifications==&lt;br /&gt;
===Replace===&lt;br /&gt;
 df[&amp;quot;name&amp;quot;] = df[&amp;quot;name&amp;quot;].str.strip() # trim whitespaces&lt;br /&gt;
 df[&amp;quot;time&amp;quot;] = df[&amp;quot;time&amp;quot;].str.replace(r&amp;quot;\..*$&amp;quot;, &amp;quot;Z&amp;quot;, regex=True)&lt;br /&gt;
&lt;br /&gt;
 # remove word &amp;quot;Probable&amp;quot;&lt;br /&gt;
 df[&amp;quot;col&amp;quot;] = df[&amp;quot;col&amp;quot;].replace(&lt;br /&gt;
     to_replace=r&amp;quot;^Probable &amp;quot;, value=&amp;quot;&amp;quot;, regex=True&lt;br /&gt;
 )&lt;br /&gt;
&lt;br /&gt;
 df[&amp;quot;Text&amp;quot;] = df[&amp;quot;Text&amp;quot;].str.replace(&lt;br /&gt;
     r&amp;quot;^Menke, *Torben *&amp;quot;,&lt;br /&gt;
     &amp;quot;Torben Menke&amp;quot;,&lt;br /&gt;
     regex=True,&lt;br /&gt;
 )&lt;br /&gt;
&lt;br /&gt;
===split string===&lt;br /&gt;
 df [ [ &amp;quot;week&amp;quot;, &amp;quot;year&amp;quot; ] ] = df[&amp;quot;Kalenderwoche&amp;quot;].str.split(&amp;quot;/&amp;quot;, expand=True)&lt;br /&gt;
&lt;br /&gt;
===string &amp;lt;-&amp;gt; number===&lt;br /&gt;
string to float: 1.234,56 -&amp;gt; 1234.56&lt;br /&gt;
 df[&amp;quot;Euro&amp;quot;] = (&lt;br /&gt;
     df[&amp;quot;Euro&amp;quot;]&lt;br /&gt;
     .str.replace(&amp;quot;.&amp;quot;, &amp;quot;&amp;quot;)&lt;br /&gt;
     .str.replace(&amp;quot;,&amp;quot;, &amp;quot;.&amp;quot;)&lt;br /&gt;
     .astype(float)&lt;br /&gt;
 )&lt;br /&gt;
&lt;br /&gt;
convert int to str adding leading zeros&lt;br /&gt;
 df[&amp;quot;Sterbewoche&amp;quot;].astype(str).str.zfill(2)&lt;br /&gt;
&lt;br /&gt;
capitalization&lt;br /&gt;
 df[&amp;quot;Wer&amp;quot;] = df[&amp;quot;Wer&amp;quot;].apply(lambda x: string.capwords(x))&lt;br /&gt;
&lt;br /&gt;
trim spaces&lt;br /&gt;
 df[&amp;quot;Wer&amp;quot;] = df[&amp;quot;Wer&amp;quot;].str.strip()&lt;br /&gt;
&lt;br /&gt;
===count word per row and check if more than one===&lt;br /&gt;
str.count()&lt;br /&gt;
 df[&amp;quot;cnt_Buchungstext&amp;quot;] = df[&amp;quot;Text&amp;quot;].str.count(&amp;quot;word&amp;quot;)&lt;br /&gt;
 df_search = df[df[&amp;quot;cnt_Buchungstext&amp;quot;] != 1]&lt;br /&gt;
 if len(df_search) &amp;gt; 0:&lt;br /&gt;
     print(df_search)&lt;br /&gt;
 del df_search&lt;br /&gt;
 df = df.drop(columns=[&amp;quot;cnt_Buchungstext&amp;quot;])&lt;br /&gt;
&lt;br /&gt;
==Sorting / order by==&lt;br /&gt;
 # sort by column&lt;br /&gt;
 df = df.sort_values(by=[&amp;#039;betten_belegt&amp;#039;], ascending=False)&lt;br /&gt;
 df = df.sort_values(by=[&amp;quot;count&amp;quot;, &amp;quot;query&amp;quot;], ascending=[False, True])&lt;br /&gt;
 df = df.sort_values(by=[&amp;quot;Buchungstag&amp;quot;, &amp;quot;Text&amp;quot;], ignore_index=True)&lt;br /&gt;
&lt;br /&gt;
===Top10===&lt;br /&gt;
 df_top_ten = (&lt;br /&gt;
     df.sort_values(by=&amp;quot;count&amp;quot;, ascending=False)&lt;br /&gt;
     .head(10)&lt;br /&gt;
 )&lt;br /&gt;
&lt;br /&gt;
==Group By / Top10==&lt;br /&gt;
 df2 = (&lt;br /&gt;
     df.groupby(&amp;quot;sender&amp;quot;)&lt;br /&gt;
     .size()&lt;br /&gt;
     .to_frame(name=&amp;quot;count&amp;quot;)&lt;br /&gt;
     .sort_values(&amp;quot;count&amp;quot;, ascending=False)&lt;br /&gt;
 )&lt;br /&gt;
 &lt;br /&gt;
 df2 = (&lt;br /&gt;
     df[ [ &amp;quot;name&amp;quot;, &amp;quot;amount&amp;quot; ] ]&lt;br /&gt;
     .groupby(&amp;quot;name&amp;quot;)&lt;br /&gt;
     .agg(amountSum=(&amp;quot;amount&amp;quot;, &amp;quot;sum&amp;quot;), count=(&amp;quot;name&amp;quot;, &amp;quot;count&amp;quot;))&lt;br /&gt;
     .sort_values(by=[&amp;quot;amountSum&amp;quot;, &amp;quot;count&amp;quot;], ascending=False)&lt;br /&gt;
 )&lt;br /&gt;
 print(df2.head(10)) # top10&lt;br /&gt;
 &lt;br /&gt;
 # resort by count&lt;br /&gt;
 print(df2.sort_values(by=&amp;quot;count&amp;quot;, ascending=False).head(10))&lt;br /&gt;
&lt;br /&gt;
===Group by year, quarter, month, week using count and add missing===&lt;br /&gt;
Simple: add missing years for multi index&lt;br /&gt;
 df1 = pd.DataFrame(&lt;br /&gt;
     columns=[&amp;quot;year&amp;quot;, &amp;quot;type&amp;quot;, &amp;quot;count&amp;quot;],&lt;br /&gt;
     data=[&lt;br /&gt;
         (2000, &amp;quot;Run&amp;quot;, 1),&lt;br /&gt;
         (2003, &amp;quot;Ride&amp;quot;, 2),&lt;br /&gt;
         (2003, &amp;quot;Swim&amp;quot;, 1),&lt;br /&gt;
     ],&lt;br /&gt;
 )&lt;br /&gt;
 df2 = pd.DataFrame(&lt;br /&gt;
     data={&lt;br /&gt;
         &amp;quot;year&amp;quot;: range(df1[&amp;quot;year&amp;quot;].min(), df1[&amp;quot;year&amp;quot;].max() + 1),&lt;br /&gt;
         &amp;quot;type&amp;quot;: &amp;quot;Run&amp;quot;,&lt;br /&gt;
         &amp;quot;count&amp;quot;: 0,&lt;br /&gt;
     }&lt;br /&gt;
 ).set_index([&amp;quot;year&amp;quot;, &amp;quot;type&amp;quot;])&lt;br /&gt;
 df1 = df1.set_index([&amp;quot;year&amp;quot;, &amp;quot;type&amp;quot;])&lt;br /&gt;
 df = df1.add(df2, fill_value=0).reset_index()&lt;br /&gt;
&lt;br /&gt;
Advanced&lt;br /&gt;
 # df has columns: date and derived year, quarter, month, week, see above&lt;br /&gt;
 # group by year, quarter, month, week, with count as aggregation and gap-filling&lt;br /&gt;
 year_min, year_max = df[&amp;quot;year&amp;quot;].min, df[&amp;quot;year&amp;quot;].max&lt;br /&gt;
 # year&lt;br /&gt;
 df_year = (&lt;br /&gt;
     df[ [ &amp;quot;year&amp;quot;, &amp;quot;date&amp;quot; ] ].groupby([&amp;quot;year&amp;quot;]).count().rename(columns={&amp;quot;date&amp;quot;: &amp;quot;count&amp;quot;})&lt;br /&gt;
 )&lt;br /&gt;
 df_year = df_year.reindex(range(year_min(), year_max() + 1), fill_value=None)&lt;br /&gt;
 df_year = df_year.reset_index().rename(columns={&amp;quot;year&amp;quot;: &amp;quot;date-grouping&amp;quot;})&lt;br /&gt;
 &lt;br /&gt;
 # quarter&lt;br /&gt;
 df_quarter = (&lt;br /&gt;
     df[ [&amp;quot;year&amp;quot;, &amp;quot;quarter&amp;quot;, &amp;quot;date&amp;quot;] ]&lt;br /&gt;
     .groupby([&amp;quot;year&amp;quot;, &amp;quot;quarter&amp;quot;])&lt;br /&gt;
     .count()&lt;br /&gt;
     .rename(columns={&amp;quot;date&amp;quot;: &amp;quot;count&amp;quot;})&lt;br /&gt;
 )&lt;br /&gt;
 df_quarter = df_quarter.reindex(&lt;br /&gt;
     pd.MultiIndex.from_product(&lt;br /&gt;
         [range(year_min(), year_max() + 1), range(1, 5)], names=[&amp;quot;year&amp;quot;, &amp;quot;quarter&amp;quot;]&lt;br /&gt;
     ),&lt;br /&gt;
     fill_value=None,&lt;br /&gt;
 )&lt;br /&gt;
 df_quarter = df_quarter.reset_index()&lt;br /&gt;
 df_quarter[&amp;quot;date-grouping&amp;quot;] = (&lt;br /&gt;
     df_quarter[&amp;quot;year&amp;quot;].astype(str) + &amp;quot;-Q&amp;quot; + df_quarter[&amp;quot;quarter&amp;quot;].astype(str)&lt;br /&gt;
 )&lt;br /&gt;
 df_quarter = df_quarter.drop(columns=[&amp;quot;year&amp;quot;, &amp;quot;quarter&amp;quot;])&lt;br /&gt;
 &lt;br /&gt;
 &lt;br /&gt;
 # month&lt;br /&gt;
 df_month = (&lt;br /&gt;
     df[ [&amp;quot;year&amp;quot;, &amp;quot;month&amp;quot;, &amp;quot;date&amp;quot;] ]&lt;br /&gt;
     .groupby([&amp;quot;year&amp;quot;, &amp;quot;month&amp;quot;])&lt;br /&gt;
     .count()&lt;br /&gt;
     .rename(columns={&amp;quot;date&amp;quot;: &amp;quot;count&amp;quot;})&lt;br /&gt;
 )&lt;br /&gt;
 df_month = df_month.reindex(&lt;br /&gt;
     pd.MultiIndex.from_product(&lt;br /&gt;
         [range(year_min(), year_max() + 1), range(1, 13)], names=[&amp;quot;year&amp;quot;, &amp;quot;month&amp;quot;]&lt;br /&gt;
     ),&lt;br /&gt;
     fill_value=None,&lt;br /&gt;
 )&lt;br /&gt;
 df_month = df_month.reset_index()&lt;br /&gt;
 df_month[&amp;quot;date-grouping&amp;quot;] = (&lt;br /&gt;
     df_month[&amp;quot;year&amp;quot;].astype(str) + &amp;quot;-&amp;quot; + df_month[&amp;quot;month&amp;quot;].astype(str).str.zfill(2)&lt;br /&gt;
 )&lt;br /&gt;
 df_month = df_month.drop(columns=[&amp;quot;year&amp;quot;, &amp;quot;month&amp;quot;])&lt;br /&gt;
 &lt;br /&gt;
 # week&lt;br /&gt;
 df_week = (&lt;br /&gt;
     df[ [&amp;quot;year&amp;quot;, &amp;quot;week&amp;quot;, &amp;quot;date&amp;quot;] ]&lt;br /&gt;
     .groupby([&amp;quot;year&amp;quot;, &amp;quot;week&amp;quot;])&lt;br /&gt;
     .count()&lt;br /&gt;
     .rename(columns={&amp;quot;date&amp;quot;: &amp;quot;count&amp;quot;})&lt;br /&gt;
 )&lt;br /&gt;
 df_week = df_week.reindex(&lt;br /&gt;
     pd.MultiIndex.from_product(&lt;br /&gt;
         [range(year_min(), year_max() + 1), range(1, 53)], names=[&amp;quot;year&amp;quot;, &amp;quot;week&amp;quot;]&lt;br /&gt;
     ),&lt;br /&gt;
     fill_value=None,&lt;br /&gt;
 )&lt;br /&gt;
 df_week = df_week.reset_index()&lt;br /&gt;
 df_week[&amp;quot;date-grouping&amp;quot;] = (&lt;br /&gt;
     df_week[&amp;quot;year&amp;quot;].astype(str) + &amp;quot;-&amp;quot; + df_week[&amp;quot;week&amp;quot;].astype(str).str.zfill(2)&lt;br /&gt;
 )&lt;br /&gt;
 df_week = df_week.drop(columns=[&amp;quot;year&amp;quot;, &amp;quot;week&amp;quot;])&lt;br /&gt;
&lt;br /&gt;
===Group by Type and Date===&lt;br /&gt;
 # group by hour&lt;br /&gt;
 # truncate min and sec data&lt;br /&gt;
 df[&amp;quot;hour&amp;quot;] = pd.to_datetime(df[&amp;quot;time&amp;quot;]).dt.floor(&amp;quot;H&amp;quot;)&lt;br /&gt;
 # sum per hour&lt;br /&gt;
 df2 = df[ [&amp;quot;hour&amp;quot;, &amp;quot;amount&amp;quot;] ].groupby(&amp;quot;hour&amp;quot;).agg(amountSum=(&amp;quot;amount&amp;quot;, &amp;quot;sum&amp;quot;))&lt;br /&gt;
 df2.plot()&lt;br /&gt;
 plt.show()&lt;br /&gt;
&lt;br /&gt;
 # Datetime Index Grouping&lt;br /&gt;
 df = df.groupby(pd.Grouper(freq=&amp;quot;5min&amp;quot;, offset=&amp;quot;00h00min&amp;quot;)).max()&lt;br /&gt;
 &lt;br /&gt;
 # Group by type and date of month start&lt;br /&gt;
 df_month = df.groupby([&amp;quot;type&amp;quot;, pd.Grouper(key=&amp;quot;date&amp;quot;, freq=&amp;quot;MS&amp;quot;)]).agg(&lt;br /&gt;
     {&amp;quot;id&amp;quot;: &amp;quot;count&amp;quot;, &amp;quot;minutes&amp;quot;: &amp;quot;sum&amp;quot;}&lt;br /&gt;
 )&lt;br /&gt;
 &lt;br /&gt;
 # generate index of the desired month-freq:&lt;br /&gt;
 idx = pd.date_range(&lt;br /&gt;
     start=df[&amp;quot;date&amp;quot;].min().replace(day=1),&lt;br /&gt;
     end=df[&amp;quot;date&amp;quot;].max().replace(day=1),&lt;br /&gt;
     freq=&amp;quot;MS&amp;quot;,  # MS = Month Start&lt;br /&gt;
 )&lt;br /&gt;
 &lt;br /&gt;
 # add missing months&lt;br /&gt;
 df_month = df_month.reindex(&lt;br /&gt;
     pd.MultiIndex.from_product(&lt;br /&gt;
         [df_month.index.get_level_values(&amp;quot;type&amp;quot;), idx],&lt;br /&gt;
         names=[&amp;quot;type&amp;quot;, &amp;quot;date&amp;quot;],&lt;br /&gt;
     )&lt;br /&gt;
 )&lt;br /&gt;
 &lt;br /&gt;
 # fill missing by 0 and convert count back to type int&lt;br /&gt;
 df_month = df_month.fillna(0).astype({&amp;quot;count&amp;quot;: int})&lt;br /&gt;
&lt;br /&gt;
====Group Datetime by Time of Day only====&lt;br /&gt;
 df[&amp;quot;Time&amp;quot;] = df[&amp;quot;Date&amp;quot;].dt.round(&amp;quot;5min&amp;quot;).dt.time&lt;br /&gt;
 &lt;br /&gt;
 df_grouped = df [ [&amp;quot;RespTime&amp;quot;, &amp;quot;Time&amp;quot; ] ].groupby(&amp;quot;Time&amp;quot;).mean()&lt;br /&gt;
&lt;br /&gt;
====Group and calculate min,avg,max====&lt;br /&gt;
 df_grouped = (&lt;br /&gt;
     df[ [&amp;quot;Time&amp;quot;, &amp;quot;RespTime&amp;quot;] ]&lt;br /&gt;
     .groupby(&amp;quot;Time&amp;quot;)&lt;br /&gt;
     .agg(&lt;br /&gt;
         max=pd.NamedAgg(column=&amp;quot;RespTime&amp;quot;, aggfunc=&amp;quot;max&amp;quot;),&lt;br /&gt;
         avg=pd.NamedAgg(column=&amp;quot;RespTime&amp;quot;, aggfunc=&amp;quot;mean&amp;quot;),&lt;br /&gt;
         min=pd.NamedAgg(column=&amp;quot;RespTime&amp;quot;, aggfunc=&amp;quot;min&amp;quot;),&lt;br /&gt;
     )&lt;br /&gt;
 )&lt;br /&gt;
&lt;br /&gt;
==Column Handling==&lt;br /&gt;
===Renaming===&lt;br /&gt;
 df = df.rename(&lt;br /&gt;
      columns = {&amp;quot;invasiv_beatmet&amp;quot;: &amp;quot;beatmet&amp;quot;,}, &lt;br /&gt;
      errors=&amp;quot;raise&amp;quot;,&lt;br /&gt;
      )&lt;br /&gt;
&lt;br /&gt;
rename column headers by extracting some int values from a string&lt;br /&gt;
 l2 = []&lt;br /&gt;
 for col in df.columns:&lt;br /&gt;
     year = int(col[0:4])&lt;br /&gt;
     week = int(col[5:7])&lt;br /&gt;
     l2.append(year * 100 + week)&lt;br /&gt;
 df.columns = l2&lt;br /&gt;
&lt;br /&gt;
===Dropping===&lt;br /&gt;
drop columns&lt;br /&gt;
 df = df.drop(columns=[&amp;quot;Sterbejahr&amp;quot;, &amp;quot;Sterbewoche&amp;quot;])&lt;br /&gt;
&lt;br /&gt;
===Change Column Order===&lt;br /&gt;
 df = df.reindex(&lt;br /&gt;
     [&lt;br /&gt;
         &amp;quot;Wann&amp;quot;,&lt;br /&gt;
         &amp;quot;Art&amp;quot;,&lt;br /&gt;
         &amp;quot;Wer&amp;quot;,&lt;br /&gt;
     ],&lt;br /&gt;
     axis=&amp;quot;columns&amp;quot;,&lt;br /&gt;
 &lt;br /&gt;
 # column reorder&lt;br /&gt;
 cols_first = [&amp;quot;year&amp;quot;, &amp;quot;month&amp;quot;, &amp;quot;week&amp;quot;, &amp;quot;weekyear&amp;quot;]&lt;br /&gt;
 cols_last = [col for col in df.columns if col not in cols_first]&lt;br /&gt;
 cols_first.extend(cols_last)&lt;br /&gt;
 df = df[cols_first]&lt;br /&gt;
 del cols_first, cols_last&lt;br /&gt;
 &lt;br /&gt;
 # same but different&lt;br /&gt;
 first_columns = (&amp;quot;description&amp;quot;, &amp;quot;isin&amp;quot;, &amp;quot;side&amp;quot;, &amp;quot;quantity&amp;quot;, &amp;quot;Price&amp;quot;, &amp;quot;amount&amp;quot;)&lt;br /&gt;
 i = 0&lt;br /&gt;
 for col in first_columns:&lt;br /&gt;
     df.insert(i, col, df.pop(col))&lt;br /&gt;
     i += 1&lt;br /&gt;
 del i&lt;br /&gt;
&lt;br /&gt;
==Index Handling==&lt;br /&gt;
select column as index&lt;br /&gt;
 df = df.set_index(&amp;quot;Date&amp;quot;)&lt;br /&gt;
move date from index back to column&lt;br /&gt;
 df = df.reset_index()&lt;br /&gt;
&lt;br /&gt;
rename index&lt;br /&gt;
 df.index.name = &amp;quot;Date&amp;quot;&lt;br /&gt;
&lt;br /&gt;
reset index to start at 0&lt;br /&gt;
 df2 = df1[1 * 365 : 2 * 365].reset_index(drop=True)&lt;br /&gt;
&lt;br /&gt;
int index to str index&lt;br /&gt;
 df.index = df.index.map(str)&lt;br /&gt;
&lt;br /&gt;
===text indexes===&lt;br /&gt;
 df = df.set_index(&amp;quot;Altersgruppe&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
===datetime indexes===&lt;br /&gt;
 assert type(df.index) is pd.DatetimeIndex&lt;br /&gt;
 df.index = df.index.tz_convert(&amp;quot;Europe/Berlin&amp;quot;)&lt;br /&gt;
 df.index = df.index.tz_convert(None) # to remove tz info&lt;br /&gt;
 &lt;br /&gt;
 # select &amp;quot;Date&amp;quot; column as index&lt;br /&gt;
 df = df.set_index([&amp;quot;Date&amp;quot;])&lt;br /&gt;
 &lt;br /&gt;
 # convert index to datetime &lt;br /&gt;
 df.index = pd.to_datetime(df.index)&lt;br /&gt;
 &lt;br /&gt;
 # add missing dates&lt;br /&gt;
 df = df.reindex(&lt;br /&gt;
     pd.date_range(df.index.min(), df.index.max(), freq=&amp;quot;D&amp;quot;), fill_value=0&lt;br /&gt;
 )&lt;br /&gt;
 df.index.name = &amp;quot;date&amp;quot;&lt;br /&gt;
 &lt;br /&gt;
 # remove timezone offset (for Excel can not handle this)&lt;br /&gt;
 df.index = df.index.tz_localize(None)&lt;br /&gt;
  &lt;br /&gt;
 date_last = df.index[-1])&lt;br /&gt;
&lt;br /&gt;
 # reindex and fill missing with 0&lt;br /&gt;
 date_last = pd.to_datetime(df.index[-1]).date()&lt;br /&gt;
 idx = pd.date_range(&amp;#039;2020-01-01&amp;#039;, date_last))&lt;br /&gt;
 df = df.reindex(idx, fill_value=0)&lt;br /&gt;
 &lt;br /&gt;
 # add missing dates&lt;br /&gt;
 df = df.asfreq(&amp;#039;D&amp;#039;, fill_value=0)&lt;br /&gt;
 df = df.asfreq(freq=&amp;quot;5M&amp;quot;)&lt;br /&gt;
 df = df.sort_index() # needed?&lt;br /&gt;
 &lt;br /&gt;
 # drop values of column for last 3 weeks&lt;br /&gt;
 date_3w = dt.date.today() - dt.timedelta(weeks=3)&lt;br /&gt;
 df.loc[df.index.date &amp;gt;= date_3w, &amp;quot;Cases&amp;quot;] = None&lt;br /&gt;
 # or&lt;br /&gt;
 df.loc[df.index.date &amp;lt; pd.to_datetime(&amp;quot;2020-03-01&amp;quot;), &amp;quot;Value&amp;quot;] = None&lt;br /&gt;
&lt;br /&gt;
====filter on date index====&lt;br /&gt;
 # drop data prior to 2020&lt;br /&gt;
 df = df.loc[&amp;#039;2020-01-01&amp;#039;:]&lt;br /&gt;
 # alternative:&lt;br /&gt;
 df = df[df.index &amp;gt;= &amp;quot;2021-01-10&amp;quot;]&lt;br /&gt;
&lt;br /&gt;
==Pivot, UnPivot/Melt==&lt;br /&gt;
===Pivot===&lt;br /&gt;
 # df has columns &amp;quot;key&amp;quot; (column that shall be the index) , &amp;quot;year&amp;quot; (column you want to convert to multiple columns), &amp;quot;value&amp;quot; (the numbers to aggregate)&lt;br /&gt;
 # convert to wide format with id as columns&lt;br /&gt;
 df2 = df.pivot_table(index=&amp;quot;key&amp;quot;, columns=&amp;quot;year&amp;quot;, values=&amp;quot;value&amp;quot;, aggfunc=&amp;quot;first&amp;quot;) # aggfunc=&amp;quot;sum&amp;quot;&lt;br /&gt;
&lt;br /&gt;
===melt/unpivot wide table to long table format===&lt;br /&gt;
 df2 = pd.melt(df, id_vars=&amp;quot;Date&amp;quot;, value_vars=df.columns[1:])  #&lt;br /&gt;
 df2 = df2.rename(columns={&amp;quot;variable&amp;quot;: &amp;quot;Machine&amp;quot;, &amp;quot;value&amp;quot;: &amp;quot;Status&amp;quot;})&lt;br /&gt;
 &lt;br /&gt;
 # not working, using melt instead&lt;br /&gt;
 # df3 = pd.wide_to_long(df, stubnames=&amp;quot;Date&amp;quot;, i=df.columns[1:], j=&amp;quot;Status&amp;quot;)&lt;br /&gt;
&lt;br /&gt;
==Merge/Concat and Append DFs==&lt;br /&gt;
===Merge/Concat===&lt;br /&gt;
 df_sum = pd.DataFrame()&lt;br /&gt;
 df_sum = pd.concat(&lt;br /&gt;
     [df_sum, df],&lt;br /&gt;
     ignore_index=True,&lt;br /&gt;
 )&lt;br /&gt;
===Append===&lt;br /&gt;
 # merge columns from different dfs, when have matching index, e.g. user_id&lt;br /&gt;
 df = pd.concat([df1, df2], axis=1)&lt;br /&gt;
 &lt;br /&gt;
 # append df2 to end of df1&lt;br /&gt;
 df1 = pd.DataFrame({&amp;quot;date&amp;quot;: pd.date_range(start=&amp;quot;2020-01&amp;quot;, end=&amp;quot;2020-02&amp;quot;, freq=&amp;quot;W&amp;quot;)})&lt;br /&gt;
 df2 = pd.DataFrame({&amp;quot;date&amp;quot;: pd.date_range(start=&amp;quot;2024-01&amp;quot;, end=&amp;quot;2024-12&amp;quot;, freq=&amp;quot;D&amp;quot;)})&lt;br /&gt;
 df = pd.concat([df1, df2]).reset_index(drop=True)&lt;br /&gt;
&lt;br /&gt;
 # join series of 2 df&lt;br /&gt;
 df_covid_2020 = pd.DataFrame()&lt;br /&gt;
 df_covid_2020[&amp;#039;Deaths_Covid_2020&amp;#039;] = df1[&amp;#039;Deaths_Covid_2020&amp;#039;].append(&lt;br /&gt;
     df2[&amp;#039;Deaths_Covid_2020&amp;#039;], ignore_index=True)&lt;br /&gt;
 &lt;br /&gt;
 # note: this would require all index of df in df2 present:&lt;br /&gt;
 # df[machine] = df2[machine]&lt;br /&gt;
&lt;br /&gt;
==Rolling Average, mean of columns, min, max==&lt;br /&gt;
 df[&amp;#039;2016_roll&amp;#039;] = df[&amp;#039;2016&amp;#039;].rolling(window=7, min_periods=1).mean().round(1)&lt;br /&gt;
 df[&amp;#039;2017_roll&amp;#039;] = df[&amp;#039;2017&amp;#039;].rolling(window=7, min_periods=1).mean().round(1)&lt;br /&gt;
 df[&amp;#039;2018_roll&amp;#039;] = df[&amp;#039;2018&amp;#039;].rolling(window=7, min_periods=1).mean().round(1)&lt;br /&gt;
 df[&amp;#039;2019_roll&amp;#039;] = df[&amp;#039;2019&amp;#039;].rolling(window=7, min_periods=1).mean().round(1)&lt;br /&gt;
 df[&amp;#039;2020_roll&amp;#039;] = df[&amp;#039;2020&amp;#039;].rolling(window=7, min_periods=1).mean().round(1)&lt;br /&gt;
 # mean value of 4 columns&lt;br /&gt;
 df[&amp;#039;2016_2019_mean&amp;#039;] = df.iloc[:, [1, 2, 3, 4]&lt;br /&gt;
                                ].mean(axis=1)  # not column 0 = day&lt;br /&gt;
 df[&amp;#039;2016_2019_mean_roll&amp;#039;] = df[&amp;#039;2016_2019_mean&amp;#039;].rolling(&lt;br /&gt;
     window=7, min_periods=1).mean().round(1)&lt;br /&gt;
 &lt;br /&gt;
 df[&amp;#039;2016_2019_roll_max&amp;#039;] = df.iloc[:, [6, 7, 8, 9]].max(axis=1)&lt;br /&gt;
 df[&amp;#039;2016_2019_roll_min&amp;#039;] = df.iloc[:, [6, 7, 8, 9]].min(axis=1)&lt;br /&gt;
 &lt;br /&gt;
&lt;br /&gt;
==Helpers==&lt;br /&gt;
 def pandas_set_date_index(df, date_column: str):&lt;br /&gt;
     &amp;quot;&amp;quot;&amp;quot; use date as index &amp;quot;&amp;quot;&amp;quot;&lt;br /&gt;
     df[date_column] = pd.to_datetime(df[date_column], format=&amp;#039;%Y-%m-%d&amp;#039;)&lt;br /&gt;
     df = df.set_index([date_column])&lt;br /&gt;
     return df&lt;br /&gt;
 &lt;br /&gt;
 def pandas_calc_roll_av(df, column: str, days: int = 7):&lt;br /&gt;
     &amp;quot;&amp;quot;&amp;quot; calc rolling average over column &amp;quot;&amp;quot;&amp;quot;&lt;br /&gt;
     df[column + &amp;#039;_roll_av&amp;#039;] = df[column].rolling(&lt;br /&gt;
         window=days, min_periods=1).mean().round(1)&lt;br /&gt;
     return df&lt;br /&gt;
 &lt;br /&gt;
 # custom rounding&lt;br /&gt;
 def custom_round(x: float, base: int = 5) -&amp;gt; int:&lt;br /&gt;
     &amp;quot;&amp;quot;&amp;quot;Custom rounding.&amp;quot;&amp;quot;&amp;quot;&lt;br /&gt;
     # from https://stackoverflow.com/questions/40372030/pandas-round-to-the-nearest-n&lt;br /&gt;
     return int(base * round(float(x) / base))&lt;br /&gt;
 &lt;br /&gt;
 df[&amp;quot;power_rounded&amp;quot;] = df[&amp;quot;power&amp;quot;].apply( lambda x: custom_round(x, base=20) )&lt;br /&gt;
&lt;br /&gt;
==Transpose==&lt;br /&gt;
 df = df.transpose()&lt;br /&gt;
&lt;br /&gt;
==Leftovers==&lt;br /&gt;
 # when in a function one might get the SettingWithCopyWarning, fix via&lt;br /&gt;
 df = df.copy()&lt;br /&gt;
&lt;br /&gt;
 # copy&lt;br /&gt;
 df2[&amp;#039;Date&amp;#039;] = df0[&amp;#039;Date&amp;#039;]&lt;br /&gt;
 &lt;br /&gt;
 # drop 2 rows from the beginning&lt;br /&gt;
 df = df2.drop([0, 1])&lt;br /&gt;
&lt;br /&gt;
====append today using yesterdays value====&lt;br /&gt;
 str_today = dt.datetime.today().strftime(&amp;quot;%Y-%m-%d&amp;quot;)&lt;br /&gt;
 ts_today = pd.Timestamp(str_today)&lt;br /&gt;
 if df.index[-1] != ts_today:&lt;br /&gt;
     df.loc[ts_today] = df.loc[df.index[-1], &amp;quot;Count&amp;quot;]&lt;br /&gt;
&lt;br /&gt;
==Plotting via Matplotlib==&lt;br /&gt;
see [[Matplotlib]]&lt;/div&gt;</summary>
		<author><name>Torben</name></author>
	</entry>
</feed>