Pandas
Getting started
import pandas as pd
Docu
nice cheat sheets can be found here
- https://www.datacamp.com/cheat-sheet/pandas-cheat-sheet-for-data-science-in-python
- https://www.datacamp.com/cheat-sheet/pandas-cheat-sheet-data-wrangling-in-python
Create DF
import pandas as pd # empty df = pd.DataFrame()
random data
df = pd.DataFrame({"x": range(100)}) df["y"] = np.random.rand(len(df)) * 10 # y = sin(x) + noise noise = np.random.rand(len(df)) * 0.1 noise = np.random.default_rng().random(10) # newer syntax df["y"] = np.sin(df["x"] / 25 * np.pi) + noise # 20 rows and 3 columns of random values df = pd.DataFrame(np.random.rand(20, 3), columns=["a", "b", "c"])
from Lists
# from 1 dim list df = pd.DataFrame(data={"Column Name": lst}) # per column df = pd.DataFrame( data={"col1":lst1, "col2":lst2, "col3":lst3}) ) # from N dim list, and set name of columns df = pd.DataFrame( data=lst, columns=("col1", "col2", "col3) ) # from multiple lists data = zip( l_days, l_2016, l_2017, l_2018, l_2019, l_2020, strict=True, ) ...
from csv
see [1]
df = pd.read_csv("data.tsv", sep="\t") # only selected columns and parse date df = pd.read_csv( "data.tsv", sep="\t", decimal=",", usecols=[ "Date", "Deaths_New", ], # only load these columns parse_dates=[ "Date", ], # convert to date object if format is yyyy-mm-dd pr dd.mm.yyyy index_col="Date", # choose this column as index na_values=["<4"], # values to treat as NA ) df = df.rename(columns={"Deaths_New": "Deaths_Covid"}) # convert data upon loading # read only first 10 chars from 2021-04-29T12:15:00+02:00 -> 2021-04-29 pd_date_converter = lambda x: (x[0:10]) # noqa: E731 df = pd.read_csv( "data.tsv", sep=",", converters={"Datum": pd_date_converter}, parse_dates=[ "Datum", # # convert to date object if format is yyyy-mm-dd ], ) # convert datetime to date after loading for c in ("Buchungstag", "Valutadatum"): df2[c] = pd.to_datetime(df2[c], format="%d.%m.%Y").dt.date
from list[str] via read_csv
from io import StringIO csv_string_io = StringIO("\n".join(lines)) df = pd.read_csv( csv_string_io, sep=";", )
from Excel
see [2]
import openpyxl # pip install openpyxl df = pd.read_excel("file.xlsx", sheet_name="Sheet1", engine="openpyxl") df = pd.read_excel( "file.xlsx", skiprows=4, usecols=[ "ArbPlatz", "BezArt", ], )
from dict
see [3]
df = pd.DataFrame.from_dict(d, orient="index", columns=["Col1", Col2])
flatten dict in list of dict
# l1 is list of dict, each dict has a sub-dict called metadata l2 = [] for item in l1: flat = item.copy() meta = item.pop("metadata") flat.update(meta) l2.append(flat) del l1 df = pd.DataFrame.from_dict(l2)
from array/record/list
see [4]
data = [ {"id":1, "name":"asdf"}, {"id":2, "name":"qwertz"}, ] df = pd.DataFrame.from_records(data)
from JSON
see [5]
df = pd.read_json("file.json")
from SQLite
import sqlite3 sql = "SELECT * FROM tab WHERE col2 = ? ORDER BY id"; df = pd.read_sql_query(sql, con, params=(MY_FILTER_VALUE,))
Export Data
to csv
df.to_csv( "data.tsv", sep="\t", lineterminator="\n", ) # selected columns only df[ ["Col1", "Col2"] ].to_csv( "data.tsv", sep="\t", lineterminator="\n", )
to Excel
see [6]
df.to_excel("data.xlsx", index=False)
Export 2 df as sheets in 1 file
with pd.ExcelWriter(path=file_in.with_suffix(".xlsx")) as writer: df1.to_excel(writer, sheet_name='Sheet_name_1') df2.to_excel(writer, sheet_name='Sheet_name_2') writer.close()
to HTML
see [7]
df.to_html( "out.html", index=False, render_links=False, escape=False, justify="center" )
# html encoding of column name only df["name"] = df["name"].str.encode("ascii", "xmlcharrefreplace").str.decode("utf-8") # add link to name df["name"] = "<a href='" + df["url"] + "' target='_blank' >" + df["name"] + "</a>" # export to html df[ ["name", "due", "overdue", "priority", "overdue priority"] ].to_html( "out.html", index=False, render_links=False, escape=False, justify="center" )
to JSON
json_data = df.values.tolist() with Path(pathStatsExport / "ts_types_month.json").open( "w", encoding="UTF-8" ) as fh: json.dump( json_data, fp=fh, ensure_ascii=False, sort_keys=False, indent=2, )
Modification
Add Row
add row from list to end
idx = df.index[-1] + 1 list_of_values = (...) df.loc[idx] = list_of_values
add dummy row for missing 1.1.2020 and reindex
df.loc[-1] = "2020-01-01", 0 df.index = df.index + 1 # shifting index df = df.sort_index() # sorting by index
Add column based on other columns
df["URL"] = df.apply( lambda row: f"https://www.openstreetmap.org/?mlat={row['Lat']}&mlon={row['Lng']}#map={zoom}/{row['Lat']}/{row['Lng']}", axis=1,
Apply map/dict id->name
df["x_gear_name"] = df["gear_id"].map(d_id_name)
Replace na Values
df = df.dropna() # removes missing values # only in certain columns df = df.dropna(subset=["power", "heart_rate", "cadence"]) df = df[df["value"].notna()]
Interpolate missing values
# interpolate missing data, e.g. after df.groupby([pd.Grouper(key="time", freq="1min")]) df["kWh_total_in"] = df["kWh_total_in"].interpolate(method="linear")
Calc Diff / Delta of rows
df["delta_prev"] = df["kWh"].diff() df["delta_next"] = df["kWh"].shift(-1) - df["kWh"]
Overwrite Data
# overwrite values > 123 by 123 etc. df["Lat"] = df["Lat"].clip(lower=-180, upper=180) df["Lng"] = df["Lng"].clip(lower=-90, upper=90 ) # set value of columns based on condition # set "type" = "Ride" where "type" == "VirtualRide" df.loc[df["type"] == "VirtualRide", "type"] = "Ride" # negative -> 0 df[df < 0] = 0 # overwrite column data of last 3 weeks by None df["DateAsDate"] = pd.to_datetime(df["Date"], format="%Y-%m-%d") date_3w = dt.date.today() - dt.timedelta(weeks=3) df.loc[df["DateAsDate"].dt.date >= date_3w, "MyColumn"] = None # rolling takes NAN values into account, so I need to overwrite them as well df3["Deaths_Covid_roll"] = np.where( df3["Deaths_Covid"].isnull(), np.nan, df3["Deaths_Covid_roll"] df.loc[(df["Kat2"] == "") & (df["IBAN"] == "DE02100100100152517108"), "Kat2"] = "Bahn" df.loc[(df["Kat1"] == "") & (df["Kat2"] == "Bahn"), "Kat1"] = "Mobilität" # assigning multiple values df.loc[ (df["Kat1"] == "") & (df["IBAN"] == "DE02100100100152517108"), ["Kat1", "Kat2", "Kat3"], ] = ["Mobilität], "Reisen", "Bahn"] # str endswith df.loc[ (df["RespTime"] == 0.0) & (df["Text"].str.endswith("(read timeout=30)")), "RespTime", ] = 30.0
Read-out Data
Columns
df2 = df [ [ "col1", "col2" ] ]
Rows
for index, row in df.iterrows():
extract cell based on Index
col1_first = df2["col1"].iloc[0] de_sum = df["col"].loc["Summe"]
Loop over DF
for row in df.itertuples(): assert type(row.start_date) is dt.datetime assert type(row.utc_offset) is int start_date = row.start_date - dt.timedelta(seconds=row.utc_offset)
Aggregation
Sum Column or Row
# sum of 1 column sum_cases = df["Cases"].sum() # sum per column df_sums = df.sum(axis="columns") # sum per row / index df_sums = df.sum(axis="index")
Filtering
Basics
# new notation var = 40 df = df.query("power >= 80 & cadence >= @var") # old notation df = df[df["x_start_locality"].isna() & df["start_latlng"].notna()] df = df[ df["power"] >= 80 ] df = df[ (df["power"] >= 80) & (df["cadence"] >= 40) ] # ~ inverts # list of multiple values df = df[ ~df["col1"].isin(("A", "B", "C")) ]
filter on index
df = df[df.index >= start_yearweek] df = df.drop("Summe")
Filtering Dates
# filter a date column via year df = df[df["Date"].dt.year == 2020] # filter a date via str df = df[df["Date"] == "2021-11-13"] # filter on date via dt df = df[ df["RECEIVE_DATE"].dt.date # convert pandas datetime64[ns] to date >= (dt.date.today() - dt.timedelta(days=30)) ]
Drop Duplicate Rows
df = df.drop_duplicates()
Distinct values of column as list
gear_ids = df["gear_id"].unique()
Date Handling
Dates
Create Date Range
df = pd.DataFrame({"date": pd.date_range(start="2023-01", end="2024-12", freq="D")}) df = pd.DataFrame({"date": pd.date_range(start="2023-01", end="2024-12", freq="W")})
Convert Str Column to Date
# str to datetime df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d") # datetime to date df["Date"] = df["Date"].dt.date
Extract: year, month, quarter, week, weekstart
df["year"] = df["date"].dt.year df["month"] = df["date"].dt.month df["quarter"] = df["date"].dt.quarter df["week"] = df["date"].dt.isocalendar().week # fix week 53 # df["x_week"] = df["x_week"].clip(upper=52) df.loc[(df["week"] == 53) & (df["month"] == 1), "x_week"] = 1 df.loc[(df["week"] == 53) & (df["month"] == 12), "x_week"] = 52 # week_start from date df["week_start"] = df["date"].dt.to_period("W").apply(lambda r: r.start_time) # week_start from week number df["week_start"] = pd.to_datetime( df.apply(lambda row: dt.date.fromisocalendar(row["year"], row["week"], 1), axis=1) )
DateTime
Convert String to Datetime and Timezone Handling
# string to datetime, converting to UTC for future compatibility df["date"] = pd.to_datetime(df["date"], format="ISO8601", utc=True) # convert from utc to local timezone df["date"] = df["date"].dt.tz_convert(tz="Europe/Berlin") # drop timezone info, since Excel can not handle it df["date"] = df["date"].dt.tz_localize(None)
DateTime: remove miliseconds
df["DateTime"] = df["DateTime"].dt.ceil(freq="s")
TimeStamps
Convert Timestamp to Datetime and use as Index
df["datetime"] = pd.to_datetime( # convert to datetime df["start_ts"], unit="s", # timestamp is in seconds utc=True, # timestamp is in UTC ).dt.tz_convert( # convert to local TZ "Europe/Berlin" ) df = df.set_index("datetime")
Convert Datetime to Timestamp
df["timestamp"] = df["datetime"].values.astype(np.int64) // 10**9 # or df["timestamp"] = df["datetime"].astype(int) // 10**9 # // is integer division
Timestamp: substract first
# calc elapsed time df["seconds"] = df["timestamp"] - df["timestamp"].iloc[0]
String Modifications
Replace
df["name"] = df["name"].str.strip() # trim whitespaces df["time"] = df["time"].str.replace(r"\..*$", "Z", regex=True)
# remove word "Probable" df["col"] = df["col"].replace( to_replace=r"^Probable ", value="", regex=True )
df["Text"] = df["Text"].str.replace( r"^Menke, *Torben *", "Torben Menke", regex=True, )
split string
df [ [ "week", "year" ] ] = df["Kalenderwoche"].str.split("/", expand=True)
string <-> number
string to float: 1.234,56 -> 1234.56
df["Euro"] = ( df["Euro"] .str.replace(".", "") .str.replace(",", ".") .astype(float) )
convert int to str adding leading zeros
df["Sterbewoche"].astype(str).str.zfill(2)
capitalization
df["Wer"] = df["Wer"].apply(lambda x: string.capwords(x))
trim spaces
df["Wer"] = df["Wer"].str.strip()
count word per row and check if more than one
str.count()
df["cnt_Buchungstext"] = df["Text"].str.count("word") df_search = df[df["cnt_Buchungstext"] != 1] if len(df_search) > 0: print(df_search) del df_search df = df.drop(columns=["cnt_Buchungstext"])
Sorting / order by
# sort by column df = df.sort_values(by=['betten_belegt'], ascending=False) df = df.sort_values(by=["count", "query"], ascending=[False, True]) df = df.sort_values(by=["Buchungstag", "Text"], ignore_index=True)
Top10
df_top_ten = ( df.sort_values(by="count", ascending=False) .head(10) )
Group By / Top10
df2 = ( df[ [ "name", "amount" ] ] .groupby("name") .agg(amountSum=("amount", "sum"), count=("name", "count")) .sort_values(by=["amountSum", "count"], ascending=False) ) print(df2.head(10)) # top10 # resort by count print(df2.sort_values(by="count", ascending=False).head(10))
Group by year, quarter, month, week using count and add missing
Simple: add missing years for multi index
df1 = pd.DataFrame( columns=["year", "type", "count"], data=[ (2000, "Run", 1), (2003, "Ride", 2), (2003, "Swim", 1), ], ) df2 = pd.DataFrame( data={ "year": range(df1["year"].min(), df1["year"].max() + 1), "type": "Run", "count": 0, } ).set_index(["year", "type"]) df1 = df1.set_index(["year", "type"]) df = df1.add(df2, fill_value=0).reset_index()
Advanced
# df has columns: date and derived year, quarter, month, week, see above # group by year, quarter, month, week, with count as aggregation and gap-filling year_min, year_max = df["year"].min, df["year"].max # year df_year = ( df[ [ "year", "date" ] ].groupby(["year"]).count().rename(columns={"date": "count"}) ) df_year = df_year.reindex(range(year_min(), year_max() + 1), fill_value=None) df_year = df_year.reset_index().rename(columns={"year": "date-grouping"}) # quarter df_quarter = ( df[ ["year", "quarter", "date"] ] .groupby(["year", "quarter"]) .count() .rename(columns={"date": "count"}) ) df_quarter = df_quarter.reindex( pd.MultiIndex.from_product( [range(year_min(), year_max() + 1), range(1, 5)], names=["year", "quarter"] ), fill_value=None, ) df_quarter = df_quarter.reset_index() df_quarter["date-grouping"] = ( df_quarter["year"].astype(str) + "-Q" + df_quarter["quarter"].astype(str) ) df_quarter = df_quarter.drop(columns=["year", "quarter"]) # month df_month = ( df[ ["year", "month", "date"] ] .groupby(["year", "month"]) .count() .rename(columns={"date": "count"}) ) df_month = df_month.reindex( pd.MultiIndex.from_product( [range(year_min(), year_max() + 1), range(1, 13)], names=["year", "month"] ), fill_value=None, ) df_month = df_month.reset_index() df_month["date-grouping"] = ( df_month["year"].astype(str) + "-" + df_month["month"].astype(str).str.zfill(2) ) df_month = df_month.drop(columns=["year", "month"]) # week df_week = ( df[ ["year", "week", "date"] ] .groupby(["year", "week"]) .count() .rename(columns={"date": "count"}) ) df_week = df_week.reindex( pd.MultiIndex.from_product( [range(year_min(), year_max() + 1), range(1, 53)], names=["year", "week"] ), fill_value=None, ) df_week = df_week.reset_index() df_week["date-grouping"] = ( df_week["year"].astype(str) + "-" + df_week["week"].astype(str).str.zfill(2) ) df_week = df_week.drop(columns=["year", "week"])
Group by Type and Date
# group by hour # truncate min and sec data df["hour"] = pd.to_datetime(df["time"]).dt.floor("H") # sum per hour df2 = df[ ["hour", "amount"] ].groupby("hour").agg(amountSum=("amount", "sum")) df2.plot() plt.show()
# Datetime Index Grouping df = df.groupby(pd.Grouper(freq="5min", offset="00h00min")).max() # Group by type and date of month start df_month = df.groupby(["type", pd.Grouper(key="date", freq="MS")]).agg( {"id": "count", "minutes": "sum"} ) # generate index of the desired month-freq: idx = pd.date_range( start=df["date"].min().replace(day=1), end=df["date"].max().replace(day=1), freq="MS", # MS = Month Start ) # add missing months df_month = df_month.reindex( pd.MultiIndex.from_product( [df_month.index.get_level_values("type"), idx], names=["type", "date"], ) ) # fill missing by 0 and convert count back to type int df_month = df_month.fillna(0).astype({"count": int})
Group Datetime by Time of Day only
df["Time"] = df["Date"].dt.round("5min").dt.time df_grouped = df [ ["RespTime", "Time" ] ].groupby("Time").mean()
Group and calculate min,avg,max
df_grouped = ( df[ ["Time", "RespTime"] ] .groupby("Time") .agg( max=pd.NamedAgg(column="RespTime", aggfunc="max"), avg=pd.NamedAgg(column="RespTime", aggfunc="mean"), min=pd.NamedAgg(column="RespTime", aggfunc="min"), ) )
Column Handling
Renaming
df = df.rename( columns = {"invasiv_beatmet": "beatmet",}, errors="raise", )
rename column headers by extracting some int values from a string
l2 = [] for col in df.columns: year = int(col[0:4]) week = int(col[5:7]) l2.append(year * 100 + week) df.columns = l2
Dropping
drop columns
df = df.drop(columns=["Sterbejahr", "Sterbewoche"])
Change Column Order
df = df.reindex( [ "Wann", "Art", "Wer", ], axis="columns", )
# move certain columns to front, leaving the remaining unchanged first_colmns = ("description", "isin", "side", "quantity", "Price", "amount") i = 0 for col in first_colmns: df.insert(i, col, df.pop(col)) i += 1 del i
Index Handling
select column as index
df = df.set_index("Date")
move date from index back to column
df = df.reset_index()
rename index
df.index.name = "Date"
reset index to start at 0
df2 = df1[1 * 365 : 2 * 365].reset_index(drop=True)
text indexes
df = df.set_index("Altersgruppe")
datetime indexes
# select "Date" column as index df = df.set_index(["Date"]) # convert index to datetime df.index = pd.to_datetime(df.index) # add missing dates df = df.reindex( pd.date_range(df.index.min(), df.index.max(), freq="D"), fill_value=0 ) df.index.name = "date" # remove timezone offset (for Excel can not handle this) df.index = df.index.tz_localize(None) date_last = df.index[-1])
# reindex and fill missing with 0 date_last = pd.to_datetime(df.index[-1]).date() idx = pd.date_range('2020-01-01', date_last)) df = df.reindex(idx, fill_value=0) # add missing dates df = df.asfreq('D', fill_value=0) df = df.asfreq(freq="5M") df = df.sort_index() # needed? # drop values of column for last 3 weeks date_3w = dt.date.today() - dt.timedelta(weeks=3) df.loc[df.index.date >= date_3w, "Cases"] = None # or df.loc[df.index.date < pd.to_datetime("2020-03-01"), "Value"] = None
filter on date index
# drop data prior to 2020 df = df.loc['2020-01-01':] # alternative: df = df[df.index >= "2021-01-10"]
Pivot, UnPivot/Melt
Pivot
# df has columns "key", "id", "value" # convert to wide format with id as columns df2 = df.pivot_table(index="key", columns="id", values="value", aggfunc="first")
melt/unpivot wide table to long table format
df2 = pd.melt(df, id_vars="Date", value_vars=df.columns[1:]) # df2 = df2.rename(columns={"variable": "Machine", "value": "Status"}) # not working, using melt instead # df3 = pd.wide_to_long(df, stubnames="Date", i=df.columns[1:], j="Status")
Merge/Concat and Append DFs
Merge/Concat
df_sum = pd.DataFrame() df_sum = pd.concat( [df_sum, df], ignore_index=True, )
Append
# append df2 to end of df1 df1 = pd.DataFrame({"date": pd.date_range(start="2020-01", end="2020-02", freq="W")}) df2 = pd.DataFrame({"date": pd.date_range(start="2024-01", end="2024-12", freq="D")}) df = pd.concat([df1, df2]).reset_index(drop=True)
# join series of 2 df df_covid_2020 = pd.DataFrame() df_covid_2020['Deaths_Covid_2020'] = df1['Deaths_Covid_2020'].append( df2['Deaths_Covid_2020'], ignore_index=True)
df = pd.concat([df, df2], axis=1) # note: this would require all index of df in df2 present: # df[machine] = df2[machine]
Rolling Average, mean of columns, min, max
df['2016_roll'] = df['2016'].rolling(window=7, min_periods=1).mean().round(1) df['2017_roll'] = df['2017'].rolling(window=7, min_periods=1).mean().round(1) df['2018_roll'] = df['2018'].rolling(window=7, min_periods=1).mean().round(1) df['2019_roll'] = df['2019'].rolling(window=7, min_periods=1).mean().round(1) df['2020_roll'] = df['2020'].rolling(window=7, min_periods=1).mean().round(1) # mean value of 4 columns df['2016_2019_mean'] = df.iloc[:, [1, 2, 3, 4] ].mean(axis=1) # not column 0 = day df['2016_2019_mean_roll'] = df['2016_2019_mean'].rolling( window=7, min_periods=1).mean().round(1) df['2016_2019_roll_max'] = df.iloc[:, [6, 7, 8, 9]].max(axis=1) df['2016_2019_roll_min'] = df.iloc[:, [6, 7, 8, 9]].min(axis=1)
Helpers
def pandas_set_date_index(df, date_column: str): """ use date as index """ df[date_column] = pd.to_datetime(df[date_column], format='%Y-%m-%d') df = df.set_index([date_column]) return df def pandas_calc_roll_av(df, column: str, days: int = 7): """ calc rolling average over column """ df[column + '_roll_av'] = df[column].rolling( window=days, min_periods=1).mean().round(1) return df # custom rounding def custom_round(x: float, base: int = 5) -> int: """Custom rounding.""" # from https://stackoverflow.com/questions/40372030/pandas-round-to-the-nearest-n return int(base * round(float(x) / base)) df["power_rounded"] = df["power"].apply( lambda x: custom_round(x, base=20) )
Transpose
df = df.transpose()
Leftovers
# when in a function one might get the SettingWithCopyWarning, fix via df = df.copy()
# copy df2['Date'] = df0['Date'] # drop 2 rows from the beginning df = df2.drop([0, 1])
append today using yesterdays value
str_today = dt.datetime.today().strftime("%Y-%m-%d") ts_today = pd.Timestamp(str_today) if df.index[-1] != ts_today: df.loc[ts_today] = df.loc[df.index[-1], "Count"]
Plotting via Matplotlib
see Matplotlib