Pandas
Pandas
import pandas as pd
create
# empty df = pd.DataFrame() # from 1 dim list df = pd.DataFrame(data={'Deaths_Covid_2020': l}) # from N dim list, and set name of columns df = pd.DataFrame(data=l_results, columns=('num_songs_played', 'pct_all_played', 'pct_80pct_played')) # from multiple lists data = zip(l_dates, l_deaths2016, l_deaths2017, l_deaths2018, l_deaths2019, l_deaths2020) df = pd.DataFrame(data, columns=['Day', '2016', '2017', '2018', '2019', '2020'])
from csv
see [1]
df = pd.read_csv('data/de-states/de-state-DE-total.tsv', sep="\t") # only selected columns and parse date df = pd.read_csv('data/de-states/de-state-DE-total.tsv', sep="\t", decimal=",", usecols=["Date", "Deaths_New",], # only load these columns parse_dates=["Date",], # convert to date object if format is yyyy-mm-dd pr dd.mm.yyyy index_col="Date", # choose this column as index # na_values=["<4",] # values to treat as NA ) df.rename(columns={"Deaths_New": "Deaths_Covid"}, inplace = True) # convert data upon loading # read only first 10 chars from 2021-04-29T12:15:00+02:00 -> 2021-04-29 pd_date_converter = lambda x: (x[0:10]) df = pd.read_csv( filename, sep=",", converters={"Datum": pd_date_converter}, parse_dates=[ "Datum", # # convert to date object if format is yyyy-mm-dd ], ) # convert datetime to date after loading for c in ("Buchungstag", "Valutadatum"): df2[c] = pd.to_datetime(df2[c], format="%d.%m.%Y").dt.date
from list[str] via read_csv
from io import StringIO csvStringIO = StringIO("\n".join(lines)) df = pd.read_csv( csvStringIO, sep=";", )
from Excel
see [2]
import openpyxl # pip install openpyxl df = pd.read_excel( excelFile, sheet_name="COVID_Todesfälle_KW_AG10", engine="openpyxl" ) df = pd.read_excel( "in/PP_APLATZ.xlsx", skiprows=4, usecols=[ "ArbPlatz", "BezArt", ], )
from dict
see [3]
d = {} for col in df_rki.columns: d[col] = df_rki[col].sum() df = pd.DataFrame.from_dict(d, orient="index", columns=["Covid_Tote"]) # deep copy df2 = df.copy()
from array/record/list
see [4]
read JSON
see [5]
df = pd.read_json("file.json")
export
to csv
df.to_csv( "data/de-mortality-timeseries.tsv", sep="\t", lineterminator="\n", ) # selected columns only df[ ["Deaths", "Deaths_roll_av"] ].to_csv( "data/de-mortality-timeseries.tsv", sep="\t", lineterminator="\n", )
to Excel
see [6]
df.to_excel("data.xlsx", index=False)
Export 2 df as sheets in 1 file
with pd.ExcelWriter('output.xlsx') as writer: df1.to_excel(writer, sheet_name='Sheet_name_1') df2.to_excel(writer, sheet_name='Sheet_name_2') writer.close()
to HTML
see [7]
to JSON
json_data = df.values.tolist() with Path(pathStatsExport / "ts_types_month.json").open( "w", encoding="UTF-8" ) as fh: json.dump( json_data, fp=fh, ensure_ascii=False, sort_keys=False, indent=2, )
Add row
# add dummy row for missing 1.1.2020 df.loc[-1] = "2020-01-01", 0 df.index = df.index + 1 # shifting index df = df.sort_index() # sorting by index
Loop over rows
for index, row in df.iterrows():
selecting
# select columns df = df [ list_of_column_names ] df = df [ [ "date", "gemeindeschluessel", "betten_frei", "betten_belegt" ] ]
substring
df["Was"] = df["Was"].str.slice(0, 4)
sum
# calc sum of row sum_cases = df["Cases"].sum() # sum per index df_sums = df.sum(axis="index") # sum per column df_sums = df.sum(axis="columns")
filter
# filter on column value via str df = df[df["Date"] == "2021-11-13"] df = df[~(df["Date"] == "2021-11-13")] # ~ -> invert # filter on date column via year df = df[df["Date"].dt.year == 2020] # multiple df = df[~df["Date"].isin(("2020-02-29", "2024-02-29", "2028-02-29"))] # filter on date df = df[ df["RECEIVE_DATE"].dt.date # convert pandas datetime64[ns] to date >= (dt.date.today() - dt.timedelta(days=30)) ] # filter on 2 columns df = df[(df["type"] == "SleepAnalysis") & (df["sourceName"] == "T-iP8")] df = df[(df["status"] != "CANCELLED") & (df["status"] != "EXPIRED")] # filter column value based on list df = df[~((df["Sg"] == 19) & (df["Usg"].isin([0, 1, 2, 3])))] df = df[df["gemeindeschluessel"].isin(l_lkids)] df = df[df["Date"].isin(("2020-02-29", "2024-02-29", "2028-02-29"))] # is not in df = df[~df["Date"].isin(("2020-02-29", "2024-02-29", "2028-02-29"))] # remove data from latest week, as it might not be complete df = df[df["YearWeek"] < df["YearWeek"].max()] # drop value=na/NaN rows df = df[df["value"].notna()]
Duplicates
df = df.drop_duplicates()
convert to date
# parse date columns date_cols = ["start_date", "start_date_local", "x_date"] for col in date_cols: df_all[col] = pd.to_datetime(df_all[col]) # type: ignore # when in a function one might get the SettingWithCopyWarning, fix via df = df.copy() # ensure to have datetime64[ns] type df["Date"] = pd.to_datetime(df["Date"]) df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d") # remove miliseconds df["Date"] = df["Date"].dt.ceil(freq="s") # split "week/year" to date # 51/2021 -> date of sunday def convert2date(s: str) -> dt.date: year, week = s.split("/") date = dt.date.fromisocalendar(int(week), int(year), 7) return date df["dt"] = df["Kalenderwoche"].apply(lambda x: convert2date(x)) df["Year"] = df["Date"].dt.year
Remove timezone offset info
# V1: proper aproach using tz_convert(None) df[col] = pd.to_datetime(df[col], format="%Y-%m-%dT%H:%M:%S%z") df[col] = df[col].dt.tz_convert(None) # if it throws: # AttributeError: Can only use .dt accessor with datetimelike values. Did you mean: 'at'? # see V2 # V2: simple removing the timezone offset # Remove the timezone information by replacing the "+01:00", "+02:00", "-02:00", # etc. with an empty string df[col] = df[col].str.replace(r"[+\-]\d{2}:\d{2}.*$", "", regex=True) # Parse the datetime column without timezone information df[col] = pd.to_datetime(df[col], format="%Y-%m-%dT%H:%M:%S") # note: now without the timezone %z info: format="%Y-%m-%dT%H:%M:%S%z"
convert Pandas Timestamp to Python datetime
def convert_pd_timestamp_to_dt( pd_timestamp: pd.Timestamp, ) -> dt.datetime: my_date_time = pd_timestamp.to_pydatetime() return my_date_time df["Date"] = df["Timestamp"].apply(convert_pd_timestamp_to_dt)
convert date to timestamp
df["timestamp"] = df["date"].values.astype(np.int64) // 10**9
TimeZone conversion
# convert UTC to local time df["Date"] = ( df["Date"] .dt.tz_localize("utc") .dt.tz_convert(tz="Europe/Berlin") .dt.tz_localize(None) # drop timezone info, since Excel can not handle it )
or
# convert UTC to local time from zoneinfo import ZoneInfo TZ_ES = ZoneInfo("Europe/Madrid") df["Date"] = df["Date"].tz_convert(tz=TZ_ES)
overwrite data
# overwrite large values df["time to fall asleep"].where(df["time to fall asleep"] > 100, 100, inplace=True) # overwrite column data of last 3 weeks by None df["DateAsDate"] = pd.to_datetime(df["Date"], format="%Y-%m-%d") date_3w = dt.date.today() - dt.timedelta(weeks=3) df.loc[df["DateAsDate"].dt.date >= date_3w, "MyColumn"] = None # overwrite column value based on date index df.loc[df.index < pd.Timestamp("2020-03-01"), "InzidenzChange"] = None # rolling takes NAN values into account, so I need to overwrite them as well df3["Deaths_Covid_roll"] = np.where( df3["Deaths_Covid"].isnull(), np.nan, df3["Deaths_Covid_roll"] # replace na by 0 df = df.fillna(0) # remove word "Probable" from scorpio_call for better clustering df["scorpio_call"] = df["scorpio_call"].replace( to_replace=r"^Probable ", value="", regex=True ) # Add/Overwrite since cell, indexed by datetime # append today using yesterdays value str_today = dt.datetime.today().strftime("%Y-%m-%d") ts_today = pd.Timestamp(str_today) if df.index[-1] != ts_today: df.loc[ts_today] = df.loc[df.index[-1], "Count"]
change value based on condition
df.loc[(df["Kat2"] == "") & (df["IBAN"] == "DE02100100100152517108"), "Kat2"] = "Bahn" df.loc[(df["Kat1"] == "") & (df["Kat2"] == "Bahn"), "Kat1"] = "Mobilität" # assigning multiple values df.loc[ (df["Kat1"] == "") & (df["IBAN"] == "DE02100100100152517108"), ["Kat1", "Kat2", "Kat3"], ] = ["Mobilität], "Reisen", "Bahn"] # str endswith df.loc[ (df["RespTime"] == 0.0) & (df["Text"].str.endswith("(read timeout=30)")), "RespTime", ] = 30.0
split str date
df [ [ "week", "year" ] ] = df["Kalenderwoche"].str.split("/", expand=True)
Sorting / order by
# sort by column df = df.sort_values(by=['betten_belegt'], ascending=False) df = df.sort_values(by=["Buchungstag", "Text"], ignore_index=True)
Group By
# group and count df1 = df.groupby(['type'])['type'].count() df1 = df.groupby(['type'])['type'].count().sort_values(ascending=False) df_lineages = ( df_all_data.groupby(["lineage", "RECEIVE_DATE"]).size().reset_index(name="count") ) # group and sum all other columns df = df.groupby(["YearWeek"]).sum() df = df.groupby(['Impfdatum'])['Anzahl'].sum().reset_index() df = df.groupby(['date', 'bundesland']).agg( {'faelle_covid_aktuell_invasiv_beatmet': 'sum', 'betten_ges': 'sum'} ) df_top_ten = ( df.groupby("myColumn") .sum() # sums over remaining column "count" .sort_values(by="count", ascending=False) .head(10) )
Datetime Index Grouping
df = df.groupby(pd.Grouper(freq="5min", offset="00h00min")).max()
Group by Type and Date and define different aggregations per column and add missing values
# Group by type and date df_month = df.groupby(["type", pd.Grouper(key="date", freq="MS")]).agg( {"id": "count", "minutes": "sum"} ) # generate index of the desired month-freq: idx = pd.date_range( start=df["date"].min().replace(day=1), end=df["date"].max().replace(day=1), freq="MS", # MS = Month Start ) # add missing months df_month = df_month.reindex( pd.MultiIndex.from_product( [df_month.index.get_level_values("type"), idx], names=["type", "date"], ) ) # fill missing by 0 and convert count back to type int df_month = df_month.fillna(0).astype({"count": int})
Group Datetime by Time of Day only
df["Time"] = df["Date"].dt.round("5min").dt.time df_grouped = df [ ["RespTime", "Time" ] ].groupby("Time").mean()
Group and calculate min,avg,max
df_grouped = ( df[ ["Time", "RespTime"] ] .groupby("Time") .agg( max=pd.NamedAgg(column="RespTime", aggfunc="max"), avg=pd.NamedAgg(column="RespTime", aggfunc="mean"), min=pd.NamedAgg(column="RespTime", aggfunc="min"), ) )
Column Handling
df = df.rename( columns = {"invasiv_beatmet": "beatmet",}, errors="raise", )
drop columns
df = df.drop(columns=["Sterbejahr", "Sterbewoche"])
string to float: 1.234,56 -> 1234.56
df["Euro"] = ( df["Euro"] .str.replace(".", "") .str.replace(",", ".") .astype(float) )
convert int to str adding leading zeros
df["Sterbewoche"].astype(str).str.zfill(2)
sum up over all columns
sum_cases = 0 for col in df.columns: sum_cases += df[col].sum()
Column Header Handling
# rename column headers by extracting some int values from a string l2 = [] for col in df.columns: year = int(col[0:4]) week = int(col[5:7]) l2.append(year * 100 + week) df.columns = l2
Change Column Order
df = df.reindex( [ "Wann", "Art", "Wer", ], axis="columns", )
# move certain columns to front, leaving the remaining unchanged first_colmns = ("description", "isin", "side", "quantity", "Price", "amount") i = 0 for col in first_colmns: df.insert(i, col, df.pop(col)) i += 1 del i
Row Handling
add row from list and index
idx = df.index[-1] + 1 list_of_values = (...) df.loc[idx] = list_of_values
Index Handling
# set index name df.index.name = "YearWeek"
int indexes
select column as index df.set_index("Altersgruppe", inplace=True) # reset index to start at 0 df2 = df1[1 * 365 : 2 * 365].reset_index(drop=True) # filter on index df = df[df.index >= start_yearweek]
datetime indexes
# select "Date" column as index df.set_index(["Date"], inplace=True) # convert index to datetime df.index = pd.to_datetime(df.index) # convert utc to local time df = df.tz_convert("Europe/Berlin") # remove timezone offset (for Excel can not handle this) df.index = df.index.tz_localize(None) # move date from index back to column df.reset_index(inplace=True) assert df.columns[0] == "Date" # filter on date # drop data prior to 2020 df = df.loc['2020-01-01':] # alternative: df = df[df.index >= "2021-01-10"] date_last = pd.to_datetime(df.index[-1]).date() # reindex and fill missing with 0 date_last = pd.to_datetime(df.index[-1]).date() idx = pd.date_range('2020-01-01', date_last)) df = df.reindex(idx, fill_value=0) # add missing dates df = df.asfreq('D', fill_value=0) df = df.asfreq(freq="5M") df.sort_index(inplace=True) # needed? # drop values of column for last 3 weeks date_3w = dt.date.today() - dt.timedelta(weeks=3) df.loc[df.index.date >= date_3w, "Cases"] = None # or df.loc[df.index.date < pd.to_datetime("2020-03-01"), "Value"] = None
text indexes
df.set_index("Altersgruppe", inplace=True)
extract cell
# select column for index betten_ges = df2["betten_ges"].iloc[0] # extract cell de_sum = df["Personen"].loc["Summe"] # drop row based on index df.drop("Summe", inplace=True)
melt/unpivot wide table to long table format
df2 = pd.melt(df, id_vars="Date", value_vars=df.columns[1:]) # df2.rename(columns={"variable": "Machine", "value": "Status"}, inplace=True) # not working, using melt instead # df3 = pd.wide_to_long(df, stubnames="Date", i=df.columns[1:], j="Status")
merge/concat multiple dataframes
df_sum = pd.DataFrame() df_sum = pd.concat( [df_sum, df], ignore_index=True, ) # join series of 2 df df_covid_2020 = pd.DataFrame() df_covid_2020['Deaths_Covid_2020'] = df1['Deaths_Covid_2020'].append( df2['Deaths_Covid_2020'], ignore_index=True)
add rows
df = pd.concat([df, df2], axis=1) # note: this would require all index of df in df2 present: # df[machine] = df2[machine]
add single row of datetimeindex
# df.columns is "Date", "Count" ts_today = pd.Timestamp(dt.date.today()) # when index it numeric if df["Date"].iloc[-1] != ts_today: df.loc[len(df.index)] = [ts_today, 0] # wen using datetime index if df.index[-1] != ts_today: df.loc[ts_today] = 0
more
df['2016_roll'] = df['2016'].rolling(window=7, min_periods=1).mean().round(1) df['2017_roll'] = df['2017'].rolling(window=7, min_periods=1).mean().round(1) df['2018_roll'] = df['2018'].rolling(window=7, min_periods=1).mean().round(1) df['2019_roll'] = df['2019'].rolling(window=7, min_periods=1).mean().round(1) df['2020_roll'] = df['2020'].rolling(window=7, min_periods=1).mean().round(1) # mean value of 4 columns df['2016_2019_mean'] = df.iloc[:, [1, 2, 3, 4] ].mean(axis=1) # not column 0 = day df['2016_2019_mean_roll'] = df['2016_2019_mean'].rolling( window=7, min_periods=1).mean().round(1) df['2016_2019_roll_max'] = df.iloc[:, [6, 7, 8, 9]].max(axis=1) df['2016_2019_roll_min'] = df.iloc[:, [6, 7, 8, 9]].min(axis=1) # prepend empty values to df # Jan und Feb values are missing for Covid Deaths series, so I need a couple of empty rows l = [None] * 59 df1 = pd.DataFrame(data={'Deaths_Covid_2020': l}) # ensure first row is from 28.2 assert (df2.iloc[0]['Date'] == '2020-02-28'), "Error of start date, expecting 2020-02-28" # copy df2['Date'] = df0['Date'] # drop 2 rows from the beginning df2.drop([0, 1], inplace=True) # negative -> 0 df[df < 0] = 0
Helpers
def pandas_set_date_index(df, date_column: str): """ use date as index """ df[date_column] = pd.to_datetime(df[date_column], format='%Y-%m-%d') df.set_index([date_column], inplace=True) return df def pandas_calc_roll_av(df, column: str, days: int = 7): """ calc rolling average over column """ df[column + '_roll_av'] = df[column].rolling( window=days, min_periods=1).mean().round(1) return df
Transpose
# transpose to have yearweek as index df = df.transpose()
String Modifications
str.replace()
df["Text"] = df["Text"].str.replace( r"^Menke, *Torben *", "Torben Menke", regex=True, )
capitalization
df["Wer"] = df["Wer"].apply(lambda x: string.capwords(x))
trim spaces
df["Wer"] = df["Wer"].str.strip()
count word per row and check if more than one str.count()
df["cnt_Buchungstext"] = df["Text"].str.count("word") df_search = df[df["cnt_Buchungstext"] != 1] if len(df_search) > 0: print(df_search) del df_search df = df.drop(columns=["cnt_Buchungstext"])
Plotting via Matplotlib
see Matplotlib