import pandas as pd from datetime import datetime, timedelta from scripts.utils import DATA_DIR # Basic Week over Week Retention def calculate_wow_retention_by_type( df: pd.DataFrame, market_creator: str ) -> pd.DataFrame: filtered_df = df.loc[df["market_creator"] == market_creator] # Get unique traders per week and type weekly_traders = ( filtered_df.groupby(["month_year_week", "trader_type"])["trader_address"] .nunique() .reset_index() ) weekly_traders = weekly_traders.sort_values(["trader_type", "month_year_week"]) # Calculate retention retention = [] # Iterate through each trader type for trader_type in weekly_traders["trader_type"].unique(): type_data = weekly_traders[weekly_traders["trader_type"] == trader_type] # Calculate retention for each week within this trader type for i in range(1, len(type_data)): current_week = type_data.iloc[i]["month_year_week"] previous_week = type_data.iloc[i - 1]["month_year_week"] # Get traders in both weeks for this type current_traders = set( filtered_df[ (filtered_df["month_year_week"] == current_week) & (filtered_df["trader_type"] == trader_type) ]["trader_address"] ) previous_traders = set( filtered_df[ (filtered_df["month_year_week"] == previous_week) & (filtered_df["trader_type"] == trader_type) ]["trader_address"] ) retained = len(current_traders.intersection(previous_traders)) retention_rate = ( (retained / len(previous_traders)) * 100 if len(previous_traders) > 0 else 0 ) retention.append( { "trader_type": trader_type, "week": current_week, "retained_traders": retained, "previous_traders": len(previous_traders), "retention_rate": round(retention_rate, 2), } ) return pd.DataFrame(retention) # Cohort Retention def calculate_cohort_retention( df: pd.DataFrame, market_creator: str, trader_type: str, max_weeks=12 ) -> pd.DataFrame: df_filtered = df.loc[ (df["market_creator"] == market_creator) & (df["trader_type"] == trader_type) ] # Get first week for each trader first_trades = ( df_filtered.groupby("trader_address") .agg({"creation_timestamp": "min", "month_year_week": "first"}) .reset_index() ) first_trades.columns = ["trader_address", "first_trade", "cohort_week"] # Get ordered list of unique weeks - converting to datetime for proper sorting all_weeks = df_filtered["month_year_week"].unique() weeks_datetime = pd.to_datetime(all_weeks) sorted_weeks_idx = weeks_datetime.argsort() all_weeks = all_weeks[sorted_weeks_idx] # Create mapping from week string to numeric index week_to_number = {week: idx for idx, week in enumerate(all_weeks)} # Merge back to get all activities cohort_data = pd.merge( df_filtered, first_trades[["trader_address", "cohort_week"]], on="trader_address", ) # Calculate week number since first activity cohort_data["cohort_number"] = cohort_data["cohort_week"].map(week_to_number) cohort_data["activity_number"] = cohort_data["month_year_week"].map(week_to_number) cohort_data["week_number"] = ( cohort_data["activity_number"] - cohort_data["cohort_number"] ) # Calculate retention by cohort cohort_sizes = cohort_data.groupby("cohort_week")["trader_address"].nunique() retention_matrix = cohort_data.groupby(["cohort_week", "week_number"])[ "trader_address" ].nunique() retention_matrix = retention_matrix.unstack(fill_value=0) # Convert to percentages retention_matrix = retention_matrix.div(cohort_sizes, axis=0) * 100 # Sort index (cohort_week) chronologically retention_matrix.index = pd.to_datetime(retention_matrix.index) retention_matrix = retention_matrix.sort_index() # Limit to max_weeks if specified if max_weeks is not None and max_weeks < retention_matrix.shape[1]: retention_matrix = retention_matrix.iloc[:, :max_weeks] return retention_matrix.round(2) def merge_retention_dataset( traders_df: pd.DataFrame, unknown_df: pd.DataFrame ) -> pd.DataFrame: traders_df["trader_type"] = traders_df["staking"].apply( lambda x: "non_Olas" if x == "non_Olas" else "Olas" ) unknown_df["trader_type"] = "unclassified" all_traders = pd.concat([traders_df, unknown_df], ignore_index=True) all_traders["creation_timestamp"] = pd.to_datetime( all_traders["creation_timestamp"] ) all_traders = all_traders.sort_values(by="creation_timestamp", ascending=True) all_traders["month_year_week"] = ( all_traders["creation_timestamp"].dt.to_period("W").dt.strftime("%b-%d-%Y") ) return all_traders def prepare_retention_dataset( retention_df: pd.DataFrame, unknown_df: pd.DataFrame ) -> pd.DataFrame: retention_df["trader_type"] = retention_df["staking"].apply( lambda x: "non_Olas" if x == "non_Olas" else "Olas" ) retention_df.rename(columns={"request_time": "creation_timestamp"}, inplace=True) retention_df = retention_df[ ["trader_type", "market_creator", "trader_address", "creation_timestamp"] ] unknown_df["trader_type"] = "unclassified" unknown_df = unknown_df[ ["trader_type", "market_creator", "trader_address", "creation_timestamp"] ] all_traders = pd.concat([retention_df, unknown_df], ignore_index=True) all_traders["creation_timestamp"] = pd.to_datetime( all_traders["creation_timestamp"] ) all_traders = all_traders.sort_values(by="creation_timestamp", ascending=True) all_traders["month_year_week"] = ( all_traders["creation_timestamp"].dt.to_period("W").dt.strftime("%b-%d-%Y") ) return all_traders if __name__ == "__main__": # read all datasets traders_df = pd.read_parquet(DATA_DIR / "all_trades_profitability.parquet") unknown_df = pd.read_parquet(DATA_DIR / "unknown_traders.parquet") all_traders = prepare_retention_dataset(traders_df, unknown_df) # Usage example: wow_retention = calculate_wow_retention_by_type(all_traders) cohort_retention = calculate_cohort_retention(all_traders)