Spaces:
Running
on
Zero
Running
on
Zero
import datasets | |
import polars as pl | |
from loguru import logger | |
from polars import datatypes as pdt | |
BASE_REPO_ID = "ai-conferences/ICLR2025" | |
PATCH_REPO_ID = "ai-conferences/ICLR2025-patches" | |
PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim" | |
def get_patch_latest_values( | |
df: pl.DataFrame, all_columns: list[str], id_col: str, timestamp_col: str = "timestamp", delimiter: str = "," | |
) -> pl.DataFrame: | |
df = df.sort(timestamp_col) | |
list_cols = [ | |
col for col, dtype in df.schema.items() if col not in (id_col, timestamp_col) and dtype.base_type() is pdt.List | |
] | |
df = df.with_columns( | |
[ | |
pl.when(pl.col(c).is_not_null()).then(pl.col(c).list.join(delimiter)).otherwise(None).alias(c) | |
for c in list_cols | |
] | |
) | |
update_columns = [col for col in df.columns if col not in (id_col, timestamp_col)] | |
melted = df.unpivot(on=update_columns, index=[timestamp_col, id_col]).drop_nulls() | |
latest_rows = ( | |
melted.sort(timestamp_col) | |
.group_by([id_col, "variable"]) | |
.agg(pl.col("value").last()) | |
.pivot("variable", index=id_col, values="value") | |
) | |
latest_rows = latest_rows.with_columns( | |
[ | |
pl.when(pl.col(c).is_not_null()).then(pl.col(c).str.split(delimiter)).otherwise(None).alias(c) | |
for c in list_cols | |
] | |
) | |
missing_cols = [c for c in all_columns if c not in latest_rows.columns and c != id_col] | |
if missing_cols: | |
latest_rows = latest_rows.with_columns([pl.lit(None).alias(c) for c in missing_cols]) | |
return latest_rows.select([id_col] + [col for col in all_columns if col != id_col]) | |
def format_author_claim_ratio(row: dict) -> str: | |
n_linked_authors = row["n_linked_authors"] | |
n_authors = row["n_authors"] | |
if n_linked_authors is None or n_authors is None: | |
return "" | |
author_linked = "✅" if n_linked_authors > 0 else "" | |
return f"{n_linked_authors}/{n_authors} {author_linked}".strip() | |
df_orig = ( | |
datasets.load_dataset(BASE_REPO_ID, split="train") | |
.to_polars() | |
.rename({"paper_url": "openreview", "submission_number": "paper_id"}) | |
.with_columns( | |
pl.lit([], dtype=pl.List(pl.Utf8)).alias(col_name) for col_name in ["space_ids", "model_ids", "dataset_ids"] | |
) | |
) | |
df_paper_page = ( | |
datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train") | |
.to_polars() | |
.drop(["summary", "author_names", "ai_keywords"]) | |
) | |
df_orig = df_orig.join(df_paper_page, on="arxiv_id", how="left") | |
try: | |
df_patches = ( | |
datasets.load_dataset(PATCH_REPO_ID, split="train") | |
.to_polars() | |
.drop("diff") | |
.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%+")) | |
) | |
df_patches = get_patch_latest_values(df_patches, df_orig.columns, id_col="paper_id", timestamp_col="timestamp") | |
df_orig = ( | |
df_orig.join(df_patches, on="paper_id", how="left") | |
.with_columns( | |
[ | |
pl.coalesce([pl.col(col + "_right"), pl.col(col)]).alias(col) | |
for col in df_orig.columns | |
if col != "paper_id" | |
] | |
) | |
.select(df_orig.columns) | |
) | |
except Exception as e: # noqa: BLE001 | |
logger.warning(e) | |
# format authors | |
df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str")) | |
# format links | |
df_orig = df_orig.with_columns( | |
[ | |
pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md") | |
for col in ["openreview", "project_page", "github"] | |
] | |
) | |
# format paper page link | |
df_orig = df_orig.with_columns( | |
(pl.lit("https://huggingface.co./papers/") + pl.col("arxiv_id")).alias("paper_page") | |
).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md")) | |
# count authors | |
df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors")) | |
df_orig = df_orig.with_columns( | |
pl.col("author_usernames") | |
.map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64) | |
.alias("n_linked_authors") | |
) | |
df_orig = df_orig.with_columns( | |
pl.struct(["n_linked_authors", "n_authors"]) | |
.map_elements(format_author_claim_ratio, return_dtype=pl.Utf8) | |
.alias("claimed") | |
) | |
# TODO: Fix this once https://github.com/gradio-app/gradio/issues/10916 is fixed # noqa: FIX002, TD002 | |
# format numbers as strings | |
df_orig = df_orig.with_columns( | |
[pl.col(col).cast(pl.Utf8).fill_null("").alias(col) for col in ["upvotes", "num_comments"]] | |
) | |
# format spaces, models, datasets | |
for repo_id_col, markdown_col, base_url in [ | |
("space_ids", "Spaces", "https://huggingface.co./spaces/"), | |
("model_ids", "Models", "https://huggingface.co./"), | |
("dataset_ids", "Datasets", "https://huggingface.co./datasets/"), | |
]: | |
df_orig = df_orig.with_columns( | |
pl.col(repo_id_col) | |
.map_elements( | |
lambda lst: "\n".join([f"[link]({base_url}{x})" for x in lst]) if lst is not None else None, # noqa: B023 | |
return_dtype=pl.Utf8, | |
) | |
.fill_null("") | |
.alias(markdown_col) | |
) | |