|
import arxiv |
|
import gradio as gr |
|
import pandas as pd |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
from cachetools import TTLCache, cached |
|
from setfit import SetFitModel |
|
from tqdm.auto import tqdm |
|
|
|
CACHE_TIME = 60 * 60 * 12 |
|
MAX_RESULTS = 30_000 |
|
|
|
|
|
@cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME)) |
|
def get_arxiv_result(): |
|
search = arxiv.Search( |
|
query="ti:dataset AND abs:machine learning", |
|
max_results=MAX_RESULTS, |
|
sort_by=arxiv.SortCriterion.SubmittedDate, |
|
) |
|
return [ |
|
{ |
|
"title": result.title, |
|
"abstract": result.summary, |
|
"url": result.entry_id, |
|
"category": result.primary_category, |
|
"updated": result.updated, |
|
} |
|
for result in tqdm(search.results(), total=MAX_RESULTS) |
|
] |
|
|
|
|
|
def load_model(): |
|
return SetFitModel.from_pretrained("librarian-bots/is_new_dataset_teacher_model") |
|
|
|
|
|
def format_row_for_model(row): |
|
return f"TITLE: {row['title']} \n\nABSTRACT: {row['abstract']}" |
|
|
|
|
|
int2label = {0: "new_dataset", 1: "not_new_dataset"} |
|
|
|
|
|
def get_predictions(data: list[dict], model=None, batch_size=32): |
|
if model is None: |
|
model = load_model() |
|
predictions = [] |
|
for i in tqdm(range(0, len(data), batch_size)): |
|
batch = data[i : i + batch_size] |
|
text_inputs = [format_row_for_model(row) for row in batch] |
|
batch_predictions = model.predict_proba(text_inputs) |
|
for j, row in enumerate(batch): |
|
prediction = batch_predictions[j] |
|
row["prediction"] = int2label[int(prediction.argmax())] |
|
row["probability"] = float(prediction.max()) |
|
predictions.append(row) |
|
return predictions |
|
|
|
|
|
def create_markdown(row): |
|
title = row["title"] |
|
abstract = row["abstract"] |
|
arxiv_id = row["arxiv_id"] |
|
hub_paper_url = f"https://huggingface.co./papers/{arxiv_id}" |
|
updated = row["updated"] |
|
updated = updated.strftime("%Y-%m-%d") |
|
broad_category = row["broad_category"] |
|
category = row["category"] |
|
return f""" <h1> {title} </h1> updated: {updated} |
|
| category: {broad_category} | subcategory: {category} | |
|
\n\n{abstract} |
|
\n\n [Hugging Face Papers page]({hub_paper_url}) |
|
""" |
|
|
|
|
|
@cached(cache=TTLCache(maxsize=100, ttl=CACHE_TIME)) |
|
def prepare_data(): |
|
print("Downloading arxiv results...") |
|
arxiv_results = get_arxiv_result() |
|
print("loading model...") |
|
model = load_model() |
|
print("Making predictions...") |
|
predictions = get_predictions(arxiv_results, model=model) |
|
df = pd.DataFrame(predictions) |
|
df.loc[:, "arxiv_id"] = df["url"].str.extract(r"(\d+\.\d+)") |
|
df.loc[:, "broad_category"] = df["category"].str.split(".").str[0] |
|
df.loc[:, "markdown"] = df.apply(create_markdown, axis=1) |
|
return df |
|
|
|
|
|
all_possible_arxiv_categories = prepare_data().category.unique().tolist() |
|
broad_categories = prepare_data().broad_category.unique().tolist() |
|
|
|
|
|
def create_markdown_summary(categories=broad_categories, all_categories=None): |
|
df = prepare_data() |
|
if categories is not None: |
|
df = df[df["broad_category"].isin(categories)] |
|
return "\n\n".join(df["markdown"].tolist()) |
|
|
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(prepare_data, "cron", hour=3, minute=30) |
|
scheduler.start() |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## New Datasets in Machine Learning") |
|
gr.Markdown( |
|
"This Space attempts to show new papers on arXiv that are *likely* to be papers" |
|
" introducing new datasets. \n\n" |
|
) |
|
broad_categories = gr.Dropdown( |
|
choices=broad_categories, |
|
label="Categories", |
|
multiselect=True, |
|
value=broad_categories, |
|
) |
|
results = gr.Markdown(create_markdown_summary()) |
|
broad_categories.change(create_markdown_summary, broad_categories, results) |
|
|
|
demo.launch() |
|
|