Spaces:
Runtime error
Runtime error
File size: 2,463 Bytes
b6a1553 c942b0f 3e7c541 c942b0f 3e7c541 cb329a5 c942b0f 3e7c541 b6a1553 8a0298f c942b0f 3e7c541 c942b0f 3e7c541 2a2af5a c942b0f b6a1553 8a0298f b6a1553 c942b0f b6a1553 c942b0f b6a1553 c942b0f b6a1553 c942b0f 65533e2 b6a1553 c942b0f b6a1553 c942b0f 8a0298f c942b0f cb329a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import gradio as gr
import torch
# Schritt 1: Dataset laden und überprüfen
# Falls "KeyError: 'text'" auftritt, Spaltennamen prüfen
dataset = load_dataset("armanc/scientific_papers", "arxiv", trust_remote_code=True) # Falls du PubMed nutzt, ersetze "arxiv" mit "pubmed"
print(dataset)
# Schritt 2: Tokenizer vorbereiten
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
def tokenize_function(examples):
return tokenizer(examples["abstract"], padding="max_length", truncation=True, max_length=151)
dataset = dataset.map(tokenize_function, batched=True)
# Schritt 3: Modell laden
model = AutoModelForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=3)
# Anpassung für Trainingsdaten: Label-Spalte hinzufügen
def add_labels(example):
example["labels"] = 1 # Dummy-Label, falls nicht vorhanden (1=positiv, 0=negativ, 2=neutral o.Ä.)
return example
dataset = dataset.map(add_labels)
# Schritt 4: Trainingsparameter setzen
training_args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
learning_rate=5e-5,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=500,
)
# Schritt 5: Trainer erstellen und Training starten
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["validation"]
)
trainer.train()
# Schritt 6: Modell speichern
trainer.save_model("./trained_model")
tokenizer.save_pretrained("./trained_model")
# Schritt 7: Modell für Gradio bereitstellen
def predict(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=151)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probabilities = torch.nn.functional.softmax(logits, dim=-1)
return {f"Label {i}": float(probabilities[0][i]) for i in range(len(probabilities[0]))}
iface = gr.Interface(
fn=predict,
inputs=gr.Textbox(lines=5, placeholder="Paste an abstract here..."),
outputs=gr.Label(),
title="Scientific Paper Evaluator",
description="This AI model scores scientific papers based on relevance, uniqueness, and redundancy."
)
iface.launch()
|