Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,7 @@ import io
|
|
5 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
6 |
|
7 |
# Load fine-tuned model and tokenizer
|
8 |
-
model_name = "TAgroup5/
|
9 |
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
10 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
11 |
|
@@ -36,10 +36,12 @@ if uploaded_file is not None:
|
|
36 |
|
37 |
# Preprocessing function
|
38 |
def preprocess_text(text):
|
39 |
-
text = text.lower() #
|
40 |
-
text = re.sub(r'\s
|
41 |
-
|
42 |
-
|
|
|
|
|
43 |
|
44 |
# Apply preprocessing and classification
|
45 |
df['processed_content'] = df['content'].apply(preprocess_text)
|
|
|
5 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
6 |
|
7 |
# Load fine-tuned model and tokenizer
|
8 |
+
model_name = "TAgroup5/news-classification-model"
|
9 |
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
10 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
11 |
|
|
|
36 |
|
37 |
# Preprocessing function
|
38 |
def preprocess_text(text):
|
39 |
+
text = text.lower() # Convert to lowercase
|
40 |
+
text = re.sub(r'[^a-z\s]', '', text) # Remove special characters & numbers
|
41 |
+
tokens = word_tokenize(text) # Tokenization
|
42 |
+
tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
|
43 |
+
tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatization
|
44 |
+
return " ".join(tokens)
|
45 |
|
46 |
# Apply preprocessing and classification
|
47 |
df['processed_content'] = df['content'].apply(preprocess_text)
|