quickaid / ai.py
noobmaster1246's picture
Update ai.py
49a6a82 verified
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from symspellpy import SymSpell, Verbosity
import gradio as gr
# Ensure Hugging Face cache directory is writable
os.environ["TRANSFORMERS_CACHE"] = "/home/user/.cache/huggingface"
# Set device
device = torch.device("cpu")
# Define DiseaseClassifier Model
class DiseaseClassifier(nn.Module):
def __init__(self, input_size, num_classes, dropout_rate=0.35665610394511454):
super(DiseaseClassifier, self).__init__()
self.fc1 = nn.Linear(input_size, 382)
self.fc2 = nn.Linear(382, 389)
self.fc3 = nn.Linear(389, 433)
self.fc4 = nn.Linear(433, num_classes)
self.activation = nn.LeakyReLU()
self.dropout = nn.Dropout(dropout_rate)
def forward(self, x):
x = self.activation(self.fc1(x))
x = self.dropout(x)
x = self.activation(self.fc2(x))
x = self.dropout(x)
x = self.activation(self.fc3(x))
x = self.dropout(x)
x = self.fc4(x) # Logits
return x
# Define DiseasePredictionModel
class DiseasePredictionModel:
def __init__(self, ai_model_name="model.pth", data_file="data.csv", symptom_json="symptoms.json", dictionary_file="frequency_dictionary_en_82_765.txt"):
# Load dataset
self.df = pd.read_csv(data_file)
self.symptom_columns = self.load_symptoms(symptom_json)
self.label_encoder = LabelEncoder()
self.label_encoder.fit(self.df.iloc[:, 0])
self.scaler = StandardScaler()
self.scaler.fit(self.df.iloc[:, 1:].values)
self.input_size = len(self.symptom_columns)
self.num_classes = len(self.label_encoder.classes_)
self.model = self._load_model(ai_model_name)
self.SYMPTOM_LIST = self.load_symptoms(symptom_json)
# Load SymSpell dictionary
self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
self.sym_spell.load_dictionary(dictionary_file, term_index=0, count_index=1)
# Load BioBERT tokenizer and model
self.tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
self.nlp_model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-v1.1")
self.ner_pipeline = pipeline("ner", model=self.nlp_model, tokenizer=self.tokenizer, aggregation_strategy="simple")
# Load Sentence Transformer
self.semantic_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
def _load_model(self, ai_model_name):
model = DiseaseClassifier(self.input_size, self.num_classes).to(device)
model.load_state_dict(torch.load(ai_model_name, map_location=device))
model.eval()
return model
def predict_disease(self, symptoms):
input_vector = np.zeros(len(self.symptom_columns))
for symptom in symptoms:
if symptom in self.symptom_columns:
input_vector[list(self.symptom_columns).index(symptom)] = 1
input_vector = self.scaler.transform([input_vector])
input_tensor = torch.tensor(input_vector, dtype=torch.float32).to(device)
with torch.no_grad():
outputs = self.model(input_tensor)
_, predicted_class = torch.max(outputs, 1)
predicted_disease = self.label_encoder.inverse_transform([predicted_class.cpu().numpy()[0]])[0]
return predicted_disease
def load_symptoms(self, json_file):
with open(json_file, "r", encoding="utf-8") as f:
return json.load(f)
def correct_text(self, text):
words = text.split()
corrected_words = []
for word in words:
if word.lower() in [symptom.lower() for symptom in self.SYMPTOM_LIST]:
corrected_words.append(word)
else:
suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
if suggestions:
corrected_words.append(suggestions[0].term)
else:
corrected_words.append(word)
return ' '.join(corrected_words)
def extract_symptoms(self, text):
ner_results = self.ner_pipeline(text)
symptoms = set()
for entity in ner_results:
if entity["entity_group"] == "DISEASE":
symptoms.add(entity["word"].lower())
return list(symptoms)
def match_symptoms(self, extracted_symptoms):
matched = {}
symptom_embeddings = self.semantic_model.encode(self.SYMPTOM_LIST, convert_to_tensor=True)
for symptom in extracted_symptoms:
symptom_embedding = self.semantic_model.encode(symptom, convert_to_tensor=True)
similarities = util.pytorch_cos_sim(symptom_embedding, symptom_embeddings)[0]
most_similar_idx = similarities.argmax()
best_match = self.SYMPTOM_LIST[most_similar_idx]
matched[symptom] = best_match
return matched.values()
# Initialize Model
model = DiseasePredictionModel()
# Define Prediction Function
def predict(symptoms):
corrected = model.correct_text(symptoms)
extracted = model.extract_symptoms(corrected)
matched = model.match_symptoms(extracted)
prediction = model.predict_disease(matched)
return prediction
# Define Gradio Interface
iface = gr.Interface(fn=predict, inputs="text", outputs="text", title="Disease Prediction AI")
iface.launch()