File size: 4,676 Bytes
4eae158
 
 
9ec41ea
4eae158
2209635
 
1525307
b8985e3
4eae158
 
b8985e3
074df17
 
9ec41ea
b8985e3
9ec41ea
3ad2e71
2574ff7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f90a0b
4eae158
2574ff7
 
 
 
 
 
2209635
82271e0
 
 
 
2209635
82271e0
2209635
82271e0
 
 
2574ff7
1463d1d
9f25ef2
1463d1d
2574ff7
 
 
 
 
2209635
 
 
074df17
 
2209635
 
 
074df17
 
2209635
 
82271e0
2211ff7
5a1664a
cb0fc84
9ec41ea
82271e0
4eae158
074df17
9ec41ea
074df17
 
2574ff7
4eae158
074df17
4eae158
8581164
074df17
2574ff7
a787fb8
3ad2e71
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import torch
from torch.nn.functional import softmax
import shap
import requests
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity
from transformers import RobertaTokenizer,RobertaForSequenceClassification, pipeline,RobertaModel
from IPython.core.display import HTML
model_dir = 'temp'
tokenizer = RobertaTokenizer.from_pretrained(model_dir)
model = RobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer1 = RobertaTokenizer.from_pretrained('roberta-base')
model1 = RobertaModel.from_pretrained('roberta-base')
threshold=0.5
#pipe = pipeline("text-classification", model="thugCodeNinja/robertatemp")
# pipe = pipeline("text-classification",model=model,tokenizer=tokenizer)
def process_text(input_text):
    if input_text:
        text = input_text
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = softmax(logits, dim=1)
    max_prob, predicted_class_id = torch.max(probs, dim=1)
    prob = str(round(max_prob.item() * 100, 2))
    label = model.config.id2label[predicted_class_id.item()]
    final_label='Human' if model.config.id2label[predicted_class_id.item()]=='LABEL_0' else 'Chat-GPT'
    processed_result = text
    def search(text):
        query = text
        api_key = 'AIzaSyClvkiiJTZrCJ8BLqUY9I38WYmbve8g-c8'
        search_engine_id = '53d064810efa44ce7'
        url = f'https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={query}&num=5'

        try:
            response = requests.get(url)
            data = response.json()
            return data
        except Exception as e:
            return {'error': str(e)}
    def get_article_text(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
            # Extract text from the article content (you may need to adjust this based on the website's structure)
                article_text = ' '.join([p.get_text() for p in soup.find_all('p')])
            return article_text
        except Exception as e:
            print(f"An error occurred: {e}")
        return ''
    def find_plagiarism(text):
        search_results=[]
        if len(text)>300:
            search_results = search(text)
        if 'items' not in search_results:
            return []
        similar_articles = []
        for item in search_results['items']:
            link = item.get('link', '')
            article_text = get_article_text(link)
            if article_text:
            # Tokenize and encode the input text and the article text
                encoding1 = tokenizer1(text, max_length=512, truncation=True, padding=True, return_tensors="pt")
                encoding2 = tokenizer1(article_text, max_length=512, truncation=True, padding=True, return_tensors="pt")
            
            # Calculate embeddings using the model
                with torch.no_grad():
                    embedding1 = model1(**encoding1).last_hidden_state.mean(dim=1)
                    embedding2 = model1(**encoding2).last_hidden_state.mean(dim=1)
            
            # Calculate cosine similarity between the input text and the article text embeddings
                similarity = cosine_similarity(embedding1, embedding2)[0][0]
                if similarity > threshold:
                    similar_articles.append([link,float(similarity)])
        similar_articles = sorted(similar_articles, key=lambda x: x[1], reverse=True)
        #threshold = 0.5  # Adjust the threshold as needed
        return similar_articles[:5]

    # prediction = pipe([text])
    # explainer = shap.DeepExplainer(model,[text])
    # shap_values = explainer([text])
    # shap_plot_html = HTML(shap.plots.text(shap_values, display=False)).data
    similar_articles = find_plagiarism(text)

    return processed_result, prob, final_label,similar_articles

text_input = gr.Textbox(label="Enter text")
outputs = [gr.Textbox(label="Processed text"), gr.Textbox(label="Probability"), gr.Textbox(label="Label"),gr.Dataframe(label="Similar Articles", headers=["Link", "Similarity"],row_count=5)]
title = "Group 2- ChatGPT text detection module"
description = '''Please upload text files and text input responsibly and await the explainable results. The approach in place includes finetuning a Roberta model for text classification.Once the classifications are done the most similar articles are presented along with the alleged similarity'''
gr.Interface(fn=process_text,title=title,description=description, inputs=[text_input], outputs=outputs).launch()