Spaces:

peccavi
/

ai-text-watermarking-model

Sleeping

File size: 4,192 Bytes

060ac52

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

import numpy as np
from transformers import pipeline
from typing import List
from utils.config import load_config


class EntailmentAnalyzer:
    # def __init__(self, config_path: str):
    def __init__(self, config):
        """
        Initialize the EntailmentAnalyzer with the config file path.
        
        Args:
        config_path: The path to the configuration file.
        """
        # self.config = load_config(config_path)['PECCAVI_TEXT']['Entailment']
        self.config = config
        self.entailment_pipeline = pipeline(task=self.config['task'], model=self.config['model'])

    def check_entailment(self, premise: str, hypothesis: str) -> float:
        """
        Check entailment between the premise and hypothesis.
        
        Args:
        premise: The premise sentence.
        hypothesis: The hypothesis sentence.

        Returns:
        float: The entailment score.
        """
        results = self.entailment_pipeline(f"{premise} [SEP] {hypothesis}", top_k=None)
        entailment_score = next(item['score'] for item in results if item['label'] == 'entailment')
        return entailment_score

    def analyze_entailment(self, original_sentence: str, paraphrased_sentences: List[str], threshold: float) -> tuple:
        """
        Analyze entailment scores for paraphrased sentences. If no selected sentences are found,
        lower the threshold and rerun the analysis.
        
        Args:
        original_sentence: The original sentence.
        paraphrased_sentences: List of paraphrased sentences.
        threshold: Minimum score to select a sentence.

        Returns:
        tuple: A dictionary of all scores, selected sentences, and discarded sentences.
        """
        all_sentences = {}
        selected_sentences = {}
        discarded_sentences = {}

        # Loop to reduce threshold if no sentences are selected
        while not selected_sentences:
            for paraphrased_sentence in paraphrased_sentences:
                entailment_score = self.check_entailment(original_sentence, paraphrased_sentence)
                
                all_sentences[paraphrased_sentence] = entailment_score
                if entailment_score >= threshold:
                    selected_sentences[paraphrased_sentence] = entailment_score
                else:
                    discarded_sentences[paraphrased_sentence] = entailment_score

            # If no sentences are selected, lower the threshold
            if not selected_sentences:
                print(f"No selected sentences found. Lowering the threshold by 0.1 (from {threshold} to {threshold - 0.1}).")
                threshold -= 0.1
                if threshold <= 0:
                    print("Threshold has reached 0. No sentences meet the criteria.")
                    break

        return all_sentences, selected_sentences, discarded_sentences


if __name__ == "__main__":
    config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml')

    config_path = '/home/ashhar21137/text_wm/scratch/utils/config/config.yaml'

    config = load_config(config_path)
    
    entailment_analyzer = EntailmentAnalyzer(config['PECCAVI_TEXT']['Entailment'])
    
    all_sentences, selected_sentences, discarded_sentences = entailment_analyzer.analyze_entailment(
        "The weather is nice today",
        [
            "The climate is pleasant today",
            "It's a good day weather-wise",
            "Today, the weather is terrible",
            "What a beautiful day it is",
            "The sky is clear and the weather is perfect",
            "It's pouring rain outside today",
            "The weather isn't bad today",
            "A lovely day for outdoor activities"
        ],
        0.7
    )

    print("----------------------- All Sentences -----------------------")
    print(all_sentences)
    print("----------------------- Discarded Sentences -----------------------")
    print(discarded_sentences)
    print("----------------------- Selected Sentences -----------------------")
    print(selected_sentences)