Spaces:

peccavi
/

ai-text-watermarking-model

Sleeping

File size: 5,811 Bytes

060ac52

import nltk
import logging
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
import re
from tqdm import tqdm

# Set logging to WARNING for minimal console output.
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

class NgramProcessor:
    def __init__(self):
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
        tqdm.write("[NgramProcessor] Initialized with stopwords.")

    def remove_stopwords(self, text):
        # No need for extensive logging inside this helper.
        words = re.findall(r'\w+', text.lower())
        filtered_words = [word for word in words if word not in self.stop_words]
        return ' '.join(filtered_words)

    def is_exact_match(self, ngram, sentences):
        logger.info(f"Checking exact match for ngram: {ngram}")
        result = all(ngram in sentence for sentence in sentences)
        logger.info(f"Exact match result for '{ngram}': {result}")
        return result

    def is_substring_of_any(self, ngram, common_ngrams):
        logger.info(f"Checking if ngram: {ngram} is substring of any common ngram.")
        result = any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
        logger.info(f"Substring check result for '{ngram}': {result}")
        return result

    def find_filtered_ngrams(self, sentences):
        from collections import Counter
        tqdm.write("[NgramProcessor] Cleaning sentences...")
        sentences_cleaned = [self.remove_stopwords(sentence)
                             for sentence in tqdm(sentences, desc="Cleaning Sentences")]
        ngram_lengths = [4, 3, 2, 1]
        common_ngrams = []
        result = {}
        for n in ngram_lengths:
            ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences_cleaned]
            ngrams_counter = Counter(ngrams_list[0])
            for ngram in ngrams_counter:
                ngram_str = ' '.join(ngram)
                if any(word in self.stop_words for word in ngram_str.split()):
                    continue
                if self.is_exact_match(ngram_str, sentences_cleaned) and not self.is_substring_of_any(ngram_str, common_ngrams):
                    common_ngrams.append(ngram_str)
        for sentence, cleaned_sentence in tqdm(zip(sentences, sentences_cleaned),
                                                 total=len(sentences),
                                                 desc="Mapping N-grams"):
            sentence_result = {}
            original_words = sentence.split()
            cleaned_words = cleaned_sentence.split()
            index_map = {}
            cleaned_idx = 0
            for orig_idx, word in enumerate(original_words):
                if word.lower() not in self.stop_words:
                    index_map[cleaned_idx] = orig_idx
                    cleaned_idx += 1
            for ngram in common_ngrams:
                ngram_words = ngram.split()
                indices = []
                for i in range(len(cleaned_words) - len(ngram_words) + 1):
                    if cleaned_words[i:i + len(ngram_words)] == ngram_words:
                        if i in index_map:
                            start_idx = index_map[i]
                            end_idx = index_map.get(i + len(ngram_words) - 1, start_idx)
                            if end_idx - start_idx == len(ngram_words) - 1:
                                indices.append((start_idx, end_idx))

                if indices:
                    sentence_result[ngram] = indices
            result[sentence] = sentence_result
        return result

    # def find_relative_order(self, sentence, common_ngrams):
    #     from tqdm import tqdm
    #     relative_order = []
    #     for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
    #         index = sentence.find(ngram)
    #         if index != -1:
    #             relative_order.append((index, ngram))
    #     return sorted(relative_order)

    def find_relative_order(self, sentence, common_ngrams):
        from tqdm import tqdm
        sentence = sentence.lower()
        relative_order = []
        
        for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
            index = sentence.find(ngram.lower())
            if index != -1:
                relative_order.append((index, ngram))
                
        sorted_pairs = sorted(relative_order)
        return [(i+1, ngram) for i, (_, ngram) in enumerate(sorted_pairs)]

# Example usage
if __name__ == "__main__":
    sentences = [
        "The quick brown fox jumps over the lazy dog .",
        "A speedy brown fox jumps over a lazy dog.",
        "A swift brown fox leaps over the lethargic dog.",
    ]
    processor = NgramProcessor()
    common_ngrams = processor.find_filtered_ngrams(sentences)
    print(common_ngrams)
    # modified_output = list({
    #     (indices[0][0], gram)
    #     for grams in common_ngrams.values()
    #     for gram, indices in grams.items()
    # })
    # print(modified_output)
    logger.info(f"Common n-grams and their indices per sentence: {common_ngrams}")
    for sentence in sentences:
        order = processor.find_relative_order(sentence, common_ngrams[sentence])
        logger.info(f"Sentence: {sentence} -> Order: {order}")


"""

{
'The quick brown fox jumps over the lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}, 
'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}, 
'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}
}
"""