Spaces:

peccavi
/

ai-text-watermarking-model

Sleeping

File size: 8,616 Bytes

060ac52

import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
import re

class NgramProcessor:
    def __init__(self):
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')

        self.stop_words = set(stopwords.words('english'))

    def remove_stopwords(self, text):
        """
        Remove stopwords using NLTK's stopword list

        Args:
            text (str): Input text

        Returns:
            str: Cleaned text with stopwords removed
        """
        words = re.findall(r'\w+', text.lower())
        filtered_words = [word for word in words if word not in self.stop_words]
        return ' '.join(filtered_words)

    def is_exact_match(self, ngram, sentences):
        """
        Check if the given n-gram has an exact match in all sentences

        Args:
            ngram (str): The n-gram to search for
            sentences (list): List of sentences to search in

        Returns:
            bool: True if n-gram has exact match in all sentences, False otherwise
        """
        return all(ngram in sentence for sentence in sentences)

    def is_substring_of_any(self, ngram, common_ngrams):
        """
        Check if the given n-gram is an exact substring of any previously found common n-grams

        Args:
            ngram (str): The n-gram to check
            common_ngrams (list): List of previously found common n-grams

        Returns:
            bool: True if ngram is a substring of any common_ngrams, False otherwise
        """
        return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)

    def find_filtered_ngrams(self, sentences):
        """
        Find all n-grams that have exact matches across all sentences,
        excluding those that are part of larger common n-grams

        Args:
            sentences (list): List of sentences to analyze

        Returns:
            list: List of tuples where each tuple contains the n-gram and its indices in each sentence
        """
        original_sentences = sentences[:]
        sentences = [self.remove_stopwords(sentence) for sentence in sentences]
        ngram_lengths = [4, 3, 2, 1]  # Quadgram, trigram, bigram, unigram
        common_ngrams = []

        for n in ngram_lengths:
            ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
            ngrams_counter = Counter(ngrams_list[0])

            for ngram in ngrams_counter:
                ngram_str = ' '.join(ngram)
                if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, [ng[0] for ng in common_ngrams]):
                    indices = []
                    for original_sentence in original_sentences:
                        words = original_sentence.split()
                        ngram_indices = [
                            (i, i + n - 1) for i in range(len(words) - n + 1)
                            if ' '.join(words[i:i + n]).lower() == ngram_str
                        ]
                        indices.append(ngram_indices)
                    common_ngrams.append((ngram_str, indices))

        return common_ngrams

    def find_relative_order(self, sentence, common_ngrams):
        """
        Find the relative order of the common n-grams in the sentence

        Args:
            sentence (str): Sentence in which to find the relative order
            common_ngrams (list): List of common n-grams

        Returns:
            list: List of tuples with the relative position and the n-gram
        """
        relative_order = []
        for ngram, _ in common_ngrams:
            index = sentence.find(ngram)
            if index != -1:
                relative_order.append((index, ngram))

        return sorted(relative_order)

# Example usage
if __name__ == "__main__":
    sentences = [
        "The quick brown fox jumps over the lazy dog.",
        "A quick brown dog outpaces a lazy fox.",
        "Quick brown animals leap over lazy obstacles."
    ]

    processor = NgramProcessor()
    common_ngrams = processor.find_filtered_ngrams(sentences)
    print("Common n-grams and their indices:")
    for ngram, indices in common_ngrams:
        print(f"{ngram}: {indices}")

    for sentence in sentences:
        relative_order = processor.find_relative_order(sentence, common_ngrams)
        print(f"Relative order in sentence '{sentence}':", relative_order)



# import nltk
# from nltk.corpus import stopwords
# from nltk.util import ngrams
# from collections import Counter
# import re

# class NgramProcessor:
#     def __init__(self):
#         try:
#             nltk.data.find('corpora/stopwords')
#         except LookupError:
#             nltk.download('stopwords')

#         self.stop_words = set(stopwords.words('english'))

#     def remove_stopwords(self, text):
#         """
#         Remove stopwords using NLTK's stopword list

#         Args:
#             text (str): Input text

#         Returns:
#             str: Cleaned text with stopwords removed
#         """
#         words = re.findall(r'\w+', text.lower())
#         filtered_words = [word for word in words if word not in self.stop_words]
#         return ' '.join(filtered_words)

#     def is_exact_match(self, ngram, sentences):
#         """
#         Check if the given n-gram has an exact match in all sentences

#         Args:
#             ngram (str): The n-gram to search for
#             sentences (list): List of sentences to search in

#         Returns:
#             bool: True if n-gram has exact match in all sentences, False otherwise
#         """
#         return all(ngram in sentence for sentence in sentences)

#     def is_substring_of_any(self, ngram, common_ngrams):
#         """
#         Check if the given n-gram is an exact substring of any previously found common n-grams

#         Args:
#             ngram (str): The n-gram to check
#             common_ngrams (list): List of previously found common n-grams

#         Returns:
#             bool: True if ngram is a substring of any common_ngrams, False otherwise
#         """
#         return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)

#     def find_filtered_ngrams(self, sentences):
#         """
#         Find all n-grams that have exact matches across all sentences,
#         excluding those that are part of larger common n-grams

#         Args:
#             sentences (list): List of sentences to analyze

#         Returns:
#             list: List of all common n-grams in order of their appearance in the first sentence
#         """
#         sentences = [self.remove_stopwords(sentence) for sentence in sentences]
#         ngram_lengths = [4, 3, 2, 1]  # Quadgram, trigram, bigram, unigram
#         common_ngrams = []

#         for n in ngram_lengths:
#             ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
#             ngrams_counter = Counter(ngrams_list[0])

#             for ngram in ngrams_counter:
#                 ngram_str = ' '.join(ngram)
#                 if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, common_ngrams):
#                     common_ngrams.append(ngram_str)

#         return common_ngrams

#     def find_relative_order(self, sentence, common_ngrams):
#         """
#         Find the relative order of the common n-grams in the sentence

#         Args:
#             sentence (str): Sentence in which to find the relative order
#             common_ngrams (list): List of common n-grams

#         Returns:
#             list: List of tuples with the relative position and the n-gram
#         """
#         relative_order = []
#         for ngram in common_ngrams:
#             index = sentence.find(ngram)
#             if index != -1:
#                 relative_order.append((index, ngram))

#         return sorted(relative_order)

# # Example usage
# if __name__ == "__main__":
#     sentences = [
#         "The quick brown fox jumps over the lazy dog.",
#         "A quick brown dog outpaces a lazy fox.",
#         "Quick brown animals leap over lazy obstacles."
#     ]

#     processor = NgramProcessor()
#     common_ngrams = processor.find_filtered_ngrams(sentences)
#     print("Common n-grams:", common_ngrams)

#     for sentence in sentences:
#         relative_order = processor.find_relative_order(sentence, common_ngrams)
#         print(f"Relative order in sentence '{sentence}':", relative_order)