ai-text-watermarking-model / utils /old /non_melting_points_v1.py
jgyasu's picture
Add entire pipeline
060ac52
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
import re
class NgramProcessor:
def __init__(self):
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
self.stop_words = set(stopwords.words('english'))
def remove_stopwords(self, text):
"""
Remove stopwords using NLTK's stopword list
Args:
text (str): Input text
Returns:
str: Cleaned text with stopwords removed
"""
words = re.findall(r'\w+', text.lower())
filtered_words = [word for word in words if word not in self.stop_words]
return ' '.join(filtered_words)
def is_exact_match(self, ngram, sentences):
"""
Check if the given n-gram has an exact match in all sentences
Args:
ngram (str): The n-gram to search for
sentences (list): List of sentences to search in
Returns:
bool: True if n-gram has exact match in all sentences, False otherwise
"""
return all(ngram in sentence for sentence in sentences)
def is_substring_of_any(self, ngram, common_ngrams):
"""
Check if the given n-gram is an exact substring of any previously found common n-grams
Args:
ngram (str): The n-gram to check
common_ngrams (list): List of previously found common n-grams
Returns:
bool: True if ngram is a substring of any common_ngrams, False otherwise
"""
return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
def find_filtered_ngrams(self, sentences):
"""
Find all n-grams that have exact matches across all sentences,
excluding those that are part of larger common n-grams
Args:
sentences (list): List of sentences to analyze
Returns:
list: List of tuples where each tuple contains the n-gram and its indices in each sentence
"""
original_sentences = sentences[:]
sentences = [self.remove_stopwords(sentence) for sentence in sentences]
ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram
common_ngrams = []
for n in ngram_lengths:
ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
ngrams_counter = Counter(ngrams_list[0])
for ngram in ngrams_counter:
ngram_str = ' '.join(ngram)
if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, [ng[0] for ng in common_ngrams]):
indices = []
for original_sentence in original_sentences:
words = original_sentence.split()
ngram_indices = [
(i, i + n - 1) for i in range(len(words) - n + 1)
if ' '.join(words[i:i + n]).lower() == ngram_str
]
indices.append(ngram_indices)
common_ngrams.append((ngram_str, indices))
return common_ngrams
def find_relative_order(self, sentence, common_ngrams):
"""
Find the relative order of the common n-grams in the sentence
Args:
sentence (str): Sentence in which to find the relative order
common_ngrams (list): List of common n-grams
Returns:
list: List of tuples with the relative position and the n-gram
"""
relative_order = []
for ngram, _ in common_ngrams:
index = sentence.find(ngram)
if index != -1:
relative_order.append((index, ngram))
return sorted(relative_order)
# Example usage
if __name__ == "__main__":
sentences = [
"The quick brown fox jumps over the lazy dog.",
"A quick brown dog outpaces a lazy fox.",
"Quick brown animals leap over lazy obstacles."
]
processor = NgramProcessor()
common_ngrams = processor.find_filtered_ngrams(sentences)
print("Common n-grams and their indices:")
for ngram, indices in common_ngrams:
print(f"{ngram}: {indices}")
for sentence in sentences:
relative_order = processor.find_relative_order(sentence, common_ngrams)
print(f"Relative order in sentence '{sentence}':", relative_order)
# import nltk
# from nltk.corpus import stopwords
# from nltk.util import ngrams
# from collections import Counter
# import re
# class NgramProcessor:
# def __init__(self):
# try:
# nltk.data.find('corpora/stopwords')
# except LookupError:
# nltk.download('stopwords')
# self.stop_words = set(stopwords.words('english'))
# def remove_stopwords(self, text):
# """
# Remove stopwords using NLTK's stopword list
# Args:
# text (str): Input text
# Returns:
# str: Cleaned text with stopwords removed
# """
# words = re.findall(r'\w+', text.lower())
# filtered_words = [word for word in words if word not in self.stop_words]
# return ' '.join(filtered_words)
# def is_exact_match(self, ngram, sentences):
# """
# Check if the given n-gram has an exact match in all sentences
# Args:
# ngram (str): The n-gram to search for
# sentences (list): List of sentences to search in
# Returns:
# bool: True if n-gram has exact match in all sentences, False otherwise
# """
# return all(ngram in sentence for sentence in sentences)
# def is_substring_of_any(self, ngram, common_ngrams):
# """
# Check if the given n-gram is an exact substring of any previously found common n-grams
# Args:
# ngram (str): The n-gram to check
# common_ngrams (list): List of previously found common n-grams
# Returns:
# bool: True if ngram is a substring of any common_ngrams, False otherwise
# """
# return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
# def find_filtered_ngrams(self, sentences):
# """
# Find all n-grams that have exact matches across all sentences,
# excluding those that are part of larger common n-grams
# Args:
# sentences (list): List of sentences to analyze
# Returns:
# list: List of all common n-grams in order of their appearance in the first sentence
# """
# sentences = [self.remove_stopwords(sentence) for sentence in sentences]
# ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram
# common_ngrams = []
# for n in ngram_lengths:
# ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
# ngrams_counter = Counter(ngrams_list[0])
# for ngram in ngrams_counter:
# ngram_str = ' '.join(ngram)
# if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, common_ngrams):
# common_ngrams.append(ngram_str)
# return common_ngrams
# def find_relative_order(self, sentence, common_ngrams):
# """
# Find the relative order of the common n-grams in the sentence
# Args:
# sentence (str): Sentence in which to find the relative order
# common_ngrams (list): List of common n-grams
# Returns:
# list: List of tuples with the relative position and the n-gram
# """
# relative_order = []
# for ngram in common_ngrams:
# index = sentence.find(ngram)
# if index != -1:
# relative_order.append((index, ngram))
# return sorted(relative_order)
# # Example usage
# if __name__ == "__main__":
# sentences = [
# "The quick brown fox jumps over the lazy dog.",
# "A quick brown dog outpaces a lazy fox.",
# "Quick brown animals leap over lazy obstacles."
# ]
# processor = NgramProcessor()
# common_ngrams = processor.find_filtered_ngrams(sentences)
# print("Common n-grams:", common_ngrams)
# for sentence in sentences:
# relative_order = processor.find_relative_order(sentence, common_ngrams)
# print(f"Relative order in sentence '{sentence}':", relative_order)