ai-text-watermarking-model / utils /non_melting_point.py
jgyasu's picture
Add entire pipeline
060ac52
import nltk
import logging
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
import re
from tqdm import tqdm
# Set logging to WARNING for minimal console output.
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
class NgramProcessor:
def __init__(self):
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
self.stop_words = set(stopwords.words('english'))
tqdm.write("[NgramProcessor] Initialized with stopwords.")
def remove_stopwords(self, text):
# No need for extensive logging inside this helper.
words = re.findall(r'\w+', text.lower())
filtered_words = [word for word in words if word not in self.stop_words]
return ' '.join(filtered_words)
def is_exact_match(self, ngram, sentences):
logger.info(f"Checking exact match for ngram: {ngram}")
result = all(ngram in sentence for sentence in sentences)
logger.info(f"Exact match result for '{ngram}': {result}")
return result
def is_substring_of_any(self, ngram, common_ngrams):
logger.info(f"Checking if ngram: {ngram} is substring of any common ngram.")
result = any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
logger.info(f"Substring check result for '{ngram}': {result}")
return result
def find_filtered_ngrams(self, sentences):
from collections import Counter
tqdm.write("[NgramProcessor] Cleaning sentences...")
sentences_cleaned = [self.remove_stopwords(sentence)
for sentence in tqdm(sentences, desc="Cleaning Sentences")]
ngram_lengths = [4, 3, 2, 1]
common_ngrams = []
result = {}
for n in ngram_lengths:
ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences_cleaned]
ngrams_counter = Counter(ngrams_list[0])
for ngram in ngrams_counter:
ngram_str = ' '.join(ngram)
if any(word in self.stop_words for word in ngram_str.split()):
continue
if self.is_exact_match(ngram_str, sentences_cleaned) and not self.is_substring_of_any(ngram_str, common_ngrams):
common_ngrams.append(ngram_str)
for sentence, cleaned_sentence in tqdm(zip(sentences, sentences_cleaned),
total=len(sentences),
desc="Mapping N-grams"):
sentence_result = {}
original_words = sentence.split()
cleaned_words = cleaned_sentence.split()
index_map = {}
cleaned_idx = 0
for orig_idx, word in enumerate(original_words):
if word.lower() not in self.stop_words:
index_map[cleaned_idx] = orig_idx
cleaned_idx += 1
for ngram in common_ngrams:
ngram_words = ngram.split()
indices = []
for i in range(len(cleaned_words) - len(ngram_words) + 1):
if cleaned_words[i:i + len(ngram_words)] == ngram_words:
if i in index_map:
start_idx = index_map[i]
end_idx = index_map.get(i + len(ngram_words) - 1, start_idx)
if end_idx - start_idx == len(ngram_words) - 1:
indices.append((start_idx, end_idx))
if indices:
sentence_result[ngram] = indices
result[sentence] = sentence_result
return result
# def find_relative_order(self, sentence, common_ngrams):
# from tqdm import tqdm
# relative_order = []
# for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
# index = sentence.find(ngram)
# if index != -1:
# relative_order.append((index, ngram))
# return sorted(relative_order)
def find_relative_order(self, sentence, common_ngrams):
from tqdm import tqdm
sentence = sentence.lower()
relative_order = []
for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
index = sentence.find(ngram.lower())
if index != -1:
relative_order.append((index, ngram))
sorted_pairs = sorted(relative_order)
return [(i+1, ngram) for i, (_, ngram) in enumerate(sorted_pairs)]
# Example usage
if __name__ == "__main__":
sentences = [
"The quick brown fox jumps over the lazy dog .",
"A speedy brown fox jumps over a lazy dog.",
"A swift brown fox leaps over the lethargic dog.",
]
processor = NgramProcessor()
common_ngrams = processor.find_filtered_ngrams(sentences)
print(common_ngrams)
# modified_output = list({
# (indices[0][0], gram)
# for grams in common_ngrams.values()
# for gram, indices in grams.items()
# })
# print(modified_output)
logger.info(f"Common n-grams and their indices per sentence: {common_ngrams}")
for sentence in sentences:
order = processor.find_relative_order(sentence, common_ngrams[sentence])
logger.info(f"Sentence: {sentence} -> Order: {order}")
"""
{
'The quick brown fox jumps over the lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]},
'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]},
'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}
}
"""