Spaces:

peccavi
/

ai-text-watermarking-model

Sleeping

App Files Files Community

ai-text-watermarking-model / utils /non_melting_point.py

jgyasu

Add entire pipeline

060ac52 28 days ago

raw

history blame contribute delete

5.81 kB

	import nltk
	import logging
	from nltk.corpus import stopwords
	from nltk.util import ngrams
	from collections import Counter
	import re
	from tqdm import tqdm

	# Set logging to WARNING for minimal console output.
	logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)

	class NgramProcessor:
	def __init__(self):
	try:
	nltk.data.find('corpora/stopwords')
	except LookupError:
	nltk.download('stopwords')
	self.stop_words = set(stopwords.words('english'))
	tqdm.write("[NgramProcessor] Initialized with stopwords.")

	def remove_stopwords(self, text):
	# No need for extensive logging inside this helper.
	words = re.findall(r'\w+', text.lower())
	filtered_words = [word for word in words if word not in self.stop_words]
	return ' '.join(filtered_words)

	def is_exact_match(self, ngram, sentences):
	logger.info(f"Checking exact match for ngram: {ngram}")
	result = all(ngram in sentence for sentence in sentences)
	logger.info(f"Exact match result for '{ngram}': {result}")
	return result

	def is_substring_of_any(self, ngram, common_ngrams):
	logger.info(f"Checking if ngram: {ngram} is substring of any common ngram.")
	result = any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
	logger.info(f"Substring check result for '{ngram}': {result}")
	return result

	def find_filtered_ngrams(self, sentences):
	from collections import Counter
	tqdm.write("[NgramProcessor] Cleaning sentences...")
	sentences_cleaned = [self.remove_stopwords(sentence)
	for sentence in tqdm(sentences, desc="Cleaning Sentences")]
	ngram_lengths = [4, 3, 2, 1]
	common_ngrams = []
	result = {}
	for n in ngram_lengths:
	ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences_cleaned]
	ngrams_counter = Counter(ngrams_list[0])
	for ngram in ngrams_counter:
	ngram_str = ' '.join(ngram)
	if any(word in self.stop_words for word in ngram_str.split()):
	continue
	if self.is_exact_match(ngram_str, sentences_cleaned) and not self.is_substring_of_any(ngram_str, common_ngrams):
	common_ngrams.append(ngram_str)
	for sentence, cleaned_sentence in tqdm(zip(sentences, sentences_cleaned),
	total=len(sentences),
	desc="Mapping N-grams"):
	sentence_result = {}
	original_words = sentence.split()
	cleaned_words = cleaned_sentence.split()
	index_map = {}
	cleaned_idx = 0
	for orig_idx, word in enumerate(original_words):
	if word.lower() not in self.stop_words:
	index_map[cleaned_idx] = orig_idx
	cleaned_idx += 1
	for ngram in common_ngrams:
	ngram_words = ngram.split()
	indices = []
	for i in range(len(cleaned_words) - len(ngram_words) + 1):
	if cleaned_words[i:i + len(ngram_words)] == ngram_words:
	if i in index_map:
	start_idx = index_map[i]
	end_idx = index_map.get(i + len(ngram_words) - 1, start_idx)
	if end_idx - start_idx == len(ngram_words) - 1:
	indices.append((start_idx, end_idx))

	if indices:
	sentence_result[ngram] = indices
	result[sentence] = sentence_result
	return result

	# def find_relative_order(self, sentence, common_ngrams):
	# from tqdm import tqdm
	# relative_order = []
	# for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
	# index = sentence.find(ngram)
	# if index != -1:
	# relative_order.append((index, ngram))
	# return sorted(relative_order)

	def find_relative_order(self, sentence, common_ngrams):
	from tqdm import tqdm
	sentence = sentence.lower()
	relative_order = []

	for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
	index = sentence.find(ngram.lower())
	if index != -1:
	relative_order.append((index, ngram))

	sorted_pairs = sorted(relative_order)
	return [(i+1, ngram) for i, (_, ngram) in enumerate(sorted_pairs)]

	# Example usage
	if __name__ == "__main__":
	sentences = [
	"The quick brown fox jumps over the lazy dog .",
	"A speedy brown fox jumps over a lazy dog.",
	"A swift brown fox leaps over the lethargic dog.",
	]
	processor = NgramProcessor()
	common_ngrams = processor.find_filtered_ngrams(sentences)
	print(common_ngrams)
	# modified_output = list({
	# (indices[0][0], gram)
	# for grams in common_ngrams.values()
	# for gram, indices in grams.items()
	# })
	# print(modified_output)
	logger.info(f"Common n-grams and their indices per sentence: {common_ngrams}")
	for sentence in sentences:
	order = processor.find_relative_order(sentence, common_ngrams[sentence])
	logger.info(f"Sentence: {sentence} -> Order: {order}")


	"""

	{
	'The quick brown fox jumps over the lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]},
	'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]},
	'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}
	}
	"""