import nltk import logging from nltk.corpus import stopwords from nltk.util import ngrams from collections import Counter import re from tqdm import tqdm # Set logging to WARNING for minimal console output. logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) class NgramProcessor: def __init__(self): try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') self.stop_words = set(stopwords.words('english')) tqdm.write("[NgramProcessor] Initialized with stopwords.") def remove_stopwords(self, text): # No need for extensive logging inside this helper. words = re.findall(r'\w+', text.lower()) filtered_words = [word for word in words if word not in self.stop_words] return ' '.join(filtered_words) def is_exact_match(self, ngram, sentences): logger.info(f"Checking exact match for ngram: {ngram}") result = all(ngram in sentence for sentence in sentences) logger.info(f"Exact match result for '{ngram}': {result}") return result def is_substring_of_any(self, ngram, common_ngrams): logger.info(f"Checking if ngram: {ngram} is substring of any common ngram.") result = any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram) logger.info(f"Substring check result for '{ngram}': {result}") return result def find_filtered_ngrams(self, sentences): from collections import Counter tqdm.write("[NgramProcessor] Cleaning sentences...") sentences_cleaned = [self.remove_stopwords(sentence) for sentence in tqdm(sentences, desc="Cleaning Sentences")] ngram_lengths = [4, 3, 2, 1] common_ngrams = [] result = {} for n in ngram_lengths: ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences_cleaned] ngrams_counter = Counter(ngrams_list[0]) for ngram in ngrams_counter: ngram_str = ' '.join(ngram) if any(word in self.stop_words for word in ngram_str.split()): continue if self.is_exact_match(ngram_str, sentences_cleaned) and not self.is_substring_of_any(ngram_str, common_ngrams): common_ngrams.append(ngram_str) for sentence, cleaned_sentence in tqdm(zip(sentences, sentences_cleaned), total=len(sentences), desc="Mapping N-grams"): sentence_result = {} original_words = sentence.split() cleaned_words = cleaned_sentence.split() index_map = {} cleaned_idx = 0 for orig_idx, word in enumerate(original_words): if word.lower() not in self.stop_words: index_map[cleaned_idx] = orig_idx cleaned_idx += 1 for ngram in common_ngrams: ngram_words = ngram.split() indices = [] for i in range(len(cleaned_words) - len(ngram_words) + 1): if cleaned_words[i:i + len(ngram_words)] == ngram_words: if i in index_map: start_idx = index_map[i] end_idx = index_map.get(i + len(ngram_words) - 1, start_idx) if end_idx - start_idx == len(ngram_words) - 1: indices.append((start_idx, end_idx)) if indices: sentence_result[ngram] = indices result[sentence] = sentence_result return result # def find_relative_order(self, sentence, common_ngrams): # from tqdm import tqdm # relative_order = [] # for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False): # index = sentence.find(ngram) # if index != -1: # relative_order.append((index, ngram)) # return sorted(relative_order) def find_relative_order(self, sentence, common_ngrams): from tqdm import tqdm sentence = sentence.lower() relative_order = [] for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False): index = sentence.find(ngram.lower()) if index != -1: relative_order.append((index, ngram)) sorted_pairs = sorted(relative_order) return [(i+1, ngram) for i, (_, ngram) in enumerate(sorted_pairs)] # Example usage if __name__ == "__main__": sentences = [ "The quick brown fox jumps over the lazy dog .", "A speedy brown fox jumps over a lazy dog.", "A swift brown fox leaps over the lethargic dog.", ] processor = NgramProcessor() common_ngrams = processor.find_filtered_ngrams(sentences) print(common_ngrams) # modified_output = list({ # (indices[0][0], gram) # for grams in common_ngrams.values() # for gram, indices in grams.items() # }) # print(modified_output) logger.info(f"Common n-grams and their indices per sentence: {common_ngrams}") for sentence in sentences: order = processor.find_relative_order(sentence, common_ngrams[sentence]) logger.info(f"Sentence: {sentence} -> Order: {order}") """ { 'The quick brown fox jumps over the lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}, 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}, 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]} } """