import nltk from nltk.corpus import stopwords from nltk.util import ngrams from collections import Counter import re class NgramProcessor: def __init__(self): try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') self.stop_words = set(stopwords.words('english')) def remove_stopwords(self, text): """ Remove stopwords using NLTK's stopword list Args: text (str): Input text Returns: str: Cleaned text with stopwords removed """ words = re.findall(r'\w+', text.lower()) filtered_words = [word for word in words if word not in self.stop_words] return ' '.join(filtered_words) def is_exact_match(self, ngram, sentences): """ Check if the given n-gram has an exact match in all sentences Args: ngram (str): The n-gram to search for sentences (list): List of sentences to search in Returns: bool: True if n-gram has exact match in all sentences, False otherwise """ return all(ngram in sentence for sentence in sentences) def is_substring_of_any(self, ngram, common_ngrams): """ Check if the given n-gram is an exact substring of any previously found common n-grams Args: ngram (str): The n-gram to check common_ngrams (list): List of previously found common n-grams Returns: bool: True if ngram is a substring of any common_ngrams, False otherwise """ return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram) def find_filtered_ngrams(self, sentences): """ Find all n-grams that have exact matches across all sentences, excluding those that are part of larger common n-grams Args: sentences (list): List of sentences to analyze Returns: list: List of tuples where each tuple contains the n-gram and its indices in each sentence """ original_sentences = sentences[:] sentences = [self.remove_stopwords(sentence) for sentence in sentences] ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram common_ngrams = [] for n in ngram_lengths: ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences] ngrams_counter = Counter(ngrams_list[0]) for ngram in ngrams_counter: ngram_str = ' '.join(ngram) if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, [ng[0] for ng in common_ngrams]): indices = [] for original_sentence in original_sentences: words = original_sentence.split() ngram_indices = [ (i, i + n - 1) for i in range(len(words) - n + 1) if ' '.join(words[i:i + n]).lower() == ngram_str ] indices.append(ngram_indices) common_ngrams.append((ngram_str, indices)) return common_ngrams def find_relative_order(self, sentence, common_ngrams): """ Find the relative order of the common n-grams in the sentence Args: sentence (str): Sentence in which to find the relative order common_ngrams (list): List of common n-grams Returns: list: List of tuples with the relative position and the n-gram """ relative_order = [] for ngram, _ in common_ngrams: index = sentence.find(ngram) if index != -1: relative_order.append((index, ngram)) return sorted(relative_order) # Example usage if __name__ == "__main__": sentences = [ "The quick brown fox jumps over the lazy dog.", "A quick brown dog outpaces a lazy fox.", "Quick brown animals leap over lazy obstacles." ] processor = NgramProcessor() common_ngrams = processor.find_filtered_ngrams(sentences) print("Common n-grams and their indices:") for ngram, indices in common_ngrams: print(f"{ngram}: {indices}") for sentence in sentences: relative_order = processor.find_relative_order(sentence, common_ngrams) print(f"Relative order in sentence '{sentence}':", relative_order) # import nltk # from nltk.corpus import stopwords # from nltk.util import ngrams # from collections import Counter # import re # class NgramProcessor: # def __init__(self): # try: # nltk.data.find('corpora/stopwords') # except LookupError: # nltk.download('stopwords') # self.stop_words = set(stopwords.words('english')) # def remove_stopwords(self, text): # """ # Remove stopwords using NLTK's stopword list # Args: # text (str): Input text # Returns: # str: Cleaned text with stopwords removed # """ # words = re.findall(r'\w+', text.lower()) # filtered_words = [word for word in words if word not in self.stop_words] # return ' '.join(filtered_words) # def is_exact_match(self, ngram, sentences): # """ # Check if the given n-gram has an exact match in all sentences # Args: # ngram (str): The n-gram to search for # sentences (list): List of sentences to search in # Returns: # bool: True if n-gram has exact match in all sentences, False otherwise # """ # return all(ngram in sentence for sentence in sentences) # def is_substring_of_any(self, ngram, common_ngrams): # """ # Check if the given n-gram is an exact substring of any previously found common n-grams # Args: # ngram (str): The n-gram to check # common_ngrams (list): List of previously found common n-grams # Returns: # bool: True if ngram is a substring of any common_ngrams, False otherwise # """ # return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram) # def find_filtered_ngrams(self, sentences): # """ # Find all n-grams that have exact matches across all sentences, # excluding those that are part of larger common n-grams # Args: # sentences (list): List of sentences to analyze # Returns: # list: List of all common n-grams in order of their appearance in the first sentence # """ # sentences = [self.remove_stopwords(sentence) for sentence in sentences] # ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram # common_ngrams = [] # for n in ngram_lengths: # ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences] # ngrams_counter = Counter(ngrams_list[0]) # for ngram in ngrams_counter: # ngram_str = ' '.join(ngram) # if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, common_ngrams): # common_ngrams.append(ngram_str) # return common_ngrams # def find_relative_order(self, sentence, common_ngrams): # """ # Find the relative order of the common n-grams in the sentence # Args: # sentence (str): Sentence in which to find the relative order # common_ngrams (list): List of common n-grams # Returns: # list: List of tuples with the relative position and the n-gram # """ # relative_order = [] # for ngram in common_ngrams: # index = sentence.find(ngram) # if index != -1: # relative_order.append((index, ngram)) # return sorted(relative_order) # # Example usage # if __name__ == "__main__": # sentences = [ # "The quick brown fox jumps over the lazy dog.", # "A quick brown dog outpaces a lazy fox.", # "Quick brown animals leap over lazy obstacles." # ] # processor = NgramProcessor() # common_ngrams = processor.find_filtered_ngrams(sentences) # print("Common n-grams:", common_ngrams) # for sentence in sentences: # relative_order = processor.find_relative_order(sentence, common_ngrams) # print(f"Relative order in sentence '{sentence}':", relative_order)