import nltk | |
from nltk.corpus import stopwords | |
from nltk.util import ngrams | |
from collections import Counter | |
import re | |
class NgramProcessor: | |
def __init__(self): | |
try: | |
nltk.data.find('corpora/stopwords') | |
except LookupError: | |
nltk.download('stopwords') | |
self.stop_words = set(stopwords.words('english')) | |
def remove_stopwords(self, text): | |
""" | |
Remove stopwords using NLTK's stopword list | |
Args: | |
text (str): Input text | |
Returns: | |
str: Cleaned text with stopwords removed | |
""" | |
words = re.findall(r'\w+', text.lower()) | |
filtered_words = [word for word in words if word not in self.stop_words] | |
return ' '.join(filtered_words) | |
def is_exact_match(self, ngram, sentences): | |
""" | |
Check if the given n-gram has an exact match in all sentences | |
Args: | |
ngram (str): The n-gram to search for | |
sentences (list): List of sentences to search in | |
Returns: | |
bool: True if n-gram has exact match in all sentences, False otherwise | |
""" | |
return all(ngram in sentence for sentence in sentences) | |
def is_substring_of_any(self, ngram, common_ngrams): | |
""" | |
Check if the given n-gram is an exact substring of any previously found common n-grams | |
Args: | |
ngram (str): The n-gram to check | |
common_ngrams (list): List of previously found common n-grams | |
Returns: | |
bool: True if ngram is a substring of any common_ngrams, False otherwise | |
""" | |
return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram) | |
def find_filtered_ngrams(self, sentences): | |
""" | |
Find all n-grams that have exact matches across all sentences, | |
excluding those that are part of larger common n-grams | |
Args: | |
sentences (list): List of sentences to analyze | |
Returns: | |
list: List of tuples where each tuple contains the n-gram and its indices in each sentence | |
""" | |
original_sentences = sentences[:] | |
sentences = [self.remove_stopwords(sentence) for sentence in sentences] | |
ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram | |
common_ngrams = [] | |
for n in ngram_lengths: | |
ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences] | |
ngrams_counter = Counter(ngrams_list[0]) | |
for ngram in ngrams_counter: | |
ngram_str = ' '.join(ngram) | |
if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, [ng[0] for ng in common_ngrams]): | |
indices = [] | |
for original_sentence in original_sentences: | |
words = original_sentence.split() | |
ngram_indices = [ | |
(i, i + n - 1) for i in range(len(words) - n + 1) | |
if ' '.join(words[i:i + n]).lower() == ngram_str | |
] | |
indices.append(ngram_indices) | |
common_ngrams.append((ngram_str, indices)) | |
return common_ngrams | |
def find_relative_order(self, sentence, common_ngrams): | |
""" | |
Find the relative order of the common n-grams in the sentence | |
Args: | |
sentence (str): Sentence in which to find the relative order | |
common_ngrams (list): List of common n-grams | |
Returns: | |
list: List of tuples with the relative position and the n-gram | |
""" | |
relative_order = [] | |
for ngram, _ in common_ngrams: | |
index = sentence.find(ngram) | |
if index != -1: | |
relative_order.append((index, ngram)) | |
return sorted(relative_order) | |
# Example usage | |
if __name__ == "__main__": | |
sentences = [ | |
"The quick brown fox jumps over the lazy dog.", | |
"A quick brown dog outpaces a lazy fox.", | |
"Quick brown animals leap over lazy obstacles." | |
] | |
processor = NgramProcessor() | |
common_ngrams = processor.find_filtered_ngrams(sentences) | |
print("Common n-grams and their indices:") | |
for ngram, indices in common_ngrams: | |
print(f"{ngram}: {indices}") | |
for sentence in sentences: | |
relative_order = processor.find_relative_order(sentence, common_ngrams) | |
print(f"Relative order in sentence '{sentence}':", relative_order) | |
# import nltk | |
# from nltk.corpus import stopwords | |
# from nltk.util import ngrams | |
# from collections import Counter | |
# import re | |
# class NgramProcessor: | |
# def __init__(self): | |
# try: | |
# nltk.data.find('corpora/stopwords') | |
# except LookupError: | |
# nltk.download('stopwords') | |
# self.stop_words = set(stopwords.words('english')) | |
# def remove_stopwords(self, text): | |
# """ | |
# Remove stopwords using NLTK's stopword list | |
# Args: | |
# text (str): Input text | |
# Returns: | |
# str: Cleaned text with stopwords removed | |
# """ | |
# words = re.findall(r'\w+', text.lower()) | |
# filtered_words = [word for word in words if word not in self.stop_words] | |
# return ' '.join(filtered_words) | |
# def is_exact_match(self, ngram, sentences): | |
# """ | |
# Check if the given n-gram has an exact match in all sentences | |
# Args: | |
# ngram (str): The n-gram to search for | |
# sentences (list): List of sentences to search in | |
# Returns: | |
# bool: True if n-gram has exact match in all sentences, False otherwise | |
# """ | |
# return all(ngram in sentence for sentence in sentences) | |
# def is_substring_of_any(self, ngram, common_ngrams): | |
# """ | |
# Check if the given n-gram is an exact substring of any previously found common n-grams | |
# Args: | |
# ngram (str): The n-gram to check | |
# common_ngrams (list): List of previously found common n-grams | |
# Returns: | |
# bool: True if ngram is a substring of any common_ngrams, False otherwise | |
# """ | |
# return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram) | |
# def find_filtered_ngrams(self, sentences): | |
# """ | |
# Find all n-grams that have exact matches across all sentences, | |
# excluding those that are part of larger common n-grams | |
# Args: | |
# sentences (list): List of sentences to analyze | |
# Returns: | |
# list: List of all common n-grams in order of their appearance in the first sentence | |
# """ | |
# sentences = [self.remove_stopwords(sentence) for sentence in sentences] | |
# ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram | |
# common_ngrams = [] | |
# for n in ngram_lengths: | |
# ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences] | |
# ngrams_counter = Counter(ngrams_list[0]) | |
# for ngram in ngrams_counter: | |
# ngram_str = ' '.join(ngram) | |
# if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, common_ngrams): | |
# common_ngrams.append(ngram_str) | |
# return common_ngrams | |
# def find_relative_order(self, sentence, common_ngrams): | |
# """ | |
# Find the relative order of the common n-grams in the sentence | |
# Args: | |
# sentence (str): Sentence in which to find the relative order | |
# common_ngrams (list): List of common n-grams | |
# Returns: | |
# list: List of tuples with the relative position and the n-gram | |
# """ | |
# relative_order = [] | |
# for ngram in common_ngrams: | |
# index = sentence.find(ngram) | |
# if index != -1: | |
# relative_order.append((index, ngram)) | |
# return sorted(relative_order) | |
# # Example usage | |
# if __name__ == "__main__": | |
# sentences = [ | |
# "The quick brown fox jumps over the lazy dog.", | |
# "A quick brown dog outpaces a lazy fox.", | |
# "Quick brown animals leap over lazy obstacles." | |
# ] | |
# processor = NgramProcessor() | |
# common_ngrams = processor.find_filtered_ngrams(sentences) | |
# print("Common n-grams:", common_ngrams) | |
# for sentence in sentences: | |
# relative_order = processor.find_relative_order(sentence, common_ngrams) | |
# print(f"Relative order in sentence '{sentence}':", relative_order) | |