|
import nltk |
|
import logging |
|
from nltk.corpus import stopwords |
|
from nltk.util import ngrams |
|
from collections import Counter |
|
import re |
|
from tqdm import tqdm |
|
|
|
|
|
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s") |
|
logger = logging.getLogger(__name__) |
|
|
|
class NgramProcessor: |
|
def __init__(self): |
|
try: |
|
nltk.data.find('corpora/stopwords') |
|
except LookupError: |
|
nltk.download('stopwords') |
|
self.stop_words = set(stopwords.words('english')) |
|
tqdm.write("[NgramProcessor] Initialized with stopwords.") |
|
|
|
def remove_stopwords(self, text): |
|
|
|
words = re.findall(r'\w+', text.lower()) |
|
filtered_words = [word for word in words if word not in self.stop_words] |
|
return ' '.join(filtered_words) |
|
|
|
def is_exact_match(self, ngram, sentences): |
|
logger.info(f"Checking exact match for ngram: {ngram}") |
|
result = all(ngram in sentence for sentence in sentences) |
|
logger.info(f"Exact match result for '{ngram}': {result}") |
|
return result |
|
|
|
def is_substring_of_any(self, ngram, common_ngrams): |
|
logger.info(f"Checking if ngram: {ngram} is substring of any common ngram.") |
|
result = any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram) |
|
logger.info(f"Substring check result for '{ngram}': {result}") |
|
return result |
|
|
|
def find_filtered_ngrams(self, sentences): |
|
from collections import Counter |
|
tqdm.write("[NgramProcessor] Cleaning sentences...") |
|
sentences_cleaned = [self.remove_stopwords(sentence) |
|
for sentence in tqdm(sentences, desc="Cleaning Sentences")] |
|
ngram_lengths = [4, 3, 2, 1] |
|
common_ngrams = [] |
|
result = {} |
|
for n in ngram_lengths: |
|
ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences_cleaned] |
|
ngrams_counter = Counter(ngrams_list[0]) |
|
for ngram in ngrams_counter: |
|
ngram_str = ' '.join(ngram) |
|
if any(word in self.stop_words for word in ngram_str.split()): |
|
continue |
|
if self.is_exact_match(ngram_str, sentences_cleaned) and not self.is_substring_of_any(ngram_str, common_ngrams): |
|
common_ngrams.append(ngram_str) |
|
for sentence, cleaned_sentence in tqdm(zip(sentences, sentences_cleaned), |
|
total=len(sentences), |
|
desc="Mapping N-grams"): |
|
sentence_result = {} |
|
original_words = sentence.split() |
|
cleaned_words = cleaned_sentence.split() |
|
index_map = {} |
|
cleaned_idx = 0 |
|
for orig_idx, word in enumerate(original_words): |
|
if word.lower() not in self.stop_words: |
|
index_map[cleaned_idx] = orig_idx |
|
cleaned_idx += 1 |
|
for ngram in common_ngrams: |
|
ngram_words = ngram.split() |
|
indices = [] |
|
for i in range(len(cleaned_words) - len(ngram_words) + 1): |
|
if cleaned_words[i:i + len(ngram_words)] == ngram_words: |
|
if i in index_map: |
|
start_idx = index_map[i] |
|
end_idx = index_map.get(i + len(ngram_words) - 1, start_idx) |
|
if end_idx - start_idx == len(ngram_words) - 1: |
|
indices.append((start_idx, end_idx)) |
|
|
|
if indices: |
|
sentence_result[ngram] = indices |
|
result[sentence] = sentence_result |
|
return result |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_relative_order(self, sentence, common_ngrams): |
|
from tqdm import tqdm |
|
sentence = sentence.lower() |
|
relative_order = [] |
|
|
|
for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False): |
|
index = sentence.find(ngram.lower()) |
|
if index != -1: |
|
relative_order.append((index, ngram)) |
|
|
|
sorted_pairs = sorted(relative_order) |
|
return [(i+1, ngram) for i, (_, ngram) in enumerate(sorted_pairs)] |
|
|
|
|
|
if __name__ == "__main__": |
|
sentences = [ |
|
"The quick brown fox jumps over the lazy dog .", |
|
"A speedy brown fox jumps over a lazy dog.", |
|
"A swift brown fox leaps over the lethargic dog.", |
|
] |
|
processor = NgramProcessor() |
|
common_ngrams = processor.find_filtered_ngrams(sentences) |
|
print(common_ngrams) |
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"Common n-grams and their indices per sentence: {common_ngrams}") |
|
for sentence in sentences: |
|
order = processor.find_relative_order(sentence, common_ngrams[sentence]) |
|
logger.info(f"Sentence: {sentence} -> Order: {order}") |
|
|
|
|
|
""" |
|
|
|
{ |
|
'The quick brown fox jumps over the lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}, |
|
'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}, |
|
'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]} |
|
} |
|
""" |
|
|
|
|