File size: 5,811 Bytes
060ac52 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import nltk
import logging
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
import re
from tqdm import tqdm
# Set logging to WARNING for minimal console output.
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
class NgramProcessor:
def __init__(self):
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
self.stop_words = set(stopwords.words('english'))
tqdm.write("[NgramProcessor] Initialized with stopwords.")
def remove_stopwords(self, text):
# No need for extensive logging inside this helper.
words = re.findall(r'\w+', text.lower())
filtered_words = [word for word in words if word not in self.stop_words]
return ' '.join(filtered_words)
def is_exact_match(self, ngram, sentences):
logger.info(f"Checking exact match for ngram: {ngram}")
result = all(ngram in sentence for sentence in sentences)
logger.info(f"Exact match result for '{ngram}': {result}")
return result
def is_substring_of_any(self, ngram, common_ngrams):
logger.info(f"Checking if ngram: {ngram} is substring of any common ngram.")
result = any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
logger.info(f"Substring check result for '{ngram}': {result}")
return result
def find_filtered_ngrams(self, sentences):
from collections import Counter
tqdm.write("[NgramProcessor] Cleaning sentences...")
sentences_cleaned = [self.remove_stopwords(sentence)
for sentence in tqdm(sentences, desc="Cleaning Sentences")]
ngram_lengths = [4, 3, 2, 1]
common_ngrams = []
result = {}
for n in ngram_lengths:
ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences_cleaned]
ngrams_counter = Counter(ngrams_list[0])
for ngram in ngrams_counter:
ngram_str = ' '.join(ngram)
if any(word in self.stop_words for word in ngram_str.split()):
continue
if self.is_exact_match(ngram_str, sentences_cleaned) and not self.is_substring_of_any(ngram_str, common_ngrams):
common_ngrams.append(ngram_str)
for sentence, cleaned_sentence in tqdm(zip(sentences, sentences_cleaned),
total=len(sentences),
desc="Mapping N-grams"):
sentence_result = {}
original_words = sentence.split()
cleaned_words = cleaned_sentence.split()
index_map = {}
cleaned_idx = 0
for orig_idx, word in enumerate(original_words):
if word.lower() not in self.stop_words:
index_map[cleaned_idx] = orig_idx
cleaned_idx += 1
for ngram in common_ngrams:
ngram_words = ngram.split()
indices = []
for i in range(len(cleaned_words) - len(ngram_words) + 1):
if cleaned_words[i:i + len(ngram_words)] == ngram_words:
if i in index_map:
start_idx = index_map[i]
end_idx = index_map.get(i + len(ngram_words) - 1, start_idx)
if end_idx - start_idx == len(ngram_words) - 1:
indices.append((start_idx, end_idx))
if indices:
sentence_result[ngram] = indices
result[sentence] = sentence_result
return result
# def find_relative_order(self, sentence, common_ngrams):
# from tqdm import tqdm
# relative_order = []
# for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
# index = sentence.find(ngram)
# if index != -1:
# relative_order.append((index, ngram))
# return sorted(relative_order)
def find_relative_order(self, sentence, common_ngrams):
from tqdm import tqdm
sentence = sentence.lower()
relative_order = []
for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
index = sentence.find(ngram.lower())
if index != -1:
relative_order.append((index, ngram))
sorted_pairs = sorted(relative_order)
return [(i+1, ngram) for i, (_, ngram) in enumerate(sorted_pairs)]
# Example usage
if __name__ == "__main__":
sentences = [
"The quick brown fox jumps over the lazy dog .",
"A speedy brown fox jumps over a lazy dog.",
"A swift brown fox leaps over the lethargic dog.",
]
processor = NgramProcessor()
common_ngrams = processor.find_filtered_ngrams(sentences)
print(common_ngrams)
# modified_output = list({
# (indices[0][0], gram)
# for grams in common_ngrams.values()
# for gram, indices in grams.items()
# })
# print(modified_output)
logger.info(f"Common n-grams and their indices per sentence: {common_ngrams}")
for sentence in sentences:
order = processor.find_relative_order(sentence, common_ngrams[sentence])
logger.info(f"Sentence: {sentence} -> Order: {order}")
"""
{
'The quick brown fox jumps over the lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]},
'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]},
'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}
}
"""
|