File size: 5,811 Bytes
060ac52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import nltk
import logging
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
import re
from tqdm import tqdm

# Set logging to WARNING for minimal console output.
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

class NgramProcessor:
    def __init__(self):
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
        tqdm.write("[NgramProcessor] Initialized with stopwords.")

    def remove_stopwords(self, text):
        # No need for extensive logging inside this helper.
        words = re.findall(r'\w+', text.lower())
        filtered_words = [word for word in words if word not in self.stop_words]
        return ' '.join(filtered_words)

    def is_exact_match(self, ngram, sentences):
        logger.info(f"Checking exact match for ngram: {ngram}")
        result = all(ngram in sentence for sentence in sentences)
        logger.info(f"Exact match result for '{ngram}': {result}")
        return result

    def is_substring_of_any(self, ngram, common_ngrams):
        logger.info(f"Checking if ngram: {ngram} is substring of any common ngram.")
        result = any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
        logger.info(f"Substring check result for '{ngram}': {result}")
        return result

    def find_filtered_ngrams(self, sentences):
        from collections import Counter
        tqdm.write("[NgramProcessor] Cleaning sentences...")
        sentences_cleaned = [self.remove_stopwords(sentence)
                             for sentence in tqdm(sentences, desc="Cleaning Sentences")]
        ngram_lengths = [4, 3, 2, 1]
        common_ngrams = []
        result = {}
        for n in ngram_lengths:
            ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences_cleaned]
            ngrams_counter = Counter(ngrams_list[0])
            for ngram in ngrams_counter:
                ngram_str = ' '.join(ngram)
                if any(word in self.stop_words for word in ngram_str.split()):
                    continue
                if self.is_exact_match(ngram_str, sentences_cleaned) and not self.is_substring_of_any(ngram_str, common_ngrams):
                    common_ngrams.append(ngram_str)
        for sentence, cleaned_sentence in tqdm(zip(sentences, sentences_cleaned),
                                                 total=len(sentences),
                                                 desc="Mapping N-grams"):
            sentence_result = {}
            original_words = sentence.split()
            cleaned_words = cleaned_sentence.split()
            index_map = {}
            cleaned_idx = 0
            for orig_idx, word in enumerate(original_words):
                if word.lower() not in self.stop_words:
                    index_map[cleaned_idx] = orig_idx
                    cleaned_idx += 1
            for ngram in common_ngrams:
                ngram_words = ngram.split()
                indices = []
                for i in range(len(cleaned_words) - len(ngram_words) + 1):
                    if cleaned_words[i:i + len(ngram_words)] == ngram_words:
                        if i in index_map:
                            start_idx = index_map[i]
                            end_idx = index_map.get(i + len(ngram_words) - 1, start_idx)
                            if end_idx - start_idx == len(ngram_words) - 1:
                                indices.append((start_idx, end_idx))

                if indices:
                    sentence_result[ngram] = indices
            result[sentence] = sentence_result
        return result

    # def find_relative_order(self, sentence, common_ngrams):
    #     from tqdm import tqdm
    #     relative_order = []
    #     for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
    #         index = sentence.find(ngram)
    #         if index != -1:
    #             relative_order.append((index, ngram))
    #     return sorted(relative_order)

    def find_relative_order(self, sentence, common_ngrams):
        from tqdm import tqdm
        sentence = sentence.lower()
        relative_order = []
        
        for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
            index = sentence.find(ngram.lower())
            if index != -1:
                relative_order.append((index, ngram))
                
        sorted_pairs = sorted(relative_order)
        return [(i+1, ngram) for i, (_, ngram) in enumerate(sorted_pairs)]

# Example usage
if __name__ == "__main__":
    sentences = [
        "The quick brown fox jumps over the lazy dog .",
        "A speedy brown fox jumps over a lazy dog.",
        "A swift brown fox leaps over the lethargic dog.",
    ]
    processor = NgramProcessor()
    common_ngrams = processor.find_filtered_ngrams(sentences)
    print(common_ngrams)
    # modified_output = list({
    #     (indices[0][0], gram)
    #     for grams in common_ngrams.values()
    #     for gram, indices in grams.items()
    # })
    # print(modified_output)
    logger.info(f"Common n-grams and their indices per sentence: {common_ngrams}")
    for sentence in sentences:
        order = processor.find_relative_order(sentence, common_ngrams[sentence])
        logger.info(f"Sentence: {sentence} -> Order: {order}")


"""

{
'The quick brown fox jumps over the lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}, 
'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}, 
'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}
}
"""