Spaces:

ss567uhg
/

nlpWeb

Sleeping

App Files Files Community

Sophia Koehler commited on Nov 11, 2024

Commit

2fa43bc

1 Parent(s): b91726c

fix3

Browse files

Files changed (2) hide show

app.py +270 -64
nlp4web-codebase +1 -0

app.py CHANGED Viewed

@@ -1,49 +1,50 @@
 # -*- coding: utf-8 -*-
 from dataclasses import dataclass
-import os
 import pickle
-from typing import List, Dict, Optional, Type, TypeVar, TypedDict
-import re
-import math
 from collections import Counter
-import gradio as gr
 import nltk
-from nlp4web_codebase.ir.data_loaders.dm import Document
-from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
-from nlp4web_codebase.ir.models import BaseRetriever
 from nltk.corpus import stopwords as nltk_stopwords
-# Check nltk stopwords data
-try:
-    nltk.data.find("corpora/stopwords")
-except LookupError:
-    nltk.download("stopwords", quiet=True)
-# Tokenization and helper functions
 LANGUAGE = "english"
-stopwords = set(nltk_stopwords.words(LANGUAGE))
 word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
 def simple_tokenize(text: str) -> List[str]:
-    words = word_splitter(text.lower())
-    tokenized = [word for word in words if word not in stopwords]
     return tokenized
 @dataclass
 class PostingList:
-    term: str
-    docid_postings: List[int]
-    tweight_postings: List[float]
-T = TypeVar("T", bound="InvertedIndex")
 @dataclass
 class InvertedIndex:
-    posting_lists: List[PostingList]
     vocab: Dict[str, int]
-    cid2docid: Dict[str, int]
-    collection_ids: List[str]
-    doc_texts: Optional[List[str]] = None
     def save(self, output_dir: str) -> None:
         os.makedirs(output_dir, exist_ok=True)
@@ -52,28 +53,138 @@ class InvertedIndex:
     @classmethod
     def from_saved(cls: Type[T], saved_dir: str) -> T:
         with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
-            return pickle.load(f)
 @dataclass
 class BM25Index(InvertedIndex):
     @staticmethod
     def cache_term_weights(
-        posting_lists: List[PostingList], total_docs: int, avgdl: float, dfs: List[int], dls: List[int], k1: float, b: float,
     ) -> None:
         N = total_docs
-        for tid, posting_list in enumerate(posting_lists):
             idf = BM25Index.calc_idf(df=dfs[tid], N=N)
-            for i, docid in enumerate(posting_list.docid_postings):
                 tf = posting_list.tweight_postings[i]
                 dl = dls[docid]
-                posting_list.tweight_postings[i] = BM25Index.calc_regularized_tf(
                     tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
-                ) * idf
     @staticmethod
-    def calc_regularized_tf(tf: int, dl: float, avgdl: float, k1: float, b: float) -> float:
         return tf / (tf + k1 * (1 - b + b * dl / avgdl))
     @staticmethod
@@ -82,54 +193,149 @@ class BM25Index(InvertedIndex):
     @classmethod
     def build_from_documents(
-        cls: Type["BM25Index"], documents: List[Document], avgdl: float, total_docs: int, k1: float = 0.9, b: float = 0.4
-    ) -> "BM25Index":
-        # Assume run_counting() is defined to return counting object with relevant data
-        counting = run_counting(documents, simple_tokenize)
-        BM25Index.cache_term_weights(counting.posting_lists, total_docs, avgdl, counting.dfs, counting.dls, k1, b)
-        return cls(counting.posting_lists, counting.vocab, counting.cid2docid, counting.collection_ids, counting.doc_texts)
-class BM25Retriever(BaseRetriever):
     def __init__(self, index_dir: str) -> None:
-        self.index = BM25Index.from_saved(index_dir)
     def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
-        toks = simple_tokenize(query)
-        docid2score = Counter()
         for tok in toks:
-            if tok in self.index.vocab:
-                tid = self.index.vocab[tok]
-                posting_list = self.index.posting_lists[tid]
-                for docid, weight in zip(posting_list.docid_postings, posting_list.tweight_postings):
-                    docid2score[docid] += weight
         return {
-            self.index.collection_ids[docid]: score for docid, score in docid2score.most_common(topk)
         }
-# Gradio app setup
 class Hit(TypedDict):
-    cid: str
-    score: float
-    text: str
 def search_sciq(query: str) -> List[Hit]:
     results = bm25_retriever.retrieve(query)
-    hits = []
     for cid, score in results.items():
-        docid = bm25_retriever.index.cid2docid[cid]
-        text = bm25_retriever.index.doc_texts[docid]
-        hits.append(Hit(cid=cid, score=score, text=text))
-    return hits
-bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
 demo = gr.Interface(
     fn=search_sciq,
     inputs="textbox",
-    outputs="json",
     description="BM25 Search Engine Demo on SciQ Dataset"
 )
-if __name__ == "__main__":
-    demo.launch()

 # -*- coding: utf-8 -*-
 from dataclasses import dataclass
 import pickle
+import os
+from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar
+from nlp4web_codebase.ir.data_loaders.dm import Document
 from collections import Counter
+import tqdm
+import re
 import nltk
+nltk.download("stopwords", quiet=True)
 from nltk.corpus import stopwords as nltk_stopwords
 LANGUAGE = "english"
 word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
+stopwords = set(nltk_stopwords.words(LANGUAGE))
+def word_splitting(text: str) -> List[str]:
+    return word_splitter(text.lower())
+def lemmatization(words: List[str]) -> List[str]:
+    return words  # We ignore lemmatization here for simplicity
 def simple_tokenize(text: str) -> List[str]:
+    words = word_splitting(text)
+    tokenized = list(filter(lambda w: w not in stopwords, words))
+    tokenized = lemmatization(tokenized)
     return tokenized
+T = TypeVar("T", bound="InvertedIndex")
 @dataclass
 class PostingList:
+    term: str  # The term
+    docid_postings: List[int]  # docid_postings[i] means the docid (int) of the i-th associated posting
+    tweight_postings: List[float]  # tweight_postings[i] means the term weight (float) of the i-th associated posting
 @dataclass
 class InvertedIndex:
+    posting_lists: List[PostingList]  # docid -> posting_list
     vocab: Dict[str, int]
+    cid2docid: Dict[str, int]  # collection_id -> docid
+    collection_ids: List[str]  # docid -> collection_id
+    doc_texts: Optional[List[str]] = None  # docid -> document text
     def save(self, output_dir: str) -> None:
         os.makedirs(output_dir, exist_ok=True)
     @classmethod
     def from_saved(cls: Type[T], saved_dir: str) -> T:
+        index = cls(
+            posting_lists=[], vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
+        )
         with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
+            index = pickle.load(f)
+        return index
+# The output of the counting function:
+@dataclass
+class Counting:
+    posting_lists: List[PostingList]
+    vocab: Dict[str, int]
+    cid2docid: Dict[str, int]
+    collection_ids: List[str]
+    dfs: List[int]  # tid -> df
+    dls: List[int]  # docid -> doc length
+    avgdl: float
+    nterms: int
+    doc_texts: Optional[List[str]] = None
+def run_counting(
+    documents: Iterable[Document],
+    tokenize_fn: Callable[[str], List[str]] = simple_tokenize,
+    store_raw: bool = True,  # store the document text in doc_texts
+    ndocs: Optional[int] = None,
+    show_progress_bar: bool = True,
+) -> Counting:
+    """Counting TFs, DFs, doc_lengths, etc."""
+    posting_lists: List[PostingList] = []
+    vocab: Dict[str, int] = {}
+    cid2docid: Dict[str, int] = {}
+    collection_ids: List[str] = []
+    dfs: List[int] = []  # tid -> df
+    dls: List[int] = []  # docid -> doc length
+    nterms: int = 0
+    doc_texts: Optional[List[str]] = []
+    for doc in tqdm.tqdm(
+        documents,
+        desc="Counting",
+        total=ndocs,
+        disable=not show_progress_bar,
+    ):
+        if doc.collection_id in cid2docid:
+            continue
+        collection_ids.append(doc.collection_id)
+        docid = cid2docid.setdefault(doc.collection_id, len(cid2docid))
+        toks = tokenize_fn(doc.text)
+        tok2tf = Counter(toks)
+        dls.append(sum(tok2tf.values()))
+        for tok, tf in tok2tf.items():
+            nterms += tf
+            tid = vocab.get(tok, None)
+            if tid is None:
+                posting_lists.append(
+                    PostingList(term=tok, docid_postings=[], tweight_postings=[])
+                )
+                tid = vocab.setdefault(tok, len(vocab))
+            posting_lists[tid].docid_postings.append(docid)
+            posting_lists[tid].tweight_postings.append(tf)
+            if tid < len(dfs):
+                dfs[tid] += 1
+            else:
+                dfs.append(0)
+        if store_raw:
+            doc_texts.append(doc.text)
+        else:
+            doc_texts = None
+    return Counting(
+        posting_lists=posting_lists,
+        vocab=vocab,
+        cid2docid=cid2docid,
+        collection_ids=collection_ids,
+        dfs=dfs,
+        dls=dls,
+        avgdl=sum(dls) / len(dls),
+        nterms=nterms,
+        doc_texts=doc_texts,
+    )
+from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+sciq = load_sciq()
+counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
+"""### BM25 Index"""
+from __future__ import annotations
+from dataclasses import asdict, dataclass
+import math
+import os
+from typing import Iterable, List, Optional, Type
+import tqdm
+from nlp4web_codebase.ir.data_loaders.dm import Document
 @dataclass
 class BM25Index(InvertedIndex):
+    @staticmethod
+    def tokenize(text: str) -> List[str]:
+        return simple_tokenize(text)
     @staticmethod
     def cache_term_weights(
+        posting_lists: List[PostingList],
+        total_docs: int,
+        avgdl: float,
+        dfs: List[int],
+        dls: List[int],
+        k1: float,
+        b: float,
     ) -> None:
+        """Compute term weights and caching"""
         N = total_docs
+        for tid, posting_list in enumerate(
+            tqdm.tqdm(posting_lists, desc="Regularizing TFs")
+        ):
             idf = BM25Index.calc_idf(df=dfs[tid], N=N)
+            for i in range(len(posting_list.docid_postings)):
+                docid = posting_list.docid_postings[i]
                 tf = posting_list.tweight_postings[i]
                 dl = dls[docid]
+                regularized_tf = BM25Index.calc_regularized_tf(
                     tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
+                )
+                posting_list.tweight_postings[i] = regularized_tf * idf
     @staticmethod
+    def calc_regularized_tf(
+        tf: int, dl: float, avgdl: float, k1: float, b: float
+    ) -> float:
         return tf / (tf + k1 * (1 - b + b * dl / avgdl))
     @staticmethod
     @classmethod
     def build_from_documents(
+        cls: Type[BM25Index],
+        documents: Iterable[Document],
+        store_raw: bool = True,
+        output_dir: Optional[str] = None,
+        ndocs: Optional[int] = None,
+        show_progress_bar: bool = True,
+        k1: float = 0.9,
+        b: float = 0.4,
+    ) -> BM25Index:
+        # Counting TFs, DFs, doc_lengths, etc.:
+        counting = run_counting(
+            documents=documents,
+            tokenize_fn=BM25Index.tokenize,
+            store_raw=store_raw,
+            ndocs=ndocs,
+            show_progress_bar=show_progress_bar,
+        )
+        # Compute term weights and caching:
+        posting_lists = counting.posting_lists
+        total_docs = len(counting.cid2docid)
+        BM25Index.cache_term_weights(
+            posting_lists=posting_lists,
+            total_docs=total_docs,
+            avgdl=counting.avgdl,
+            dfs=counting.dfs,
+            dls=counting.dls,
+            k1=k1,
+            b=b,
+        )
+        # Assembly and save:
+        index = BM25Index(
+            posting_lists=posting_lists,
+            vocab=counting.vocab,
+            cid2docid=counting.cid2docid,
+            collection_ids=counting.collection_ids,
+            doc_texts=counting.doc_texts,
+        )
+        return index
+"""### BM25 Retriever"""
+from nlp4web_codebase.ir.models import BaseRetriever
+from typing import Type
+from abc import abstractmethod
+class BaseInvertedIndexRetriever(BaseRetriever):
+    @property
+    @abstractmethod
+    def index_class(self) -> Type[InvertedIndex]:
+        pass
     def __init__(self, index_dir: str) -> None:
+        self.index = self.index_class.from_saved(index_dir)
+    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
+        toks = self.index.tokenize(query)
+        target_docid = self.index.cid2docid[cid]
+        term_weights = {}
+        for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            posting_list = self.index.posting_lists[tid]
+            for docid, tweight in zip(
+                posting_list.docid_postings, posting_list.tweight_postings
+            ):
+                if docid == target_docid:
+                    term_weights[tok] = tweight
+                    break
+        return term_weights
+    def score(self, query: str, cid: str) -> float:
+        return sum(self.get_term_weights(query=query, cid=cid).values())
     def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
+        toks = self.index.tokenize(query)
+        docid2score: Dict[int, float] = {}
         for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            posting_list = self.index.posting_lists[tid]
+            for docid, tweight in zip(
+                posting_list.docid_postings, posting_list.tweight_postings
+            ):
+                docid2score.setdefault(docid, 0)
+                docid2score[docid] += tweight
+        docid2score = dict(
+            sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
+        )
         return {
+            self.index.collection_ids[docid]: score
+            for docid, score in docid2score.items()
         }
+class BM25Retriever(BaseInvertedIndexRetriever):
+    @property
+    def index_class(self) -> Type[BM25Index]:
+        return BM25Index
+import gradio as gr
+from typing import TypedDict
 class Hit(TypedDict):
+  cid: str
+  score: float
+  text: str
+demo: Optional[gr.Interface] = None  # Assign your gradio demo to this variable
+return_type = List[Hit]
+## YOUR_CODE_STARTS_HERE
+bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True
+)
+bm25_index.save("output/bm25_index")
+bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
 def search_sciq(query: str) -> List[Hit]:
     results = bm25_retriever.retrieve(query)
+    hitlist = []
     for cid, score in results.items():
+        index = bm25_retriever.index.cid2docid[cid]
+        text = bm25_retriever.index.doc_texts[index]
+        hitlist.append(Hit(cid=cid, score=score, text=text))
+    return hitlist
 demo = gr.Interface(
     fn=search_sciq,
     inputs="textbox",
+    outputs="textbox",
     description="BM25 Search Engine Demo on SciQ Dataset"
 )
+## YOUR_CODE_ENDS_HERE
+demo.launch()

nlp4web-codebase ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 83f9afbbf7e372c116fdd04997a96449007f861f