Spaces:

ss567uhg
/

nlpWeb

Sleeping

App Files Files Community

Sophia Koehler commited on Nov 11, 2024

Commit

e39c176

1 Parent(s): e8df6fa

fix2

Browse files

Files changed (1) hide show

app.py +61 -431

app.py CHANGED Viewed

@@ -1,58 +1,49 @@
 # -*- coding: utf-8 -*-
-"""## Pre-requisite code
-The code within this section will be used in the tasks. Please do not change these code lines.
-### SciQ loading and counting
-"""
 from dataclasses import dataclass
-import pickle
 import os
-from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar
-from nlp4web_codebase.ir.data_loaders.dm import Document
-from collections import Counter
-import tqdm
 import re
 import nltk
-nltk.download("stopwords", quiet=True)
 from nltk.corpus import stopwords as nltk_stopwords
 LANGUAGE = "english"
-word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
 stopwords = set(nltk_stopwords.words(LANGUAGE))
-def word_splitting(text: str) -> List[str]:
-    return word_splitter(text.lower())
-def lemmatization(words: List[str]) -> List[str]:
-    return words  # We ignore lemmatization here for simplicity
 def simple_tokenize(text: str) -> List[str]:
-    words = word_splitting(text)
-    tokenized = list(filter(lambda w: w not in stopwords, words))
-    tokenized = lemmatization(tokenized)
     return tokenized
-T = TypeVar("T", bound="InvertedIndex")
 @dataclass
 class PostingList:
-    term: str  # The term
-    docid_postings: List[int]  # docid_postings[i] means the docid (int) of the i-th associated posting
-    tweight_postings: List[float]  # tweight_postings[i] means the term weight (float) of the i-th associated posting
 @dataclass
 class InvertedIndex:
-    posting_lists: List[PostingList]  # docid -> posting_list
     vocab: Dict[str, int]
-    cid2docid: Dict[str, int]  # collection_id -> docid
-    collection_ids: List[str]  # docid -> collection_id
-    doc_texts: Optional[List[str]] = None  # docid -> document text
     def save(self, output_dir: str) -> None:
         os.makedirs(output_dir, exist_ok=True)
@@ -61,138 +52,28 @@ class InvertedIndex:
     @classmethod
     def from_saved(cls: Type[T], saved_dir: str) -> T:
-        index = cls(
-            posting_lists=[], vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
-        )
         with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
-            index = pickle.load(f)
-        return index
-# The output of the counting function:
-@dataclass
-class Counting:
-    posting_lists: List[PostingList]
-    vocab: Dict[str, int]
-    cid2docid: Dict[str, int]
-    collection_ids: List[str]
-    dfs: List[int]  # tid -> df
-    dls: List[int]  # docid -> doc length
-    avgdl: float
-    nterms: int
-    doc_texts: Optional[List[str]] = None
-def run_counting(
-    documents: Iterable[Document],
-    tokenize_fn: Callable[[str], List[str]] = simple_tokenize,
-    store_raw: bool = True,  # store the document text in doc_texts
-    ndocs: Optional[int] = None,
-    show_progress_bar: bool = True,
-) -> Counting:
-    """Counting TFs, DFs, doc_lengths, etc."""
-    posting_lists: List[PostingList] = []
-    vocab: Dict[str, int] = {}
-    cid2docid: Dict[str, int] = {}
-    collection_ids: List[str] = []
-    dfs: List[int] = []  # tid -> df
-    dls: List[int] = []  # docid -> doc length
-    nterms: int = 0
-    doc_texts: Optional[List[str]] = []
-    for doc in tqdm.tqdm(
-        documents,
-        desc="Counting",
-        total=ndocs,
-        disable=not show_progress_bar,
-    ):
-        if doc.collection_id in cid2docid:
-            continue
-        collection_ids.append(doc.collection_id)
-        docid = cid2docid.setdefault(doc.collection_id, len(cid2docid))
-        toks = tokenize_fn(doc.text)
-        tok2tf = Counter(toks)
-        dls.append(sum(tok2tf.values()))
-        for tok, tf in tok2tf.items():
-            nterms += tf
-            tid = vocab.get(tok, None)
-            if tid is None:
-                posting_lists.append(
-                    PostingList(term=tok, docid_postings=[], tweight_postings=[])
-                )
-                tid = vocab.setdefault(tok, len(vocab))
-            posting_lists[tid].docid_postings.append(docid)
-            posting_lists[tid].tweight_postings.append(tf)
-            if tid < len(dfs):
-                dfs[tid] += 1
-            else:
-                dfs.append(0)
-        if store_raw:
-            doc_texts.append(doc.text)
-        else:
-            doc_texts = None
-    return Counting(
-        posting_lists=posting_lists,
-        vocab=vocab,
-        cid2docid=cid2docid,
-        collection_ids=collection_ids,
-        dfs=dfs,
-        dls=dls,
-        avgdl=sum(dls) / len(dls),
-        nterms=nterms,
-        doc_texts=doc_texts,
-    )
-from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
-sciq = load_sciq()
-counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
-"""### BM25 Index"""
-from __future__ import annotations
-from dataclasses import asdict, dataclass
-import math
-import os
-from typing import Iterable, List, Optional, Type
-import tqdm
-from nlp4web_codebase.ir.data_loaders.dm import Document
 @dataclass
 class BM25Index(InvertedIndex):
-    @staticmethod
-    def tokenize(text: str) -> List[str]:
-        return simple_tokenize(text)
     @staticmethod
     def cache_term_weights(
-        posting_lists: List[PostingList],
-        total_docs: int,
-        avgdl: float,
-        dfs: List[int],
-        dls: List[int],
-        k1: float,
-        b: float,
     ) -> None:
-        """Compute term weights and caching"""
         N = total_docs
-        for tid, posting_list in enumerate(
-            tqdm.tqdm(posting_lists, desc="Regularizing TFs")
-        ):
             idf = BM25Index.calc_idf(df=dfs[tid], N=N)
-            for i in range(len(posting_list.docid_postings)):
-                docid = posting_list.docid_postings[i]
                 tf = posting_list.tweight_postings[i]
                 dl = dls[docid]
-                regularized_tf = BM25Index.calc_regularized_tf(
                     tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
-                )
-                posting_list.tweight_postings[i] = regularized_tf * idf
     @staticmethod
-    def calc_regularized_tf(
-        tf: int, dl: float, avgdl: float, k1: float, b: float
-    ) -> float:
         return tf / (tf + k1 * (1 - b + b * dl / avgdl))
     @staticmethod
@@ -201,305 +82,54 @@ class BM25Index(InvertedIndex):
     @classmethod
     def build_from_documents(
-        cls: Type[BM25Index],
-        documents: Iterable[Document],
-        store_raw: bool = True,
-        output_dir: Optional[str] = None,
-        ndocs: Optional[int] = None,
-        show_progress_bar: bool = True,
-        k1: float = 0.9,
-        b: float = 0.4,
     ) -> BM25Index:
-        # Counting TFs, DFs, doc_lengths, etc.:
-        counting = run_counting(
-            documents=documents,
-            tokenize_fn=BM25Index.tokenize,
-            store_raw=store_raw,
-            ndocs=ndocs,
-            show_progress_bar=show_progress_bar,
-        )
-        # Compute term weights and caching:
-        posting_lists = counting.posting_lists
-        total_docs = len(counting.cid2docid)
-        BM25Index.cache_term_weights(
-            posting_lists=posting_lists,
-            total_docs=total_docs,
-            avgdl=counting.avgdl,
-            dfs=counting.dfs,
-            dls=counting.dls,
-            k1=k1,
-            b=b,
-        )
-        # Assembly and save:
-        index = BM25Index(
-            posting_lists=posting_lists,
-            vocab=counting.vocab,
-            cid2docid=counting.cid2docid,
-            collection_ids=counting.collection_ids,
-            doc_texts=counting.doc_texts,
-        )
-        return index
-bm25_index = BM25Index.build_from_documents(
-    documents=iter(sciq.corpus),
-    ndocs=12160,
-    show_progress_bar=True,
-)
-bm25_index.save("output/bm25_index")
-!ls
-"""### BM25 Retriever"""
-from nlp4web_codebase.ir.models import BaseRetriever
-from typing import Type
-from abc import abstractmethod
-class BaseInvertedIndexRetriever(BaseRetriever):
-    @property
-    @abstractmethod
-    def index_class(self) -> Type[InvertedIndex]:
-        pass
     def __init__(self, index_dir: str) -> None:
-        self.index = self.index_class.from_saved(index_dir)
-    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
-        toks = self.index.tokenize(query)
-        target_docid = self.index.cid2docid[cid]
-        term_weights = {}
-        for tok in toks:
-            if tok not in self.index.vocab:
-                continue
-            tid = self.index.vocab[tok]
-            posting_list = self.index.posting_lists[tid]
-            for docid, tweight in zip(
-                posting_list.docid_postings, posting_list.tweight_postings
-            ):
-                if docid == target_docid:
-                    term_weights[tok] = tweight
-                    break
-        return term_weights
-    def score(self, query: str, cid: str) -> float:
-        return sum(self.get_term_weights(query=query, cid=cid).values())
     def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
-        toks = self.index.tokenize(query)
-        docid2score: Dict[int, float] = {}
         for tok in toks:
-            if tok not in self.index.vocab:
-                continue
-            tid = self.index.vocab[tok]
-            posting_list = self.index.posting_lists[tid]
-            for docid, tweight in zip(
-                posting_list.docid_postings, posting_list.tweight_postings
-            ):
-                docid2score.setdefault(docid, 0)
-                docid2score[docid] += tweight
-        docid2score = dict(
-            sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
-        )
         return {
-            self.index.collection_ids[docid]: score
-            for docid, score in docid2score.items()
         }
-class BM25Retriever(BaseInvertedIndexRetriever):
-    @property
-    def index_class(self) -> Type[BM25Index]:
-        return BM25Index
-bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
-bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")
-"""# TASK1: tune b and k1 (4 points)
-Tune b and k1 on the **dev** split of SciQ using the metric MAP@10. The evaluation function (`evalaute_map`) is provided. Record the values in `plots_k1` and `plots_b`. Do it in a greedy manner: as the influence from b is larger, please first tune b (with k1 fixed to the default value 0.9) and use the best value of b to further tune k1.
-$${\displaystyle {\text{score}}(D,Q)=\sum _{i=1}^{n}{\text{IDF}}(q_{i})\cdot {\frac {f(q_{i},D)\cdot (k_{1}+1)}{f(q_{i},D)+k_{1}\cdot \left(1-b+b\cdot {\frac {|D|}{\text{avgdl}}}\right)}}}$$
-"""
-from nlp4web_codebase.ir.data_loaders import Split
-import pytrec_eval
-def evaluate_map(rankings: Dict[str, Dict[str, float]], split=Split.dev) -> float:
-  metric = "map_cut_10"
-  qrels = sciq.get_qrels_dict(split)
-  evaluator = pytrec_eval.RelevanceEvaluator(sciq.get_qrels_dict(split), (metric,))
-  qps = evaluator.evaluate(rankings)
-  return float(np.mean([qp[metric] for qp in qps.values()]))
-"""Example of using the pre-requisite code:"""
-# Loading dataset:
-from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
-sciq = load_sciq()
-counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
-# Building BM25 index and save:
-bm25_index = BM25Index.build_from_documents(
-    documents=iter(sciq.corpus),
-    ndocs=12160,
-    show_progress_bar=True
-)
-bm25_index.save("output/bm25_index")
-# Loading index and use BM25 retriever to retrieve:
-bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
-print(bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?"))  # the ranking
-plots_b: Dict[str, List[float]] = {
-    "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
-    "Y": []
-}
-plots_k1: Dict[str, List[float]] = {
-    "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
-    "Y": []
-}
-## YOUR_CODE_STARTS_HERE
-class MyBMIndex(BM25Index):
-    @staticmethod
-    def calc_regularized_tf(
-        tf: int, dl: float, avgdl: float, k1: float, b: float
-    ) -> float:
-        return tf * (k1 + 1) / (tf + k1 * (1 - b + b * (dl / avgdl)**1.5))
-    @staticmethod
-    def calc_idf(df: int, N: int):
-        return math.log((N + 1) / (df + 0.5)) + 1
-import numpy as np
-# Two steps should be involved:
-# Step 1. Fix k1 value to the default one 0.9,
-# go through all the candidate b values (0, 0.1, ..., 1.0),
-# and record in plots_b["Y"] the corresponding performances obtained via evaluate_map;
-# Step 2. Fix b to the best one in step 1. and do the same for k1.
-# Hint (on using the pre-requisite code):
-# - One can use the loaded sciq dataset directly (loaded in the pre-requisite code);
-# - One can build bm25_index with `BM25Index.build_from_documents`;
-# - One can use BM25Retriever to load the index and perform retrieval on the dev queries
-# (dev queries can be obtained via sciq.get_split_queries(Split.dev))
-counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
-def get_ranking(k1, b, counting) -> Dict[str, Dict[str, float]]:
-  # Building BM25 index and save:
-  bm25_index = MyBMIndex.build_from_documents(
-      documents=iter(sciq.corpus),
-      ndocs=12160,
-      show_progress_bar=True,
-      k1=k1,
-      b=b
-  )
-  bm25_index.save("output/bm25_index")
-  # Loading index and use BM25 retriever to retrieve:
-  bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
-  query_terms = sciq.get_split_queries(split= Split.dev)
-  rankings = {}
-  for query in query_terms:
-    ranking = bm25_retriever.retrieve(query=query.text)
-    rankings[query.query_id] = ranking
-  return rankings
-for b in plots_b["X"]:
-  ranking = get_ranking(0.9, b, counting)
-  plots_b["Y"].append(evaluate_map(rankings=ranking))
-max_b = np.max(plots_b["Y"])
-for k1 in plots_k1["X"]:
-  ranking = get_ranking(k1, max_b, counting)
-  plots_k1["Y"].append(evaluate_map(rankings=ranking))
-## YOU_CODE_ENDS_HERE
-## TEST_CASES (should be close to 0.8135637188208616 and 0.7512916099773244)
-print(plots_k1["Y"][9])
-print(plots_b["Y"][1])
-## RESULT_CHECKING_POINT
-print(plots_k1)
-print(plots_b)
-from matplotlib import pyplot as plt
-plt.plot(plots_b["X"], plots_b["Y"], label="b")
-plt.plot(plots_k1["X"], plots_k1["Y"], label="k1")
-plt.ylabel("MAP")
-plt.legend()
-plt.grid()
-plt.show()
-"""Let's check the effectiveness gain on test after this tuning on dev"""
-default_map = 0.7849
-best_b = plots_b["X"][np.argmax(plots_b["Y"])]
-best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])]
-bm25_index = BM25Index.build_from_documents(
-    documents=iter(sciq.corpus),
-    ndocs=12160,
-    show_progress_bar=True,
-    k1=best_k1,
-    b=best_b
-)
-bm25_index.save("output/bm25_index")
-bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
-rankings = {}
-for query in sciq.get_split_queries(Split.test):  # note this is now on test
-  ranking = bm25_retriever.retrieve(query=query.text)
-  rankings[query.query_id] = ranking
-optimized_map = evaluate_map(rankings, split=Split.test)  # note this is now on test
-print(default_map, optimized_map)
-"""# TASK3: a search-engine demo based on Huggingface space (4 points)
-## TASK3.1: create the gradio app (2 point)
-Create a gradio app to demo the BM25 search engine index on SciQ. The app should have a single input variable for the query (of type `str`) and a single output variable for the returned ranking (of type `List[Hit]` in the code below). Please use the BM25 system with default k1 and b values.
-Hint: it should use a "search" function of signature:
-```python
-def search(query: str) -> List[Hit]:
-  ...
-```
-"""
-import gradio as gr
-from typing import TypedDict
 class Hit(TypedDict):
-  cid: str
-  score: float
-  text: str
-demo: Optional[gr.Interface] = None  # Assign your gradio demo to this variable
-return_type = List[Hit]
-## YOUR_CODE_STARTS_HERE
 def search_sciq(query: str) -> List[Hit]:
     results = bm25_retriever.retrieve(query)
-    hitlist = []
     for cid, score in results.items():
-        index = bm25_retriever.index.cid2docid[cid]
-        text = bm25_retriever.index.doc_texts[index]
-        hitlist.append(Hit(cid=cid, score=score, text=text))
-    return hitlist
 demo = gr.Interface(
     fn=search_sciq,
     inputs="textbox",
-    outputs="textbox",
     description="BM25 Search Engine Demo on SciQ Dataset"
 )
-## YOUR_CODE_ENDS_HERE
-demo.launch()

 # -*- coding: utf-8 -*-
 from dataclasses import dataclass
 import os
+import pickle
+from typing import List, Dict, Optional, Type, TypeVar, TypedDict
 import re
+import math
+from collections import Counter
+import gradio as gr
 import nltk
+from nlp4web_codebase.ir.data_loaders.dm import Document
+from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+from nlp4web_codebase.ir.models import BaseRetriever
 from nltk.corpus import stopwords as nltk_stopwords
+# Check nltk stopwords data
+try:
+    nltk.data.find("corpora/stopwords")
+except LookupError:
+    nltk.download("stopwords", quiet=True)
+# Tokenization and helper functions
 LANGUAGE = "english"
 stopwords = set(nltk_stopwords.words(LANGUAGE))
+word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
 def simple_tokenize(text: str) -> List[str]:
+    words = word_splitter(text.lower())
+    tokenized = [word for word in words if word not in stopwords]
     return tokenized
 @dataclass
 class PostingList:
+    term: str
+    docid_postings: List[int]
+    tweight_postings: List[float]
+T = TypeVar("T", bound="InvertedIndex")
 @dataclass
 class InvertedIndex:
+    posting_lists: List[PostingList]
     vocab: Dict[str, int]
+    cid2docid: Dict[str, int]
+    collection_ids: List[str]
+    doc_texts: Optional[List[str]] = None
     def save(self, output_dir: str) -> None:
         os.makedirs(output_dir, exist_ok=True)
     @classmethod
     def from_saved(cls: Type[T], saved_dir: str) -> T:
         with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
+            return pickle.load(f)
 @dataclass
 class BM25Index(InvertedIndex):
     @staticmethod
     def cache_term_weights(
+        posting_lists: List[PostingList], total_docs: int, avgdl: float, dfs: List[int], dls: List[int], k1: float, b: float,
     ) -> None:
         N = total_docs
+        for tid, posting_list in enumerate(posting_lists):
             idf = BM25Index.calc_idf(df=dfs[tid], N=N)
+            for i, docid in enumerate(posting_list.docid_postings):
                 tf = posting_list.tweight_postings[i]
                 dl = dls[docid]
+                posting_list.tweight_postings[i] = BM25Index.calc_regularized_tf(
                     tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
+                ) * idf
     @staticmethod
+    def calc_regularized_tf(tf: int, dl: float, avgdl: float, k1: float, b: float) -> float:
         return tf / (tf + k1 * (1 - b + b * dl / avgdl))
     @staticmethod
     @classmethod
     def build_from_documents(
+        cls: Type[BM25Index], documents: List[Document], avgdl: float, total_docs: int, k1: float = 0.9, b: float = 0.4
     ) -> BM25Index:
+        # Assume run_counting() is defined to return counting object with relevant data
+        counting = run_counting(documents, simple_tokenize)
+        BM25Index.cache_term_weights(counting.posting_lists, total_docs, avgdl, counting.dfs, counting.dls, k1, b)
+        return cls(counting.posting_lists, counting.vocab, counting.cid2docid, counting.collection_ids, counting.doc_texts)
+class BM25Retriever(BaseRetriever):
     def __init__(self, index_dir: str) -> None:
+        self.index = BM25Index.from_saved(index_dir)
     def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
+        toks = simple_tokenize(query)
+        docid2score = Counter()
         for tok in toks:
+            if tok in self.index.vocab:
+                tid = self.index.vocab[tok]
+                posting_list = self.index.posting_lists[tid]
+                for docid, weight in zip(posting_list.docid_postings, posting_list.tweight_postings):
+                    docid2score[docid] += weight
         return {
+            self.index.collection_ids[docid]: score for docid, score in docid2score.most_common(topk)
         }
+# Gradio app setup
 class Hit(TypedDict):
+    cid: str
+    score: float
+    text: str
 def search_sciq(query: str) -> List[Hit]:
     results = bm25_retriever.retrieve(query)
+    hits = []
     for cid, score in results.items():
+        docid = bm25_retriever.index.cid2docid[cid]
+        text = bm25_retriever.index.doc_texts[docid]
+        hits.append(Hit(cid=cid, score=score, text=text))
+    return hits
+bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
 demo = gr.Interface(
     fn=search_sciq,
     inputs="textbox",
+    outputs="json",
     description="BM25 Search Engine Demo on SciQ Dataset"
 )
+if __name__ == "__main__":
+    demo.launch()