|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import logging |
|
from typing import List, Dict |
|
|
|
from pyserini.pyclass import autoclass |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
JLuceneIndexer = autoclass('io.anserini.index.SimpleIndexer') |
|
JsonCollectionDocument = autoclass('io.anserini.collection.JsonCollection$Document') |
|
JacksonObjectMapper = autoclass('com.fasterxml.jackson.databind.ObjectMapper') |
|
JacksonJsonNode = autoclass('com.fasterxml.jackson.databind.JsonNode') |
|
|
|
|
|
class LuceneIndexer: |
|
"""Wrapper class for ``SimpleIndexer`` in Anserini. Provides basic functionality for on-the-fly indexing via a |
|
programmatic API, i.e., indexing in-process objects as opposed to on-file documents. |
|
|
|
Parameters |
|
---------- |
|
index_dir : str |
|
Path to Lucene index directory. |
|
args : List[str] |
|
List of arguments to pass to ``SimpleIndexer``. |
|
append : bool |
|
Append to existing index. |
|
threads : int |
|
Number of indexing threads. |
|
""" |
|
def __init__(self, index_dir: str = None, args: List[str] = None, append: bool = False, threads: int = 8): |
|
self.index_dir = index_dir |
|
self.args = args |
|
if args: |
|
args.extend(['-input', '', '-collection', 'JsonCollection', '-threads', str(threads)]) |
|
if append: |
|
args.extend(['-append']) |
|
self.object = JLuceneIndexer(args) |
|
else: |
|
self.object = JLuceneIndexer(index_dir, append, int(threads)) |
|
|
|
self.mapper = JacksonObjectMapper() |
|
|
|
def add_doc_raw(self, doc: str): |
|
"""Add a raw document (in the form of a JSON string) to the index. |
|
|
|
Parameters |
|
---------- |
|
doc : str |
|
Document to add. |
|
""" |
|
self.object.addRawDocument(doc) |
|
|
|
def add_doc_dict(self, doc: Dict[str, str]): |
|
"""Add a document (in the form of a Python dictionary) to the index. |
|
|
|
Parameters |
|
---------- |
|
doc : Dict[str, str] |
|
Document to add. |
|
""" |
|
self.object.addJsonDocument(JsonCollectionDocument.fromFields(doc['id'], doc['contents'])) |
|
|
|
def add_doc_json(self, node: JacksonJsonNode): |
|
"""Add a document (in the form of a Jackson JSON node object) to the index. |
|
|
|
Parameters |
|
---------- |
|
node : JacksonJsonNode |
|
Document to add. |
|
""" |
|
self.object.addJsonNode(node) |
|
|
|
def add_batch_raw(self, docs: List[str]): |
|
"""Add a batch of raw documents (in the form of JSON strings) to the index. |
|
|
|
Parameters |
|
---------- |
|
docs : List[str] |
|
Documents to add. |
|
""" |
|
self.object.addRawDocuments(docs) |
|
|
|
def add_batch_dict(self, docs: List[Dict[str, str]]): |
|
"""Add a batch of documents (in the form of Python dictionaries) to the index. |
|
|
|
Parameters |
|
---------- |
|
docs : List[Dict[str, str]] |
|
Documents to add. |
|
""" |
|
docs = list(map(lambda d: JsonCollectionDocument.fromFields(d['id'], d['contents']), docs)) |
|
self.object.addJsonDocuments(docs) |
|
|
|
def add_batch_json(self, nodes: List[JacksonJsonNode]): |
|
"""Add a batch of documents (in the form of Jackson JSON node objects) to the index. |
|
|
|
Parameters |
|
---------- |
|
nodes : List[JacksonJsonNode] |
|
Documents to add. |
|
""" |
|
self.object.addJsonNodes(nodes) |
|
|
|
def close(self): |
|
"""Close this indexer, committing all in-memory data to disk.""" |
|
self.object.close() |
|
|