Spaces:

castorini
/

ONNX-Demo

Build error

App Files Files Community

ONNX-Demo / pyserini /fusion /_base.py

ArthurChen189

upload pyserini

62977bb over 1 year ago

raw

history blame contribute delete

4.4 kB

	#
	# Pyserini: Reproducible IR research with sparse and dense representations
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	from enum import Enum
	from pyserini.trectools import AggregationMethod, RescoreMethod, TrecRun
	from typing import List


	class FusionMethod(Enum):
	RRF = 'rrf'
	INTERPOLATION = 'interpolation'
	AVERAGE = 'average'


	def reciprocal_rank_fusion(runs: List[TrecRun], rrf_k: int = 60, depth: int = None, k: int = None):
	"""Perform reciprocal rank fusion on a list of ``TrecRun`` objects. Implementation follows Cormack et al.
	(SIGIR 2009) paper titled "Reciprocal Rank Fusion Outperforms Condorcet and Individual Rank Learning Methods."

	Parameters
	----------
	runs : List[TrecRun]
	List of ``TrecRun`` objects.
	rrf_k : int
	Parameter to avoid vanishing importance of lower-ranked documents. Note that this is different from the k in
	top k retrieval; set to 60 by default, per Cormack et al.
	depth : int
	Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that
	the complete list of results is considered.
	k : int
	Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents
	are ranked.

	Returns
	-------
	TrecRun
	Output ``TrecRun`` that combines input runs via reciprocal rank fusion.
	"""

	# TODO: Add option to not clone runs, thus making the method destructive, but also more efficient.
	rrf_runs = [run.clone().rescore(method=RescoreMethod.RRF, rrf_k=rrf_k) for run in runs]
	return TrecRun.merge(rrf_runs, AggregationMethod.SUM, depth=depth, k=k)


	def interpolation(runs: List[TrecRun], alpha: int = 0.5, depth: int = None, k: int = None):
	"""Perform fusion by interpolation on a list of exactly two ``TrecRun`` objects.
	new_score = first_run_score * alpha + (1 - alpha) * second_run_score.

	Parameters
	----------
	runs : List[TrecRun]
	List of ``TrecRun`` objects. Exactly two runs.
	alpha : int
	Parameter alpha will be applied on the first run and (1 - alpha) will be applied on the second run.
	depth : int
	Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that
	the complete list of results is considered.
	k : int
	Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents
	are ranked.

	Returns
	-------
	TrecRun
	Output ``TrecRun`` that combines input runs via interpolation.
	"""

	if len(runs) != 2:
	raise Exception('Interpolation must be performed on exactly two runs.')

	scaled_runs = []
	scaled_runs.append(runs[0].clone().rescore(method=RescoreMethod.SCALE, scale=alpha))
	scaled_runs.append(runs[1].clone().rescore(method=RescoreMethod.SCALE, scale=(1-alpha)))

	return TrecRun.merge(scaled_runs, AggregationMethod.SUM, depth=depth, k=k)


	def average(runs: List[TrecRun], depth: int = None, k: int = None):
	"""Perform fusion by averaging on a list of ``TrecRun`` objects.

	Parameters
	----------
	runs : List[TrecRun]
	List of ``TrecRun`` objects.
	depth : int
	Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that
	the complete list of results is considered.
	k : int
	Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents
	are ranked.

	Returns
	-------
	TrecRun
	Output ``TrecRun`` that combines input runs via averaging.
	"""

	scaled_runs = [run.clone().rescore(method=RescoreMethod.SCALE, scale=(1/len(runs))) for run in runs]
	return TrecRun.merge(scaled_runs, AggregationMethod.SUM, depth=depth, k=k)