Spaces:

svenwey
/

logmetric

Sleeping

File size: 12,915 Bytes

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TODO: Add a description here."""

import evaluate
import datasets
import re
import dateutil.parser
import numpy as np
from typing import List, Dict, Any

# Constant regex to get timestrings
timestamp_regex = r'^\s*\[?\s*(\d{4}[-/.]\d{2}[-/.]\d{2}(?:[ T]\d{2}[:]\d{2}(?:[:]\d{2}(?:[.,]\d+)?)?(?:Z|[+-]\d{2}[:]\d{2})?)?)\s*\]?\s*'
TIMESTAMP_PATTERN = re.compile(timestamp_regex, re.MULTILINE)

INT_PATTERN = re.compile(r'(-?\d+)')
FLOAT_PATTERN = re.compile(r'(-?\d+\.\d+)')
SACREBLEU_METRIC = evaluate.load("evaluate-metric/sacrebleu")

# TODO: Add BibTeX citation
_CITATION = """\
@InProceedings{huggingface:module,
title = {A great new module},
authors={huggingface, Inc.},
year={2020}
}
"""

# TODO: Add description of the module here
_DESCRIPTION = """\
This new module is designed to solve this great ML task and is crafted with a lot of care.
"""


# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Calculates how good are predictions given some references, using certain scores
Args:
    predictions: list of predictions to score. Each predictions
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
Returns:
    accuracy: description of the first score,
    another_score: description of the second score,
Examples:
    Examples should be written in doctest format, and should illustrate how
    to use the function.

    >>> my_new_module = evaluate.load("my_new_module")
    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
    >>> print(results)
    {'accuracy': 1.0}
"""

# TODO: Define external resources urls if needed
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class LogMetric(evaluate.Metric):
    """TODO: Short description of my evaluation module."""

    def _info(self):
        # TODO: Specifies the evaluate.EvaluationModuleInfo object
        return evaluate.MetricInfo(
            # This is the description that will appear on the modules page.
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            # Both prediction and reference are strings
            features=datasets.Features({
                "predictions": datasets.Value("string", id="sequence"),
                "references": datasets.Value("string", id="sequence"),
            }),
            # Homepage of the module for documentation
            homepage="http://module.homepage",
            # Additional links to the codebase or references
            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
            reference_urls=["http://path.to.reference.url/new_module"]
        )

    def _download_and_prepare(self, dl_manager):
        """Optional: download external resources useful to compute the scores"""
        # TODO: Download external resources if needed
        pass

    
    def _compute(self, predictions, references):
        # TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
        metric_dicts = [PredRefScore(p,r).run() for p,r in zip(predictions,references)]
        # Extract keys (assuming all dictionaries have the same keys)
        keys = metric_dicts[0].keys()
        
        # Convert list of dictionaries into a 2D numpy array
        values = np.array([list(d.values()) for d in metric_dicts])
        
        # Calculate the mean along the vertical axis (axis=0)
        mean_values = np.mean(values, axis=0)
        
        # a dictionary, matching the keys with their corresponding mean values
        metric_result = dict(zip(keys, mean_values))

        return metric_result
    

class PredRefScore:
    scores : Dict[str, float]= {}

    def __init__(self, prediction : str, reference: str) -> Dict[str, float]:
        self.reference = reference.strip(' \t\n\r')
        self.prediction = prediction.strip(' \t\n\r')
    
    def run(self):
        self.getLogMetric()
        return self.scores


    ##### Convenience Methods #####

    # TODO: also set pred_ts, ref_ts, pred_msgs and ref_msgs as fields
    
    # A score depending on the difference in length of two sentences
    def get_length_score(self, preds_split : List[Any], refs_split : List[Any]) -> float:
        pred_content_lengths = np.vectorize(len)(preds_split)
        ref_content_lengths = np.vectorize(len)(refs_split)

        return self.smapeScore(pred_content_lengths, ref_content_lengths)

    # helper function that computes the smape_score either between two numbers or two lists of numbers (must be the same length)
    def smapeScore(self, P, R) -> float:
        P_isnumber = isinstance(P, (int, float))
        R_isnumber = isinstance(R, (int, float))

        # either both must be numbers or both must be no number
        assert P_isnumber == R_isnumber

        if not P_isnumber:
            assert(len(P) == len(R))

        if P_isnumber and R_isnumber:
            if P == 0 and R == 0: 
                return 1.0      # since this leads to (|R| + |P|) = 0
            return 1 - (np.sum(np.abs(R - P) / (np.abs(R) + np.abs(P))))    # (n = 1)
        else:
            if len(P) == 0 and len(R) == 0:
                return 1.0     # since this leads to n = 0
            n = len(P)
            P = np.array(P)
            R = np.array(R)
            denominator = np.abs(R) + np.abs(P)
            # Replace zeros in the denominator with 1 to avoid division by zero.
            # the denominator[i] = 0 is only possible if R[i] == P[i] == 0, hence we can set denominator[i] = 1 and still achieve the result of 0 after division at index i
            denominator[denominator == 0] = 1

            return 1 - (1.0/n * np.sum(np.abs(R - P) / denominator))

    # Replaces numbers in a string with a placeholder
    def replaceNumbers(self, text : str) -> str:
        text = INT_PATTERN.sub(r'<|INT|>', text)
        text = FLOAT_PATTERN.sub(r'<|FLOAT|>', text)
        return text
    
    # Split all log-entries in timestamps and log-messages
    def split_log_entry(self, pred : str, ref: str):
        pred_lines = pred.splitlines()
        ref_lines = ref.splitlines()

        # One logentry always consists of timestamp + log-message
        pred_timestamps, pred_logMessages = [], []
        ref_timestamps, ref_logMessages = [], []

        for i in range(len(pred_lines)):
            if TIMESTAMP_PATTERN.match(pred_lines[i]) is not None:
                # try to match timestamp
                _, pred_ts, pred_msg = TIMESTAMP_PATTERN.split(pred_lines[i])
                pred_timestamps.append(pred_ts)
                pred_logMessages.append(pred_msg)
            else:
                # 0. space heuristic
                pred_msg = pred_lines[i]
                pred_logMessages.append(pred_msg)

        for i in range(len(ref_lines)):
            if TIMESTAMP_PATTERN.match(ref_lines[i]) is None:
                raise ValueError("The provided regex can't parse a timestamp in a reference log. Please make sure that the regex can parse a provided reference log format. Line: " + ref_lines[i])
            _, ref_ts, ref_msg = TIMESTAMP_PATTERN.split(ref_lines[i])
            ref_timestamps.append(ref_ts)
            ref_logMessages.append(ref_msg)

        # We extend the shorter list to the length of the longer one
        max_logentries = max(len(pred_logMessages), len(ref_logMessages))

        pred_logMessages += (max_logentries - len(pred_logMessages)) * [" "]
        ref_logMessages += (max_logentries- len(ref_logMessages)) * [" "]

        return pred_timestamps, pred_logMessages, ref_timestamps, ref_logMessages

    ##### Individual Setter Methods for Scores #####

    # splits both strings at \n and then computes the smape_score of their lengths
    def set_linecount_score(self, pred : str, ref : str) -> None:
        pred_lines_amt = len(pred.splitlines())
        ref_lines_amt = len(ref.splitlines())
        self.scores["linecount_difference_SMAPE_score"] = self.smapeScore(pred_lines_amt, ref_lines_amt)
    
    def set_sacrebleu_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
        sacrebleu_score = SACREBLEU_METRIC.compute(predictions=pred_log_messages, references=ref_log_messages)["score"] / 100.0
        self.scores["linecontent_sacrebleu_score"] = sacrebleu_score
    
    def set_smape_length_score(self, pred_log_messages : List[str], ref_log_messages : List[str]) -> None:
        smape_length_score = self.get_length_score(pred_log_messages, ref_log_messages)
        self.scores["linecontentlength_difference_SMAPE_score"] = smape_length_score

    def set_sacrebleu_withoutexplnumbers_score(self, pred_log_messages : List[str], ref_log_messages : List[str]):
        vectorized_replaceNumbers = np.vectorize(self.replaceNumbers)
        cleaned_pred_logMessages = vectorized_replaceNumbers(pred_log_messages)
        cleaned_ref_logMessages = vectorized_replaceNumbers(ref_log_messages)
        sacrebleu_withoutExplicitNumbers_score = SACREBLEU_METRIC.compute(predictions=cleaned_pred_logMessages, references=cleaned_ref_logMessages)["score"] / 100.0
        self.scores["linecontent_sacrebleu_withoutExplicitNumbers_score"] = sacrebleu_withoutExplicitNumbers_score

    # Get differenct scores regarding the content of a log-message
    def all_linecontent_scores(self, pred_logMessages : List[str], ref_logMessages: List[str]) -> None:
        if pred_logMessages == [] and ref_logMessages == []:
            pred_logMessages = [""]
            ref_logMessages = [""]

        self.set_sacrebleu_score(pred_logMessages, ref_logMessages)
        self.set_smape_length_score(pred_logMessages, ref_logMessages)
        self.set_sacrebleu_withoutexplnumbers_score(pred_logMessages, ref_logMessages)

    def set_timestamp_amt_score(self, pred_timestamps : List[str], ref_timestamps : List[str]):
        timestamp_amt_score = self.smapeScore(len(pred_timestamps), len(ref_timestamps))
        self.scores["timestamps_SMAPE_difference_score"] = timestamp_amt_score

    def set_timestamp_format_consistency_score(self, pred_timestamps, ref_timestamps):
        if (len(pred_timestamps) == 0):
            self.scores["timestamps_formatConsistency_score"] = 1.0
            return 
        
        pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_timestamps[0])).strip()
        all_consistent = all(re.fullmatch(pred_timestring_pattern, ts.strip()) is not None for ts in ref_timestamps)
        
        self.scores["timestamps_formatConsistency_score"] = 1.0 if all_consistent else 0.0

    def set_timestamp_monotonicity_score(self, pred_timestamps) -> None:
        try:
            parsed_times = [dateutil.parser.parse(ts) for ts in pred_timestamps]  # Parse all timestamps
        except dateutil.parser.ParserError:
            self.scores["timestamps_monotinicity_score"] = 0.0
            return

        # Check if the timestamps are monotonically increasing
        all_monotone =  all(t1 <= t2 for t1, t2 in zip(parsed_times, parsed_times[1:]))
        self.scores["timestamps_monotinicity_score"] = 1.0 if all_monotone else 0.0

    # get different scores regarding the timestamp
    def all_timestamp_scores(self, pred_timestamps, ref_timestamps) -> None:
        self.set_timestamp_amt_score(pred_timestamps, ref_timestamps)
        self.set_timestamp_format_consistency_score(pred_timestamps, ref_timestamps)
        self.set_timestamp_monotonicity_score(pred_timestamps)
        
    # driver method for different score computations
    def getLogMetric(self):
        self.set_linecount_score(self.prediction, self.reference)
        # Split log on timestamps
        pred_timestamps, pred_logMessages, ref_timestamps, ref_logMessages = self.split_log_entry(self.prediction, self.reference)
        self.all_linecontent_scores(pred_logMessages, ref_logMessages)
        self.all_timestamp_scores(pred_timestamps, ref_timestamps)