File size: 1,414 Bytes
e55a136
2c2081e
 
 
dbfaf91
e55a136
 
66fcc65
2c2081e
 
66fcc65
2c2081e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66fcc65
2c2081e
 
 
 
 
 
 
66fcc65
2c2081e
 
 
e55a136
2c2081e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""Define mlbee."""
# pylint: disable=invalid-name
from typing import List

from cmat2aset310 import cmat2aset
from logzero import logger

from st_mlbee.gen_cmat import gen_cmat


def st_mlbee(
    text1: List[str],
    text2: List[str],
    eps: float = 10,
    min_samples: int = 6,
    # show_plot: bool = True,
):
    """Align multilingual texts.

    Args:
        text1: list of strings
        text2: list of strings
        eps: epsilon
        min_samples: minimum number of points to be considered as a cluster
        x show_plot: whether to show plots, refactored to cmat2html's show_plot
    Returns:
        Aligned text pairs.
    """
    if not (
        [elm for elm in text1 if elm.strip()] and [elm for elm in text2 if elm.strip()]
    ):
        logger.warning("One or both inputs are empty")
        raise Exception("Nothing to do...exiting")

    try:
        # from json_de2zh.gen_cmat import gen_cmat  # noqa  # pylint: disable=import-outside-toplevel
        cmat = gen_cmat(text1, text2)
        # logger.level is reset to 20 in fastlid
        st_mlbee.cmat = cmat
    except Exception as e:
        logger.exception(e)
        raise

    try:
        # aset = cmat2aset(cmat.T)
        aset = cmat2aset(cmat, eps=eps, min_samples=min_samples)
        st_mlbee.aset = aset
    except Exception as e:
        logger.exception(e)
        raise

    # paired texts or aset?
    return aset