ffreemt commited on
Commit
4913387
·
1 Parent(s): 6f69137

Update split_text.py

Browse files
Files changed (1) hide show
  1. st_mlbee/split_text.py +105 -0
st_mlbee/split_text.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Split text to sentences.
3
+
4
+ Modifed from seg_text.seg_text.py, vtext sentence_splitter removed, use
5
+
6
+ Use sentence_splitter if supported,
7
+ else use polyglot.text.Text
8
+
9
+ !apt install libicu-dev
10
+ !install pyicu pycld2
11
+ !pip install polyglot sentence_splitter
12
+
13
+ Use vtext and fastlid to rid of polyglot?
14
+
15
+ from vtext.tokenize_sentence import UnicodeSentenceTokenizer, PunctuationTokenizer
16
+ tok = UnicodeSentenceTokenizer()
17
+ seg = tok.tokenize(''' Text ''') for langs not in LANG_S
18
+
19
+ """
20
+ # pylint: disable=invalid-name
21
+
22
+ import re
23
+ from typing import List, Optional, Union
24
+
25
+ import pysbd
26
+ from fastlid import fastlid
27
+ from loguru import logger
28
+ from tqdm.auto import tqdm
29
+
30
+
31
+ def _seg_text(
32
+ text: str,
33
+ lang: Optional[str] = None,
34
+ ) -> List[str]:
35
+ """
36
+ Split text to sentences.
37
+
38
+ Switched to pysbd
39
+
40
+ Args:
41
+ ----
42
+ text: string to split
43
+ lang: language, two-letter ISO (22 languages)
44
+
45
+ Returns:
46
+ -------
47
+ List of segmented sentences
48
+
49
+ """
50
+ if lang is None:
51
+ try:
52
+ lang, _ = fastlid(text)
53
+ except Exception as exc:
54
+ logger.warning(" fastlid: %s, setting lang='en'", exc)
55
+ lang = "en"
56
+
57
+ if not text.strip():
58
+ return []
59
+
60
+ seg = pysbd.Segmenter(language=lang, clean=True)
61
+
62
+ try:
63
+ # _ = tok.tokenize(text)
64
+ _ = seg.segment(text)
65
+ except Exception as exc:
66
+ logger.exception(f"pysbd.Segmenter, {exc=}")
67
+ raise
68
+ return _
69
+
70
+
71
+ def seg_text(
72
+ lst: Union[str, List[str]],
73
+ lang: Optional[str] = None,
74
+ maxlines: int = 1000,
75
+ extra: Optional[str] = None,
76
+ ) -> List[str]:
77
+ """Split a list of text.
78
+
79
+ Arguments:
80
+ lst: text or text list
81
+ lang: optional lang code
82
+ maxlines: (default 1000), threshold for turn on tqdm progressbar, set to <1 or a large number to turn it off
83
+ extra: re.split(rf"{extra}, text) first
84
+ Returns:
85
+ list of splitted text.
86
+ """
87
+ if isinstance(lst, str):
88
+ lst = [lst]
89
+
90
+ if extra:
91
+ # insert \n
92
+ lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst]
93
+
94
+ res = []
95
+ for elm in lst:
96
+ res.extend(
97
+ _seg_text(
98
+ elm,
99
+ lang=lang,
100
+ maxlines=maxlines,
101
+ # flag=False,
102
+ )
103
+ )
104
+
105
+ return res