|
import re
|
|
from typing import TypedDict
|
|
|
|
from style_bert_vits2.constants import Languages
|
|
from style_bert_vits2.logging import logger
|
|
from style_bert_vits2.nlp import bert_models
|
|
from style_bert_vits2.nlp.japanese import pyopenjtalk_worker as pyopenjtalk
|
|
from style_bert_vits2.nlp.japanese.mora_list import MORA_KATA_TO_MORA_PHONEMES, VOWELS
|
|
from style_bert_vits2.nlp.japanese.normalizer import replace_punctuation
|
|
from style_bert_vits2.nlp.symbols import PUNCTUATIONS
|
|
|
|
|
|
def g2p(
|
|
norm_text: str, use_jp_extra: bool = True, raise_yomi_error: bool = False
|
|
) -> tuple[list[str], list[int], list[int]]:
|
|
"""
|
|
他で使われるメインの関数。`normalize_text()` で正規化された `norm_text` を受け取り、
|
|
- phones: 音素のリスト(ただし `!` や `,` や `.` など punctuation が含まれうる)
|
|
- tones: アクセントのリスト、0(低)と1(高)からなり、phones と同じ長さ
|
|
- word2ph: 元のテキストの各文字に音素が何個割り当てられるかを表すリスト
|
|
のタプルを返す。
|
|
ただし `phones` と `tones` の最初と終わりに `_` が入り、応じて `word2ph` の最初と最後に 1 が追加される。
|
|
|
|
Args:
|
|
norm_text (str): 正規化されたテキスト
|
|
use_jp_extra (bool, optional): False の場合、「ん」の音素を「N」ではなく「n」とする。Defaults to True.
|
|
raise_yomi_error (bool, optional): False の場合、読めない文字が「'」として発音される。Defaults to False.
|
|
|
|
Returns:
|
|
tuple[list[str], list[int], list[int]]: 音素のリスト、アクセントのリスト、word2ph のリスト
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
phone_tone_list_wo_punct = __g2phone_tone_wo_punct(norm_text)
|
|
|
|
|
|
|
|
sep_text, sep_kata = text_to_sep_kata(norm_text, raise_yomi_error=raise_yomi_error)
|
|
|
|
|
|
sep_phonemes = __handle_long([__kata_to_phoneme_list(i) for i in sep_kata])
|
|
|
|
|
|
phone_w_punct: list[str] = []
|
|
for i in sep_phonemes:
|
|
phone_w_punct += i
|
|
|
|
|
|
phone_tone_list = __align_tones(phone_w_punct, phone_tone_list_wo_punct)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sep_tokenized: list[list[str]] = []
|
|
for i in sep_text:
|
|
if i not in PUNCTUATIONS:
|
|
sep_tokenized.append(
|
|
bert_models.load_tokenizer(Languages.JP).tokenize(i)
|
|
)
|
|
else:
|
|
sep_tokenized.append([i])
|
|
|
|
|
|
word2ph = []
|
|
for token, phoneme in zip(sep_tokenized, sep_phonemes):
|
|
phone_len = len(phoneme)
|
|
word_len = len(token)
|
|
word2ph += __distribute_phone(phone_len, word_len)
|
|
|
|
|
|
phone_tone_list = [("_", 0)] + phone_tone_list + [("_", 0)]
|
|
word2ph = [1] + word2ph + [1]
|
|
|
|
phones = [phone for phone, _ in phone_tone_list]
|
|
tones = [tone for _, tone in phone_tone_list]
|
|
|
|
assert len(phones) == sum(word2ph), f"{len(phones)} != {sum(word2ph)}"
|
|
|
|
|
|
if not use_jp_extra:
|
|
phones = [phone if phone != "N" else "n" for phone in phones]
|
|
|
|
return phones, tones, word2ph
|
|
|
|
|
|
def text_to_sep_kata(
|
|
norm_text: str, raise_yomi_error: bool = False
|
|
) -> tuple[list[str], list[str]]:
|
|
"""
|
|
`normalize_text` で正規化済みの `norm_text` を受け取り、それを単語分割し、
|
|
分割された単語リストとその読み(カタカナ or 記号1文字)のリストのタプルを返す。
|
|
単語分割結果は、`g2p()` の `word2ph` で1文字あたりに割り振る音素記号の数を決めるために使う。
|
|
例:
|
|
`私はそう思う!って感じ?` →
|
|
["私", "は", "そう", "思う", "!", "って", "感じ", "?"], ["ワタシ", "ワ", "ソー", "オモウ", "!", "ッテ", "カンジ", "?"]
|
|
|
|
Args:
|
|
norm_text (str): 正規化されたテキスト
|
|
raise_yomi_error (bool, optional): False の場合、読めない文字が「'」として発音される。Defaults to False.
|
|
|
|
Returns:
|
|
tuple[list[str], list[str]]: 分割された単語リストと、その読み(カタカナ or 記号1文字)のリスト
|
|
"""
|
|
|
|
|
|
parsed = pyopenjtalk.run_frontend(norm_text)
|
|
sep_text: list[str] = []
|
|
sep_kata: list[str] = []
|
|
|
|
for parts in parsed:
|
|
|
|
|
|
word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
|
|
"’", ""
|
|
)
|
|
"""
|
|
ここで `yomi` の取りうる値は以下の通りのはず。
|
|
- `word` が通常単語 → 通常の読み(カタカナ)
|
|
(カタカナからなり、長音記号も含みうる、`アー` 等)
|
|
- `word` が `ー` から始まる → `ーラー` や `ーーー` など
|
|
- `word` が句読点や空白等 → `、`
|
|
- `word` が punctuation の繰り返し → 全角にしたもの
|
|
基本的に punctuation は1文字ずつ分かれるが、何故かある程度連続すると1つにまとまる。
|
|
他にも `word` が読めないキリル文字アラビア文字等が来ると `、` になるが、正規化でこの場合は起きないはず。
|
|
また元のコードでは `yomi` が空白の場合の処理があったが、これは起きないはず。
|
|
処理すべきは `yomi` が `、` の場合のみのはず。
|
|
"""
|
|
assert yomi != "", f"Empty yomi: {word}"
|
|
if yomi == "、":
|
|
|
|
if not set(word).issubset(set(PUNCTUATIONS)):
|
|
|
|
|
|
if raise_yomi_error:
|
|
raise YomiError(f"Cannot read: {word} in:\n{norm_text}")
|
|
|
|
|
|
logger.warning(
|
|
f'Cannot read: {word} in:\n{norm_text}, replaced with "\'"'
|
|
)
|
|
|
|
yomi = "'" * len(word)
|
|
else:
|
|
|
|
yomi = word
|
|
elif yomi == "?":
|
|
assert word == "?", f"yomi `?` comes from: {word}"
|
|
yomi = "?"
|
|
sep_text.append(word)
|
|
sep_kata.append(yomi)
|
|
|
|
return sep_text, sep_kata
|
|
|
|
|
|
def adjust_word2ph(
|
|
word2ph: list[int],
|
|
generated_phone: list[str],
|
|
given_phone: list[str],
|
|
) -> list[int]:
|
|
"""
|
|
`g2p()` で得られた `word2ph` を、generated_phone と given_phone の差分情報を使っていい感じに調整する。
|
|
generated_phone は正規化された読み上げテキストから生成された読みの情報だが、
|
|
given_phone で 同じ読み上げテキストに異なる読みが与えられた場合、正規化された読み上げテキストの各文字に
|
|
音素が何文字割り当てられるかを示す word2ph の合計値が given_phone の長さ (音素数) と一致しなくなりうる
|
|
そこで generated_phone と given_phone の差分を取り変更箇所に対応する word2ph の要素の値だけを増減させ、
|
|
アクセントへの影響を最低限に抑えつつ word2ph の合計値を given_phone の長さ (音素数) に一致させる。
|
|
|
|
Args:
|
|
word2ph (list[int]): 単語ごとの音素の数のリスト
|
|
generated_phone (list[str]): 生成された音素のリスト
|
|
given_phone (list[str]): 与えられた音素のリスト
|
|
|
|
Returns:
|
|
list[int]: 修正された word2ph のリスト
|
|
"""
|
|
|
|
|
|
|
|
word2ph = word2ph[1:-1]
|
|
generated_phone = generated_phone[1:-1]
|
|
given_phone = given_phone[1:-1]
|
|
|
|
class DiffDetail(TypedDict):
|
|
begin_index: int
|
|
end_index: int
|
|
value: list[str]
|
|
|
|
class Diff(TypedDict):
|
|
generated: DiffDetail
|
|
given: DiffDetail
|
|
|
|
def extract_differences(
|
|
generated_phone: list[str], given_phone: list[str]
|
|
) -> list[Diff]:
|
|
"""
|
|
最長共通部分列を基にして、二つのリストの異なる部分を抽出する。
|
|
"""
|
|
|
|
def longest_common_subsequence(
|
|
X: list[str], Y: list[str]
|
|
) -> list[tuple[int, int]]:
|
|
"""
|
|
二つのリストの最長共通部分列のインデックスのペアを返す。
|
|
"""
|
|
m, n = len(X), len(Y)
|
|
L = [[0] * (n + 1) for _ in range(m + 1)]
|
|
|
|
for i in range(1, m + 1):
|
|
for j in range(1, n + 1):
|
|
if X[i - 1] == Y[j - 1]:
|
|
L[i][j] = L[i - 1][j - 1] + 1
|
|
else:
|
|
L[i][j] = max(L[i - 1][j], L[i][j - 1])
|
|
|
|
index_pairs = []
|
|
i, j = m, n
|
|
while i > 0 and j > 0:
|
|
if X[i - 1] == Y[j - 1]:
|
|
index_pairs.append((i - 1, j - 1))
|
|
i -= 1
|
|
j -= 1
|
|
elif L[i - 1][j] >= L[i][j - 1]:
|
|
i -= 1
|
|
else:
|
|
j -= 1
|
|
index_pairs.reverse()
|
|
return index_pairs
|
|
|
|
differences = []
|
|
common_indices = longest_common_subsequence(generated_phone, given_phone)
|
|
prev_x, prev_y = -1, -1
|
|
|
|
|
|
for x, y in common_indices:
|
|
diff_X = {
|
|
"begin_index": prev_x + 1,
|
|
"end_index": x,
|
|
"value": generated_phone[prev_x + 1 : x],
|
|
}
|
|
diff_Y = {
|
|
"begin_index": prev_y + 1,
|
|
"end_index": y,
|
|
"value": given_phone[prev_y + 1 : y],
|
|
}
|
|
if diff_X or diff_Y:
|
|
differences.append({"generated": diff_X, "given": diff_Y})
|
|
prev_x, prev_y = x, y
|
|
|
|
if prev_x < len(generated_phone) - 1 or prev_y < len(given_phone) - 1:
|
|
differences.append(
|
|
{
|
|
"generated": {
|
|
"begin_index": prev_x + 1,
|
|
"end_index": len(generated_phone) - 1,
|
|
"value": generated_phone[prev_x + 1 : len(generated_phone) - 1],
|
|
},
|
|
"given": {
|
|
"begin_index": prev_y + 1,
|
|
"end_index": len(given_phone) - 1,
|
|
"value": given_phone[prev_y + 1 : len(given_phone) - 1],
|
|
},
|
|
}
|
|
)
|
|
|
|
for diff in differences[:]:
|
|
if (
|
|
len(diff["generated"]["value"]) == 0
|
|
and len(diff["given"]["value"]) == 0
|
|
):
|
|
differences.remove(diff)
|
|
|
|
return differences
|
|
|
|
|
|
differences = extract_differences(generated_phone, given_phone)
|
|
|
|
|
|
|
|
adjusted_word2ph: list[int] = [0] * len(word2ph)
|
|
|
|
current_generated_index = 0
|
|
|
|
|
|
|
|
|
|
for word2ph_element_index, word2ph_element in enumerate(word2ph):
|
|
|
|
|
|
|
|
for _ in range(word2ph_element):
|
|
|
|
current_diff: Diff | None = None
|
|
for diff in differences:
|
|
if diff["generated"]["begin_index"] == current_generated_index:
|
|
current_diff = diff
|
|
break
|
|
|
|
if current_diff is not None:
|
|
|
|
diff_in_phonemes = \
|
|
len(current_diff["given"]["value"]) - len(current_diff["generated"]["value"])
|
|
|
|
adjusted_word2ph[word2ph_element_index] += diff_in_phonemes
|
|
|
|
adjusted_word2ph[word2ph_element_index] += 1
|
|
|
|
current_generated_index += 1
|
|
|
|
|
|
assert len(given_phone) == sum(adjusted_word2ph), f"{len(given_phone)} != {sum(adjusted_word2ph)}"
|
|
|
|
|
|
|
|
|
|
|
|
for adjusted_word2ph_element_index, adjusted_word2ph_element in enumerate(adjusted_word2ph):
|
|
|
|
if adjusted_word2ph_element < 1:
|
|
|
|
diff = 1 - adjusted_word2ph_element
|
|
|
|
|
|
adjusted_word2ph[adjusted_word2ph_element_index] = 1
|
|
|
|
|
|
|
|
for i in range(1, len(adjusted_word2ph)):
|
|
if adjusted_word2ph_element_index + i >= len(adjusted_word2ph):
|
|
break
|
|
if adjusted_word2ph[adjusted_word2ph_element_index + i] - diff >= 1:
|
|
adjusted_word2ph[adjusted_word2ph_element_index + i] -= diff
|
|
break
|
|
else:
|
|
diff -= adjusted_word2ph[adjusted_word2ph_element_index + i] - 1
|
|
adjusted_word2ph[adjusted_word2ph_element_index + i] = 1
|
|
if diff == 0:
|
|
break
|
|
|
|
|
|
|
|
|
|
for adjusted_word2ph_element_index, adjusted_word2ph_element in enumerate(adjusted_word2ph):
|
|
if adjusted_word2ph_element > 6:
|
|
diff = adjusted_word2ph_element - 6
|
|
adjusted_word2ph[adjusted_word2ph_element_index] = 6
|
|
for i in range(1, len(adjusted_word2ph)):
|
|
if adjusted_word2ph_element_index + i >= len(adjusted_word2ph):
|
|
break
|
|
if adjusted_word2ph[adjusted_word2ph_element_index + i] + diff <= 6:
|
|
adjusted_word2ph[adjusted_word2ph_element_index + i] += diff
|
|
break
|
|
else:
|
|
diff -= 6 - adjusted_word2ph[adjusted_word2ph_element_index + i]
|
|
adjusted_word2ph[adjusted_word2ph_element_index + i] = 6
|
|
if diff == 0:
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return [1] + adjusted_word2ph + [1]
|
|
|
|
|
|
def __g2phone_tone_wo_punct(text: str) -> list[tuple[str, int]]:
|
|
"""
|
|
テキストに対して、音素とアクセント(0か1)のペアのリストを返す。
|
|
ただし「!」「.」「?」等の非音素記号 (punctuation) は全て消える(ポーズ記号も残さない)。
|
|
非音素記号を含める処理は `align_tones()` で行われる。
|
|
また「っ」は「q」に、「ん」は「N」に変換される。
|
|
例: "こんにちは、世界ー。。元気?!" →
|
|
[('k', 0), ('o', 0), ('N', 1), ('n', 1), ('i', 1), ('ch', 1), ('i', 1), ('w', 1), ('a', 1), ('s', 1), ('e', 1), ('k', 0), ('a', 0), ('i', 0), ('i', 0), ('g', 1), ('e', 1), ('N', 0), ('k', 0), ('i', 0)]
|
|
|
|
Args:
|
|
text (str): テキスト
|
|
|
|
Returns:
|
|
list[tuple[str, int]]: 音素とアクセントのペアのリスト
|
|
"""
|
|
|
|
prosodies = __pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True)
|
|
|
|
result: list[tuple[str, int]] = []
|
|
current_phrase: list[tuple[str, int]] = []
|
|
current_tone = 0
|
|
|
|
for i, letter in enumerate(prosodies):
|
|
|
|
|
|
|
|
if letter == "^":
|
|
assert i == 0, "Unexpected ^"
|
|
|
|
elif letter in ("$", "?", "_", "#"):
|
|
|
|
result.extend(__fix_phone_tone(current_phrase))
|
|
|
|
if letter in ("$", "?"):
|
|
assert i == len(prosodies) - 1, f"Unexpected {letter}"
|
|
|
|
|
|
current_phrase = []
|
|
|
|
current_tone = 0
|
|
|
|
elif letter == "[":
|
|
current_tone = current_tone + 1
|
|
|
|
elif letter == "]":
|
|
current_tone = current_tone - 1
|
|
|
|
else:
|
|
if letter == "cl":
|
|
letter = "q"
|
|
|
|
|
|
current_phrase.append((letter, current_tone))
|
|
|
|
return result
|
|
|
|
|
|
__PYOPENJTALK_G2P_PROSODY_A1_PATTERN = re.compile(r"/A:([0-9\-]+)\+")
|
|
__PYOPENJTALK_G2P_PROSODY_A2_PATTERN = re.compile(r"\+(\d+)\+")
|
|
__PYOPENJTALK_G2P_PROSODY_A3_PATTERN = re.compile(r"\+(\d+)/")
|
|
__PYOPENJTALK_G2P_PROSODY_E3_PATTERN = re.compile(r"!(\d+)_")
|
|
__PYOPENJTALK_G2P_PROSODY_F1_PATTERN = re.compile(r"/F:(\d+)_")
|
|
__PYOPENJTALK_G2P_PROSODY_P3_PATTERN = re.compile(r"\-(.*?)\+")
|
|
|
|
|
|
def __pyopenjtalk_g2p_prosody(
|
|
text: str, drop_unvoiced_vowels: bool = True
|
|
) -> list[str]:
|
|
"""
|
|
ESPnet の実装から引用、概ね変更点無し。「ん」は「N」なことに注意。
|
|
ref: https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
|
|
------------------------------------------------------------------------------------------
|
|
|
|
Extract phoneme + prosody symbol sequence from input full-context labels.
|
|
|
|
The algorithm is based on `Prosodic features control by symbols as input of
|
|
sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
|
|
|
|
Args:
|
|
text (str): Input text.
|
|
drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
|
|
|
|
Returns:
|
|
List[str]: List of phoneme + prosody symbols.
|
|
|
|
Examples:
|
|
>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
|
|
>>> pyopenjtalk_g2p_prosody("こんにちは。")
|
|
['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
|
|
|
|
.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
|
|
modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
|
|
"""
|
|
|
|
def _numeric_feature_by_regex(pattern: re.Pattern[str], s: str) -> int:
|
|
match = pattern.search(s)
|
|
if match is None:
|
|
return -50
|
|
return int(match.group(1))
|
|
|
|
labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
|
|
N = len(labels)
|
|
|
|
phones = []
|
|
for n in range(N):
|
|
lab_curr = labels[n]
|
|
|
|
|
|
p3 = __PYOPENJTALK_G2P_PROSODY_P3_PATTERN.search(lab_curr).group(1)
|
|
|
|
if drop_unvoiced_vowels and p3 in "AEIOU":
|
|
p3 = p3.lower()
|
|
|
|
|
|
if p3 == "sil":
|
|
assert n == 0 or n == N - 1
|
|
if n == 0:
|
|
phones.append("^")
|
|
elif n == N - 1:
|
|
|
|
e3 = _numeric_feature_by_regex(
|
|
__PYOPENJTALK_G2P_PROSODY_E3_PATTERN, lab_curr
|
|
)
|
|
if e3 == 0:
|
|
phones.append("$")
|
|
elif e3 == 1:
|
|
phones.append("?")
|
|
continue
|
|
elif p3 == "pau":
|
|
phones.append("_")
|
|
continue
|
|
else:
|
|
phones.append(p3)
|
|
|
|
|
|
a1 = _numeric_feature_by_regex(__PYOPENJTALK_G2P_PROSODY_A1_PATTERN, lab_curr)
|
|
a2 = _numeric_feature_by_regex(__PYOPENJTALK_G2P_PROSODY_A2_PATTERN, lab_curr)
|
|
a3 = _numeric_feature_by_regex(__PYOPENJTALK_G2P_PROSODY_A3_PATTERN, lab_curr)
|
|
|
|
|
|
f1 = _numeric_feature_by_regex(__PYOPENJTALK_G2P_PROSODY_F1_PATTERN, lab_curr)
|
|
|
|
a2_next = _numeric_feature_by_regex(
|
|
__PYOPENJTALK_G2P_PROSODY_A2_PATTERN, labels[n + 1]
|
|
)
|
|
|
|
if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
|
|
phones.append("#")
|
|
|
|
elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
|
|
phones.append("]")
|
|
|
|
elif a2 == 1 and a2_next == 2:
|
|
phones.append("[")
|
|
|
|
return phones
|
|
|
|
|
|
def __fix_phone_tone(phone_tone_list: list[tuple[str, int]]) -> list[tuple[str, int]]:
|
|
"""
|
|
`phone_tone_list` の tone(アクセントの値)を 0 か 1 の範囲に修正する。
|
|
例: [(a, 0), (i, -1), (u, -1)] → [(a, 1), (i, 0), (u, 0)]
|
|
|
|
Args:
|
|
phone_tone_list (list[tuple[str, int]]): 音素とアクセントのペアのリスト
|
|
|
|
Returns:
|
|
list[tuple[str, int]]: 修正された音素とアクセントのペアのリスト
|
|
"""
|
|
|
|
tone_values = set(tone for _, tone in phone_tone_list)
|
|
if len(tone_values) == 1:
|
|
assert tone_values == {0}, tone_values
|
|
return phone_tone_list
|
|
elif len(tone_values) == 2:
|
|
if tone_values == {0, 1}:
|
|
return phone_tone_list
|
|
elif tone_values == {-1, 0}:
|
|
return [
|
|
(letter, 0 if tone == -1 else 1) for letter, tone in phone_tone_list
|
|
]
|
|
else:
|
|
raise ValueError(f"Unexpected tone values: {tone_values}")
|
|
else:
|
|
raise ValueError(f"Unexpected tone values: {tone_values}")
|
|
|
|
|
|
def __handle_long(sep_phonemes: list[list[str]]) -> list[list[str]]:
|
|
"""
|
|
フレーズごとに分かれた音素(長音記号がそのまま)のリストのリスト `sep_phonemes` を受け取り、
|
|
その長音記号を処理して、音素のリストのリストを返す。
|
|
基本的には直前の音素を伸ばすが、直前の音素が母音でない場合もしくは冒頭の場合は、
|
|
おそらく長音記号とダッシュを勘違いしていると思われるので、ダッシュに対応する音素 `-` に変換する。
|
|
|
|
Args:
|
|
sep_phonemes (list[list[str]]): フレーズごとに分かれた音素のリストのリスト
|
|
|
|
Returns:
|
|
list[list[str]]: 長音記号を処理した音素のリストのリスト
|
|
"""
|
|
|
|
for i in range(len(sep_phonemes)):
|
|
if len(sep_phonemes[i]) == 0:
|
|
|
|
continue
|
|
if sep_phonemes[i][0] == "ー":
|
|
if i != 0:
|
|
prev_phoneme = sep_phonemes[i - 1][-1]
|
|
if prev_phoneme in VOWELS:
|
|
|
|
sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
|
|
else:
|
|
|
|
|
|
sep_phonemes[i][0] = "-"
|
|
else:
|
|
|
|
sep_phonemes[i][0] = "-"
|
|
if "ー" in sep_phonemes[i]:
|
|
for j in range(len(sep_phonemes[i])):
|
|
if sep_phonemes[i][j] == "ー":
|
|
sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
|
|
|
|
return sep_phonemes
|
|
|
|
|
|
__KATAKANA_PATTERN = re.compile(r"[\u30A0-\u30FF]+")
|
|
__MORA_PATTERN = re.compile(
|
|
"|".join(
|
|
map(re.escape, sorted(MORA_KATA_TO_MORA_PHONEMES.keys(), key=len, reverse=True))
|
|
)
|
|
)
|
|
__LONG_PATTERN = re.compile(r"(\w)(ー*)")
|
|
|
|
|
|
def __kata_to_phoneme_list(text: str) -> list[str]:
|
|
"""
|
|
原則カタカナの `text` を受け取り、それをそのままいじらずに音素記号のリストに変換。
|
|
注意点:
|
|
- punctuation かその繰り返しが来た場合、punctuation たちをそのままリストにして返す。
|
|
- 冒頭に続く「ー」はそのまま「ー」のままにする(`handle_long()` で処理される)
|
|
- 文中の「ー」は前の音素記号の最後の音素記号に変換される。
|
|
例:
|
|
`ーーソーナノカーー` → ["ー", "ー", "s", "o", "o", "n", "a", "n", "o", "k", "a", "a", "a"]
|
|
`?` → ["?"]
|
|
`!?!?!?!?!` → ["!", "?", "!", "?", "!", "?", "!", "?", "!"]
|
|
|
|
Args:
|
|
text (str): カタカナのテキスト
|
|
|
|
Returns:
|
|
list[str]: 音素記号のリスト
|
|
"""
|
|
|
|
if set(text).issubset(set(PUNCTUATIONS)):
|
|
return list(text)
|
|
|
|
if __KATAKANA_PATTERN.fullmatch(text) is None:
|
|
raise ValueError(f"Input must be katakana only: {text}")
|
|
|
|
def mora2phonemes(mora: str) -> str:
|
|
consonant, vowel = MORA_KATA_TO_MORA_PHONEMES[mora]
|
|
if consonant is None:
|
|
return f" {vowel}"
|
|
return f" {consonant} {vowel}"
|
|
|
|
spaced_phonemes = __MORA_PATTERN.sub(lambda m: mora2phonemes(m.group()), text)
|
|
|
|
|
|
long_replacement = lambda m: m.group(1) + (" " + m.group(1)) * len(m.group(2))
|
|
spaced_phonemes = __LONG_PATTERN.sub(long_replacement, spaced_phonemes)
|
|
|
|
return spaced_phonemes.strip().split(" ")
|
|
|
|
|
|
def __align_tones(
|
|
phones_with_punct: list[str], phone_tone_list: list[tuple[str, int]]
|
|
) -> list[tuple[str, int]]:
|
|
"""
|
|
例: …私は、、そう思う。
|
|
phones_with_punct:
|
|
[".", ".", ".", "w", "a", "t", "a", "sh", "i", "w", "a", ",", ",", "s", "o", "o", "o", "m", "o", "u", "."]
|
|
phone_tone_list:
|
|
[("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), ("_", 0), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0))]
|
|
Return:
|
|
[(".", 0), (".", 0), (".", 0), ("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), (",", 0), (",", 0), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0), (".", 0)]
|
|
|
|
Args:
|
|
phones_with_punct (list[str]): punctuation を含む音素のリスト
|
|
phone_tone_list (list[tuple[str, int]]): punctuation を含まない音素とアクセントのペアのリスト
|
|
|
|
Returns:
|
|
list[tuple[str, int]]: punctuation を含む音素とアクセントのペアのリスト
|
|
"""
|
|
|
|
result: list[tuple[str, int]] = []
|
|
tone_index = 0
|
|
for phone in phones_with_punct:
|
|
if tone_index >= len(phone_tone_list):
|
|
|
|
result.append((phone, 0))
|
|
elif phone == phone_tone_list[tone_index][0]:
|
|
|
|
result.append((phone, phone_tone_list[tone_index][1]))
|
|
|
|
tone_index += 1
|
|
elif phone in PUNCTUATIONS:
|
|
|
|
result.append((phone, 0))
|
|
else:
|
|
logger.debug(f"phones: {phones_with_punct}")
|
|
logger.debug(f"phone_tone_list: {phone_tone_list}")
|
|
logger.debug(f"result: {result}")
|
|
logger.debug(f"tone_index: {tone_index}")
|
|
logger.debug(f"phone: {phone}")
|
|
raise ValueError(f"Unexpected phone: {phone}")
|
|
|
|
return result
|
|
|
|
|
|
def __distribute_phone(n_phone: int, n_word: int) -> list[int]:
|
|
"""
|
|
左から右に 1 ずつ振り分け、次にまた左から右に1ずつ増やし、というふうに、
|
|
音素の数 `n_phone` を単語の数 `n_word` に分配する。
|
|
|
|
Args:
|
|
n_phone (int): 音素の数
|
|
n_word (int): 単語の数
|
|
|
|
Returns:
|
|
list[int]: 単語ごとの音素の数のリスト
|
|
"""
|
|
|
|
phones_per_word = [0] * n_word
|
|
for _ in range(n_phone):
|
|
min_tasks = min(phones_per_word)
|
|
min_index = phones_per_word.index(min_tasks)
|
|
phones_per_word[min_index] += 1
|
|
|
|
return phones_per_word
|
|
|
|
|
|
class YomiError(Exception):
|
|
"""
|
|
OpenJTalk で、読みが正しく取得できない箇所があるときに発生する例外。
|
|
基本的に「学習の前処理のテキスト処理時」には発生させ、そうでない場合は、
|
|
ignore_yomi_error=True にしておいて、この例外を発生させないようにする。
|
|
"""
|
|
|