File size: 1,223 Bytes
c8a32e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
from typing import List
from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
from surya.model.recognition.tokenizer import _tokenize as lang_tokenize
from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE
from marker.settings import settings
def langs_to_ids(langs: List[str]):
unique_langs = list(set(langs))
_, lang_tokens = lang_tokenize("", unique_langs)
return lang_tokens
def replace_langs_with_codes(langs):
if settings.OCR_ENGINE == "surya":
for i, lang in enumerate(langs):
if lang.title() in LANGUAGE_TO_CODE:
langs[i] = LANGUAGE_TO_CODE[lang.title()]
else:
for i, lang in enumerate(langs):
if lang in LANGUAGE_TO_CODE:
langs[i] = LANGUAGE_TO_TESSERACT_CODE[lang]
return langs
def validate_langs(langs):
if settings.OCR_ENGINE == "surya":
for lang in langs:
if lang not in CODE_TO_LANGUAGE:
raise ValueError(f"Invalid language code {lang} for Surya OCR")
else:
for lang in langs:
if lang not in TESSERACT_CODE_TO_LANGUAGE:
raise ValueError(f"Invalid language code {lang} for Tesseract") |