|
""" |
|
File: lang_codes.py |
|
|
|
Description: Language codes (e.g. used by tessearct for OCR) |
|
|
|
Author: Didier Guillevic |
|
Date: 2024-11-23 |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
tesseract_lang_codes = { |
|
'Afrikaans': 'afr', |
|
'Amharic': 'amh', |
|
'Arabic': 'ara', |
|
'Assamese': 'asm', |
|
'Azerbaijani': 'aze', |
|
'Azerbaijani - Cyrilic': 'aze_cyrl', |
|
'Belarusian': 'bel', |
|
'Bengali': 'ben', |
|
'Tibetan': 'bod', |
|
'Bosnian': 'bos', |
|
'Breton': 'bre', |
|
'Bulgarian': 'bul', |
|
'Catalan; Valencian': 'cat', |
|
'Cebuano': 'ceb', |
|
'Czech': 'ces', |
|
'Chinese - Simplified': 'chi_sim', |
|
'Chinese - Traditional': 'chi_tra', |
|
'Cherokee': 'chr', |
|
'Corsican': 'cos', |
|
'Welsh': 'cym', |
|
'Danish': 'dan', |
|
'Danish - Fraktur (contrib)': 'dan_frak', |
|
'German': 'deu', |
|
'German - Fraktur (contrib)': 'deu_frak', |
|
'German (Fraktur Latin)': 'deu_latf', |
|
'Dzongkha': 'dzo', |
|
'Greek, Modern (1453-)': 'ell', |
|
'English': 'eng', |
|
'English, Middle (1100-1500)': 'enm', |
|
'Esperanto': 'epo', |
|
'Math / equation detection module': 'equ', |
|
'Estonian': 'est', |
|
'Basque': 'eus', |
|
'Faroese': 'fao', |
|
'Persian': 'fas', |
|
'Filipino (old - Tagalog)': 'fil', |
|
'Finnish': 'fin', |
|
'French': 'fra', |
|
'German - Fraktur (now deu_latf)': 'frk', |
|
'French, Middle (ca.1400-1600)': 'frm', |
|
'Western Frisian': 'fry', |
|
'Scottish Gaelic': 'gla', |
|
'Irish': 'gle', |
|
'Galician': 'glg', |
|
'Greek, Ancient (to 1453) (contrib)': 'grc', |
|
'Gujarati': 'guj', |
|
'Haitian; Haitian Creole': 'hat', |
|
'Hebrew': 'heb', |
|
'Hindi': 'hin', |
|
'Croatian': 'hrv', |
|
'Hungarian': 'hun', |
|
'Armenian': 'hye', |
|
'Inuktitut': 'iku', |
|
'Indonesian': 'ind', |
|
'Icelandic': 'isl', |
|
'Italian': 'ita', |
|
'Italian - Old': 'ita_old', |
|
'Javanese': 'jav', |
|
'Japanese': 'jpn', |
|
'Kannada': 'kan', |
|
'Georgian': 'kat', |
|
'Georgian - Old': 'kat_old', |
|
'Kazakh': 'kaz', |
|
'Central Khmer': 'khm', |
|
'Kirghiz; Kyrgyz': 'kir', |
|
'Kurmanji (Kurdish - Latin Script)': 'kmr', |
|
'Korean': 'kor', |
|
'Korean (vertical)': 'kor_vert', |
|
'Kurdish (Arabic Script)': 'kur', |
|
'Lao': 'lao', |
|
'Latin': 'lat', |
|
'Latvian': 'lav', |
|
'Lithuanian': 'lit', |
|
'Luxembourgish': 'ltz', |
|
'Malayalam': 'mal', |
|
'Marathi': 'mar', |
|
'Macedonian': 'mkd', |
|
'Maltese': 'mlt', |
|
'Mongolian': 'mon', |
|
'Maori': 'mri', |
|
'Malay': 'msa', |
|
'Burmese': 'mya', |
|
'Nepali': 'nep', |
|
'Dutch; Flemish': 'nld', |
|
'Norwegian': 'nor', |
|
'Occitan (post 1500)': 'oci', |
|
'Oriya': 'ori', |
|
'Orientation and script detection module': 'osd', |
|
'Panjabi; Punjabi': 'pan', |
|
'Polish': 'pol', |
|
'Portuguese': 'por', |
|
'Pushto; Pashto': 'pus', |
|
'Quechua': 'que', |
|
'Romanian; Moldavian; Moldovan': 'ron', |
|
'Russian': 'rus', |
|
'Sanskrit': 'san', |
|
'Sinhala; Sinhalese': 'sin', |
|
'Slovak': 'slk', |
|
'Slovak - Fraktur (contrib)': 'slk_frak', |
|
'Slovenian': 'slv', |
|
'Sindhi': 'snd', |
|
'Spanish; Castilian': 'spa', |
|
'Spanish; Castilian - Old': 'spa_old', |
|
'Albanian': 'sqi', |
|
'Serbian': 'srp', |
|
'Serbian - Latin': 'srp_latn', |
|
'Sundanese': 'sun', |
|
'Swahili': 'swa', |
|
'Swedish': 'swe', |
|
'Syriac': 'syr', |
|
'Tamil': 'tam', |
|
'Tatar': 'tat', |
|
'Telugu': 'tel', |
|
'Tajik': 'tgk', |
|
'Tagalog (new - Filipino)': 'tgl', |
|
'Thai': 'tha', |
|
'Tigrinya': 'tir', |
|
'Tonga': 'ton', |
|
'Turkish': 'tur', |
|
'Uighur; Uyghur': 'uig', |
|
'Ukrainian': 'ukr', |
|
'Urdu': 'urd', |
|
'Uzbek': 'uzb', |
|
'Uzbek - Cyrilic': 'uzb_cyrl', |
|
'Vietnamese': 'vie', |
|
'Yiddish': 'yid', |
|
'Yoruba': 'yor' |
|
} |
|
|