""" File: lang_codes.py Description: Language codes (e.g. used by tessearct for OCR) Author: Didier Guillevic Date: 2024-11-23 """ # Tesseract language codes: # - https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html # - https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes tesseract_lang_codes = { 'Afrikaans': 'afr', 'Amharic': 'amh', 'Arabic': 'ara', 'Assamese': 'asm', 'Azerbaijani': 'aze', 'Azerbaijani - Cyrilic': 'aze_cyrl', 'Belarusian': 'bel', 'Bengali': 'ben', 'Tibetan': 'bod', 'Bosnian': 'bos', 'Breton': 'bre', 'Bulgarian': 'bul', 'Catalan; Valencian': 'cat', 'Cebuano': 'ceb', 'Czech': 'ces', 'Chinese - Simplified': 'chi_sim', 'Chinese - Traditional': 'chi_tra', 'Cherokee': 'chr', 'Corsican': 'cos', 'Welsh': 'cym', 'Danish': 'dan', 'Danish - Fraktur (contrib)': 'dan_frak', 'German': 'deu', 'German - Fraktur (contrib)': 'deu_frak', 'German (Fraktur Latin)': 'deu_latf', 'Dzongkha': 'dzo', 'Greek, Modern (1453-)': 'ell', 'English': 'eng', 'English, Middle (1100-1500)': 'enm', 'Esperanto': 'epo', 'Math / equation detection module': 'equ', 'Estonian': 'est', 'Basque': 'eus', 'Faroese': 'fao', 'Persian': 'fas', 'Filipino (old - Tagalog)': 'fil', 'Finnish': 'fin', 'French': 'fra', 'German - Fraktur (now deu_latf)': 'frk', 'French, Middle (ca.1400-1600)': 'frm', 'Western Frisian': 'fry', 'Scottish Gaelic': 'gla', 'Irish': 'gle', 'Galician': 'glg', 'Greek, Ancient (to 1453) (contrib)': 'grc', 'Gujarati': 'guj', 'Haitian; Haitian Creole': 'hat', 'Hebrew': 'heb', 'Hindi': 'hin', 'Croatian': 'hrv', 'Hungarian': 'hun', 'Armenian': 'hye', 'Inuktitut': 'iku', 'Indonesian': 'ind', 'Icelandic': 'isl', 'Italian': 'ita', 'Italian - Old': 'ita_old', 'Javanese': 'jav', 'Japanese': 'jpn', 'Kannada': 'kan', 'Georgian': 'kat', 'Georgian - Old': 'kat_old', 'Kazakh': 'kaz', 'Central Khmer': 'khm', 'Kirghiz; Kyrgyz': 'kir', 'Kurmanji (Kurdish - Latin Script)': 'kmr', 'Korean': 'kor', 'Korean (vertical)': 'kor_vert', 'Kurdish (Arabic Script)': 'kur', 'Lao': 'lao', 'Latin': 'lat', 'Latvian': 'lav', 'Lithuanian': 'lit', 'Luxembourgish': 'ltz', 'Malayalam': 'mal', 'Marathi': 'mar', 'Macedonian': 'mkd', 'Maltese': 'mlt', 'Mongolian': 'mon', 'Maori': 'mri', 'Malay': 'msa', 'Burmese': 'mya', 'Nepali': 'nep', 'Dutch; Flemish': 'nld', 'Norwegian': 'nor', 'Occitan (post 1500)': 'oci', 'Oriya': 'ori', 'Orientation and script detection module': 'osd', 'Panjabi; Punjabi': 'pan', 'Polish': 'pol', 'Portuguese': 'por', 'Pushto; Pashto': 'pus', 'Quechua': 'que', 'Romanian; Moldavian; Moldovan': 'ron', 'Russian': 'rus', 'Sanskrit': 'san', 'Sinhala; Sinhalese': 'sin', 'Slovak': 'slk', 'Slovak - Fraktur (contrib)': 'slk_frak', 'Slovenian': 'slv', 'Sindhi': 'snd', 'Spanish; Castilian': 'spa', 'Spanish; Castilian - Old': 'spa_old', 'Albanian': 'sqi', 'Serbian': 'srp', 'Serbian - Latin': 'srp_latn', 'Sundanese': 'sun', 'Swahili': 'swa', 'Swedish': 'swe', 'Syriac': 'syr', 'Tamil': 'tam', 'Tatar': 'tat', 'Telugu': 'tel', 'Tajik': 'tgk', 'Tagalog (new - Filipino)': 'tgl', 'Thai': 'tha', 'Tigrinya': 'tir', 'Tonga': 'ton', 'Turkish': 'tur', 'Uighur; Uyghur': 'uig', 'Ukrainian': 'ukr', 'Urdu': 'urd', 'Uzbek': 'uzb', 'Uzbek - Cyrilic': 'uzb_cyrl', 'Vietnamese': 'vie', 'Yiddish': 'yid', 'Yoruba': 'yor' }