Didier's picture
Upload lang_codes.py
2cad216 verified
"""
File: lang_codes.py
Description: Language codes (e.g. used by tessearct for OCR)
Author: Didier Guillevic
Date: 2024-11-23
"""
# Tesseract language codes:
# - https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
# - https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes
tesseract_lang_codes = {
'Afrikaans': 'afr',
'Amharic': 'amh',
'Arabic': 'ara',
'Assamese': 'asm',
'Azerbaijani': 'aze',
'Azerbaijani - Cyrilic': 'aze_cyrl',
'Belarusian': 'bel',
'Bengali': 'ben',
'Tibetan': 'bod',
'Bosnian': 'bos',
'Breton': 'bre',
'Bulgarian': 'bul',
'Catalan; Valencian': 'cat',
'Cebuano': 'ceb',
'Czech': 'ces',
'Chinese - Simplified': 'chi_sim',
'Chinese - Traditional': 'chi_tra',
'Cherokee': 'chr',
'Corsican': 'cos',
'Welsh': 'cym',
'Danish': 'dan',
'Danish - Fraktur (contrib)': 'dan_frak',
'German': 'deu',
'German - Fraktur (contrib)': 'deu_frak',
'German (Fraktur Latin)': 'deu_latf',
'Dzongkha': 'dzo',
'Greek, Modern (1453-)': 'ell',
'English': 'eng',
'English, Middle (1100-1500)': 'enm',
'Esperanto': 'epo',
'Math / equation detection module': 'equ',
'Estonian': 'est',
'Basque': 'eus',
'Faroese': 'fao',
'Persian': 'fas',
'Filipino (old - Tagalog)': 'fil',
'Finnish': 'fin',
'French': 'fra',
'German - Fraktur (now deu_latf)': 'frk',
'French, Middle (ca.1400-1600)': 'frm',
'Western Frisian': 'fry',
'Scottish Gaelic': 'gla',
'Irish': 'gle',
'Galician': 'glg',
'Greek, Ancient (to 1453) (contrib)': 'grc',
'Gujarati': 'guj',
'Haitian; Haitian Creole': 'hat',
'Hebrew': 'heb',
'Hindi': 'hin',
'Croatian': 'hrv',
'Hungarian': 'hun',
'Armenian': 'hye',
'Inuktitut': 'iku',
'Indonesian': 'ind',
'Icelandic': 'isl',
'Italian': 'ita',
'Italian - Old': 'ita_old',
'Javanese': 'jav',
'Japanese': 'jpn',
'Kannada': 'kan',
'Georgian': 'kat',
'Georgian - Old': 'kat_old',
'Kazakh': 'kaz',
'Central Khmer': 'khm',
'Kirghiz; Kyrgyz': 'kir',
'Kurmanji (Kurdish - Latin Script)': 'kmr',
'Korean': 'kor',
'Korean (vertical)': 'kor_vert',
'Kurdish (Arabic Script)': 'kur',
'Lao': 'lao',
'Latin': 'lat',
'Latvian': 'lav',
'Lithuanian': 'lit',
'Luxembourgish': 'ltz',
'Malayalam': 'mal',
'Marathi': 'mar',
'Macedonian': 'mkd',
'Maltese': 'mlt',
'Mongolian': 'mon',
'Maori': 'mri',
'Malay': 'msa',
'Burmese': 'mya',
'Nepali': 'nep',
'Dutch; Flemish': 'nld',
'Norwegian': 'nor',
'Occitan (post 1500)': 'oci',
'Oriya': 'ori',
'Orientation and script detection module': 'osd',
'Panjabi; Punjabi': 'pan',
'Polish': 'pol',
'Portuguese': 'por',
'Pushto; Pashto': 'pus',
'Quechua': 'que',
'Romanian; Moldavian; Moldovan': 'ron',
'Russian': 'rus',
'Sanskrit': 'san',
'Sinhala; Sinhalese': 'sin',
'Slovak': 'slk',
'Slovak - Fraktur (contrib)': 'slk_frak',
'Slovenian': 'slv',
'Sindhi': 'snd',
'Spanish; Castilian': 'spa',
'Spanish; Castilian - Old': 'spa_old',
'Albanian': 'sqi',
'Serbian': 'srp',
'Serbian - Latin': 'srp_latn',
'Sundanese': 'sun',
'Swahili': 'swa',
'Swedish': 'swe',
'Syriac': 'syr',
'Tamil': 'tam',
'Tatar': 'tat',
'Telugu': 'tel',
'Tajik': 'tgk',
'Tagalog (new - Filipino)': 'tgl',
'Thai': 'tha',
'Tigrinya': 'tir',
'Tonga': 'ton',
'Turkish': 'tur',
'Uighur; Uyghur': 'uig',
'Ukrainian': 'ukr',
'Urdu': 'urd',
'Uzbek': 'uzb',
'Uzbek - Cyrilic': 'uzb_cyrl',
'Vietnamese': 'vie',
'Yiddish': 'yid',
'Yoruba': 'yor'
}