# -*- coding: utf-8 -*- import json from fastapi import APIRouter, Response from fastapi.responses import JSONResponse from pythainlp.tokenize import ( word_tokenize as py_word_tokenize, subword_tokenize as py_subword_tokenize, sent_tokenize as py_sent_tokenize ) from enum import Enum from typing import List, Optional from pydantic import BaseModel router = APIRouter() class SentTokenizeEngine(str, Enum): whitespace = "whitespace" whitespace_newline = "whitespace+newline" crfcut = "crfcut" class WordTokenizeEngine(str, Enum): newmm = "newmm" longest = "longest" tltk = "tltk" class SubwordTokenizeEngine(str, Enum): tcc = "tcc" etcc = "etcc" ssg = "ssg" tltk = "tltk" class WordTokenizeResponse(BaseModel): words: List[str] = [] class SubwordTokenizeResponse(BaseModel): subwords: List[str] = [] class SentTokenizeEngine(BaseModel): sents: List[str] = [] @router.post('/word_tokenize', response_model=WordTokenizeResponse) def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"): """ Word tokenize or word segmentation for Thai language ## Input - **text**: Text that want to tokenize. - **engine**: Word Tokenize Engine (default is newmm) """ return JSONResponse( {"words": py_word_tokenize(text=text, engine=engine)}, media_type="application/json; charset=utf-8", ) @router.post('/subword_tokenize', response_model=SubwordTokenizeResponse) def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"): """ Subword tokenize or subword segmentation for Thai language ## Input - **text**: Text that want to tokenize. - **engine**: Sub word Tokenize Engine (default is tcc) """ return JSONResponse( {"subwords": py_subword_tokenize(text=text, engine=engine)}, media_type="application/json; charset=utf-8", ) @router.post('/sent_tokenize', response_model=SentTokenizeEngine) def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"): """ Thai sentence segmentation ## Input - **text**: Text that want to tokenize. - **engine**: Sentence Tokenize Engine (default is crfcut) """ return JSONResponse( {"sents": py_sent_tokenize(text=text, engine=engine)}, media_type="application/json; charset=utf-8", )