File size: 2,364 Bytes
dbaf7c6 9dbb134 0c9becc 8fe7306 dbaf7c6 8fe7306 dbaf7c6 8fe7306 dbaf7c6 8fe7306 dbaf7c6 8fe7306 cf27cda dbaf7c6 414dfdc dd4d11e b9def7b 1bc58c6 b9def7b 414dfdc 0c9becc fdea6f1 737b250 9dbb134 dbaf7c6 cf27cda dbaf7c6 dd4d11e b9def7b 1bc58c6 b9def7b dd4d11e 0c9becc fdea6f1 737b250 9dbb134 8fe7306 dd4d11e b9def7b 1bc58c6 b9def7b dd4d11e 0c9becc fdea6f1 737b250 9dbb134 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# -*- coding: utf-8 -*-
import json
from fastapi import APIRouter, Response
from fastapi.responses import JSONResponse
from pythainlp.tokenize import (
word_tokenize as py_word_tokenize,
subword_tokenize as py_subword_tokenize,
sent_tokenize as py_sent_tokenize
)
from enum import Enum
from typing import List, Optional
from pydantic import BaseModel
router = APIRouter()
class SentTokenizeEngine(str, Enum):
whitespace = "whitespace"
whitespace_newline = "whitespace+newline"
crfcut = "crfcut"
class WordTokenizeEngine(str, Enum):
newmm = "newmm"
longest = "longest"
tltk = "tltk"
class SubwordTokenizeEngine(str, Enum):
tcc = "tcc"
etcc = "etcc"
ssg = "ssg"
tltk = "tltk"
class WordTokenizeResponse(BaseModel):
words: List[str] = []
class SubwordTokenizeResponse(BaseModel):
subwords: List[str] = []
class SentTokenizeEngine(BaseModel):
sents: List[str] = []
@router.post('/word_tokenize', response_model=WordTokenizeResponse)
def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"):
"""
Word tokenize or word segmentation for Thai language
## Input
- **text**: Text that want to tokenize.
- **engine**: Word Tokenize Engine (default is newmm)
"""
return JSONResponse(
{"words": py_word_tokenize(text=text, engine=engine)},
media_type="application/json; charset=utf-8",
)
@router.post('/subword_tokenize', response_model=SubwordTokenizeResponse)
def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"):
"""
Subword tokenize or subword segmentation for Thai language
## Input
- **text**: Text that want to tokenize.
- **engine**: Sub word Tokenize Engine (default is tcc)
"""
return JSONResponse(
{"subwords": py_subword_tokenize(text=text, engine=engine)},
media_type="application/json; charset=utf-8",
)
@router.post('/sent_tokenize', response_model=SentTokenizeEngine)
def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"):
"""
Thai sentence segmentation
## Input
- **text**: Text that want to tokenize.
- **engine**: Sentence Tokenize Engine (default is crfcut)
"""
return JSONResponse(
{"sents": py_sent_tokenize(text=text, engine=engine)},
media_type="application/json; charset=utf-8",
)
|