File size: 2,364 Bytes
dbaf7c6
9dbb134
 
0c9becc
8fe7306
 
 
 
 
dbaf7c6
 
 
 
 
 
 
 
 
 
8fe7306
dbaf7c6
 
 
 
 
8fe7306
dbaf7c6
 
 
 
 
8fe7306
 
dbaf7c6
 
 
 
 
 
 
8fe7306
 
 
cf27cda
dbaf7c6
414dfdc
dd4d11e
b9def7b
 
 
1bc58c6
b9def7b
414dfdc
0c9becc
fdea6f1
737b250
9dbb134
dbaf7c6
 
cf27cda
dbaf7c6
dd4d11e
b9def7b
 
 
 
1bc58c6
b9def7b
dd4d11e
0c9becc
fdea6f1
737b250
9dbb134
8fe7306
 
 
 
dd4d11e
 
b9def7b
 
 
1bc58c6
b9def7b
dd4d11e
0c9becc
fdea6f1
737b250
9dbb134
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
import json
from fastapi import APIRouter, Response
from fastapi.responses import JSONResponse
from pythainlp.tokenize import (
    word_tokenize as py_word_tokenize,
    subword_tokenize as py_subword_tokenize,
    sent_tokenize as py_sent_tokenize
)
from enum import Enum
from typing import List, Optional
from pydantic import BaseModel

router = APIRouter()


class SentTokenizeEngine(str, Enum):
    whitespace = "whitespace"
    whitespace_newline = "whitespace+newline"
    crfcut = "crfcut"


class WordTokenizeEngine(str, Enum):
    newmm = "newmm"
    longest = "longest"
    tltk = "tltk"


class SubwordTokenizeEngine(str, Enum):
    tcc = "tcc"
    etcc = "etcc"
    ssg = "ssg"
    tltk = "tltk"

class WordTokenizeResponse(BaseModel):
    words: List[str] = []

class SubwordTokenizeResponse(BaseModel):
    subwords: List[str] = []

class SentTokenizeEngine(BaseModel):
    sents: List[str] = []

@router.post('/word_tokenize', response_model=WordTokenizeResponse)
def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"):
    """
    Word tokenize or word segmentation for Thai language

    ## Input

    - **text**: Text that want to tokenize.
    - **engine**: Word Tokenize Engine (default is newmm)
    """
    return JSONResponse(
        {"words": py_word_tokenize(text=text, engine=engine)},
        media_type="application/json; charset=utf-8",
    )


@router.post('/subword_tokenize', response_model=SubwordTokenizeResponse)
def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"):
    """
    Subword tokenize or subword segmentation for Thai language

    ## Input

    - **text**: Text that want to tokenize.
    - **engine**: Sub word Tokenize Engine (default is tcc)
    """
    return JSONResponse(
        {"subwords": py_subword_tokenize(text=text, engine=engine)},
        media_type="application/json; charset=utf-8",
    )


@router.post('/sent_tokenize', response_model=SentTokenizeEngine)
def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"):
    """
    Thai sentence segmentation

    ## Input

    - **text**: Text that want to tokenize.
    - **engine**: Sentence Tokenize Engine (default is crfcut)
    """
    return JSONResponse(
        {"sents": py_sent_tokenize(text=text, engine=engine)},
        media_type="application/json; charset=utf-8",
    )