From ab4922b5292a1f0bb582f1875c4643ea32bc79e6 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Sun, 17 Aug 2025 19:00:34 +0700 Subject: [PATCH 1/2] Add B-K/umt5-thai-g2p-v2-0.5k --- pythainlp/transliterate/core.py | 5 ++++ pythainlp/transliterate/umt5_thaig2p.py | 34 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 pythainlp/transliterate/umt5_thaig2p.py diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py index 315343661..c5ffa55e8 100644 --- a/pythainlp/transliterate/core.py +++ b/pythainlp/transliterate/core.py @@ -122,6 +122,9 @@ def transliterate( * *tltk_ipa* - tltk, output is International Phonetic Alphabet (IPA) * *thaig2p_v2* - Thai Grapheme-to-Phoneme, output is IPA. https://huggingface.co/pythainlp/thaig2p-v2.0 + * *umt5_thaig2p* - Thai Grapheme-to-Phoneme, + output is IPA, powered by UMT5.\ + https://huggingface.co/B-K/umt5-thai-g2p-v2-0.5k :Example: :: @@ -174,6 +177,8 @@ def transliterate( from pythainlp.transliterate.iso_11940 import transliterate elif engine == "thaig2p_v2": from pythainlp.transliterate.thaig2p_v2 import transliterate + elif engine == "umt5_thaig2p": + from pythainlp.translate.umt5_thaig2p import transliterate else: # use default engine: "thaig2p" from pythainlp.transliterate.thaig2p import transliterate diff --git a/pythainlp/transliterate/umt5_thaig2p.py b/pythainlp/transliterate/umt5_thaig2p.py new file mode 100644 index 000000000..f17479658 --- /dev/null +++ b/pythainlp/transliterate/umt5_thaig2p.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +""" +umt5-thai-g2p-v2-0.5k + +huggingface: https://huggingface.co/B-K/umt5-thai-g2p-v2-0.5k +""" + +# Use a pipeline as a high-level helper +from transformers import pipeline + + +class Umt5ThaiG2P: + """ + Latin transliteration of Thai words, using International Phonetic Alphabet + """ + + def __init__(self, device: str = "cpu"): + self.pipe = pipeline("text2text-generation", model="B-K/umt5-thai-g2p-v2-0.5k", device=device) + + def g2p(self, text: str) -> str: + return self.pipe(text)[0]["generated_text"] + + +_THAI_G2P = None + + +def transliterate(text: str, device="cpu") -> str: + global _THAI_G2P + if _THAI_G2P is None: + _THAI_G2P = Umt5ThaiG2P(device=device) + return _THAI_G2P.g2p(text) From c8d912a88ac3cfabb9c1a5cf7faf185841784c5f Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Sun, 17 Aug 2025 19:07:04 +0700 Subject: [PATCH 2/2] Add umt5_thaig2p test --- tests/extra/testx_transliterate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/extra/testx_transliterate.py b/tests/extra/testx_transliterate.py index eb34e96bc..b9bba6f6f 100644 --- a/tests/extra/testx_transliterate.py +++ b/tests/extra/testx_transliterate.py @@ -140,6 +140,8 @@ def test_transliterate(self): self.assertIsNotNone(transliterate("แมว", engine="thaig2p")) self.assertIsNotNone(transliterate("คน", engine="thaig2p_v2")) self.assertIsNotNone(transliterate("แมว", engine="thaig2p_v2")) + self.assertIsNotNone(transliterate("คน", engine="umt5_thaig2p")) + self.assertIsNotNone(transliterate("แมว", engine="umt5_thaig2p")) self.assertIsNotNone(transliterate("คน", engine="tltk_g2p")) self.assertIsNotNone(transliterate("แมว", engine="tltk_g2p")) self.assertIsNotNone(transliterate("คน", engine="tltk_ipa"))