From 2f9814cc96f2da58ea4f60d1ad9621a256186580 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 29 Sep 2025 00:16:17 +0700 Subject: [PATCH 1/9] Add Thai character mapping and analysis function Added a comprehensive mapping of Thai characters to their descriptive names, including consonants, vowels, tone marks, punctuation, and digits. Implemented the analyze_thai_text function to analyze Thai text and return a list of classified characters. --- pythainlp/util/thai.py | 84 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index aea0bfbb2..7349ff7a3 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -8,6 +8,7 @@ import string from typing import Tuple +from collections import defaultdict from pythainlp import ( thai_above_vowels, @@ -26,6 +27,53 @@ _TH_FIRST_CHAR_ASCII = 3584 _TH_LAST_CHAR_ASCII = 3711 +# A comprehensive map of Thai characters to their descriptive names. +THAI_CHAR_NAMES = { + # Consonants + **{char: char for char in thai_consonants}, + # Vowels and Signs + "\u0e24": "ฤ", + "\u0e26": "ฦ", + "\u0e30": "สระ อะ", + "\u0e31": "ไม้หันอากาศ", + "\u0e32": "สระ อา", + "\u0e33": "สระ อำ", + "\u0e34": "สระ อิ", + "\u0e35": "สระ อี", + "\u0e36": "สระ อึ", + "\u0e37": "สระ อือ", + "\u0e38": "สระ อุ", + "\u0e39": "สระ อู", + "\u0e40": "สระ เอ", + "\u0e41": "สระ แอ", + "\u0e42": "สระ โอ", + "\u0e43": "สระ ใอ", + "\u0e44": "สระ ไอ", + "\u0e45": "ไม้ม้วน", + "\u0e4d": "นฤคหิต", + "\u0e47": "ไม้ไต่คู้", + # Tone Marks + "\u0e48": "ไม้เอก", + "\u0e49": "ไม้โท", + "\u0e4a": "ไม้ตรี", + "\u0e4b": "ไม้จัตวา", + # Other Signs + "\u0e2f": "ไปยาลน้อย", + "\u0e3a": "พินทุ", + "\u0e46": "ไม้ยมก", + "\u0e4c": "การันต์", + "\u0e4d": "นฤคหิต", + "\u0e4e": "ยามักการ", + # Punctuation + "\u0e4f": "ฟองมัน", + "\u0e5a": "อังคั่นคู่", + "\u0e5b": "โคมุต", + # Digits + **{char: char for char in thai_digits}, + # Symbol + "\u0e3f": "฿", +} + def isthaichar(ch: str) -> bool: """Check if a character is a Thai character. @@ -269,3 +317,39 @@ def count_thai_chars(text: str) -> dict: else: _dict["non_thai"] += 1 return _dict + + +def analyze_thai_text(text: str) -> list[dict]: + """ + Analyzes a string of Thai text and returns a list of dictionaries, + where each dictionary represents a single classified character from the text. + + The function processes the text character by character and maps each Thai + character to its descriptive name or itself (for consonants and digits). + Non-Thai characters are categorized as "other not Thai". + + :param str text: The Thai text string to be analyzed. + :rtype: list[dict] + :return: A list of dictionaries, with each item containing + a single character and a count of 1. + + Examples: + >>> analyze_thai_text("คนดี") + [{'ค': 1}, {'น': 1}, {'ด': 1}, {'สระ อี': 1}] + + >>> analyze_thai_text("เล่น") + [{'สระ เอ': 1}, {'ล': 1}, {'ไม้เอก': 1}, {'น': 1}] + """ + results = defaultdict(int) + + # Iterate over each character in the input string + for char in text: + # Check if the character is in our mapping + if char in THAI_CHAR_NAMES: + name = THAI_CHAR_NAMES[char] + results[name]+=1 + else: + # If the character is not a known Thai character, classify it as character + results[char]+=1 + + return dict(results) From f7c97b3983ddddf34aad747d4b77e95074e612ee Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 29 Sep 2025 00:17:41 +0700 Subject: [PATCH 2/9] Document analyze_thai_text function Added documentation for analyze_thai_text function. --- docs/api/util.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/api/util.rst b/docs/api/util.rst index 9a1554707..1b9ee1bdb 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -7,6 +7,11 @@ The :mod:`pythainlp.util` module serves as a treasure trove of utility functions Modules ------- +.. autofunction:: analyze_thai_text + :noindex: + + Analyzes a string of Thai text and returns a list of dictionaries, where each dictionary represents a single classified character from the text. + .. autofunction:: abbreviation_to_full_text :noindex: From 2e66c6379ab72304f8a0b1812981043d489784b4 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 29 Sep 2025 00:19:00 +0700 Subject: [PATCH 3/9] Add analyze_thai_text to module exports --- pythainlp/util/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 7613257a6..d9f832c3f 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -69,6 +69,7 @@ "tone_detector", "tone_to_spelling", "words_to_num", + "analyze_thai_text", ] from pythainlp.util import spell_words @@ -121,6 +122,7 @@ isthai, isthaichar, thai_word_tone_detector, + analyze_thai_text, ) from pythainlp.util.thai_lunar_date import th_zodiac, to_lunar_date from pythainlp.util.thaiwordcheck import is_native_thai From 251501f0dc580bf99483ca24d0a9ea9295211f1c Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 29 Sep 2025 00:21:47 +0700 Subject: [PATCH 4/9] Add tests for analyze_thai_text function Added unit tests for analyze_thai_text function. --- tests/core/test_util.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/core/test_util.py b/tests/core/test_util.py index 9c46ad2aa..cf1de3048 100644 --- a/tests/core/test_util.py +++ b/tests/core/test_util.py @@ -67,6 +67,7 @@ tone_detector, words_to_num, spelling, + analyze_thai_text, ) from pythainlp.util.morse import morse_decode, morse_encode @@ -874,3 +875,13 @@ def test_longest_common_subsequence(self): self.assertEqual(longest_common_subsequence("", "ABC"), "") self.assertEqual(longest_common_subsequence("ABC", ""), "") self.assertEqual(longest_common_subsequence("", ""), "") + + def test_analyze_thai_text(self): + self.assertEqual( + analyze_thai_text("คนดี"), + [{"ค": 1}, {"น": 1}, {"ด": 1}, {"สระ อี": 1}] + ) + self.assertEqual( + analyze_thai_text("เล่น"), + [{'สระ เอ': 1}, {'ล': 1}, {'ไม้เอก': 1}, {'น': 1}] + ) From 711b4081fe5ce90ccb787fdf7345a85992653f0e Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 29 Sep 2025 00:26:28 +0700 Subject: [PATCH 5/9] Change analyze_thai_text return type to dict Updated the analyze_thai_text function to return a single dictionary instead of a list of dictionaries, reflecting changes in the return type and documentation. --- pythainlp/util/thai.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index 7349ff7a3..3564245ac 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -319,26 +319,25 @@ def count_thai_chars(text: str) -> dict: return _dict -def analyze_thai_text(text: str) -> list[dict]: +def analyze_thai_text(text: str) -> dict: """ - Analyzes a string of Thai text and returns a list of dictionaries, + Analyzes a string of Thai text and returns a dictionaries, where each dictionary represents a single classified character from the text. The function processes the text character by character and maps each Thai character to its descriptive name or itself (for consonants and digits). - Non-Thai characters are categorized as "other not Thai". :param str text: The Thai text string to be analyzed. :rtype: list[dict] - :return: A list of dictionaries, with each item containing + :return: A dictionaries, with each item containing a single character and a count of 1. Examples: >>> analyze_thai_text("คนดี") - [{'ค': 1}, {'น': 1}, {'ด': 1}, {'สระ อี': 1}] + {'ค': 1, 'น': 1, 'ด': 1, 'สระ อี': 1} >>> analyze_thai_text("เล่น") - [{'สระ เอ': 1}, {'ล': 1}, {'ไม้เอก': 1}, {'น': 1}] + {'สระ เอ': 1, 'ล': 1, 'ไม้เอก': 1, 'น': 1} """ results = defaultdict(int) From d5a3fdac2ff196b35ec1835eda31b2d7a93f02f3 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 29 Sep 2025 00:27:58 +0700 Subject: [PATCH 6/9] Update util.rst --- docs/api/util.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/util.rst b/docs/api/util.rst index 1b9ee1bdb..ef38bcd44 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -10,7 +10,7 @@ Modules .. autofunction:: analyze_thai_text :noindex: - Analyzes a string of Thai text and returns a list of dictionaries, where each dictionary represents a single classified character from the text. + Analyzes a string of Thai text and returns a dictionaries, where each values represents a single classified character from the text. .. autofunction:: abbreviation_to_full_text :noindex: From 5d8651caa31c4fe88d3f8dd8f3e9389b2e2cb15e Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 29 Sep 2025 00:28:20 +0700 Subject: [PATCH 7/9] Fix typo in analyze_thai_text docstring Corrected the description in the analyze_thai_text function. --- pythainlp/util/thai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index 3564245ac..9ad5f3e38 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -322,7 +322,7 @@ def count_thai_chars(text: str) -> dict: def analyze_thai_text(text: str) -> dict: """ Analyzes a string of Thai text and returns a dictionaries, - where each dictionary represents a single classified character from the text. + where each values represents a single classified character from the text. The function processes the text character by character and maps each Thai character to its descriptive name or itself (for consonants and digits). From 769db1b79889e31a32ad0622bc56501eba8a4081 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 29 Sep 2025 00:29:24 +0700 Subject: [PATCH 8/9] Fix expected output format in Thai text analysis tests --- tests/core/test_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/core/test_util.py b/tests/core/test_util.py index cf1de3048..7d01fb174 100644 --- a/tests/core/test_util.py +++ b/tests/core/test_util.py @@ -879,9 +879,9 @@ def test_longest_common_subsequence(self): def test_analyze_thai_text(self): self.assertEqual( analyze_thai_text("คนดี"), - [{"ค": 1}, {"น": 1}, {"ด": 1}, {"สระ อี": 1}] + {"ค": 1, "น": 1, "ด": 1, "สระ อี": 1} ) self.assertEqual( analyze_thai_text("เล่น"), - [{'สระ เอ': 1}, {'ล': 1}, {'ไม้เอก': 1}, {'น': 1}] + {'สระ เอ': 1, 'ล': 1, 'ไม้เอก': 1, 'น': 1} ) From 8b779c88d2ea2bbc015aeef99a90a63b84c451db Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 29 Sep 2025 00:47:26 +0700 Subject: [PATCH 9/9] Update thai.py --- pythainlp/util/thai.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index 9ad5f3e38..a0fa2a5b1 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -62,7 +62,6 @@ "\u0e3a": "พินทุ", "\u0e46": "ไม้ยมก", "\u0e4c": "การันต์", - "\u0e4d": "นฤคหิต", "\u0e4e": "ยามักการ", # Punctuation "\u0e4f": "ฟองมัน",