Skip to content

Add Thai word list from ICU BreakIterator dictionary #879

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Dec 6, 2023
2 changes: 2 additions & 0 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"thai_dict",
"thai_family_names",
"thai_female_names",
"thai_icu",
"thai_male_names",
"thai_negations",
"thai_orst_words",
Expand Down Expand Up @@ -110,5 +111,6 @@ def corpus_db_path() -> str:
thai_words,
thai_wsd_dict,
)
from pythainlp.corpus.thai_icu import thai_icu
from pythainlp.corpus.volubilis import volubilis
from pythainlp.corpus.wikipedia_titles import wikipedia_titles
30 changes: 30 additions & 0 deletions pythainlp/corpus/thai_icu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Provides an optional word list from International Components for Unicode (ICU) dictionary.
"""
from typing import FrozenSet

from pythainlp.corpus.common import get_corpus

_THAI_ICU = None
_THAI_ICU_FILENAME = "thai_icu.txt"


def thai_icu() -> FrozenSet[str]:
"""
Return a frozenset of words from the International Components for Unicode (ICU) dictionary.

The data is at pythainlp/corpus/thai_icu.txt
The word list has beed prepared by the code at:
https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/dictionaries/thaidict.txt

:return: :class:`frozenset` containing words in the Thai ICU dictionary.
:rtype: :class:`frozenset`
"""
global _THAI_ICU
if not _THAI_ICU:
_THAI_ICU = get_corpus(_THAI_ICU_FILENAME)

return _THAI_ICU
Loading