Skip to content

Commit de098f3

Browse files
authored
Merge pull request #870 from konbraphat51/corpus_volubilis
Add Volubilis corpus
2 parents abfbf02 + 57b9e37 commit de098f3

File tree

5 files changed

+121221
-32
lines changed

5 files changed

+121221
-32
lines changed

pythainlp/corpus/__init__.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -20,31 +20,32 @@
2020
"""
2121

2222
__all__ = [
23-
"corpus_path",
2423
"corpus_db_path",
2524
"corpus_db_url",
25+
"corpus_path",
2626
"countries",
2727
"download",
2828
"get_corpus",
2929
"get_corpus_db",
3030
"get_corpus_db_detail",
3131
"get_corpus_default_db",
3232
"get_corpus_path",
33+
"get_path_folder_corpus",
34+
"path_pythainlp_corpus",
3335
"provinces",
3436
"remove",
3537
"thai_dict",
3638
"thai_family_names",
3739
"thai_female_names",
3840
"thai_male_names",
3941
"thai_negations",
40-
"thai_synonym",
42+
"thai_orst_words",
4143
"thai_stopwords",
4244
"thai_syllables",
45+
"thai_synonym",
4346
"thai_words",
4447
"thai_wsd_dict",
45-
"thai_orst_words",
46-
"path_pythainlp_corpus",
47-
"get_path_folder_corpus",
48+
"volubilis",
4849
]
4950

5051
import os
@@ -119,3 +120,4 @@ def corpus_db_path() -> str:
119120
thai_dict,
120121
thai_wsd_dict
121122
)
123+
from pythainlp.corpus.volubilis import volubilis

pythainlp/corpus/corpus_license.md

+48-27
Original file line numberDiff line numberDiff line change
@@ -10,31 +10,30 @@ The following word lists are created by the PyThaiNLP project and released under
1010
**Creative Commons Zero 1.0 Universal Public Domain Dedication License**
1111
https://creativecommons.org/publicdomain/zero/1.0/
1212

13-
Filename | Description
14-
---------|------------
15-
countries_th.txt | List of countries in Thai
16-
etcc.txt List of | Enhanced Thai Character Clusters
17-
negations_th.txt | Negation word list
18-
stopwords_th.txt | Stop word list
19-
syllables_th.txt | List of Thai syllables
20-
thailand_provinces_th.csv | List of Thailand provinces in Thai
21-
tnc_freq.txt | Words and their frequencies, from Thai National Corpus
22-
ttc_freq.txt | Words and their frequencies, from Thai Textbook Corpus
23-
words_th.txt | List of Thai words
24-
words_th_thai2fit_201810.txt | List of Thai words (frozen for thai2fit)
13+
| Filename | Description |
14+
| ---------------------------- | ------------------------------------------------------ |
15+
| countries_th.txt | List of countries in Thai |
16+
| etcc.txt List of | Enhanced Thai Character Clusters |
17+
| negations_th.txt | Negation word list |
18+
| stopwords_th.txt | Stop word list |
19+
| syllables_th.txt | List of Thai syllables |
20+
| thailand_provinces_th.csv | List of Thailand provinces in Thai |
21+
| tnc_freq.txt | Words and their frequencies, from Thai National Corpus |
22+
| ttc_freq.txt | Words and their frequencies, from Thai Textbook Corpus |
23+
| words_th.txt | List of Thai words |
24+
| words_th_thai2fit_201810.txt | List of Thai words (frozen for thai2fit) |
2525

2626
The following word lists are from **Thai Male and Female Names Corpus**
2727
https://github.com/korkeatw/thai-names-corpus/ by Korkeat Wannapat
2828
and released under their original licenses which are
2929
**Creative Commons Attribution-ShareAlike 4.0 International Public License**
3030
https://creativecommons.org/licenses/by-sa/4.0/
3131

32-
Filename | Description
33-
---------|------------
34-
family_names_th.txt | List of family names in Thailand
35-
person_names_female_th.txt | List of female names in Thailand
36-
person_names_male_th.txt | List of male names in Thailand
37-
32+
| Filename | Description |
33+
| -------------------------- | -------------------------------- |
34+
| family_names_th.txt | List of family names in Thailand |
35+
| person_names_female_th.txt | List of female names in Thailand |
36+
| person_names_male_th.txt | List of male names in Thailand |
3837

3938
## Models
4039

@@ -43,14 +42,13 @@ and released under
4342
**Creative Commons Attribution 4.0 International Public License**
4443
https://creativecommons.org/licenses/by/4.0/
4544

46-
Filename | Description
47-
---------|------------
48-
pos_orchid_perceptron.pkl | Part-of-speech tagging model, trained from ORCHID data, using perceptron
49-
pos_orchid_unigram.json | Part-of-speech tagging model, trained from ORCHID data, using unigram
50-
pos_ud_perceptron.pkl | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron
51-
pos_ud_unigram.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram
52-
sentenceseg_crfcut.model | Sentence segmentation model, trained from TED subtitles, using CRF
53-
45+
| Filename | Description |
46+
| ------------------------- | ----------------------------------------------------------------------------------------------------- |
47+
| pos_orchid_perceptron.pkl | Part-of-speech tagging model, trained from ORCHID data, using perceptron |
48+
| pos_orchid_unigram.json | Part-of-speech tagging model, trained from ORCHID data, using unigram |
49+
| pos_ud_perceptron.pkl | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
50+
| pos_ud_unigram.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram |
51+
| sentenceseg_crfcut.model | Sentence segmentation model, trained from TED subtitles, using CRF |
5452

5553
## Thai WordNet
5654

@@ -100,4 +98,27 @@ For more information about Thai WordNet, see
10098
S. Thoongsup et al., ‘Thai WordNet construction’,
10199
in Proceedings of the 7th Workshop on Asian Language Resources,
102100
Suntec, Singapore, Aug. 2009, pp. 139–144.
103-
https://www.aclweb.org/anthology/W09-3420.pdf
101+
https://www.aclweb.org/anthology/W09-3420.pdf
102+
103+
## Volubilis
104+
105+
Corpus of Thai words registered in Volubilis (volubilis.txt) which was processed by konbraphat51 (https://github.com/konbraphat51/Thai_Dictionary_Cleaner/tree/main)
106+
107+
The original data is VOLUBILIS 23.1 (Mar. 2023) Database from [Volubilis](https://belisan-volubilis.blogspot.com/) which Francis Bastien has created.
108+
109+
```
110+
VOLUBILIS MULTILINGUAL THAI DICT. & DATABASE by Francis Bastien (Belisan) is licensed under CC BY-SA 4.0
111+
112+
This is a human-readable summary of (and not a substitute for) the license below.
113+
You are free:
114+
to Share—copy and redistribute the material in any medium or format
115+
to Adapt—remix, transform, and build upon the material
116+
for any purpose, even commercially.
117+
The licensor cannot revoke these freedoms as long as you follow the license terms.
118+
Under the following terms:
119+
Attribution—You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
120+
Share Alike—If you remix, transform, or build upon the material, you must distribute your contributions under the same license as the original.
121+
No additional restrictions—You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
122+
Notices:
123+
You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation. No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material.
124+
```

pythainlp/corpus/volubilis.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (C) 2016-2023 PyThaiNLP Project
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
Provides an optional word list from the Volubilis dictionary.
17+
"""
18+
from typing import FrozenSet
19+
20+
from pythainlp.corpus.common import get_corpus
21+
22+
_VOLUBILIS = None
23+
_VOLUBILIS_FILENAME = "volubilis_modified.txt"
24+
25+
26+
def volubilis() -> FrozenSet[str]:
27+
"""
28+
Return a frozenset of words from the Volubilis dictionary.
29+
30+
The data is at pythainlp/corpus/volubilis_modified.txt
31+
The word list has beed prepared by the code at:
32+
https://github.com/konbraphat51/Thai_Dictionary_Cleaner
33+
Based Volubilis dictionary 23.1 (March 2023):
34+
https://belisan-volubilis.blogspot.com/
35+
36+
:return: :class:`frozenset` containing words in the Volubilis dictionary.
37+
:rtype: :class:`frozenset`
38+
"""
39+
global _VOLUBILIS
40+
if not _VOLUBILIS:
41+
_VOLUBILIS = get_corpus(_VOLUBILIS_FILENAME)
42+
43+
return _VOLUBILIS

0 commit comments

Comments
 (0)