Skip to content

Commit 4a86bca

Browse files
authored
Merge pull request #414 from PyThaiNLP/more-test-cases
Properly check if download() is needed in get_corpus_path()
2 parents 4c22970 + 14e7063 commit 4a86bca

File tree

13 files changed

+203
-182
lines changed

13 files changed

+203
-182
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ Using PyThaiNLP:
2727
- [PyThaiNLP Get Started](https://www.thainlp.org/pythainlp/tutorials/notebooks/pythainlp_get_started.html)
2828
- More tutorials at [https://www.thainlp.org/pythainlp/tutorials/](https://www.thainlp.org/pythainlp/tutorials/)
2929
- See full documentation at [https://thainlp.org/pythainlp/docs/2.1/](https://thainlp.org/pythainlp/docs/2.1/)
30-
- Some additional data (like word lists and language models) maybe automatically downloaded by the library during runtime and it will be kept under the directory `~/pythainlp-data` by default.
30+
- Some additional data (like word lists and language models) may get automatically download during runtime and it will be kept under the directory `~/pythainlp-data` by default. See corpus catalog at [https://github.com/PyThaiNLP/pythainlp-corpus](https://github.com/PyThaiNLP/pythainlp-corpus).
3131
- The data location can be changed, using `PYTHAINLP_DATA_DIR` environment variable.
3232
- For PyThaiNLP tokenization performance and measurement methods, see [tokenization benchmark](tokenization-benchmark.md)
3333
- 📫 follow our [PyThaiNLP](https://www.facebook.com/pythainlp/) Facebook page

pythainlp/corpus/__init__.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,20 @@
3636
_CORPUS_DIRNAME = "corpus"
3737
_CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME)
3838

39+
# remote corpus catalog URL
3940
_CORPUS_DB_URL = (
4041
"https://raw.githubusercontent.com/"
41-
+ "PyThaiNLP/pythainlp-corpus/"
42-
+ "2.2/db.json"
42+
"PyThaiNLP/pythainlp-corpus/"
43+
"2.2/db.json"
4344
)
4445

46+
# local corpus catalog filename
4547
_CORPUS_DB_FILENAME = "db.json"
48+
49+
# local corpus catalog full path
4650
_CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME)
4751

48-
# Create a local corpus database if it does not already exist
52+
# create a local corpus database if it does not already exist
4953
if not os.path.exists(_CORPUS_DB_PATH):
5054
TinyDB(_CORPUS_DB_PATH)
5155

pythainlp/corpus/core.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,16 @@
99
from urllib.request import urlopen
1010

1111
import requests
12-
from requests.exceptions import HTTPError
13-
from tinydb import Query, TinyDB
14-
1512
from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
1613
from pythainlp.tools import get_full_data_path
14+
from requests.exceptions import HTTPError
15+
from tinydb import Query, TinyDB
1716

1817

1918
def get_corpus_db(url: str) -> requests.Response:
19+
"""
20+
Get corpus catalog from server.
21+
"""
2022
corpus_db = None
2123
try:
2224
corpus_db = requests.get(url)
@@ -29,20 +31,23 @@ def get_corpus_db(url: str) -> requests.Response:
2931

3032

3133
def get_corpus_db_detail(name: str) -> dict:
34+
"""
35+
Get details about a corpus, using information from local catalog.
36+
"""
3237
local_db = TinyDB(corpus_db_path())
3338
query = Query()
3439
res = local_db.search(query.name == name)
3540
local_db.close()
3641

3742
if res:
3843
return res[0]
39-
else:
40-
return dict()
44+
45+
return dict()
4146

4247

4348
def get_corpus(filename: str) -> frozenset:
4449
"""
45-
Read corpus from file and return a frozenset.
50+
Read corpus data from file and return a frozenset.
4651
4752
(Please see the filename from
4853
`this file
@@ -82,7 +87,7 @@ def get_corpus_path(name: str) -> Union[str, None]:
8287
Get corpus path.
8388
8489
:param str name: corpus name
85-
:return: path to the corpus or **None** of the corpus doesn't
90+
:return: path to the corpus or **None** of the corpus doesn't \
8691
exist in the device
8792
:rtype: str
8893
@@ -112,18 +117,22 @@ def get_corpus_path(name: str) -> Union[str, None]:
112117
print(get_corpus_path('wiki_lm_lstm'))
113118
# output: /root/pythainlp-data/thwiki_model_lstm.pth
114119
"""
115-
db = TinyDB(corpus_db_path())
116-
query = Query()
117-
path = None
118-
119-
if db.search(query.name == name):
120-
path = get_full_data_path(db.search(query.name == name)[0]["file"])
121-
120+
# check if the corpus is in local catalog, download if not
121+
corpus_db_detail = get_corpus_db_detail(name)
122+
if not corpus_db_detail or not corpus_db_detail.get("file_name"):
123+
download(name)
124+
corpus_db_detail = get_corpus_db_detail(name)
125+
126+
if corpus_db_detail and corpus_db_detail.get("file_name"):
127+
# corpus is in the local catalog, get full path to the file
128+
path = get_full_data_path(corpus_db_detail.get("file_name"))
129+
# check if the corpus file actually exists, download if not
122130
if not os.path.exists(path):
123131
download(name)
132+
if os.path.exists(path):
133+
return path
124134

125-
db.close()
126-
return path
135+
return None
127136

128137

129138
def _download(url: str, dst: str) -> int:
@@ -174,9 +183,7 @@ def _check_hash(dst: str, md5: str) -> None:
174183
raise Exception("Hash does not match expected.")
175184

176185

177-
def download(
178-
name: str, force: bool = False, url: str = None
179-
) -> bool:
186+
def download(name: str, force: bool = False, url: str = None) -> bool:
180187
"""
181188
Download corpus.
182189
@@ -215,7 +222,7 @@ def download(
215222

216223
corpus_db = corpus_db.json()
217224

218-
# Check if corpus is available
225+
# check if corpus is available
219226
if name in list(corpus_db.keys()):
220227
local_db = TinyDB(corpus_db_path())
221228
query = Query()
@@ -239,7 +246,7 @@ def download(
239246
{
240247
"name": name,
241248
"version": corpus["version"],
242-
"file": corpus["file_name"],
249+
"file_name": corpus["file_name"],
243250
}
244251
)
245252
else:

pythainlp/tag/named_entity.py

Lines changed: 39 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@
77

88
from typing import List, Tuple, Union
99

10-
import pycrfsuite
11-
from pythainlp.corpus import download, get_corpus_path, thai_stopwords
10+
from pycrfsuite import Tagger as CRFTagger
11+
from pythainlp.corpus import get_corpus_path, thai_stopwords
1212
from pythainlp.tag import pos_tag
1313
from pythainlp.tokenize import word_tokenize
1414
from pythainlp.util import isthai
1515

16-
_WORD_TOKENIZER = "newmm" # ตัวตัดคำ
16+
_CORPUS_NAME = "thainer-1-4"
17+
_TOKENIZER_ENGINE = "newmm" # should be the same as one used in training data
1718

1819

1920
def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย
@@ -74,14 +75,10 @@ def _doc2features(doc, i) -> dict:
7475
class ThaiNameTagger:
7576
def __init__(self):
7677
"""
77-
Thai named-entity recognizer
78+
Thai named-entity recognizer.
7879
"""
79-
self.__data_path = get_corpus_path("thainer-1-4")
80-
if not self.__data_path:
81-
download("thainer-1-4")
82-
self.__data_path = get_corpus_path("thainer-1-4")
83-
self.crf = pycrfsuite.Tagger()
84-
self.crf.open(self.__data_path)
80+
self.crf = CRFTagger()
81+
self.crf.open(get_corpus_path(_CORPUS_NAME))
8582

8683
def get_ner(
8784
self, text: str, pos: bool = True, tag: bool = False
@@ -137,41 +134,41 @@ def get_ner(
137134
tag=True)
138135
'วันที่ <DATE>15 ก.ย. 61</DATE> ทดสอบระบบเวลา <TIME>14:49 น.</TIME>'
139136
"""
140-
self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER)
141-
self.__pos_tags = pos_tag(
142-
self.__tokens, engine="perceptron", corpus="orchid_ud"
143-
)
144-
self.__x_test = self.__extract_features(self.__pos_tags)
145-
self.__y = self.crf.tag(self.__x_test)
146-
147-
self.sent_ner = [
148-
(self.__pos_tags[i][0], data) for i, data in enumerate(self.__y)
149-
]
137+
tokens = word_tokenize(text, engine=_TOKENIZER_ENGINE)
138+
pos_tags = pos_tag(tokens, engine="perceptron", corpus="orchid_ud")
139+
x_test = ThaiNameTagger.__extract_features(pos_tags)
140+
y = self.crf.tag(x_test)
141+
142+
sent_ner = [(pos_tags[i][0], data) for i, data in enumerate(y)]
143+
150144
if tag:
151-
self.temp = ""
152-
self.sent = ""
153-
for idx, (word, ner) in enumerate(self.sent_ner):
154-
if "B-" in ner and self.temp != "":
155-
self.sent += "</" + self.temp + ">"
156-
self.temp = ner.replace("B-", "")
157-
self.sent += "<" + self.temp + ">"
158-
elif "B-" in ner:
159-
self.temp = ner.replace("B-", "")
160-
self.sent += "<" + self.temp + ">"
161-
elif "O" == ner and self.temp != "":
162-
self.sent += "</" + self.temp + ">"
163-
self.temp = ""
164-
self.sent += word
165-
if idx == len(self.sent_ner) - 1 and self.temp != "":
166-
self.sent += "</" + self.temp + ">"
167-
return self.sent
168-
elif pos:
145+
temp = ""
146+
sent = ""
147+
for idx, (word, ner) in enumerate(sent_ner):
148+
if ner.startswith("B-") and temp != "":
149+
sent += "</" + temp + ">"
150+
temp = ner[2:]
151+
sent += "<" + temp + ">"
152+
elif ner.startswith("B-"):
153+
temp = ner[2:]
154+
sent += "<" + temp + ">"
155+
elif ner == "O" and temp != "":
156+
sent += "</" + temp + ">"
157+
temp = ""
158+
sent += word
159+
160+
if idx == len(sent_ner) - 1 and temp != "":
161+
sent += "</" + temp + ">"
162+
163+
return sent
164+
165+
if pos:
169166
return [
170-
(self.__pos_tags[i][0], self.__pos_tags[i][1], data)
171-
for i, data in enumerate(self.__y)
167+
(pos_tags[i][0], pos_tags[i][1], data)
168+
for i, data in enumerate(y)
172169
]
173-
else:
174-
return self.sent_ner
170+
171+
return sent_ner
175172

176173
@staticmethod
177174
def __extract_features(doc):

pythainlp/tag/pos_tag.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,14 +180,13 @@ def pos_tag(
180180
# [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None),
181181
# ('<space>', None), ('<equal>', None), ('3', 'NUM')]
182182
"""
183+
if not words:
184+
return []
183185

184-
# NOTE:
185186
_corpus = corpus
186187
_tag = []
187188
if corpus == "orchid_ud":
188189
corpus = "orchid"
189-
if not words:
190-
return []
191190

192191
if engine == "perceptron":
193192
from .perceptron import tag as tag_
@@ -243,4 +242,4 @@ def pos_tag_sents(
243242
if not sentences:
244243
return []
245244

246-
return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
245+
return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]

pythainlp/tools/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
# -*- coding: utf-8 -*-
22
__all__ = [
3+
"PYTHAINLP_DEFAULT_DATA_DIR",
34
"get_full_data_path",
45
"get_pythainlp_data_path",
56
"get_pythainlp_path",
6-
"PYTHAINLP_DATA_DIR",
77
]
88

99
from pythainlp.tools.path import (
10+
PYTHAINLP_DEFAULT_DATA_DIR,
1011
get_full_data_path,
1112
get_pythainlp_data_path,
1213
get_pythainlp_path,
13-
PYTHAINLP_DATA_DIR,
1414
)

pythainlp/tools/path.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
"""
77
import os
88

9-
import pythainlp
9+
from pythainlp import __file__ as pythainlp_file
1010

11-
PYTHAINLP_DATA_DIR = "pythainlp-data"
11+
PYTHAINLP_DEFAULT_DATA_DIR = "pythainlp-data"
1212

1313

1414
def get_full_data_path(path: str) -> str:
@@ -49,10 +49,10 @@ def get_pythainlp_data_path() -> str:
4949
get_pythainlp_data_path()
5050
# output: '/root/pythainlp-data'
5151
"""
52-
path = os.getenv(
53-
"PYTHAINLP_DATA_DIR", os.path.join("~", PYTHAINLP_DATA_DIR)
52+
pythainlp_data_dir = os.getenv(
53+
"PYTHAINLP_DATA_DIR", os.path.join("~", PYTHAINLP_DEFAULT_DATA_DIR)
5454
)
55-
path = os.path.expanduser(path)
55+
path = os.path.expanduser(pythainlp_data_dir)
5656
os.makedirs(path, exist_ok=True)
5757
return path
5858

@@ -72,4 +72,4 @@ def get_pythainlp_path() -> str:
7272
get_pythainlp_path()
7373
# output: '/usr/local/lib/python3.6/dist-packages/pythainlp'
7474
"""
75-
return os.path.dirname(pythainlp.__file__)
75+
return os.path.dirname(pythainlp_file)

0 commit comments

Comments
 (0)