diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 6dcc8678e..21abc56f4 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -1,29 +1,40 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals -from pythainlp.tools import get_path_db,get_path_data -from tinydb import TinyDB,Query -from future.moves.urllib.request import urlopen -from tqdm import tqdm -import requests + +from __future__ import absolute_import, unicode_literals + import os + import requests -#__all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"] -path_db_=get_path_db() +from future.moves.urllib.request import urlopen +from pythainlp.tools import get_path_data, get_path_db +from tinydb import Query, TinyDB +from tqdm import tqdm + +CORPUS_DB_URL = ( + "https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json" +) + +# __all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"] +path_db_ = get_path_db() + + def get_file(name): - db=TinyDB(path_db_) + db = TinyDB(path_db_) temp = Query() - if len(db.search(temp.name==name))>0: - path= get_path_data(db.search(temp.name==name)[0]['file']) + if len(db.search(temp.name == name)) > 0: + path = get_path_data(db.search(temp.name == name)[0]["file"]) db.close() if not os.path.exists(path): download(name) return path + + def download_(url, dst): """ @param: url to download file @param: dst place to put the file """ - file_size = int(urlopen(url).info().get('Content-Length', -1)) + file_size = int(urlopen(url).info().get("Content-Length", -1)) if os.path.exists(dst): first_byte = os.path.getsize(dst) else: @@ -32,55 +43,90 @@ def download_(url, dst): return file_size header = {"Range": "bytes=%s-%s" % (first_byte, file_size)} pbar = tqdm( - total=file_size, initial=first_byte, - unit='B', unit_scale=True, desc=url.split('/')[-1]) + total=file_size, + initial=first_byte, + unit="B", + unit_scale=True, + desc=url.split("/")[-1], + ) req = requests.get(url, headers=header, stream=True) - with(open(get_path_data(dst), 'wb')) as f: + with (open(get_path_data(dst), "wb")) as f: for chunk in req.iter_content(chunk_size=1024): if chunk: f.write(chunk) pbar.update(1024) pbar.close() - #return file_size -def download(name,force=False): - db=TinyDB(path_db_) + # return file_size + + +def download(name, force=False): + db = TinyDB(path_db_) temp = Query() - data=requests.get("https://raw.githubusercontent.com/PyThaiNLP/pythainlp-corpus/master/db.json") - data_json=data.json() + data = requests.get(CORPUS_DB_URL) + data_json = data.json() if name in list(data_json.keys()): - temp_name=data_json[name] - print("Download : "+name) - if len(db.search(temp.name==name))==0: - print(name+" "+temp_name['version']) - download_(temp_name['download'],temp_name['file_name']) - db.insert({'name': name, 'version': temp_name['version'],'file':temp_name['file_name']}) + temp_name = data_json[name] + print("Download : " + name) + + if not db.search(temp.name == name): + print(name + " " + temp_name["version"]) + download_(temp_name["download"], temp_name["file_name"]) + db.insert( + { + "name": name, + "version": temp_name["version"], + "file": temp_name["file_name"], + } + ) else: - if len(db.search(temp.name==name and temp.version==temp_name['version']))==0: + if not db.search( + temp.name == name and temp.version == temp_name["version"] + ): print("have update") - print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version']) - yes_no="y" - if force==False: - yes_no=str(input("y or n : ")).lower() - if "y"==yes_no: - download_(temp_name['download'],temp_name['file_name']) - db.update({'version':temp_name['version']},temp.name==name) + print( + "from " + + name + + " " + + db.search(temp.name == name)[0]["version"] + + " update to " + + name + + " " + + temp_name["version"] + ) + yes_no = "y" + if not force: + yes_no = str(input("y or n : ")).lower() + if "y" == yes_no: + download_(temp_name["download"], temp_name["file_name"]) + db.update({"version": temp_name["version"]}, temp.name == name) else: print("re-download") - print("from "+name+" "+db.search(temp.name==name)[0]['version']+" update to "+name+" "+temp_name['version']) - yes_no="y" - if force==False: - yes_no=str(input("y or n : ")).lower() - if "y"==yes_no: - download_(temp_name['download'],temp_name['file_name']) - db.update({'version':temp_name['version']},temp.name==name) + print( + "from " + + name + + " " + + db.search(temp.name == name)[0]["version"] + + " update to " + + name + + " " + + temp_name["version"] + ) + yes_no = "y" + if not force: + yes_no = str(input("y or n : ")).lower() + if "y" == yes_no: + download_(temp_name["download"], temp_name["file_name"]) + db.update({"version": temp_name["version"]}, temp.name == name) db.close() + + def remove(name): - db=TinyDB(path_db_) + db = TinyDB(path_db_) temp = Query() - data=db.search(temp.name==name) - if len(data)>0: - path=get_file(name) + data = db.search(temp.name == name) + if len(data) > 0: + path = get_file(name) os.remove(path) - db.remove(temp.name==name) + db.remove(temp.name == name) return True - return False \ No newline at end of file + return False diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py index e927e0903..398a9b331 100644 --- a/pythainlp/tools/__init__.py +++ b/pythainlp/tools/__init__.py @@ -1,25 +1,32 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals + +from __future__ import absolute_import, unicode_literals + import os -import dill -from pythainlp.tokenize import tcc -import marisa_trie import subprocess import sys + def install_package(package): subprocess.call([sys.executable, "-m", "pip", "install", package]) + + def get_path_db(): - path = os.path.join(get_path_pythainlp_data(), "db.json") - if not os.path.exists(path): - from tinydb import TinyDB - db=TinyDB(path) - #db.insert({'name': 'hi', 'version': '0.1','file':''}) - return path + path = os.path.join(get_path_pythainlp_data(), "db.json") + if not os.path.exists(path): + from tinydb import TinyDB + + db = TinyDB(path) + # db.insert({'name': 'hi', 'version': '0.1','file':''}) + return path + + def get_path_data(filename): - return os.path.join(get_path_pythainlp_data(), filename) + return os.path.join(get_path_pythainlp_data(), filename) + + def get_path_pythainlp_data(): - path= os.path.join(os.path.expanduser("~"), 'pythainlp-data') - if not os.path.exists(path): - os.makedirs(path) - return path + path = os.path.join(os.path.expanduser("~"), "pythainlp-data") + if not os.path.exists(path): + os.makedirs(path) + return path diff --git a/pythainlp/ulmfit/__init__.py b/pythainlp/ulmfit/__init__.py index d61c3dfd8..ee14b01ed 100644 --- a/pythainlp/ulmfit/__init__.py +++ b/pythainlp/ulmfit/__init__.py @@ -1,2 +1,3 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import,unicode_literals \ No newline at end of file + +from __future__ import absolute_import, unicode_literals diff --git a/pythainlp/ulmfit/utils.py b/pythainlp/ulmfit/utils.py index dcb19ba4b..dd5adaad4 100644 --- a/pythainlp/ulmfit/utils.py +++ b/pythainlp/ulmfit/utils.py @@ -1,22 +1,28 @@ # -*- coding: utf-8 -*- -''' + +""" Code by https://github.com/cstorm125/thai2vec/tree/master/notebook -''' -from __future__ import absolute_import,unicode_literals -import os -import sys +""" + +from __future__ import absolute_import, unicode_literals + import re -import torch +import sys -#numpy and fastai +from pythainlp.corpus import download, get_file +from pythainlp.tokenize import word_tokenize + + +# numpy and fastai try: import numpy as np from fastai.text import * import dill as pickle except ImportError: from pythainlp.tools import install_package - install_package('fastai') - install_package('numpy') + + install_package("fastai") + install_package("numpy") try: import numpy as np from fastai.text import * @@ -25,53 +31,54 @@ print("Error installing using 'pip install fastai numpy dill'") sys.exit(0) -#import torch +# import torch try: import torch except ImportError: - print('PyTorch required. See https://pytorch.org/.') + print("PyTorch required. See https://pytorch.org/.") -from pythainlp.tokenize import word_tokenize -from pythainlp.corpus import get_file -from pythainlp.corpus import download -MODEL_NAME = 'thwiki_model2' -ITOS_NAME = 'itos' - -#paralellized thai tokenizer with some text cleaning -class ThaiTokenizer(): - def __init__(self, engine='newmm'): + +MODEL_NAME = "thwiki_model2" +ITOS_NAME = "itos" + + +# paralellized Thai tokenizer with some text cleaning +class ThaiTokenizer: + def __init__(self, engine="newmm"): """ :parameters for tokenization engine: * newmm - Maximum Matching algorithm + TCC - * icu - IBM ICU + * icu - IBM ICU * longest-matching - Longest matching * mm - Maximum Matching algorithm * pylexto - LexTo * deepcut - Deep Neural Network """ self.engine = engine - self.re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE) - self.re_rep = re.compile(r'(\S)(\1{3,})') + self.__RE_BR = re.compile(r"<\s*br\s*/?>", re.IGNORECASE) + self.__RE_REP = re.compile(r"(\S)(\1{3,})") + self.__RE_SLASH_HASH = re.compile(r"([/#])") + self.__RE_DOUBLE_SPACE = re.compile(" {2,}") - def sub_br(self,text): + def sub_br(self, text): """ :meth:`sub_br` replace `
` tags with `\n` :param str text: text to process :return: procssed text """ - return self.re_br.sub("\n", text) + return self.__RE_BR.sub("\n", text) - def tokenize(self,text): + def tokenize(self, text): """ :meth: tokenize text with selected engine :param str text: text to tokenize :return: tokenized text """ - return [t for t in word_tokenize(self.sub_br(text),engine=self.engine)] - + return [t for t in word_tokenize(self.sub_br(text), engine=self.engine)] + @staticmethod def replace_rep(text): - ''' + """ :meth:`replace_rep` replace 3 or above repetitive characters with `tkrep` :param str text: text to process :return: processed text where repetitions are replaced by `tkrep` followed by number of repetitions @@ -80,10 +87,10 @@ def replace_rep(text): >>> tt = ThaiTokenizer() >>> tt.replace_rep('คือดียยยยยย') คือดีtkrep6ย - ''' - TK_REP = 'tkrep' - c,cc = text.groups() - return f'{TK_REP}{len(cc)+1}{c}' + """ + TK_REP = "tkrep" + c, cc = text.groups() + return f"{TK_REP}{len(cc)+1}{c}" def proc_text(self, text): """ @@ -91,10 +98,9 @@ def proc_text(self, text): :param str text: text to process :return: processed and tokenized text """ - s = self.re_rep.sub(ThaiTokenizer.replace_rep, text) - s = re.sub(r'([/#])', r' \1 ', s) - #remvoe double space - s = re.sub(' {2,}', ' ', s) + s = self.__RE_REP.sub(ThaiTokenizer.replace_rep, text) + s = self.__RE_SLASH_HASH.sub(r" \1 ", s) + s = self.__RE_DOUBLE_SPACE(" ", s) return self.tokenize(s) @staticmethod @@ -114,12 +120,15 @@ def proc_all_mp(ss): :param str text: text to process :return: processed and tokenized text """ - ncpus = num_cpus()//2 + ncpus = num_cpus() // 2 with ProcessPoolExecutor(ncpus) as e: return sum(e.map(ThaiTokenizer.proc_all, ss), []) -#ulmfit helper functions -BOS = 'xbos' # beginning-of-sentence tag + +# ulmfit helper functions +BOS = "xbos" # beginning-of-sentence tag + + def get_texts(df): """ :meth: `get_texts` get tuple of tokenized texts and labels @@ -128,10 +137,11 @@ def get_texts(df): * tok - lists of tokenized texts with beginning-of-sentence tag `xbos` as first element of each list * labels - list of labels """ - labels = df.iloc[:,0].values.astype(np.int64) - texts = BOS+df.iloc[:,1].astype(str).apply(lambda x: x.rstrip()) + labels = df.iloc[:, 0].values.astype(np.int64) + texts = BOS + df.iloc[:, 1].astype(str).apply(lambda x: x.rstrip()) tok = ThaiTokenizer().proc_all_mp(partition_by_cores(texts)) - return(tok, list(labels)) + return (tok, list(labels)) + def get_all(df): """ @@ -142,13 +152,16 @@ def get_all(df): * labels - list of labels """ tok, labels = [], [] - for i, r in enumerate(df): + for _, r in enumerate(df): tok_, labels_ = get_texts(r) - tok += tok_; + tok += tok_ labels += labels_ - return(tok, labels) + return (tok, labels) + -def numericalizer(df, itos=None, max_vocab = 60000, min_freq = 2, pad_tok = '_pad_', unk_tok = '_unk_'): +def numericalizer( + df, itos=None, max_vocab=60000, min_freq=2, pad_tok="_pad_", unk_tok="_unk_" +): """ :meth: `numericalize` numericalize tokenized texts for: * tokens with word frequency more than `min_freq` @@ -172,12 +185,13 @@ def numericalizer(df, itos=None, max_vocab = 60000, min_freq = 2, pad_tok = '_pa tok, labels = get_all(df) freq = Counter(p for o in tok for p in o) if itos is None: - itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq] + itos = [o for o, c in freq.most_common(max_vocab) if c > min_freq] itos.insert(0, pad_tok) itos.insert(0, unk_tok) - stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)}) + stoi = collections.defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)}) lm = np.array([[stoi[o] for o in p] for p in tok]) - return(lm,tok,labels,itos,stoi,freq) + return (lm, tok, labels, itos, stoi, freq) + def merge_wgts(em_sz, wgts, itos_pre, itos_cls): """ @@ -189,23 +203,26 @@ def merge_wgts(em_sz, wgts, itos_pre, itos_cls): :return: merged weights of the model for current dataset """ vocab_size = len(itos_cls) - enc_wgts = to_np(wgts['0.encoder.weight']) - #average weight of encoding + enc_wgts = to_np(wgts["0.encoder.weight"]) + # average weight of encoding row_m = enc_wgts.mean(0) - stoi_pre = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos_pre)}) - #new embedding based on classification dataset + stoi_pre = collections.defaultdict( + lambda: -1, {v: k for k, v in enumerate(itos_pre)} + ) + # new embedding based on classification dataset new_w = np.zeros((vocab_size, em_sz), dtype=np.float32) - for i,w in enumerate(itos_cls): + for i, w in enumerate(itos_cls): r = stoi_pre[w] - #use pretrianed embedding if present; else use the average - new_w[i] = enc_wgts[r] if r>=0 else row_m - wgts['0.encoder.weight'] = T(new_w) - wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w)) - wgts['1.decoder.weight'] = T(np.copy(new_w)) - return(wgts) - -#feature extractor -def document_vector(ss, m, stoi,tok_engine='newmm'): + # use pretrianed embedding if present; else use the average + new_w[i] = enc_wgts[r] if r >= 0 else row_m + wgts["0.encoder.weight"] = T(new_w) + wgts["0.encoder_with_dropout.embed.weight"] = T(np.copy(new_w)) + wgts["1.decoder.weight"] = T(np.copy(new_w)) + return wgts + + +# feature extractor +def document_vector(ss, m, stoi, tok_engine="newmm"): """ :meth: `document_vector` get document vector using pretrained ULMFit model :param str ss: sentence to extract embeddings @@ -215,44 +232,55 @@ def document_vector(ss, m, stoi,tok_engine='newmm'): :return: `numpy.array` of document vector sized 300 """ s = word_tokenize(ss) - t = LongTensor([stoi[i] for i in s]).view(-1,1).cuda() - t = Variable(t,volatile=False) + t = LongTensor([stoi[i] for i in s]).view(-1, 1).cuda() + t = Variable(t, volatile=False) m.reset() - pred,*_ = m[0](t) - #get average of last lstm layer along bptt - res = to_np(torch.mean(pred[-1],0).view(-1)) - return(res) - -class SaveFeatures(): - features=None - def __init__(self, m): self.hook = m.register_forward_hook(self.hook_fn) - def hook_fn(self, module, input, output): self.features = output - def remove(self): self.hook.remove() - -#Download pretrained models + pred, *_ = m[0](t) + # get average of last lstm layer along bptt + res = to_np(torch.mean(pred[-1], 0).view(-1)) + return res + + +class SaveFeatures: + features = None + + def __init__(self, m): + self.hook = m.register_forward_hook(self.hook_fn) + + def hook_fn(self, module, input, output): + self.features = output + + def remove(self): + self.hook.remove() + + +# Download pretrained models def get_path(fname): - path = get_file(fname) - if path==None: - download(fname) - path = get_file(fname) - return(path) + path = get_file(fname) + if not path: + download(fname) + path = get_file(fname) + return path + def load_pretrained_model(): path = get_path(MODEL_NAME) wgts = torch.load(path, map_location=lambda storage, loc: storage) - return(wgts) + return wgts + def load_pretrained_itos(): path = get_path(ITOS_NAME) - itos = pickle.load(open(path,'rb')) - return(itos) + itos = pickle.load(open(path, "rb")) + return itos + def about(): - return ''' - thai2vec - State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language. - Created as part of pyThaiNLP with ULMFit implementation from fast.ai - - Development : Charin Polpanumas - GitHub : https://github.com/cstorm125/thai2vec - ''' + return """ + thai2vec + State-of-the-Art Language Modeling, Text Feature Extraction and Text Classification in Thai Language. + Created as part of PyThaiNLP with ULMFit implementation from fast.ai + + Development : Charin Polpanumas + GitHub : https://github.com/cstorm125/thai2vec + """