Skip to content

Commit 266e4d5

Browse files
Dbhasin1rossbar
andauthored
Tutorial: NumPy NLP from scratch with a focus on ethics (#105)
Co-authored-by: Ross Barnowski <[email protected]>
1 parent e3e67bb commit 266e4d5

13 files changed

+1427
-1
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ or navigate to any of the documents listed below and download it individually.
2424
7. [Tutorial: Masked Arrays](content/tutorial-ma.md)
2525
8. [Tutorial: Static Equilibrium](content/tutorial-static_equilibrium.md)
2626
9. [Tutorial: Plotting Fractals](content/tutorial-plotting-fractals.ipynb)
27+
10. [Tutorial: NumPy natural language processing from scratch with a focus on ethics](content/tutorial-nlp-from-scratch.md)
2728

2829

2930
## Contributing

content/_static/dl_architectures.jpg

25.3 KB
Loading

content/_static/lstm.gif

3.31 MB
Loading

content/_static/mem_block.png

232 KB
Loading

content/text_preprocessing.py

+185
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
import pandas as pd
2+
import argparse
3+
import numpy as np
4+
import re # (https://docs.python.org/3/library/re.html) for tokenising textual data
5+
import string # (https://docs.python.org/3/library/string.html) for string operations
6+
7+
class TextPreprocess:
8+
"""Text Preprocessing for a Natural Language Processing model."""
9+
10+
11+
def cleantext(self, df, text_column, remove_stopwords = True, remove_punc = True):
12+
"""Function to clean text data by removing stopwords, tags and punctuation.
13+
14+
Parameters
15+
----------
16+
df : pandas dataframe
17+
The dataframe housing the input data.
18+
text_column : str
19+
Column in dataframe whose text is to be cleaned.
20+
remove_stopwords : bool
21+
if True, remove stopwords from text
22+
remove_punc : bool
23+
if True, remove punctuation suymbols from text
24+
25+
Returns
26+
-------
27+
Numpy array
28+
Cleaned text.
29+
30+
"""
31+
data = df
32+
# converting all characters to lowercase
33+
data[text_column] = data[text_column].str.lower()
34+
35+
# List of common stopwords taken from https://gist.github.com/sebleier/554280
36+
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because",
37+
"been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
38+
"each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here",
39+
"here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
40+
"is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
41+
"other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should",
42+
"so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
43+
"these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
44+
"very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
45+
"which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
46+
"your", "yours", "yourself", "yourselves" ]
47+
48+
def remove_stopwords(data, column):
49+
data[f'{column} without stopwords'] = data[column].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
50+
return data
51+
52+
def remove_tags(string):
53+
result = re.sub('<*>','',string)
54+
return result
55+
56+
# remove html tags and brackets from text
57+
if remove_stopwords:
58+
data_without_stopwords = remove_stopwords(data, text_column)
59+
data_without_stopwords[f'clean_{text_column}']= data_without_stopwords[f'{text_column} without stopwords'].apply(lambda cw : remove_tags(cw))
60+
if remove_punc:
61+
data_without_stopwords[f'clean_{text_column}'] = data_without_stopwords[f'clean_{text_column}'].str.replace('[{}]'.format(string.punctuation), ' ', regex = True)
62+
63+
X = data_without_stopwords[f'clean_{text_column}'].to_numpy()
64+
65+
return X
66+
67+
def split_data (self, X, y, split_percentile):
68+
"""Function to split data into training and testing data.
69+
70+
Parameters
71+
----------
72+
X : Numpy Array
73+
Contains textual data.
74+
y : Numpy Array
75+
Contains target data.
76+
split_percentile : int
77+
Proportion of training to testing data.
78+
79+
80+
Returns
81+
-------
82+
Tuple
83+
Contains numpy arrays of test and training data.
84+
85+
"""
86+
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))
87+
arr_rand = np.random.rand(X.shape[0])
88+
split = arr_rand < np.percentile(arr_rand, split_percentile)
89+
X_train = X[split]
90+
y_train = y[split]
91+
X_test = X[~split]
92+
y_test = y[~split]
93+
94+
return (X_train, y_train, X_test, y_test)
95+
96+
97+
def sent_tokeniser (self, x):
98+
"""Function to split text into sentences.
99+
100+
Parameters
101+
----------
102+
x : str
103+
piece of text
104+
105+
Returns
106+
-------
107+
list
108+
sentences with punctuation removed.
109+
110+
"""
111+
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', x)
112+
sentences.pop()
113+
sentences_cleaned = [re.sub(r'[^\w\s]', '', x) for x in sentences]
114+
return sentences_cleaned
115+
116+
def word_tokeniser(self, text):
117+
"""Function to split text into tokens.
118+
119+
Parameters
120+
----------
121+
x : str
122+
piece of text
123+
124+
Returns
125+
-------
126+
list
127+
words with punctuation removed.
128+
129+
"""
130+
tokens = re.split(r"([-\s.,;!?])+", text)
131+
words = [x for x in tokens if (x not in '- \t\n.,;!?\\' and '\\' not in x)]
132+
return words
133+
134+
def loadGloveModel(self, emb_path):
135+
"""Function to read from the word embedding file.
136+
137+
Returns
138+
-------
139+
Dict
140+
mapping from word to corresponding word embedding.
141+
142+
"""
143+
print("Loading Glove Model")
144+
File = emb_path
145+
f = open(File,'r')
146+
gloveModel = {}
147+
for line in f:
148+
splitLines = line.split()
149+
word = splitLines[0]
150+
wordEmbedding = np.array([float(value) for value in splitLines[1:]])
151+
gloveModel[word] = wordEmbedding
152+
print(len(gloveModel)," words loaded!")
153+
return gloveModel
154+
155+
156+
def text_to_paras(self, text, para_len):
157+
"""Function to split text into paragraphs.
158+
159+
Parameters
160+
----------
161+
text : str
162+
piece of text
163+
164+
para_len : int
165+
length of each paragraph
166+
167+
Returns
168+
-------
169+
list
170+
paragraphs of specified length.
171+
172+
"""
173+
# split the speech into a list of words
174+
words = text.split()
175+
# obtain the total number of paragraphs
176+
no_paras = int(np.ceil(len(words)/para_len))
177+
# split the speech into a list of sentences
178+
sentences = self.sent_tokeniser(text)
179+
# aggregate the sentences into paragraphs
180+
k, m = divmod(len(sentences), no_paras)
181+
agg_sentences = [sentences[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(no_paras)]
182+
paras = np.array([' '.join(sents) for sents in agg_sentences])
183+
184+
return paras
185+

0 commit comments

Comments
 (0)