forked from torch/demos
-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathexample-logistic-regression.lua
532 lines (417 loc) · 17.6 KB
/
example-logistic-regression.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
----------------------------------------------------------------------
-- example-logistic-regression.lua
--
-- Logistic regression and multinomial logistic regression
--
require 'nn'
require 'optim'
----------------------------------------------------------------------
-- 1. Create the training data
print('')
print('============================================================')
print('Constructing dataset')
print('')
-- The data come from a tutorial on using R from UCLA, which can be found at
-- http://www.ats.ucla.edu/stat/r/dae/mlogit.htm
-- The model is one of brand preference, where there are 3 brands and 2
-- explanatory variables. The variables are coded this way:
-- brand: 1, 2 or 3
-- female: 1 if the person is a female, 0 if a male
-- age: a positive integer
-- The data are stored in a csv file 'example-logistic-regression.csv'
-- and read with the csvigo package (torch-pkg install csvigo)
require 'csvigo'
-- The data are in a comma separated values (CSV) file. The first record
-- contains field names and subsequent records contain data. The fields and
-- their formats are:
-- - num: observation number; an integer surrounded by double quote chars
-- - brand: brand number; 1, 2, or 3 surrounded by double quote chars
-- - female: indicator for is-female: 1 if female, 0 otherwise; no quote chars
-- - age: age of the person; no quote characters
-- Reading CSV files can be tricky. This code uses the csvigo package for this:
loaded = csvigo.load('example-logistic-regression.csv')
-- Convert the CSV table into dense tensors. The tensor form has the
-- advantage that it stores its elements continguously (which leads to
-- better performance) and a tensor allows one to select columns and rows
-- easily, using slicing methods.
-- first convert each variable list to a tensor:
brands = torch.Tensor(loaded.brand)
females = torch.Tensor(loaded.female)
ages = torch.Tensor(loaded.age)
-- copy all the input variables into a single tensor:
dataset_inputs = torch.Tensor( (#brands)[1],2 )
dataset_inputs[{ {},1 }] = females
dataset_inputs[{ {},2 }] = ages
-- the outputs are just the brands
dataset_outputs = brands
-- To implement the model, we need to know how many categories there are.
numberOfBrands = torch.max(dataset_outputs) - torch.min(dataset_outputs) + 1
-- summarize the data
function summarizeData()
function p(name,value)
print(string.format('%20s %f', name, value) )
end
p('number of brands', numberOfBrands)
p('min brand', torch.min(brands))
p('max brand', torch.max(brands))
p('min female', torch.min(females))
p('max female', torch.max(females))
p('min age', torch.min(ages))
p('max age', torch.max(ages))
end
summarizeData()
----------------------------------------------------------------------
-- 2. Define the model (predictor)
-- The model is a multinomial logistic regression.
-- It will consist of two layers that operate sequentially:
-- - 1: a linear model
-- - 2: a soft max layer
-- The linear model supposes that the un-normalized probability of choosing
-- a specific brand is proportional to the product of unknown weights and
-- the observed variables plus a bias:
-- Prob(brand = b) = bias + weight1 * female * weight2 * age
-- There are two inputs (female and age) and three outputs (one for each
-- value that brand can take on)
linLayer = nn.Linear(2,3)
-- The soft max layer takes the 3 outputs from the linear layer and
-- transforms them to lie in the range (0,1) and to sum to 1. Thus, unlike
-- some text books in which the probabilities are un-normalized, the output
-- of the soft max layer will be normalized probabilities.
-- The log soft max layer takes the log of these 3 outputs. This is done
-- because we want to feed the log values into the ClassNLLCriterion
-- described below.
softMaxLayer = nn.LogSoftMax() -- the input and output are a single tensor
-- We need to put the layers into a sequential container.
model = nn.Sequential()
model:add(linLayer)
model:add(softMaxLayer)
----------------------------------------------------------------------
-- 3. Define a loss function, to be minimized.
-- In that example, we minimize the cross entropy between
-- the predictions of our linear model and the groundtruth available
-- in the dataset.
-- Torch provides many common criterions to train neural networks.
-- The ClassNLLCriterion expects to be fed the log probabilities in a
-- tensor. Hence, the use of the LogSoftMax layer in the model instead
-- of SoftMax.
-- Minimizing the cross-entropy is equivalent to maximizing the
-- maximum a-posteriori (MAP) prediction, which is equivalent to
-- minimizing the negative log-likelihoood (NLL), hence the use of
-- the NLL loss.
criterion = nn.ClassNLLCriterion()
----------------------------------------------------------------------
-- 4.a. Train the model (Using SGD)
-- To minimize the loss defined above, using the linear model defined
-- in 'model', we follow a stochastic gradient descent procedure (SGD).
-- SGD is a good optimization algorithm when the amount of training data
-- is large, and estimating the gradient of the loss function over the
-- entire training set is too costly.
-- Given an arbitrarily complex model, we can retrieve its trainable
-- parameters, and the gradients of our loss function wrt these
-- parameters by doing so:
x, dl_dx = model:getParameters()
-- The above statement does not create a copy of the parameters in the
-- model! Instead it create in x and dl_dx a view of the model's weights
-- and derivative wrt the weights. The view is implemented so that when
-- the weights and their derivatives changes, so do the x and dl_dx. The
-- implementation is efficient in that the underlying storage is shared.
-- A note on terminology: In the machine learning literature, the parameters
-- that one seeks to learn are often called weights and denoted with a W.
-- However, in the optimization literature, the parameter one seeks to
-- optimize is often called x. Hence the use of x and dl_dx above.
-- In the following code, we define a closure, feval, which computes
-- the value of the loss function at a given point x, and the gradient of
-- that function with respect to x. x is the vector of trainable weights,
-- which, in this example, are all the weights of the linear matrix of
-- our mode, plus one bias.
feval = function(x_new)
-- set x to x_new, if differnt
-- (in this simple example, x_new will typically always point to x,
-- so the copy is really useless)
if x ~= x_new then
x:copy(x_new)
end
-- select a new training sample
_nidx_ = (_nidx_ or 0) + 1
if _nidx_ > (#dataset_inputs)[1] then _nidx_ = 1 end
local inputs = dataset_inputs[_nidx_]
local target = dataset_outputs[_nidx_]
-- reset gradients (gradients are always accumulated, to accomodate
-- batch methods)
dl_dx:zero()
-- evaluate the loss function and its derivative wrt x, for that sample
local loss_x = criterion:forward(model:forward(inputs), target)
model:backward(inputs, criterion:backward(model.output, target))
-- return loss(x) and dloss/dx
return loss_x, dl_dx
end
-- Given the function above, we can now easily train the model using SGD.
-- For that, we need to define four key parameters:
-- + a learning rate: the size of the step taken at each stochastic
-- estimate of the gradient
-- + a weight decay, to regularize the solution (L2 regularization)
-- + a momentum term, to average steps over time
-- + a learning rate decay, to let the algorithm converge more precisely
sgd_params = {
learningRate = 1e-3,
learningRateDecay = 1e-4,
weightDecay = 0,
momentum = 0
}
-- We're now good to go... all we have left to do is run over the dataset
-- for a certain number of iterations, and perform a stochastic update
-- at each iteration. The number of iterations is found empirically here,
-- but should typically be determinined using cross-validation (i.e.
-- using multiple folds of training/test subsets).
epochs = 1e2 -- number of times to cycle over our training data
print('')
print('============================================================')
print('Training with SGD')
print('')
for i = 1,epochs do
-- this variable is used to estimate the average loss
current_loss = 0
-- an epoch is a full loop over our training data
for i = 1,(#dataset_inputs)[1] do
-- optim contains several optimization algorithms.
-- All of these algorithms assume the same parameters:
-- + a closure that computes the loss, and its gradient wrt to x,
-- given a point x
-- + a point x
-- + some parameters, which are algorithm-specific
_,fs = optim.sgd(feval,x,sgd_params)
-- Functions in optim all return two things:
-- + the new x, found by the optimization method (here SGD)
-- + the value of the loss functions at all points that were used by
-- the algorithm. SGD only estimates the function once, so
-- that list just contains one value.
current_loss = current_loss + fs[1]
end
-- report average error on epoch
current_loss = current_loss / (#dataset_inputs)[1]
print('epoch = ' .. i ..
' of ' .. epochs ..
' current loss = ' .. current_loss)
end
----------------------------------------------------------------------
-- 4.b. Train the model (Using L-BFGS)
-- now that we know how to train the model using simple SGD, we can
-- use more complex optimization heuristics. In the following, we
-- use a second-order method: L-BFGS, which typically yields
-- more accurate results (for linear models), but can be significantly
-- slower. For very large datasets, SGD is typically much faster
-- to converge, and L-FBGS can be used to refine the results.
-- we start again, and reset the trained parameter vector:
model:reset()
-- next we re-define the closure that evaluates f and df/dx, so that
-- it estimates the true f, and true (exact) df/dx, over the entire
-- dataset. This is a full batch approach.
feval = function(x_new)
-- set x to x_new, if differnt
-- (in this simple example, x_new will typically always point to x,
-- so the copy is really useless)
if x ~= x_new then
x:copy(x_new)
end
-- reset gradients (gradients are always accumulated, to accomodate
-- batch methods)
dl_dx:zero()
-- and batch over the whole training dataset:
local loss_x = 0
for i = 1,(#dataset_inputs)[1] do
-- select a new training sample
_nidx_ = (_nidx_ or 0) + 1
if _nidx_ > (#dataset_inputs)[1] then _nidx_ = 1 end
local inputs = dataset_inputs[_nidx_]
local target = dataset_outputs[_nidx_]
-- evaluate the loss function and its derivative wrt x, for that sample
loss_x = loss_x + criterion:forward(model:forward(inputs), target)
model:backward(inputs, criterion:backward(model.output, target))
end
-- normalize with batch size
loss_x = loss_x / (#dataset_inputs)[1]
dl_dx = dl_dx:div( (#dataset_inputs)[1] )
-- return loss(x) and dloss/dx
return loss_x, dl_dx
end
-- L-BFGS parameters are different than SGD:
-- + a line search: we provide a line search, which aims at
-- finding the point that minimizes the loss locally
-- + max nb of iterations: the maximum number of iterations for the batch,
-- which is equivalent to the number of epochs
-- on the given batch. In that example, it's simple
-- because the batch is the full dataset, but in
-- some cases, the batch can be a small subset
-- of the full dataset, in which case maxIter
-- becomes a more subtle parameter.
lbfgs_params = {
lineSearch = optim.lswolfe,
maxIter = epochs,
verbose = true
}
print('')
print('============================================================')
print('Training with L-BFGS')
print('')
_,fs = optim.lbfgs(feval,x,lbfgs_params)
-- fs contains all the evaluations of f, during optimization
print('history of L-BFGS evaluations:')
print(fs)
----------------------------------------------------------------------
-- 5. Test the trained model.
print('')
print('============================================================')
print('Testing the model')
print('')
-- Now that the model is trained, one can test it by evaluating it
-- on new samples.
-- The model constructed and trained above computes the probabilities
-- of each class given the input values.
-- We want to compare our model's results with those from the text.
-- The input variables have narrow ranges, so we just compare all possible
-- input variables in the training data.
-- Determine actual frequency of the each female-age pair in the
-- training data
-- return index of largest value
function maxIndex(a,b,c)
if a >=b and a >= c then return 1
elseif b >= a and b >= c then return 2
else return 3 end
end
-- return predicted brand and probabilities of each brand
-- for the model in the text
-- The R code in the text computes the probabilities of choosing
-- brands 2 and 3 relative to the probability of choosing brand 1:
-- Prob(brand=2)/prob(brand=1) = exp(-11.77 + 0.52*female + 0.37*age)
-- Prob(brand=3)/prob(brand=1) = exp(-22.72 + 0.47*female + 0.69*age)
function predictText(age, female)
-- 1: calculate the "logit's"
-- The coefficients come from the text.
-- If you download the R script and run it, you may see slightly
-- different results.
local logit1 = 0
local logit2 = -11.774655 + 0.523814 * female + 0.368206 * age
local logit3 = -22.721396 + 0.465941 * female + 0.685908 * age
-- 2: calculate the unnormalized probabilities
local uprob1 = math.exp(logit1)
local uprob2 = math.exp(logit2)
local uprob3 = math.exp(logit3)
-- 3: normalize the probabilities
local z = uprob1 + uprob2 + uprob3
local prob1 = (1/z) * uprob1
local prob2 = (1/z) * uprob2
local prob3 = (1/z) * uprob3
return maxIndex(prob1, prob2, prob3), prob1, prob2, prob3
end
-- return predicted brand and the probabilities of each brand
-- for our model
function predictOur(age, female)
local input = torch.Tensor(2)
input[1] = female -- must be in same order as when the model was trained!
input[2] = age
local logProbs = model:forward(input)
--print('predictOur', age, female, input)
local probs = torch.exp(logProbs)
--print('logProbs', logProbs)
--print('probs', probs[1], probs[2], probs[3] )
local prob1, prob2, prob3 = probs[1], probs[2], probs[3]
return maxIndex(prob1, prob2, prob3), prob1, prob2, prob3
end
counts = {}
function makeKey(age, brand, female)
-- return a string containing the values
-- Note that returning a table will not work, because each
-- table is unique.
-- Because Lua interns the strings, a string with a given sequence
-- of characters is stored only once.
return string.format('%2d%1d%1f', age, brand, female)
end
for i = 1,(#brands)[1] do
local brand = brands[i]
local female = females[i]
local age = ages[i]
local key = makeKey (age, brand, female)
counts[key] = (counts[key] or 0) + 1
end
-- return probability of each brand conditioned on age and female
function actualProbabilities(age, female)
function countOf(age, brand, female)
return counts[makeKey(age, brand, female)] or 0
end
local count1 = countOf(age, 1, female)
local count2 = countOf(age, 2, female)
local count3 = countOf(age, 3, female)
local sumCounts = count1 + count2 + count3
if sumCounts == 0 then
return 0, 0, 0
else
return count1/sumCounts, count2/sumCounts, count3/sumCounts
end
end
print(' ')
print('summary of data')
summarizeData()
print(' ')
print('training variables')
for k,v in pairs(sgd_params) do
print(string.format('%20s %f', k, v))
end
print(string.format('%20s %f', 'epochs', epochs))
print(' ')
print('current loss', current_loss)
-- print the headers
print(' ')
lineFormat = '%-6s %-3s| %-17s | %-17s | %-17s | %-1s %-1s %-1s'
print(
string.format(lineFormat,
'', '',
'actual probs', 'text probs', 'our probs',
'best', '', ''))
choices = 'brnd1 brnd2 brnd3'
print(string.format(lineFormat,
'female', 'age',
choices, choices, choices,
'a', 't', 'o'))
-- print each row in the table
function formatFemale(female)
return string.format('%1d', female)
end
function formatAge(age)
return string.format('%2d', age)
end
function formatProbs(p1, p2, p3)
return string.format('%5.3f %5.3f %5.3f', p1, p2, p3)
end
function indexString(p1, p2, p3)
-- return index of highest probability or '-' if nearly all zeroes
if p1 < 0.001 and p2 < 0.001 and p3 < 0.001 then
return '-'
else
return string.format('%1d', maxIndex(p1, p2, p3))
end
end
-- print table rows and accumulate accuracy
for female = 0,1 do
for age = torch.min(ages),torch.max(ages) do
-- calculate the actual probabilities in the training data
local actual1, actual2, actual3 = actualProbabilities(age, female)
-- calculate the prediction and probabilities using the model in the text
local textBrand, textProb1, textProb2, textProb3 =
predictText(age, female)
-- calculate the probabilities using the model we just trained
--print("main", age, female)
local ourBrand, ourProb1, ourProb2, ourProb3 =
predictOur(age, female)
print(
string.format(lineFormat,
formatFemale(female),
formatAge(age),
formatProbs(actual1, actual2, actual3),
formatProbs(textProb1, textProb2, textProb3),
formatProbs(ourProb1, ourProb2, ourProb3),
indexString(actual1,actual2,actual3),
indexString(textProb1,textProb2,textProb3),
indexString(ourProb1,ourProb2,ourProb3))
)
end
end