NCE:multicuda()

nicholas-leonard · nicholas-leonard · commit 62696d935e80 · 2016-06-03T14:47:38.000-04:00
diff --git a/Module.lua b/Module.lua
@@ -484,7 +484,8 @@ function Module:updateGradParameters(momFactor, momDamp, momNesterov)
       end
       local momGradParams = self:momentumGradParameters()
       for i,gradParam in pairs(gradParams) do
-         momGradParams[i]:mul(momFactor):add(1-momDamp, gradParam)
+         momGradParams[i]:mul(momFactor)
+         momGradParams[i]:add(1-momDamp, gradParam)
       end
       
       if momNesterov then
diff --git a/NCEModule.lua b/NCEModule.lua
@@ -3,7 +3,7 @@
 -- Ref.: A. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf
 ------------------------------------------------------------------------
 local NCEModule, parent = torch.class("nn.NCEModule", "nn.Linear")
-NCEModule.version = 3
+NCEModule.version = 4 -- added multicuda()
 
 -- for efficient serialization
 local empty = _.clone(parent.dpnn_mediumEmpty)
@@ -62,7 +62,7 @@ function NCEModule:updateOutput(inputTable)
       if self.addBuffer:nElement() ~= batchsize then
          self.addBuffer:resize(batchsize):fill(1)
       end
-      self.linout:addmm(0, self.linout, 1, input, self.weight:t())
+      self.weight.addmm(self.linout, 0, self.linout, 1, input, self.weight:t())
       if self.bias then self.linout:addr(1, self.addBuffer, self.bias) end
       self.output = torch.type(self.output) == 'table' and input.new() or self.output
       if self.logsoftmax then
@@ -102,8 +102,8 @@ function NCEModule:updateOutput(inputTable)
       end
       
       -- build (batchsize x k+1 x inputsize) weight tensor
-      self._weight = self._weight or self.weight.new()
-      self._weight:index(self.weight, 1, self.sampleidx:view(-1))
+      self._weight = self._weight or self.bias.new()
+      self.weight.index(self._weight, self.weight, 1, self.sampleidx:view(-1))
       assert(self._weight:nElement() == batchsize*(self.k+1)*inputsize)
       self._weight:resize(batchsize, self.k+1, inputsize)
       
@@ -190,7 +190,7 @@ function NCEModule:accGradParameters(inputTable, gradOutput, scale)
    local batchsize = input:size(1)
    local inputsize = self.weight:size(2)
    
-   self._gradWeight = self._gradWeight or self.gradWeight.new()
+   self._gradWeight = self._gradWeight or self.bias.new()
    self._gradWeight:resizeAs(self._weight):zero() -- batchsize x k+1 x inputsize
    self._gradOutput:resize(batchsize, self.k+1, 1)
    self._gradOutput:mul(scale)
@@ -216,7 +216,23 @@ function NCEModule:type(type, cache)
    local unigrams = self.unigrams
    self.unigrams = nil
    local am = self.aliasmultinomial
-   local rtn = parent.type(self, type, cache)
+   
+   local rtn
+   if type and torch.type(self.weight) == 'torch.MultiCudaTensor' then
+      assert(type == 'torch.CudaTensor', "Cannot convert a multicuda NCEModule to anything other than cuda")
+      local weight = self.weight
+      local gradWeight = self.gradWeight
+      self.weight = nil
+      self.gradWeight = nil
+      
+      rtn = parent.type(self, type, cache)
+      
+      self.weight = weight
+      self.gradWeight = gradWeight
+   else
+      rtn = parent.type(self, type, cache)
+   end
+   
    self.unigrams = unigrams
    self.aliasmultinomial = am
    return rtn
@@ -264,3 +280,25 @@ function NCEModule:clearState()
       gradInput:set()
    end
 end
+
+function NCEModule:multicuda(device1, device2)
+   assert(device1 and device2, "specify two devices as arguments")
+   require 'torchx'
+   assert(torchx.version and torchx.version >= 1, "update torchx: luarocks install torchx")
+   
+   self:float()
+   
+   local isize = self.weight:size(2)
+   local weights = {
+      cutorch.withDevice(device1, function() return self.weight[{{}, {1, torch.round(isize/2)}}]:cuda() end),
+      cutorch.withDevice(device2, function() return self.weight[{{}, {torch.round(isize/2)+1, isize}}]:cuda() end)
+   }
+   self.weight = torch.MultiCudaTensor(2, weights)
+   local gradWeights = {
+      cutorch.withDevice(device1, function() return self.gradWeight[{{}, {1, torch.round(isize/2)}}]:cuda() end),
+      cutorch.withDevice(device2, function() return self.gradWeight[{{}, {torch.round(isize/2)+1, isize}}]:cuda() end)
+   }
+   self.gradWeight = torch.MultiCudaTensor(2, gradWeights)
+   
+   self:cuda()
+end
diff --git a/test/test.lua b/test/test.lua
@@ -2199,7 +2199,7 @@ function dpnntest.OneHot()
    end
 end
 
-function dpnntest.NCE()
+function dpnntest.NCE_main()
    local batchsize = 4
    local k = 10
    local inputsize = 3
@@ -2353,7 +2353,7 @@ function dpnntest.NCE()
       local linear = nn.Linear(inputsize, outputsize)
       linear.weight:copy(ncem.weight)
       linear.bias:copy(ncem.bias)
-      local mlp = nn.Sequential():add(linear):add(nn.Exp())
+      local mlp = nn.Sequential():add(linear):add(nn.Exp()):add(nn.MulConstant(1/ncem.Z[1]))
       mlp:cuda()
 
       local output2_ = mlp:forward(input)
@@ -2455,6 +2455,8 @@ function dpnntest.NCE_multinomial()
 end
 
 function dpnnbigtest.NCE_benchmark()
+   pcall(function() require 'cunn' end) -- make sure to import cunn before initializing large tensors, else weird segfault...
+   
    local nclass = 1000000
    local hiddensize = 200
    local batchsize = 50
@@ -2483,8 +2485,6 @@ function dpnnbigtest.NCE_benchmark()
       sync = function() cutorch.synchronize() end
    end
    
-   print(torch.type(nce.unigrams))
-   
    local output = nce:forward{input, target}
    local loss = crit:forward(output, target)
    local gradOutput = crit:backward(output, target)
@@ -2494,8 +2494,8 @@ function dpnnbigtest.NCE_benchmark()
    local loss = nll:forward(output, target)
    local gradOutput = nll:backward(output, target)
    local gradInput = mlp:backward(input, gradOutput)
-   sync()
    
+   sync()
    local a = torch.Timer()
    for i=1,nloop do
       output = nce:forward{input, target}
@@ -2525,7 +2525,6 @@ function dpnnbigtest.NCE_benchmark()
    local ncebwd = a:time().real
    
    -- mlp nll
-   
    local a = torch.Timer()
    for i=1,nloop do
       output = mlp:forward(input)
@@ -2561,6 +2560,38 @@ function dpnnbigtest.NCE_benchmark()
    print("criterion:backward (nce vs nll)", critbwd, nllbwd)
    print("module:backward (nce vs linear)", ncebwd, mlpbwd)
    print("total (nce vs linear)", ncetotal, lintotal, lintotal/ncetotal)
+   
+   if not (cunn and cutorch.getDeviceCount() > 1) then
+      return
+   end
+   
+   nce:multicuda(1,2)
+   
+   local output = nce:forward{input, target}
+   local loss = crit:forward(output, target)
+   local gradOutput = crit:backward(output, target)
+   local gradInput = nce:backward({input, target}, gradOutput)
+   sync()
+   
+   local a = torch.Timer()
+   for i=1,nloop do
+      output = nce:forward{input, target}
+   end
+   sync()
+   local ncefwd2 = a:time().real
+   
+   a:reset()
+   for i=1,nloop do
+      gradInput = nce:backward({input, target}, gradOutput)
+   end
+   sync()
+   local ncebwd2 = a:time().real
+   
+   local total1 = ncefwd+ncebwd
+   local total2 = ncefwd2+ncebwd2
+   print("module:forward (1 vs 2 gpu)", ncefwd, ncefwd2)
+   print("module:backward (1 vs 2 gpu)", ncebwd, ncebwd2)
+   print("total (1 vs 2 gpu)", total1, total2, total2/total1)
 end
 
 function dpnntest.NaN()
@@ -2599,6 +2630,87 @@ function dpnntest.NaN()
    mytester:assert(not pcall(function() nan:backward(input, gradOutput) end))
 end
 
+function dpnntest.NCE_multicuda()
+   if not pcall(function() require 'torchx' end) then
+      return
+   end
+   if not pcall(function() require 'cunn' end) then
+      return
+   end
+   if cutorch.getDeviceCount() < 2 then 
+      return
+   end
+   assert(torchx.version and torchx.version >= 1, "Update torchx")
+   
+   local nclass = 1000
+   local hiddensize = 20
+   local batchsize = 5
+   local k = 25
+   local unigrams = torch.Tensor(nclass):uniform(0,1)
+   local noise = torch.LongTensor(batchsize, k):random(1,nclass)
+   
+   local crit = nn.NCECriterion():cuda()
+   local crit2 = nn.NCECriterion():cuda()
+   
+   local nce = nn.NCEModule(hiddensize, nclass, k, unigrams)
+   
+   -- make it deterministic
+   nce.noiseSample = function(self, sampleidx, batchsize, k)
+      sampleidx:resize(batchsize, k)
+      sampleidx:copy(noise)
+      return sampleidx
+   end
+   
+   local nce2 = nce:clone()
+   nce2:cuda()
+   
+   local input = torch.randn(batchsize, hiddensize):cuda()
+   local target = torch.LongTensor(batchsize):random(1,nclass):cuda()
+   
+   nce:multicuda(1, 2)
+   
+   local output = nce:forward{input, target}
+   local loss = crit:forward(output, target)
+   local gradOutput = crit:backward(output, target)
+   nce:zeroGradParameters()
+   local gradInput = nce:backward({input, target}, gradOutput)
+   
+   local output2 = nce2:forward{input, target}
+   local loss2 = crit2:forward(output2, target)
+   local gradOutput2 = crit2:backward(output2, target)
+   nce2:zeroGradParameters()
+   local gradInput2 = nce2:backward({input, target}, gradOutput2)
+   
+   mytester:assertTensorEq(output[1], output2[1], 0.00001)
+   mytester:assertTensorEq(output[2], output2[2], 0.00001)
+   mytester:assertTensorEq(output[3], output2[3], 0.00001)
+   mytester:assertTensorEq(output[4], output2[4], 0.00001)
+   
+   mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.00001)
+   mytester:assertTensorEq(gradInput[2], gradInput2[2], 0.00001)
+   
+   
+   nce2:updateParameters(0.1)
+   nce:updateParameters(0.1)
+   
+   mytester:assertTensorEq(nce2.bias, nce.bias, 0.000001)
+   mytester:assertTensorEq(nce2.gradBias, nce.gradBias, 0.000001)
+   mytester:assertTensorEq(nce2.weight[{{},{1,hiddensize/2}}]:float(), nce.weight.tensors[1]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.weight[{{},{1+(hiddensize/2), hiddensize}}]:float(), nce.weight.tensors[2]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.gradWeight[{{},{1,hiddensize/2}}]:float(), nce.gradWeight.tensors[1]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.gradWeight[{{},{1+(hiddensize/2), hiddensize}}]:float(), nce.gradWeight.tensors[2]:float(), 0.000001)
+   
+   -- test momentum
+   nce2:updateGradParameters(0.9)
+   nce:updateGradParameters(0.9)
+   
+   mytester:assertTensorEq(nce2.gradBias, nce.gradBias, 0.000001)
+   mytester:assertTensorEq(nce2.momGradParams[1][{{},{1,hiddensize/2}}]:float(), nce.momGradParams[1].tensors[1]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.momGradParams[1][{{},{1+(hiddensize/2), hiddensize}}]:float(), nce.momGradParams[1].tensors[2]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.gradWeight[{{},{1,hiddensize/2}}]:float(), nce.gradWeight.tensors[1]:float(), 0.000001)
+   mytester:assertTensorEq(nce2.gradWeight[{{},{1+(hiddensize/2), hiddensize}}]:float(), nce.gradWeight.tensors[2]:float(), 0.000001)
+end
+
 function dpnn.test(tests)
    mytester = torch.Tester()
    mytester:add(dpnntest)