diff --git a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs index 3ce5102fa1..d6de490d52 100644 --- a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs +++ b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs @@ -92,5 +92,20 @@ public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, C OutputData = view }; } + + [TlcModule.EntryPoint(Name = "Transforms.LightLda", Desc = LdaTransform.Summary, UserName = LdaTransform.UserName, ShortName = LdaTransform.ShortName)] + public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LdaTransform.Arguments input) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(input, nameof(input)); + + var h = EntryPointUtils.CheckArgsAndCreateHost(env, "LightLda", input); + var view = new LdaTransform(h, input, input.Data); + return new CommonOutputs.TransformOutput() + { + Model = new TransformModel(h, view, input.Data), + OutputData = view + }; + } } } diff --git a/src/Microsoft.ML.Transforms/Text/LdaSingleBox.cs b/src/Microsoft.ML.Transforms/Text/LdaSingleBox.cs new file mode 100644 index 0000000000..36e55099d9 --- /dev/null +++ b/src/Microsoft.ML.Transforms/Text/LdaSingleBox.cs @@ -0,0 +1,357 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.InteropServices; +using System.Security; + +namespace Microsoft.ML.Runtime.TextAnalytics +{ + + internal static class LdaInterface + { + public struct LdaEngine + { + public IntPtr Ptr; + } + + private const string NativeDll = "LdaNative"; + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern LdaEngine CreateEngine(int numTopic, int numVocab, float alphaSum, float beta, int numIter, + int likelihoodInterval, int numThread, int mhstep, int maxDocToken); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void AllocateModelMemory(LdaEngine engine, int numTopic, int numVocab, long tableSize, long aliasTableSize); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void AllocateDataMemory(LdaEngine engine, int docNum, long corpusSize); + + [DllImport(NativeDll, CharSet = CharSet.Ansi), SuppressUnmanagedCodeSecurity] + internal static extern void Train(LdaEngine engine, string trainOutput); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void GetModelStat(LdaEngine engine, out long memBlockSize, out long aliasMemBlockSize); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void Test(LdaEngine engine, int numBurninIter, float[] pLogLikelihood); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void CleanData(LdaEngine engine); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void CleanModel(LdaEngine engine); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void DestroyEngine(LdaEngine engine); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void GetWordTopic(LdaEngine engine, int wordId, int[] pTopic, int[] pProb, ref int length); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void SetWordTopic(LdaEngine engine, int wordId, int[] pTopic, int[] pProb, int length); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void SetAlphaSum(LdaEngine engine, float avgDocLength); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern int FeedInData(LdaEngine engine, int[] termId, int[] termFreq, int termNum, int numVocab); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern int FeedInDataDense(LdaEngine engine, int[] termFreq, int termNum, int numVocab); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void GetDocTopic(LdaEngine engine, int docId, int[] pTopic, int[] pProb, ref int numTopicReturn); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void GetTopicSummary(LdaEngine engine, int topicId, int[] pWords, float[] pProb, ref int numTopicReturn); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void TestOneDoc(LdaEngine engine, int[] termId, int[] termFreq, int termNum, int[] pTopics, int[] pProbs, ref int numTopicsMax, int numBurnIter, bool reset); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void TestOneDocDense(LdaEngine engine, int[] termFreq, int termNum, int[] pTopics, int[] pProbs, ref int numTopicsMax, int numBurninIter, bool reset); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void InitializeBeforeTrain(LdaEngine engine); + + [DllImport(NativeDll), SuppressUnmanagedCodeSecurity] + internal static extern void InitializeBeforeTest(LdaEngine engine); + } + + internal sealed class LdaSingleBox : IDisposable + { + private LdaInterface.LdaEngine _engine; + private bool _isDisposed; + private int[] _topics; + private int[] _probabilities; + private int[] _summaryTerm; + private float[] _summaryTermProb; + private readonly int _likelihoodInterval; + private readonly float _alpha; + private readonly float _beta; + private readonly int _mhStep; + private readonly int _numThread; + private readonly int _numSummaryTerms; + private readonly bool _denseOutput; + + public readonly int NumTopic; + public readonly int NumVocab; + public LdaSingleBox(int numTopic, int numVocab, float alpha, + float beta, int numIter, int likelihoodInterval, int numThread, + int mhstep, int numSummaryTerms, bool denseOutput, int maxDocToken) + { + NumTopic = numTopic; + NumVocab = numVocab; + _alpha = alpha; + _beta = beta; + _mhStep = mhstep; + _numSummaryTerms = numSummaryTerms; + _denseOutput = denseOutput; + _likelihoodInterval = likelihoodInterval; + _numThread = numThread; + + _topics = new int[numTopic]; + _probabilities = new int[numTopic]; + + _summaryTerm = new int[_numSummaryTerms]; + _summaryTermProb = new float[_numSummaryTerms]; + + _engine = LdaInterface.CreateEngine(numTopic, numVocab, alpha, beta, numIter, likelihoodInterval, numThread, mhstep, maxDocToken); + } + + public void AllocateModelMemory(int numTopic, int numVocab, long tableSize, long aliasTableSize) + { + Contracts.Check(numTopic >= 0); + Contracts.Check(numVocab >= 0); + Contracts.Check(tableSize >= 0); + Contracts.Check(aliasTableSize >= 0); + LdaInterface.AllocateModelMemory(_engine, numVocab, numTopic, tableSize, aliasTableSize); + } + + public void AllocateDataMemory(int docNum, long corpusSize) + { + Contracts.Check(docNum >= 0); + Contracts.Check(corpusSize >= 0); + LdaInterface.AllocateDataMemory(_engine, docNum, corpusSize); + } + + public void Train(string trainOutput) + { + if (string.IsNullOrWhiteSpace(trainOutput)) + LdaInterface.Train(_engine, null); + else + LdaInterface.Train(_engine, trainOutput); + } + + public void GetModelStat(out long memBlockSize, out long aliasMemBlockSize) + { + LdaInterface.GetModelStat(_engine, out memBlockSize, out aliasMemBlockSize); + } + + public void Test(int numBurninIter, float[] logLikelihood) + { + Contracts.Check(numBurninIter >= 0); + var pLogLikelihood = new float[numBurninIter]; + LdaInterface.Test(_engine, numBurninIter, pLogLikelihood); + logLikelihood = pLogLikelihood.Select(item => (float)item).ToArray(); + } + + public void CleanData() + { + LdaInterface.CleanData(_engine); + } + + public void CleanModel() + { + LdaInterface.CleanModel(_engine); + } + + public void CopyModel(LdaSingleBox trainer, int wordId) + { + int length = NumTopic; + LdaInterface.GetWordTopic(trainer._engine, wordId, _topics, _probabilities, ref length); + LdaInterface.SetWordTopic(_engine, wordId, _topics, _probabilities, length); + } + + public void SetAlphaSum(float averageDocLength) + { + LdaInterface.SetAlphaSum(_engine, averageDocLength); + } + + public int LoadDoc(int[] termID, double[] termVal, int termNum, int numVocab) + { + Contracts.Check(numVocab == NumVocab); + Contracts.Check(termNum > 0); + Contracts.Check(termID.Length >= termNum); + Contracts.Check(termVal.Length >= termNum); + + int[] pID = new int[termNum]; + int[] pVal = termVal.Select(item => (int)item).ToArray(); + Array.Copy(termID, pID, termNum); + return LdaInterface.FeedInData(_engine, pID, pVal, termNum, NumVocab); + } + + public int LoadDocDense(double[] termVal, int termNum, int numVocab) + { + Contracts.Check(numVocab == NumVocab); + Contracts.Check(termNum > 0); + + Contracts.Check(termVal.Length >= termNum); + + int[] pID = new int[termNum]; + int[] pVal = termVal.Select(item => (int)item).ToArray(); + return LdaInterface.FeedInDataDense(_engine, pVal, termNum, NumVocab); + + } + + public List> GetDocTopicVector(int docID) + { + int numTopicReturn = NumTopic; + LdaInterface.GetDocTopic(_engine, docID, _topics, _probabilities, ref numTopicReturn); + var topicRet = new List>(); + int currentTopic = 0; + for (int i = 0; i < numTopicReturn; i++) + { + if (_denseOutput) + { + while (currentTopic < _topics[i]) + { + //use a value to smooth the count so that we get dense output on each topic + //the smooth value is usually set to 0.1 + topicRet.Add(new KeyValuePair(currentTopic, (float)_alpha)); + currentTopic++; + } + topicRet.Add(new KeyValuePair(_topics[i], _probabilities[i] + (float)_alpha)); + currentTopic++; + } + else + { + topicRet.Add(new KeyValuePair(_topics[i], (float)_probabilities[i])); + } + } + + if (_denseOutput) + { + while (currentTopic < NumTopic) + { + topicRet.Add(new KeyValuePair(currentTopic, (float)_alpha)); + currentTopic++; + } + } + return topicRet; + } + + public List> TestDoc(int[] termID, double[] termVal, int termNum, int numBurninIter, bool reset) + { + Contracts.Check(termNum > 0); + Contracts.Check(termVal.Length >= termNum); + Contracts.Check(termID.Length >= termNum); + + int[] pID = new int[termNum]; + int[] pVal = termVal.Select(item => (int)item).ToArray(); + int[] pTopic = new int[NumTopic]; + int[] pProb = new int[NumTopic]; + Array.Copy(termID, pID, termNum); + + int numTopicReturn = NumTopic; + + LdaInterface.TestOneDoc(_engine, pID, pVal, termNum, pTopic, pProb, ref numTopicReturn, numBurninIter, reset); + + // PREfast suspects that the value of numTopicReturn could be changed in _engine->TestOneDoc, which might result in read overrun in the following loop. + if (numTopicReturn > NumTopic) + { + Contracts.Check(false); + numTopicReturn = NumTopic; + } + + var topicRet = new List>(); + for (int i = 0; i < numTopicReturn; i++) + topicRet.Add(new KeyValuePair(pTopic[i], (float)pProb[i])); + return topicRet; + } + + public List> TestDocDense(double[] termVal, int termNum, int numBurninIter, bool reset) + { + Contracts.Check(termNum > 0); + Contracts.Check(numBurninIter > 0); + Contracts.Check(termVal.Length >= termNum); + int[] pVal = termVal.Select(item => (int)item).ToArray(); + int[] pTopic = new int[NumTopic]; + int[] pProb = new int[NumTopic]; + + int numTopicReturn = NumTopic; + + // There are two versions of TestOneDoc interfaces + // (1) TestOneDoc + // (2) TestOneDocRestart + // The second one is the same as the first one except that it will reset + // the states of the internal random number generator, so that it yields reproducable results for the same input + LdaInterface.TestOneDocDense(_engine, pVal, termNum, pTopic, pProb, ref numTopicReturn, numBurninIter, reset); + + // PREfast suspects that the value of numTopicReturn could be changed in _engine->TestOneDoc, which might result in read overrun in the following loop. + if (numTopicReturn > NumTopic) + { + Contracts.Check(false); + numTopicReturn = NumTopic; + } + + var topicRet = new List>(); + for (int i = 0; i < numTopicReturn; i++) + topicRet.Add(new KeyValuePair(pTopic[i], (float)pProb[i])); + return topicRet; + } + + public void InitializeBeforeTrain() + { + LdaInterface.InitializeBeforeTrain(_engine); + } + + public void InitializeBeforeTest() + { + LdaInterface.InitializeBeforeTest(_engine); + } + + public KeyValuePair[] GetModel(int wordId) + { + int length = NumTopic; + LdaInterface.GetWordTopic(_engine, wordId, _topics, _probabilities, ref length); + var wordTopicVector = new KeyValuePair[length]; + + for (int i = 0; i < length; i++) + wordTopicVector[i] = new KeyValuePair(_topics[i], _probabilities[i]); + return wordTopicVector; + } + + public KeyValuePair[] GetTopicSummary(int topicId) + { + int length = _numSummaryTerms; + LdaInterface.GetTopicSummary(_engine, topicId, _summaryTerm, _summaryTermProb, ref length); + var topicSummary = new KeyValuePair[length]; + + for (int i = 0; i < length; i++) + topicSummary[i] = new KeyValuePair(_summaryTerm[i], _summaryTermProb[i]); + return topicSummary; + } + + public void SetModel(int termID, int[] topicID, int[] topicProb, int topicNum) + { + Contracts.Check(termID >= 0); + Contracts.Check(topicNum <= NumTopic); + Array.Copy(topicID, _topics, topicNum); + Array.Copy(topicProb, _probabilities, topicNum); + LdaInterface.SetWordTopic(_engine, termID, _topics, _probabilities, topicNum); + } + + public void Dispose() + { + if (_isDisposed) + return; + _isDisposed = true; + LdaInterface.DestroyEngine(_engine); + _engine.Ptr = IntPtr.Zero; + } + } +} diff --git a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs new file mode 100644 index 0000000000..1267f634cd --- /dev/null +++ b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs @@ -0,0 +1,962 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Float = System.Single; + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.EntryPoints; +using Microsoft.ML.Runtime.Internal.Internallearn; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; +using Microsoft.ML.Runtime.TextAnalytics; + +[assembly: LoadableClass(typeof(LdaTransform), typeof(LdaTransform.Arguments), typeof(SignatureDataTransform), + LdaTransform.UserName, LdaTransform.LoaderSignature, LdaTransform.ShortName, DocName = "transform/LdaTransform.md")] + +[assembly: LoadableClass(typeof(LdaTransform), null, typeof(SignatureLoadDataTransform), + LdaTransform.UserName, LdaTransform.LoaderSignature)] + +namespace Microsoft.ML.Runtime.TextAnalytics +{ + /// + /// The latent Dirichlet allocation (LDA) transform. + /// http://arxiv.org/abs/1412.1576 + /// + public sealed class LdaTransform : OneToOneTransformBase + { + public sealed class Arguments : TransformInputBase + { + [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:srcs)", ShortName = "col", SortOrder = 49)] + public Column[] Column; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The number of topics in the LDA", SortOrder = 50)] + [TGUI(SuggestedSweeps = "20,40,100,200")] + [TlcModule.SweepableDiscreteParam("NumTopic", new object[] { 20, 40, 100, 200 })] + public int NumTopic = 100; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Dirichlet prior on document-topic vectors")] + [TGUI(SuggestedSweeps = "1,10,100,200")] + [TlcModule.SweepableDiscreteParam("AlphaSum", new object[] { 1, 10, 100, 200 })] + public Single AlphaSum = 100; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Dirichlet prior on vocab-topic vectors")] + [TGUI(SuggestedSweeps = "0.01,0.015,0.07,0.02")] + [TlcModule.SweepableDiscreteParam("Beta", new object[] { 0.01f, 0.015f, 0.07f, 0.02f })] + public Single Beta = 0.01f; + + [Argument(ArgumentType.Multiple, HelpText = "Number of Metropolis Hasting step")] + [TGUI(SuggestedSweeps = "2,4,8,16")] + [TlcModule.SweepableDiscreteParam("Mhstep", new object[] { 2, 4, 8, 16 })] + public int Mhstep = 4; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Number of iterations", ShortName = "iter")] + [TGUI(SuggestedSweeps = "100,200,300,400")] + [TlcModule.SweepableDiscreteParam("NumIterations", new object[] { 100, 200, 300, 400 })] + public int NumIterations = 200; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Compute log likelihood over local dataset on this iteration interval", ShortName = "llInterval")] + public int LikelihoodInterval = 5; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The threshold of maximum count of tokens per doc", ShortName = "maxNumToken", SortOrder = 50)] + public int NumMaxDocToken = 512; + + // REVIEW: Should change the default when multi-threading support is optimized. + [Argument(ArgumentType.AtMostOnce, HelpText = "The number of training threads. Default value depends on number of logical processors.", ShortName = "t", SortOrder = 50)] + public int? NumThreads; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The number of words to summarize the topic", ShortName = "ns")] + public int NumSummaryTermPerTopic = 10; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The number of burn-in iterations", ShortName = "burninIter")] + [TGUI(SuggestedSweeps = "10,20,30,40")] + [TlcModule.SweepableDiscreteParam("NumBurninIterations", new object[] { 10, 20, 30, 40 })] + public int NumBurninIterations = 10; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Reset the random number generator for each document", ShortName = "reset")] + public bool ResetRandomGenerator; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to output the topic-word summary in text format", ShortName = "summary")] + public bool OutputTopicWordSummary; + } + + public sealed class Column : OneToOneColumn + { + [Argument(ArgumentType.AtMostOnce, HelpText = "The number of topics in the LDA")] + public int? NumTopic; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Dirichlet prior on document-topic vectors")] + public Single? AlphaSum; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Dirichlet prior on vocab-topic vectors")] + public Single? Beta; + + [Argument(ArgumentType.Multiple, HelpText = "Number of Metropolis Hasting step")] + public int? Mhstep; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Number of iterations", ShortName = "iter")] + public int? NumIterations; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Compute log likelihood over local dataset on this iteration interval", ShortName = "llInterval")] + public int? LikelihoodInterval; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The number of training threads", ShortName = "t")] + public int? NumThreads; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The threshold of maximum count of tokens per doc", ShortName = "maxNumToken")] + public int? NumMaxDocToken; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The number of words to summarize the topic", ShortName = "ns")] + public int? NumSummaryTermPerTopic; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The number of burn-in iterations", ShortName = "burninIter")] + public int? NumBurninIterations = 10; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Reset the random number generator for each document", ShortName = "reset")] + public bool? ResetRandomGenerator; + + public static Column Parse(string str) + { + Contracts.AssertNonEmpty(str); + + var res = new Column(); + if (res.TryParse(str)) + return res; + return null; + } + + public bool TryUnparse(StringBuilder sb) + { + Contracts.AssertValue(sb); + if (NumTopic != null || AlphaSum != null || Beta != null || Mhstep != null || NumIterations != null || LikelihoodInterval != null || + NumThreads != null || NumMaxDocToken != null || NumSummaryTermPerTopic != null || ResetRandomGenerator != null) + return false; + return TryUnparseCore(sb); + } + } + + private sealed class ColInfoEx + { + public readonly int NumTopic; + public readonly Single AlphaSum; + public readonly Single Beta; + public readonly int MHStep; + public readonly int NumIter; + public readonly int LikelihoodInterval; + public readonly int NumThread; + public readonly int NumMaxDocToken; + public readonly int NumSummaryTermPerTopic; + public readonly int NumBurninIter; + public readonly bool ResetRandomGenerator; + + public ColInfoEx(IExceptionContext ectx, Column item, Arguments args) + { + Contracts.AssertValue(ectx); + + NumTopic = item.NumTopic ?? args.NumTopic; + Contracts.CheckUserArg(NumTopic > 0, nameof(item.NumTopic), "Must be positive."); + + AlphaSum = item.AlphaSum ?? args.AlphaSum; + + Beta = item.Beta ?? args.Beta; + + MHStep = item.Mhstep ?? args.Mhstep; + ectx.CheckUserArg(MHStep > 0, nameof(item.Mhstep), "Must be positive."); + + NumIter = item.NumIterations ?? args.NumIterations; + ectx.CheckUserArg(NumIter > 0, nameof(item.NumIterations), "Must be positive."); + + LikelihoodInterval = item.LikelihoodInterval ?? args.LikelihoodInterval; + ectx.CheckUserArg(LikelihoodInterval > 0, nameof(item.LikelihoodInterval), "Must be positive."); + + NumThread = item.NumThreads ?? args.NumThreads ?? 0; + ectx.CheckUserArg(NumThread >= 0, nameof(item.NumThreads), "Must be positive or zero."); + + NumMaxDocToken = item.NumMaxDocToken ?? args.NumMaxDocToken; + ectx.CheckUserArg(NumMaxDocToken > 0, nameof(item.NumMaxDocToken), "Must be positive."); + + NumSummaryTermPerTopic = item.NumSummaryTermPerTopic ?? args.NumSummaryTermPerTopic; + ectx.CheckUserArg(NumSummaryTermPerTopic > 0, nameof(item.NumSummaryTermPerTopic), "Must be positive"); + + NumBurninIter = item.NumBurninIterations ?? args.NumBurninIterations; + ectx.CheckUserArg(NumBurninIter >= 0, nameof(item.NumBurninIterations), "Must be non-negative."); + + ResetRandomGenerator = item.ResetRandomGenerator ?? args.ResetRandomGenerator; + } + + public ColInfoEx(IExceptionContext ectx, ModelLoadContext ctx) + { + Contracts.AssertValue(ectx); + ectx.AssertValue(ctx); + + // *** Binary format *** + // int NumTopic; + // Single AlphaSum; + // Single Beta; + // int MHStep; + // int NumIter; + // int LikelihoodInterval; + // int NumThread; + // int NumMaxDocToken; + // int NumSummaryTermPerTopic; + // int NumBurninIter; + // byte ResetRandomGenerator; + + NumTopic = ctx.Reader.ReadInt32(); + ectx.CheckDecode(NumTopic > 0); + + AlphaSum = ctx.Reader.ReadSingle(); + + Beta = ctx.Reader.ReadSingle(); + + MHStep = ctx.Reader.ReadInt32(); + ectx.CheckDecode(MHStep > 0); + + NumIter = ctx.Reader.ReadInt32(); + ectx.CheckDecode(NumIter > 0); + + LikelihoodInterval = ctx.Reader.ReadInt32(); + ectx.CheckDecode(LikelihoodInterval > 0); + + NumThread = ctx.Reader.ReadInt32(); + ectx.CheckDecode(NumThread >= 0); + + NumMaxDocToken = ctx.Reader.ReadInt32(); + ectx.CheckDecode(NumMaxDocToken > 0); + + NumSummaryTermPerTopic = ctx.Reader.ReadInt32(); + ectx.CheckDecode(NumSummaryTermPerTopic > 0); + + NumBurninIter = ctx.Reader.ReadInt32(); + ectx.CheckDecode(NumBurninIter >= 0); + + ResetRandomGenerator = ctx.Reader.ReadBoolByte(); + } + + public void Save(ModelSaveContext ctx) + { + Contracts.AssertValue(ctx); + + // *** Binary format *** + // int NumTopic; + // Single AlphaSum; + // Single Beta; + // int MHStep; + // int NumIter; + // int LikelihoodInterval; + // int NumThread; + // int NumMaxDocToken; + // int NumSummaryTermPerTopic; + // int NumBurninIter; + // byte ResetRandomGenerator; + + ctx.Writer.Write(NumTopic); + ctx.Writer.Write(AlphaSum); + ctx.Writer.Write(Beta); + ctx.Writer.Write(MHStep); + ctx.Writer.Write(NumIter); + ctx.Writer.Write(LikelihoodInterval); + ctx.Writer.Write(NumThread); + ctx.Writer.Write(NumMaxDocToken); + ctx.Writer.Write(NumSummaryTermPerTopic); + ctx.Writer.Write(NumBurninIter); + ctx.Writer.WriteBoolByte(ResetRandomGenerator); + } + } + + public const string LoaderSignature = "LdaTransform"; + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "LIGHTLDA", + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + private readonly ColInfoEx[] _exes; + private readonly LdaState[] _ldas; + private readonly ColumnType[] _types; + private readonly bool _saveText; + + private const string RegistrationName = "LightLda"; + private const string WordTopicModelFilename = "word_topic_summary.txt"; + internal const string Summary = "The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation."; + internal const string UserName = "Latent Dirichlet Allocation Transform"; + internal const string ShortName = "LightLda"; + + public LdaTransform(IHostEnvironment env, Arguments args, IDataView input) + : base(env, RegistrationName, args.Column, input, TestType) + { + Host.CheckValue(args, nameof(args)); + Host.CheckUserArg(args.NumTopic > 0, nameof(args.NumTopic), "Must be positive."); + Host.CheckValue(input, nameof(input)); + Host.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column)); + _exes = new ColInfoEx[Infos.Length]; + _types = new ColumnType[Infos.Length]; + _ldas = new LdaState[Infos.Length]; + _saveText = args.OutputTopicWordSummary; + for (int i = 0; i < Infos.Length; i++) + { + var ex = new ColInfoEx(Host, args.Column[i], args); + _exes[i] = ex; + _types[i] = new VectorType(NumberType.Float, ex.NumTopic); + } + using (var ch = Host.Start("Train")) + { + Train(ch, input, _ldas); + ch.Done(); + } + Metadata.Seal(); + } + + private void Dispose(bool disposing) + { + if (_ldas != null) + { + foreach (var state in _ldas) + state?.Dispose(); + } + if (disposing) + GC.SuppressFinalize(this); + } + + public void Dispose() + { + Dispose(true); + } + + ~LdaTransform() + { + Dispose(false); + } + + private LdaTransform(IHost host, ModelLoadContext ctx, IDataView input) + : base(host, ctx, input, TestType) + { + Host.AssertValue(ctx); + + // *** Binary format *** + // + // + // ldaState[num infos]: The LDA parameters + + // Note: infos.length would be just one in most cases. + _exes = new ColInfoEx[Infos.Length]; + _ldas = new LdaState[Infos.Length]; + _types = new ColumnType[Infos.Length]; + for (int i = 0; i < _ldas.Length; i++) + { + _ldas[i] = new LdaState(Host, ctx); + _exes[i] = _ldas[i].InfoEx; + _types[i] = new VectorType(NumberType.Float, _ldas[i].InfoEx.NumTopic); + } + using (var ent = ctx.Repository.OpenEntryOrNull("model", WordTopicModelFilename)) + { + _saveText = ent != null; + } + Metadata.Seal(); + } + + public static LdaTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) + { + Contracts.CheckValue(env, nameof(env)); + var h = env.Register(RegistrationName); + + h.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + h.CheckValue(input, nameof(input)); + + return h.Apply( + "Loading Model", + ch => + { + // *** Binary Format *** + // int: sizeof(Float) + // + int cbFloat = ctx.Reader.ReadInt32(); + h.CheckDecode(cbFloat == sizeof(Float)); + return new LdaTransform(h, ctx, input); + }); + } + + public string GetTopicSummary() + { + StringWriter writer = new StringWriter(); + VBuffer slotNames = default(VBuffer); + for (int i = 0; i < _ldas.Length; i++) + { + GetSlotNames(i, ref slotNames); + _ldas[i].GetTopicSummaryWriter(slotNames)(writer); + writer.WriteLine(); + } + return writer.ToString(); + } + + public override void Save(ModelSaveContext ctx) + { + Host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + ctx.SetVersionInfo(GetVersionInfo()); + + // *** Binary format *** + // int: sizeof(Float) + // + // ldaState[num infos]: The LDA parameters + + ctx.Writer.Write(sizeof(Float)); + SaveBase(ctx); + Host.Assert(_ldas.Length == Infos.Length); + VBuffer slotNames = default(VBuffer); + for (int i = 0; i < _ldas.Length; i++) + { + GetSlotNames(i, ref slotNames); + _ldas[i].Save(ctx, _saveText, slotNames); + } + } + + private void GetSlotNames(int iinfo, ref VBuffer dst) + { + Host.Assert(0 <= iinfo && iinfo < Infos.Length); + if (Source.Schema.HasSlotNames(Infos[iinfo].Source, Infos[iinfo].TypeSrc.ValueCount)) + Source.Schema.GetMetadata(MetadataUtils.Kinds.SlotNames, Infos[iinfo].Source, ref dst); + else + dst = default(VBuffer); + } + + private static string TestType(ColumnType t) + { + // LDA consumes term frequency vectors, so I am assuming VBuffer is an appropriate input type. + // It must also be of known size for the sake of the LDA trainer initialization. + if (t.IsKnownSizeVector && t.ItemType is NumberType) + return null; + return "Expected vector of number type of known size."; + } + + private static int GetFrequency(double value) + { + int result = (int)value; + if (!(result == value && result >= 0)) + return -1; + return result; + } + + private void Train(IChannel ch, IDataView trainingData, LdaState[] states) + { + Host.AssertValue(ch); + ch.AssertValue(trainingData); + ch.AssertValue(states); + ch.Assert(states.Length == Infos.Length); + + bool[] activeColumns = new bool[trainingData.Schema.ColumnCount]; + int[] numVocabs = new int[Infos.Length]; + + for (int i = 0; i < Infos.Length; i++) + { + activeColumns[Infos[i].Source] = true; + numVocabs[i] = 0; + } + + //the current lda needs the memory allocation before feedin data, so needs two sweeping of the data, + //one for the pre-calc memory, one for feedin data really + //another solution can be prepare these two value externally and put them in the beginning of the input file. + long[] corpusSize = new long[Infos.Length]; + int[] numDocArray = new int[Infos.Length]; + + using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) + { + var getters = new ValueGetter>[Utils.Size(Infos)]; + for (int i = 0; i < Infos.Length; i++) + { + corpusSize[i] = 0; + numDocArray[i] = 0; + getters[i] = RowCursorUtils.GetVecGetterAs(NumberType.R8, cursor, Infos[i].Source); + } + VBuffer src = default(VBuffer); + long rowCount = 0; + + while (cursor.MoveNext()) + { + ++rowCount; + for (int i = 0; i < Infos.Length; i++) + { + int docSize = 0; + getters[i](ref src); + + // compute term, doc instance#. + for (int termID = 0; termID < src.Count; termID++) + { + int termFreq = GetFrequency(src.Values[termID]); + if (termFreq < 0) + { + // Ignore this row. + docSize = 0; + break; + } + + if (docSize >= _exes[i].NumMaxDocToken - termFreq) + break; //control the document length + + //if legal then add the term + docSize += termFreq; + } + + // Ignore empty doc + if (docSize == 0) + continue; + + numDocArray[i]++; + corpusSize[i] += docSize * 2 + 1; // in the beggining of each doc, there is a cursor variable + + // increase numVocab if needed. + if (numVocabs[i] < src.Length) + numVocabs[i] = src.Length; + } + } + + for (int i = 0; i < Infos.Length; ++i) + { + if (numDocArray[i] != rowCount) + { + ch.Assert(numDocArray[i] < rowCount); + ch.Warning($"Column '{Infos[i].Name}' has skipped {rowCount - numDocArray[i]} of {rowCount} rows either empty or with negative, non-finite, or fractional values."); + } + } + } + + // Initialize all LDA states + for (int i = 0; i < Infos.Length; i++) + { + var state = new LdaState(Host, _exes[i], numVocabs[i]); + if (numDocArray[i] == 0 || corpusSize[i] == 0) + throw ch.Except("The specified documents are all empty in column '{0}'.", Infos[i].Name); + + state.AllocateDataMemory(numDocArray[i], corpusSize[i]); + states[i] = state; + } + + using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) + { + int[] docSizeCheck = new int[Infos.Length]; + // This could be optimized so that if multiple trainers consume the same column, it is + // fed into the train method once. + var getters = new ValueGetter>[Utils.Size(Infos)]; + for (int i = 0; i < Infos.Length; i++) + { + docSizeCheck[i] = 0; + getters[i] = RowCursorUtils.GetVecGetterAs(NumberType.R8, cursor, Infos[i].Source); + } + + VBuffer src = default(VBuffer); + + while (cursor.MoveNext()) + { + for (int i = 0; i < Infos.Length; i++) + { + getters[i](ref src); + docSizeCheck[i] += states[i].FeedTrain(Host, ref src); + } + } + for (int i = 0; i < Infos.Length; i++) + { + Host.Assert(corpusSize[i] == docSizeCheck[i]); + states[i].CompleteTrain(); + } + } + } + + private sealed class LdaState : IDisposable + { + public readonly ColInfoEx InfoEx; + private readonly int _numVocab; + private readonly object _preparationSyncRoot; + private readonly object _testSyncRoot; + private bool _predictionPreparationDone; + private LdaSingleBox _ldaTrainer; + + private LdaState() + { + _preparationSyncRoot = new object(); + _testSyncRoot = new object(); + } + + public LdaState(IExceptionContext ectx, ColInfoEx ex, int numVocab) + : this() + { + Contracts.AssertValue(ectx); + ectx.AssertValue(ex, "ex"); + + ectx.Assert(numVocab >= 0); + InfoEx = ex; + _numVocab = numVocab; + + _ldaTrainer = new LdaSingleBox( + InfoEx.NumTopic, + numVocab, /* Need to set number of vocabulary here */ + InfoEx.AlphaSum, + InfoEx.Beta, + InfoEx.NumIter, + InfoEx.LikelihoodInterval, + InfoEx.NumThread, + InfoEx.MHStep, + InfoEx.NumSummaryTermPerTopic, + false, + InfoEx.NumMaxDocToken); + } + + public LdaState(IExceptionContext ectx, ModelLoadContext ctx) + : this() + { + ectx.AssertValue(ctx); + + // *** Binary format *** + // + // int: vocabnum + // long: memblocksize + // long: aliasMemBlockSize + // (serializing term by term, for one term) + // int: term_id, int: topic_num, KeyValuePair[]: termTopicVector + + InfoEx = new ColInfoEx(ectx, ctx); + + _numVocab = ctx.Reader.ReadInt32(); + ectx.CheckDecode(_numVocab > 0); + + long memBlockSize = ctx.Reader.ReadInt64(); + ectx.CheckDecode(memBlockSize > 0); + + long aliasMemBlockSize = ctx.Reader.ReadInt64(); + ectx.CheckDecode(aliasMemBlockSize > 0); + + _ldaTrainer = new LdaSingleBox( + InfoEx.NumTopic, + _numVocab, /* Need to set number of vocabulary here */ + InfoEx.AlphaSum, + InfoEx.Beta, + InfoEx.NumIter, + InfoEx.LikelihoodInterval, + InfoEx.NumThread, + InfoEx.MHStep, + InfoEx.NumSummaryTermPerTopic, + false, + InfoEx.NumMaxDocToken); + + _ldaTrainer.AllocateModelMemory(_numVocab, InfoEx.NumTopic, memBlockSize, aliasMemBlockSize); + + for (int i = 0; i < _numVocab; i++) + { + int termID = ctx.Reader.ReadInt32(); + ectx.CheckDecode(termID >= 0); + int termTopicNum = ctx.Reader.ReadInt32(); + ectx.CheckDecode(termTopicNum >= 0); + + int[] topicId = new int[termTopicNum]; + int[] topicProb = new int[termTopicNum]; + + for (int j = 0; j < termTopicNum; j++) + { + topicId[j] = ctx.Reader.ReadInt32(); + topicProb[j] = ctx.Reader.ReadInt32(); + } + + //set the topic into _ldaTrainer inner topic table + _ldaTrainer.SetModel(termID, topicId, topicProb, termTopicNum); + } + + //do the preparation + if (!_predictionPreparationDone) + { + _ldaTrainer.InitializeBeforeTest(); + _predictionPreparationDone = true; + } + } + + public Action GetTopicSummaryWriter(VBuffer mapping) + { + Action writeAction; + + if (mapping.Length == 0) + { + writeAction = + writer => + { + for (int i = 0; i < _ldaTrainer.NumTopic; i++) + { + KeyValuePair[] topicSummaryVector = _ldaTrainer.GetTopicSummary(i); + writer.Write("{0}\t{1}\t", i, topicSummaryVector.Length); + foreach (KeyValuePair p in topicSummaryVector) + writer.Write("{0}:{1}\t", p.Key, p.Value); + writer.WriteLine(); + } + }; + } + else + { + writeAction = + writer => + { + DvText slotName = default(DvText); + for (int i = 0; i < _ldaTrainer.NumTopic; i++) + { + KeyValuePair[] topicSummaryVector = _ldaTrainer.GetTopicSummary(i); + writer.Write("{0}\t{1}\t", i, topicSummaryVector.Length); + foreach (KeyValuePair p in topicSummaryVector) + { + mapping.GetItemOrDefault(p.Key, ref slotName); + writer.Write("{0}[{1}]:{2}\t", p.Key, slotName, p.Value); + } + writer.WriteLine(); + } + }; + } + + return writeAction; + } + + public void Save(ModelSaveContext ctx, bool saveText, VBuffer mapping) + { + Contracts.AssertValue(ctx); + long memBlockSize = 0; + long aliasMemBlockSize = 0; + _ldaTrainer.GetModelStat(out memBlockSize, out aliasMemBlockSize); + + // *** Binary format *** + // + // int: vocabnum + // long: memblocksize + // long: aliasMemBlockSize + // (serializing term by term, for one term) + // int: term_id, int: topic_num, KeyValuePair[]: termTopicVector + + InfoEx.Save(ctx); + ctx.Writer.Write(_ldaTrainer.NumVocab); + ctx.Writer.Write(memBlockSize); + ctx.Writer.Write(aliasMemBlockSize); + + //save model from this interface + for (int i = 0; i < _ldaTrainer.NumVocab; i++) + { + KeyValuePair[] termTopicVector = _ldaTrainer.GetModel(i); + + //write the topic to disk through ctx + ctx.Writer.Write(i); //term_id + ctx.Writer.Write(termTopicVector.Length); + + foreach (KeyValuePair p in termTopicVector) + { + ctx.Writer.Write(p.Key); + ctx.Writer.Write(p.Value); + } + } + + var writeAction = GetTopicSummaryWriter(mapping); + + // save word-topic summary in text + if (saveText) + ctx.SaveTextStream(WordTopicModelFilename, writeAction); + } + + public void AllocateDataMemory(int docNum, long corpusSize) + { + _ldaTrainer.AllocateDataMemory(docNum, corpusSize); + } + + public int FeedTrain(IExceptionContext ectx, ref VBuffer input) + { + Contracts.AssertValue(ectx); + + // REVIEW: Input the counts to your trainer here. This + // is called multiple times. + + int docSize = 0; + int termNum = 0; + + for (int i = 0; i < input.Count; i++) + { + int termFreq = GetFrequency(input.Values[i]); + if (termFreq < 0) + { + // Ignore this row. + return 0; + } + if (docSize >= InfoEx.NumMaxDocToken - termFreq) + break; + + // If legal then add the term. + docSize += termFreq; + termNum++; + } + + // Ignore empty doc. + if (docSize == 0) + return 0; + + int actualSize = 0; + if (input.IsDense) + actualSize = _ldaTrainer.LoadDocDense(input.Values, termNum, input.Length); + else + actualSize = _ldaTrainer.LoadDoc(input.Indices, input.Values, termNum, input.Length); + + ectx.Assert(actualSize == 2 * docSize + 1, string.Format("The doc size are distinct. Actual: {0}, Expected: {1}", actualSize, 2 * docSize + 1)); + return actualSize; + } + + public void CompleteTrain() + { + //allocate all kinds of in memory sample tables + _ldaTrainer.InitializeBeforeTrain(); + + //call native lda trainer to perform the multi-thread training + _ldaTrainer.Train(""); /* Need to pass in an empty string */ + } + + public void Output(ref VBuffer src, ref VBuffer dst, int numBurninIter, bool reset) + { + // Prediction for a single document. + // LdaSingleBox.InitializeBeforeTest() is NOT thread-safe. + if (!_predictionPreparationDone) + { + lock (_preparationSyncRoot) + { + if (!_predictionPreparationDone) + { + //do some preparation for building tables in native c++ + _ldaTrainer.InitializeBeforeTest(); + _predictionPreparationDone = true; + } + } + } + + int len = InfoEx.NumTopic; + var values = dst.Values; + var indices = dst.Indices; + if (src.Count == 0) + { + dst = new VBuffer(len, 0, values, indices); + return; + } + + // Make sure all the frequencies are valid and truncate if the sum gets too large. + int docSize = 0; + int termNum = 0; + for (int i = 0; i < src.Count; i++) + { + int termFreq = GetFrequency(src.Values[i]); + if (termFreq < 0) + { + // REVIEW: Should this log a warning message? And what should it produce? + // It currently produces a vbuffer of all NA values. + // REVIEW: Need a utility method to do this... + if (Utils.Size(values) < len) + values = new Float[len]; + for (int k = 0; k < len; k++) + values[k] = Float.NaN; + dst = new VBuffer(len, values, indices); + return; + } + + if (docSize >= InfoEx.NumMaxDocToken - termFreq) + break; + + docSize += termFreq; + termNum++; + } + + // REVIEW: Too much memory allocation here on each prediction. + List> retTopics; + if (src.IsDense) + retTopics = _ldaTrainer.TestDocDense(src.Values, termNum, numBurninIter, reset); + else + retTopics = _ldaTrainer.TestDoc(src.Indices.Take(src.Count).ToArray(), src.Values.Take(src.Count).ToArray(), termNum, numBurninIter, reset); + + int count = retTopics.Count; + Contracts.Assert(count <= len); + if (Utils.Size(values) < count) + values = new Float[count]; + if (count < len && Utils.Size(indices) < count) + indices = new int[count]; + + double normalizer = 0; + for (int i = 0; i < count; i++) + { + int index = retTopics[i].Key; + Float value = retTopics[i].Value; + Contracts.Assert(value >= 0); + Contracts.Assert(0 <= index && index < len); + if (count < len) + { + Contracts.Assert(i == 0 || indices[i - 1] < index); + indices[i] = index; + } + else + Contracts.Assert(index == i); + + values[i] = value; + normalizer += value; + } + + if (normalizer > 0) + { + for (int i = 0; i < count; i++) + values[i] = (Float)(values[i] / normalizer); + } + dst = new VBuffer(len, count, values, indices); + } + + public void Dispose() + { + _ldaTrainer.Dispose(); + } + } + + private ColumnType[] InitColumnTypes(int numTopics) + { + Host.Assert(Utils.Size(Infos) > 0); + var types = new ColumnType[Infos.Length]; + for (int c = 0; c < Infos.Length; c++) + types[c] = new VectorType(NumberType.Float, numTopics); + return types; + } + + protected override ColumnType GetColumnTypeCore(int iinfo) + { + Host.Assert(0 <= iinfo & iinfo < Utils.Size(_types)); + return _types[iinfo]; + } + + protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer) + { + Host.AssertValueOrNull(ch); + Host.AssertValue(input); + Host.Assert(0 <= iinfo && iinfo < Infos.Length); + disposer = null; + + return GetTopic(input, iinfo); + } + + private ValueGetter> GetTopic(IRow input, int iinfo) + { + var getSrc = RowCursorUtils.GetVecGetterAs(NumberType.R8, input, Infos[iinfo].Source); + var src = default(VBuffer); + var lda = _ldas[iinfo]; + int numBurninIter = lda.InfoEx.NumBurninIter; + bool reset = lda.InfoEx.ResetRandomGenerator; + return + (ref VBuffer dst) => + { + // REVIEW: This will work, but there are opportunities for caching + // based on input.Counter that are probably worthwhile given how long inference takes. + getSrc(ref src); + lda.Output(ref src, ref dst, numBurninIter, reset); + }; + } + } +} diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index fd365acfbe..0f8fefb267 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -958,6 +958,18 @@ public void Add(Microsoft.ML.Transforms.LabelToFloatConverter input, Microsoft.M _jsonNodes.Add(Serialize("Transforms.LabelToFloatConverter", input, output)); } + public Microsoft.ML.Transforms.LightLda.Output Add(Microsoft.ML.Transforms.LightLda input) + { + var output = new Microsoft.ML.Transforms.LightLda.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Transforms.LightLda input, Microsoft.ML.Transforms.LightLda.Output output) + { + _jsonNodes.Add(Serialize("Transforms.LightLda", input, output)); + } + public Microsoft.ML.Transforms.LogMeanVarianceNormalizer.Output Add(Microsoft.ML.Transforms.LogMeanVarianceNormalizer input) { var output = new Microsoft.ML.Transforms.LogMeanVarianceNormalizer.Output(); @@ -10506,6 +10518,246 @@ public LabelToFloatConverterPipelineStep(Output output) } } + namespace Transforms + { + + public sealed partial class LdaTransformColumn : OneToOneColumn, IOneToOneColumn + { + /// + /// The number of topics in the LDA + /// + public int? NumTopic { get; set; } + + /// + /// Dirichlet prior on document-topic vectors + /// + public float? AlphaSum { get; set; } + + /// + /// Dirichlet prior on vocab-topic vectors + /// + public float? Beta { get; set; } + + /// + /// Number of Metropolis Hasting step + /// + public int? Mhstep { get; set; } + + /// + /// Number of iterations + /// + public int? NumIterations { get; set; } + + /// + /// Compute log likelihood over local dataset on this iteration interval + /// + public int? LikelihoodInterval { get; set; } + + /// + /// The number of training threads + /// + public int? NumThreads { get; set; } + + /// + /// The threshold of maximum count of tokens per doc + /// + public int? NumMaxDocToken { get; set; } + + /// + /// The number of words to summarize the topic + /// + public int? NumSummaryTermPerTopic { get; set; } + + /// + /// The number of burn-in iterations + /// + public int? NumBurninIterations { get; set; } = 10; + + /// + /// Reset the random number generator for each document + /// + public bool? ResetRandomGenerator { get; set; } + + /// + /// Name of the new column + /// + public string Name { get; set; } + + /// + /// Name of the source column + /// + public string Source { get; set; } + + } + + /// + /// The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation. + /// + public sealed partial class LightLda : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + + public LightLda() + { + } + + public LightLda(params string[] inputColumns) + { + if (inputColumns != null) + { + foreach (string input in inputColumns) + { + AddColumn(input); + } + } + } + + public LightLda(params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + if (inputOutputColumns != null) + { + foreach (var inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); + } + } + } + + public void AddColumn(string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); + Column = list.ToArray(); + } + + public void AddColumn(string outputColumn, string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + Column = list.ToArray(); + } + + + /// + /// New column definition(s) (optional form: name:srcs) + /// + public LdaTransformColumn[] Column { get; set; } + + /// + /// The number of topics in the LDA + /// + [TlcModule.SweepableDiscreteParamAttribute("NumTopic", new object[]{20, 40, 100, 200})] + public int NumTopic { get; set; } = 100; + + /// + /// Dirichlet prior on document-topic vectors + /// + [TlcModule.SweepableDiscreteParamAttribute("AlphaSum", new object[]{1, 10, 100, 200})] + public float AlphaSum { get; set; } = 100f; + + /// + /// Dirichlet prior on vocab-topic vectors + /// + [TlcModule.SweepableDiscreteParamAttribute("Beta", new object[]{0.01f, 0.015f, 0.07f, 0.02f})] + public float Beta { get; set; } = 0.01f; + + /// + /// Number of Metropolis Hasting step + /// + [TlcModule.SweepableDiscreteParamAttribute("Mhstep", new object[]{2, 4, 8, 16})] + public int Mhstep { get; set; } = 4; + + /// + /// Number of iterations + /// + [TlcModule.SweepableDiscreteParamAttribute("NumIterations", new object[]{100, 200, 300, 400})] + public int NumIterations { get; set; } = 200; + + /// + /// Compute log likelihood over local dataset on this iteration interval + /// + public int LikelihoodInterval { get; set; } = 5; + + /// + /// The threshold of maximum count of tokens per doc + /// + public int NumMaxDocToken { get; set; } = 512; + + /// + /// The number of training threads. Default value depends on number of logical processors. + /// + public int? NumThreads { get; set; } + + /// + /// The number of words to summarize the topic + /// + public int NumSummaryTermPerTopic { get; set; } = 10; + + /// + /// The number of burn-in iterations + /// + [TlcModule.SweepableDiscreteParamAttribute("NumBurninIterations", new object[]{10, 20, 30, 40})] + public int NumBurninIterations { get; set; } = 10; + + /// + /// Reset the random number generator for each document + /// + public bool ResetRandomGenerator { get; set; } = false; + + /// + /// Whether to output the topic-word summary in text format + /// + public bool OutputTopicWordSummary { get; set; } = false; + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); + + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + + } + public Var GetInputData() => Data; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(LightLda)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new LightLdaPipelineStep(output); + } + + private class LightLdaPipelineStep : ILearningPipelineDataStep + { + public LightLdaPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + + public Var Data { get; } + public Var Model { get; } + } + } + } + namespace Transforms { diff --git a/src/Native/CMakeLists.txt b/src/Native/CMakeLists.txt index 947a664ab6..d8f963e44e 100644 --- a/src/Native/CMakeLists.txt +++ b/src/Native/CMakeLists.txt @@ -179,4 +179,5 @@ function(install_library_and_symbols targetName) endfunction() add_subdirectory(CpuMathNative) -add_subdirectory(FastTreeNative) \ No newline at end of file +add_subdirectory(FastTreeNative) +add_subdirectory(LdaNative) \ No newline at end of file diff --git a/src/Native/LdaNative/CMakeLists.txt b/src/Native/LdaNative/CMakeLists.txt new file mode 100644 index 0000000000..f2e1d340de --- /dev/null +++ b/src/Native/LdaNative/CMakeLists.txt @@ -0,0 +1,19 @@ +project (LdaNative) + +set(SOURCES + alias_multinomial_rng_int.cpp + data_block.cpp + hybrid_alias_map.cpp + hybrid_map.cpp + lda_document.cpp + lda_engine.cpp + lda_engine_export.cpp + light_doc_sampler.cpp + light_hash_map.cpp + model_block.cpp + utils.cpp +) + +add_library(LdaNative SHARED ${SOURCES} ${RESOURCES}) + +install_library_and_symbols (LdaNative) diff --git a/src/Native/LdaNative/alias_multinomial_rng_int.cpp b/src/Native/LdaNative/alias_multinomial_rng_int.cpp new file mode 100644 index 0000000000..a945feb6be --- /dev/null +++ b/src/Native/LdaNative/alias_multinomial_rng_int.cpp @@ -0,0 +1,49 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#include "alias_multinomial_rng_int.hpp" +#include "rand_int_rng.h" +#include +#include +#include +#include + +namespace wood +{ + AliasMultinomialRNGInt::AliasMultinomialRNGInt() + : n_(-1), internal_memory_(nullptr) + { + + } + AliasMultinomialRNGInt::~AliasMultinomialRNGInt() + { + if (internal_memory_ != nullptr) + { + delete[]internal_memory_; + } + } + + int32_t AliasMultinomialRNGInt::Next(xorshift_rng& rng, std::vector& alias_kv) + { + // NOTE: stl uniform_real_distribution generates the highest quality random numbers + // yet, the other two are much faster + auto sample = rng.rand(); + + // NOTE: use std::floor is too slow + // here we guarantee sample * n_ is nonnegative, this makes cast work + int idx = sample / a_int_; + + if (n_ <= idx) + { + idx = n_ - 1; + } + + // the following code is equivalent to + // return sample < V_[idx] ? idx : K_[idx]; + // but faster, see + // http://stackoverflow.com/questions/6754454/speed-difference-between-if-else-and-ternary-operator-in-c + int m = -(sample < alias_kv[idx].v_); + return (idx & m) | (alias_kv[idx].k_ & ~m); + } +} \ No newline at end of file diff --git a/src/Native/LdaNative/alias_multinomial_rng_int.hpp b/src/Native/LdaNative/alias_multinomial_rng_int.hpp new file mode 100644 index 0000000000..e25bc5bc17 --- /dev/null +++ b/src/Native/LdaNative/alias_multinomial_rng_int.hpp @@ -0,0 +1,454 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "rand_int_rng.h" +#include +#include +/* +Algorithm described in +http://www.jstatsoft.org/v11/i03/paper +George Marsaglia +Fast generation of discrete random variables +*/ +namespace wood +{ + struct alias_k_v + { + int32_t k_; + int32_t v_; + }; + + class AliasMultinomialRNGInt + { + public: + AliasMultinomialRNGInt(); + ~AliasMultinomialRNGInt(); + + void Init(int K) + { + L_.resize(K); + H_.resize(K); + proportion_int_.resize(K); + internal_memory_ = new int32_t[2 * K]; + } + + void SetProportionMass(std::vector &proportion, + float mass, + std::vector &alias_kv, + int32_t *height, + xorshift_rng &rng) + { + n_ = (int32_t)proportion.size(); //proportion number should be kept within 2Billion + + mass_int_ = 0x7fffffff; + a_int_ = mass_int_ / n_; + mass_int_ = a_int_ * n_; + *height = a_int_; + + int64_t mass_sum = 0; //use int64_t to avoid overflowing + for (int i = 0; i < n_; ++i) + { + proportion[i] /= mass; + proportion_int_[i] = (int32_t)(proportion[i] * mass_int_); + mass_sum += proportion_int_[i]; + } + + if (mass_sum > mass_int_) + { + //Todo: is this data type safe? more is int and mass_sum is in64 + int32_t more = (int32_t)(mass_sum - mass_int_); + + int i = 0; + int id = 0; + int r = 0; + while (i < more) + { + if (proportion_int_[id] >= 1) + { + proportion_int_[id]--; + ++i; + } + id = (id + 1) % n_; + } + } + + if (mass_sum < mass_int_) + { + //Todo: is this data type safe? more is int and mass_sum is in64 + int32_t more = (int32_t)(mass_int_ - mass_sum); + + int i = 0; + int id = 0; + while (i < more) + { + proportion_int_[id]++; + id = (id + 1) % n_; + i++; + } + } + + for (int i = 0; i < n_; ++i) + { + alias_kv[i].k_ = i; + alias_kv[i].v_ = (i + 1) * a_int_; + } + + int32_t L_head = 0; + int32_t L_tail = 0; + + int32_t H_head = 0; + int32_t H_tail = 0; + + for (auto i = 0; i < proportion_int_.size(); ++i) + { + auto val = proportion_int_[i]; + if (val < a_int_) + { + L_[L_tail].first = i; + L_[L_tail].second = val; + ++L_tail; + } + else + { + H_[H_tail].first = i; + H_[H_tail].second = val; + ++H_tail; + } + } + + assert(L_tail + H_tail == n_); + + while (L_head != L_tail && H_head != H_tail) + { + auto &i_pi = L_[L_head++]; + auto &h_ph = H_[H_head++]; + + alias_kv[i_pi.first].k_ = h_ph.first; + alias_kv[i_pi.first].v_ = i_pi.first * a_int_ + i_pi.second; + + auto sum = h_ph.second + i_pi.second; + if (sum > 2 * a_int_) + { + H_[H_tail].first = h_ph.first; + H_[H_tail].second = sum - a_int_; + ++H_tail; + } + else + { + L_[L_tail].first = h_ph.first; + L_[L_tail].second = sum - a_int_; + ++L_tail; + } + } + while (L_head != L_tail) + { + auto first = L_[L_head].first; + auto second = L_[L_head].second; + alias_kv[first].k_ = first; + alias_kv[first].v_ = first * a_int_ + second; + ++L_head; + } + while (H_head != H_tail) + { + auto first = H_[H_head].first; + auto second = H_[H_head].second; + alias_kv[first].k_ = first; + alias_kv[first].v_ = first * a_int_ + second; + ++H_head; + } + + } + + inline void SetProportionMass(std::vector &proportion, + float mass, + int32_t* memory, + int32_t *height, + xorshift_rng &rng) + { + n_ = (int32_t)proportion.size(); + + mass_int_ = 0x7fffffff; + a_int_ = mass_int_ / n_; + mass_int_ = a_int_ * n_; + *height = a_int_; + + int64_t mass_sum = 0; + for (int i = 0; i < n_; ++i) + { + proportion[i] /= mass; + proportion_int_[i] = (int32_t)(proportion[i] * mass_int_); + mass_sum += proportion_int_[i]; + } + + if (mass_sum > mass_int_) + { + //Todo: is this data type safe? more is int and mass_sum is in64 + int32_t more = (int32_t)(mass_sum - mass_int_); + int i = 0; + int id = 0; + int r = 0; + while (i < more) + { + if (proportion_int_[id] >= 1) + { + proportion_int_[id]--; + ++i; + } + id = (id + 1) % n_; + } + } + + if (mass_sum < mass_int_) + { + //Todo: is this data type safe? more is int and mass_sum is in64 + int32_t more = (int32_t)(mass_int_ - mass_sum); + int i = 0; + int id = 0; + while (i < more) + { + proportion_int_[id]++; + id = (id + 1) % n_; + i++; + } + } + + for (int i = 0; i < n_; ++i) + { + int32_t *p = internal_memory_ + 2 * i; + *p = i; p++; + *p = (i + 1) * a_int_; + } + + int32_t L_head = 0; + int32_t L_tail = 0; + + int32_t H_head = 0; + int32_t H_tail = 0; + + for (auto i = 0; i < n_; ++i) + { + auto val = proportion_int_[i]; + if (val < a_int_) + { + L_[L_tail].first = i; + L_[L_tail].second = val; + ++L_tail; + } + else + { + H_[H_tail].first = i; + H_[H_tail].second = val; + ++H_tail; + } + } + + assert(L_tail + H_tail == n_); + + while (L_head != L_tail && H_head != H_tail) + { + auto &i_pi = L_[L_head++]; + auto &h_ph = H_[H_head++]; + + int32_t *p = internal_memory_ + 2 * i_pi.first; + *p = h_ph.first; p++; + *p = i_pi.first * a_int_ + i_pi.second; + + auto sum = h_ph.second + i_pi.second; + if (sum > 2 * a_int_) + { + H_[H_tail].first = h_ph.first; + H_[H_tail].second = sum - a_int_; + ++H_tail; + } + else + { + L_[L_tail].first = h_ph.first; + L_[L_tail].second = sum - a_int_; + ++L_tail; + } + } + while (L_head != L_tail) + { + auto first = L_[L_head].first; + auto second = L_[L_head].second; + + int32_t *p = internal_memory_ + 2 * first; + *p = first; p++; + *p = first * a_int_ + second; + ++L_head; + } + while (H_head != H_tail) + { + auto first = H_[H_head].first; + auto second = H_[H_head].second; + + int32_t *p = internal_memory_ + 2 * first; + *p = first; p++; + *p = first * a_int_ + second; + ++H_head; + } + memcpy(memory, internal_memory_, sizeof(int32_t)* 2 * n_); + } + + inline void SetProportionMass(std::vector &proportion, + int32_t size, + float mass, + int32_t* memory, + int32_t *height, + xorshift_rng &rng, + int32_t word_id) + { + n_ = size; + + mass_int_ = 0x7fffffff; + a_int_ = mass_int_ / n_; + mass_int_ = a_int_ * n_; + *height = a_int_; + + int64_t mass_sum = 0; + for (int i = 0; i < n_; ++i) + { + proportion[i] /= mass; + proportion_int_[i] = (int32_t)(proportion[i] * mass_int_); + mass_sum += proportion_int_[i]; + } + + if (mass_sum > mass_int_) + { + //Todo: is this data type safe? more is int and mass_sum is in64 + int32_t more = (int32_t)(mass_sum - mass_int_); + + int i = 0; + int id = 0; + int r = 0; + while (i < more) + { + if (proportion_int_[id] >= 1) + { + proportion_int_[id]--; + ++i; + } + id = (id + 1) % n_; + } + } + + if (mass_sum < mass_int_) + { + //Todo: is this data type safe? more is int and mass_sum is in64 + int32_t more = (int32_t)(mass_int_ - mass_sum); + + int i = 0; + int id = 0; + while (i < more) + { + proportion_int_[id]++; + id = (id + 1) % n_; + i++; + } + } + + int32_t L_head = 0; + int32_t L_tail = 0; + int32_t H_head = 0; + int32_t H_tail = 0; + + for (int i = 0; i < n_; ++i) + { + int32_t *p = memory + 2 * i; + *p = i; p++; + *p = (i + 1) * a_int_; + } + + for (auto i = 0; i < n_; ++i) + { + auto val = proportion_int_[i]; + if (val < a_int_) + { + L_[L_tail].first = i; + L_[L_tail].second = val; + ++L_tail; + } + else + { + H_[H_tail].first = i; + H_[H_tail].second = val; + ++H_tail; + } + } + + assert(L_tail + H_tail == n_); + + while (L_head != L_tail && H_head != H_tail) + { + auto &i_pi = L_[L_head++]; + auto &h_ph = H_[H_head++]; + + int32_t *p = memory + 2 * i_pi.first; + *p = h_ph.first; p++; + *p = i_pi.first * a_int_ + i_pi.second; + + auto sum = h_ph.second + i_pi.second; + if (sum > 2 * a_int_) + { + H_[H_tail].first = h_ph.first; + H_[H_tail].second = sum - a_int_; + ++H_tail; + } + else + { + L_[L_tail].first = h_ph.first; + L_[L_tail].second = sum - a_int_; + ++L_tail; + } + } + while (L_head != L_tail) + { + auto first = L_[L_head].first; + auto second = L_[L_head].second; + int32_t *p = memory + 2 * first; + *p = first; p++; + *p = first * a_int_ + second; + ++L_head; + } + while (H_head != H_tail) + { + auto first = H_[H_head].first; + auto second = H_[H_head].second; + int32_t *p = memory + 2 * first; + + *p = first; p++; + *p = first * a_int_ + second; + ++H_head; + } + } + + // Make sure to call SetProportion or SetProportionMass before calling Next + int32_t Next(xorshift_rng& rng, std::vector& alias_kv); + + private: + void GenerateAliasTable(std::vector& alias_kv); + + public: + AliasMultinomialRNGInt(const AliasMultinomialRNGInt &other) = delete; + AliasMultinomialRNGInt& operator=(const AliasMultinomialRNGInt &other) = delete; + + std::vector proportion_int_; + int32_t *internal_memory_; + + int32_t n_; + int32_t a_int_; + int32_t mass_int_; + + std::vector> L_; + std::vector> H_; + }; +} \ No newline at end of file diff --git a/src/Native/LdaNative/data_block.cpp b/src/Native/LdaNative/data_block.cpp new file mode 100644 index 0000000000..11b56b9ad7 --- /dev/null +++ b/src/Native/LdaNative/data_block.cpp @@ -0,0 +1,117 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#include +#include "data_block.h" +#include "lda_document.h" + +namespace lda +{ + LDADataBlock::LDADataBlock(int32_t num_threads) : + num_threads_(num_threads), has_read_(false), index_document_(0), documents_buffer_(nullptr), offset_buffer_(nullptr) + { + } + + LDADataBlock::~LDADataBlock() + { + if (has_read_) + { + delete[] offset_buffer_; + delete[] documents_buffer_; + } + } + + void LDADataBlock::Clear() + { + has_read_ = false; + index_document_ = 0; + used_size_ = 0; + + num_documents_ = 0; + corpus_size_ = 0; + + if (offset_buffer_) + { + delete[]offset_buffer_; + offset_buffer_ = nullptr; + } + if (documents_buffer_) + { + delete[]documents_buffer_; + documents_buffer_ = nullptr; + } + } + + void LDADataBlock::Allocate(const int32_t num_document, const int64_t corpus_size) + { + num_documents_ = num_document; + corpus_size_ = corpus_size; + + offset_buffer_ = new int64_t[num_documents_ + 1]; // +1: one for the end of last document, + documents_buffer_ = new int32_t[corpus_size_]; + + index_document_ = 0; + used_size_ = 0; + + offset_buffer_[0] = 0; + } + + + //term_id, term_freq, term_num + int LDADataBlock::Add(int32_t* term_id, int32_t* term_freq, int32_t term_num) + { + int64_t data_length = 1; + + int64_t idx = offset_buffer_[index_document_] + 1; + for (int i = 0; i < term_num; ++i) + { + for (int j = 0; j < term_freq[i]; ++j) + { + documents_buffer_[idx++] = term_id[i]; + documents_buffer_[idx++] = 0; + data_length += 2; + } + } + + index_document_++; + used_size_ += data_length; + + offset_buffer_[index_document_] = used_size_; + has_read_ = true; + + return (int)data_length; + } + + int LDADataBlock::AddDense(int32_t* term_freq, int32_t term_num) + { + int64_t data_length = 1; + + int64_t idx = offset_buffer_[index_document_] + 1; + for (int i = 0; i < term_num; ++i) + { + for (int j = 0; j < term_freq[i]; ++j) + { + documents_buffer_[idx++] = i; + documents_buffer_[idx++] = 0; + data_length += 2; + } + } + + index_document_++; + used_size_ += data_length; + + offset_buffer_[index_document_] = used_size_; + has_read_ = true; + + return (int)data_length; + } + + std::shared_ptr LDADataBlock::GetOneDoc(int32_t index) const + { + std::shared_ptr returned_ptr( + new LDADocument(documents_buffer_ + offset_buffer_[index], + documents_buffer_ + offset_buffer_[index + 1])); + return returned_ptr; + } +} \ No newline at end of file diff --git a/src/Native/LdaNative/data_block.h b/src/Native/LdaNative/data_block.h new file mode 100644 index 0000000000..9f0894a858 --- /dev/null +++ b/src/Native/LdaNative/data_block.h @@ -0,0 +1,70 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once + +#include +#include +#include +#include "light_hash_map.h" + +namespace lda +{ + class LDADocument; + class LDADataBlock + { + public: + explicit LDADataBlock(int32_t num_threads); + ~LDADataBlock(); + + void Clear(); + //in data feedin scenario + void Allocate(const int32_t num_document, const int64_t corpus_size); + //port the data from external process, e.g. c# + int AddDense(int32_t* term_freq, int32_t term_num); + int Add(int32_t* term_id, int32_t* term_freq, int32_t term_num); + std::shared_ptr GetOneDoc(int32_t index) const; + + inline int32_t num_documents() const; + // Return the first document for thread thread_id + inline int32_t Begin(int32_t thread_id) const; + // Return the next to last document for thread thread_i + inline int32_t End(int32_t thread_id) const; + + + private: + LDADataBlock(const LDADataBlock& other) = delete; + LDADataBlock& operator=(const LDADataBlock& other) = delete; + + int32_t num_threads_; + bool has_read_; // equal true if LDADataBlock holds memory + + int32_t index_document_; + int64_t used_size_; + + int32_t num_documents_; + int64_t corpus_size_; + + int64_t* offset_buffer_; // offset_buffer_ size = num_document_ + 1 + int32_t* documents_buffer_; // documents_buffer_ size = corpus_size_; + }; + + inline int32_t LDADataBlock::num_documents() const + { + return num_documents_; + } + inline int32_t LDADataBlock::Begin(int32_t thread_id) const + { + int32_t num_of_one_doc = num_documents_ / num_threads_; + return thread_id * num_of_one_doc; + } + + inline int32_t LDADataBlock::End(int32_t thread_id) const + { + if (thread_id == num_threads_ - 1) // last thread + return num_documents_; + int32_t num_of_one_doc = num_documents_ / num_threads_; + return (thread_id + 1) * num_of_one_doc; + } +} \ No newline at end of file diff --git a/src/Native/LdaNative/hybrid_alias_map.cpp b/src/Native/LdaNative/hybrid_alias_map.cpp new file mode 100644 index 0000000000..fcbeee3806 --- /dev/null +++ b/src/Native/LdaNative/hybrid_alias_map.cpp @@ -0,0 +1,198 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#include +#include +#include +#include "utils.hpp" +#include +#include "hybrid_alias_map.h" + +namespace lda +{ + hybrid_alias_map::hybrid_alias_map() + :memory_(nullptr), + is_dense_(1), + kv_(nullptr), + idx_(nullptr), + capacity_(0), + size_(0), + mass_(0), + n_kw_mass_(0.0), + beta_mass_(0.0) + { + } + hybrid_alias_map::hybrid_alias_map(int32_t *memory, int32_t is_dense, int32_t capacity) + :memory_(memory), + is_dense_(is_dense), + capacity_(capacity), + kv_(nullptr), + idx_(nullptr), + size_(0), + mass_(0), + n_kw_mass_(0.0), + beta_mass_(0.0) + { + if (is_dense_) + { + kv_ = memory_; + idx_ = nullptr; + } + else + { + kv_ = memory_; + idx_ = memory_ + capacity_ * 2; + } + } + + hybrid_alias_map::hybrid_alias_map(const hybrid_alias_map &other) + { + this->memory_ = other.memory_; + this->is_dense_ = other.is_dense_; + this->capacity_ = other.capacity_; + + this->kv_ = other.kv_; + this->idx_ = other.idx_; + this->height_ = other.height_; + this->size_ = other.size_; + + this->mass_ = other.mass_; + this->n_kw_mass_ = other.n_kw_mass_; + this->beta_mass_ = other.beta_mass_; + } + hybrid_alias_map& hybrid_alias_map::operator=(const hybrid_alias_map &other) + { + this->memory_ = other.memory_; + this->is_dense_ = other.is_dense_; + this->capacity_ = other.capacity_; + + this->kv_ = other.kv_; + this->idx_ = other.idx_; + this->height_ = other.height_; + this->size_ = other.size_; + + this->mass_ = other.mass_; + this->n_kw_mass_ = other.n_kw_mass_; + this->beta_mass_ = other.beta_mass_; + + return *this; + } + + void hybrid_alias_map::clear() + { + size_ = 0; + } + + std::string hybrid_alias_map::DebugString() + { + std::string str = ""; + + if (size_ == 0) + { + return str; + } + + str += "is_dense:" + std::to_string(is_dense_) + " height:" + std::to_string(height_) + " mass:" + std::to_string(n_kw_mass_); + if (is_dense_) + { + for (int i = 0; i < capacity_; ++i) + { + str += " " + std::to_string(i) + ":" + std::to_string(*(memory_ + 2 * i)) + ":" + std::to_string(*(memory_ + 2 * i + 1)); + } + } + else + { + for (int i = 0; i < size_; ++i) + { + str += " " + std::to_string(idx_[i]) + ":" + std::to_string(*(memory_ + 2 * i)) + ":" + std::to_string(*(memory_ + 2 * i + 1)); + } + } + + return str; + } + + void hybrid_alias_map::build_table( + wood::AliasMultinomialRNGInt &alias_rng, + const hybrid_map &word_topic_row, + const std::vector &summary_row, + std::vector &q_w_proportion, + float beta, + float beta_sum, + int word_id, + wood::xorshift_rng &rng) + { + if (is_dense_) + { + size_ = capacity_; + mass_ = 0.0; + for (int k = 0; k < capacity_; ++k) + { + int32_t n_kw = word_topic_row[k]; + float prop = (n_kw + beta) / (summary_row[k] + beta_sum); + q_w_proportion[k] = prop; + mass_ += prop; + } + if (size_ == 0) + { + return; + } + alias_rng.SetProportionMass(q_w_proportion, mass_, memory_, &height_, rng); + + } + else + { + if (word_topic_row.is_dense()) + { + size_ = 0; + n_kw_mass_ = 0.0; + for (int k = 0; k < word_topic_row.capacity_; ++k) + { + if (word_topic_row.memory_[k] == 0) continue; + int32_t n_tw = word_topic_row.memory_[k]; + int64_t n_t = summary_row[k]; + q_w_proportion[size_] = n_tw / (n_t + beta_sum); + idx_[size_] = k; + n_kw_mass_ += q_w_proportion[size_]; + ++size_; + } + + if (size_ == 0) + { + // it is possible that, the local tf of a word is zero + return; + } + alias_rng.SetProportionMass(q_w_proportion, size_, n_kw_mass_, memory_, &height_, rng, word_id); + } + else + { + size_ = 0; + n_kw_mass_ = 0.0; + int32_t row_capacity = word_topic_row.capacity_; + for (int k = 0; k < row_capacity; ++k) + { + int32_t key = word_topic_row.key_[k]; + if (key > 0) + { + int32_t n_kw = word_topic_row.value_[k]; + float prop = n_kw / (summary_row[key - 1] + beta_sum); + + + + q_w_proportion[size_] = prop; + idx_[size_] = word_topic_row.key_[k] - 1; // minus one from the the internal key + n_kw_mass_ += prop; + + ++size_; + } + } + if (size_ == 0) + { + // it is possible that, the local tf of a word is zero + return; + } + alias_rng.SetProportionMass(q_w_proportion, size_, n_kw_mass_, memory_, &height_, rng, word_id); + } + } + } +} diff --git a/src/Native/LdaNative/hybrid_alias_map.h b/src/Native/LdaNative/hybrid_alias_map.h new file mode 100644 index 0000000000..f62b1e33af --- /dev/null +++ b/src/Native/LdaNative/hybrid_alias_map.h @@ -0,0 +1,128 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once +#include +#include +#include +#include +#include +#include +#include "alias_multinomial_rng_int.hpp" +#include "hybrid_map.h" + +namespace lda +{ + class hybrid_alias_map + { + public: + + hybrid_alias_map(); + hybrid_alias_map(int32_t *memory, int32_t is_dense, int32_t capacity); + hybrid_alias_map(const hybrid_alias_map &other); + hybrid_alias_map& operator=(const hybrid_alias_map &other); + + void clear(); + inline int32_t size() const; + + std::string DebugString(); + void build_table( + wood::AliasMultinomialRNGInt &alias_rng, + const hybrid_map &word_topic_row, + const std::vector &summary_row, + std::vector &q_w_proportion, + float beta, + float beta_sum, + int word_id, + wood::xorshift_rng &rng); + + inline int32_t next(wood::xorshift_rng &rng, int32_t beta_height, float beta_mass, std::vector &beta_k_v, bool debug); + + private: + int32_t *memory_; + int32_t is_dense_; + int32_t *kv_; + int32_t *idx_; + int32_t height_; + int32_t capacity_; + int32_t size_; + + float mass_; + float n_kw_mass_; + float beta_mass_; + }; + + inline int32_t hybrid_alias_map::size() const + { + return size_; + } + + inline int32_t hybrid_alias_map::next(wood::xorshift_rng &rng, int32_t beta_height, float beta_mass, std::vector &beta_k_v, bool debug) + { + //NOTE: here we will set those unseen words' topic to 0. logicall we could set it to random as well. + if (capacity_ == 0) + { + return 0; + } + + if (is_dense_) + { + auto sample = rng.rand(); + int idx = sample / height_; + if (idx >= size_) + { + idx = size_ - 1; + } + + int32_t *p = memory_ + 2 * idx; + int32_t k = *p; + p++; + int32_t v = *p; + int32_t m = -(sample < v); + return (idx & m) | (k & ~m); + } + else + { + float sample = rng.rand_real() * (n_kw_mass_ + beta_mass); + if (sample < n_kw_mass_) + { + auto n_kw_sample = rng.rand(); + int32_t idx = n_kw_sample / height_; + + if (idx >= size_) + { + idx = size_ - 1; + } + + + int32_t *p = memory_ + 2 * idx; + int32_t k = *p; p++; + int32_t v = *p; + int32_t id = idx_[idx]; + int32_t k_id = idx_[k]; + + int32_t m = -(n_kw_sample < v); + return (id & m) | (k_id & ~m); + + } + else + { + auto sampleLocal = rng.rand(); + int idx = sampleLocal / beta_height; + int beta_size = (int)beta_k_v.size(); + + if (idx >= beta_size) + { + idx = beta_size - 1; + } + + int32_t k = beta_k_v[idx].k_; + int32_t v = beta_k_v[idx].v_; + int32_t m = -(sampleLocal < v); + return (idx & m) | (k & ~m); + } + } + } + +} \ No newline at end of file diff --git a/src/Native/LdaNative/hybrid_map.cpp b/src/Native/LdaNative/hybrid_map.cpp new file mode 100644 index 0000000000..e5c8252702 --- /dev/null +++ b/src/Native/LdaNative/hybrid_map.cpp @@ -0,0 +1,142 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#include +#include +#include +#include "utils.hpp" +#include +#include "hybrid_map.h" + +namespace lda +{ + hybrid_map::hybrid_map() + :memory_(nullptr), + is_dense_(1), + capacity_(0), + empty_key_(0), + deleted_key_(-1), + key_(nullptr), + value_(nullptr), + num_deleted_key_(0), + external_rehash_buf_(nullptr) + { + } + hybrid_map::hybrid_map(int32_t *memory, int32_t is_dense, int32_t capacity, int32_t num_deleted_key + , int32_t *external_rehash_buf_) + : memory_(memory), + is_dense_(is_dense), + capacity_(capacity), + empty_key_(0), + deleted_key_(-1), + key_(nullptr), + value_(nullptr), + num_deleted_key_(num_deleted_key), + external_rehash_buf_(external_rehash_buf_) + { + if (is_dense_ == 0) { + key_ = memory_; + value_ = memory_ + capacity_; + } + } + + hybrid_map::hybrid_map(const hybrid_map &other) + { + this->memory_ = other.memory_; + this->is_dense_ = other.is_dense_; + this->capacity_ = other.capacity_; + empty_key_ = other.empty_key_; + deleted_key_ = other.deleted_key_; + num_deleted_key_ = other.num_deleted_key_; + external_rehash_buf_ = other.external_rehash_buf_; + if (this->is_dense_) + { + this->key_ = nullptr; + this->value_ = nullptr; + } + else + { + this->key_ = this->memory_; + this->value_ = this->memory_ + capacity_; + } + + } + hybrid_map& hybrid_map::operator=(const hybrid_map &other) + { + this->memory_ = other.memory_; + this->is_dense_ = other.is_dense_; + this->capacity_ = other.capacity_; + empty_key_ = other.empty_key_; + deleted_key_ = other.deleted_key_; + num_deleted_key_ = other.num_deleted_key_; + external_rehash_buf_ = other.external_rehash_buf_; + if (this->is_dense_) + { + this->key_ = nullptr; + this->value_ = nullptr; + } + else + { + this->key_ = this->memory_; + this->value_ = this->memory_ + capacity_; + } + return *this; + } + + void hybrid_map::clear() + { + int32_t memory_size = is_dense_ ? capacity_ : 2 * capacity_; + memset(memory_, 0, memory_size * sizeof(int32_t)); + } + + std::string hybrid_map::DumpString() const + { + if (is_dense_) + { + std::string result; + for (int i = 0; i < capacity_; ++i) + { + if (memory_[i] != 0) + { + result += std::to_string(i) + ":" + std::to_string(memory_[i]) + " "; + } + } + return result; + } + else + { + std::string result; + for (int i = 0; i < capacity_; ++i) + { + if (key_[i] > 0) + { + result += std::to_string(key_[i] - 1) + ":" + std::to_string(value_[i]) + " "; + } + } + return result; + } + } + + void hybrid_map::sorted_rehashing() + { + if (!is_dense_) + { + std::map rehash_buffer; + for (int i = 0; i < capacity_; ++i) + { + if (key_[i] > 0) + { + rehash_buffer[key_[i] - 1] = value_[i]; + } + } + memset(memory_, 0, 2 * capacity_ * sizeof(int32_t)); + for (auto it = rehash_buffer.begin(); + it != rehash_buffer.end(); ++it) + { + inc(it->first, it->second); + } + } + } + +} diff --git a/src/Native/LdaNative/hybrid_map.h b/src/Native/LdaNative/hybrid_map.h new file mode 100644 index 0000000000..88bbc82d5b --- /dev/null +++ b/src/Native/LdaNative/hybrid_map.h @@ -0,0 +1,238 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +// The probing method: +// Linear probing +// #define JUMP_(key, num_probes) ( 1 ) + +// Quadratic probing +#define JUMP_(key, num_probes) ( num_probes ) +#define ILLEGAL_BUCKET -1 + +namespace lda +{ + class hybrid_alias_map; + + class hybrid_map + { + friend class hybrid_alias_map; + public: + hybrid_map(); + hybrid_map(int32_t *memory, int32_t is_dense, int32_t capacity, int32_t num_deleted_key + , int32_t *external_rehash_buf_); + hybrid_map(const hybrid_map &other); + hybrid_map& operator=(const hybrid_map &other); + + + void clear(); + std::string DumpString() const; + void sorted_rehashing(); + + inline int32_t nonzero_num() const; + inline bool is_dense() const; + inline int32_t capacity() const; + inline int32_t *memory() const; + inline int32_t* key() const; + inline int32_t* value() const; + inline void rehashing(); + inline void inc(int32_t key, int32_t delta); + // query the value of |key| + // if |key| is in the table, return the |value| corresonding to |key| + // if not, just return 0 + inline int32_t operator[](int32_t key) const; + + private: + inline std::pair find_position(const int32_t key) const; + + int32_t *memory_; + int32_t is_dense_; + int32_t *key_; + int32_t *value_; + + // if |is_dense_| == true, capactiy_ is the length of an array + // if |is dense_| == false, capacity_ is the size of a light hash table + int32_t capacity_; + int32_t empty_key_; + int32_t deleted_key_; + + int32_t num_deleted_key_; + int32_t* external_rehash_buf_; + }; + + inline int32_t hybrid_map::nonzero_num() const + { + if (is_dense_) + { + int32_t size = 0; + for (int i = 0; i < capacity_; ++i) + { + if (memory_[i] > 0) + { + ++size; + } + } + return size; + } + else + { + int32_t size = 0; + for (int i = 0; i < capacity_; ++i) + { + if (key_[i] > 0) + { + ++size; + } + } + return size; + } + } + + inline bool hybrid_map::is_dense() const + { + return is_dense_ != 0; + } + + inline int32_t hybrid_map::capacity() const + { + return capacity_; + } + + inline int32_t* hybrid_map::memory() const + { + return memory_; + } + inline int32_t* hybrid_map::key() const + { + return key_; + } + inline int32_t* hybrid_map::value() const + { + return value_; + } + inline void hybrid_map::rehashing() + { + if (!is_dense_) + { + memcpy(external_rehash_buf_, memory_, 2 * capacity_ * sizeof(int32_t)); + int32_t *key = external_rehash_buf_; + int32_t *value = external_rehash_buf_ + capacity_; + memset(memory_, 0, 2 * capacity_ * sizeof(int32_t)); + for (int i = 0; i < capacity_; ++i) + { + if (key[i] > 0) + { + inc(key[i] - 1, value[i]); + } + } + num_deleted_key_ = 0; + } + } + inline void hybrid_map::inc(int32_t key, int32_t delta) + { + if (is_dense_) + { + memory_[key] += delta; + } + else + { + int32_t internal_key = key + 1; + std::pair pos = find_position(internal_key); + if (pos.first != ILLEGAL_BUCKET) + { + value_[pos.first] += delta; + if (value_[pos.first] == 0) // the value becomes zero, delete the key + { + key_[pos.first] = deleted_key_; + + ++num_deleted_key_; // num_deleted_key ++ + if (num_deleted_key_ * 20 > capacity_) + { + rehashing(); + } + } + } + else // not found the key, insert it with delta as value + { + key_[pos.second] = internal_key; + value_[pos.second] = delta; + } + } + } + + // query the value of |key| + // if |key| is in the table, return the |value| corresonding to |key| + // if not, just return 0 + inline int32_t hybrid_map::operator[](int32_t key) const + { + if (is_dense_) + { + //return memory_[key]; + if (capacity_ > 0) + { + return memory_[key]; + } + else + { + return 0; + } + } + else + { + int32_t internal_key = key + 1; + std::pair pos = find_position(internal_key); + if (pos.first != ILLEGAL_BUCKET) + { + return value_[pos.first]; + } + else + { + return 0; + } + } + } + inline std::pair hybrid_map::find_position(const int32_t key) const + { + int num_probes = 0; + int32_t capacity_minus_one = capacity_ - 1; + int32_t idx = key % capacity_; + int32_t insert_pos = ILLEGAL_BUCKET; + while (1) // probe until something happens + { + if (key_[idx] == empty_key_) // bucket is empty + { + if (insert_pos == ILLEGAL_BUCKET) // found no prior place to insert + { + return std::pair(ILLEGAL_BUCKET, idx); + } + else // previously, there is a position to insert + { + return std::pair(ILLEGAL_BUCKET, insert_pos); + } + } + else if (key_[idx] == deleted_key_) // keep searching, but makr to insert + { + if (insert_pos == ILLEGAL_BUCKET) + { + insert_pos = idx; + } + } + else if (key_[idx] == key) + { + return std::pair(idx, ILLEGAL_BUCKET); + } + ++num_probes; // we are doing another probe + idx = (idx + JUMP_(key, num_probes) & capacity_minus_one); + assert(num_probes < capacity_); // && "Hashtable is full: an error in key_equal<> or hash<>"); + } + } +} \ No newline at end of file diff --git a/src/Native/LdaNative/lda_document.cpp b/src/Native/LdaNative/lda_document.cpp new file mode 100644 index 0000000000..c2a3371020 --- /dev/null +++ b/src/Native/LdaNative/lda_document.cpp @@ -0,0 +1,29 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#include "lda_document.h" + +namespace lda +{ + LDADocument::LDADocument(int32_t* memory_begin, int32_t* memory_end) : + memory_begin_(memory_begin), memory_end_(memory_end), cursor_(*memory_begin) {} + + // should be called when sweeped over all the tokens in a document + void LDADocument::ResetCursor() + { + cursor_ = 0; + } + void LDADocument::GetDocTopicCounter(lda::light_hash_map& doc_topic_counter) + { + int32_t* p = memory_begin_ + 2; + int32_t num = 0; + while (p < memory_end_) + { + doc_topic_counter.inc(*p, 1); + ++p; ++p; + if (++num == 512) + return; + } + } +} \ No newline at end of file diff --git a/src/Native/LdaNative/lda_document.h b/src/Native/LdaNative/lda_document.h new file mode 100644 index 0000000000..45df42f06c --- /dev/null +++ b/src/Native/LdaNative/lda_document.h @@ -0,0 +1,60 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once + +#include +#include +#include +#include "light_hash_map.h" + +namespace lda +{ + class LDADocument + { + public: + const int32_t kMaxSizeLightHash = 512; // This is for the easy use of LightHashMap + + LDADocument(int32_t* memory_begin, int32_t* memory_end); + + inline int32_t size() const; + inline int32_t& get_cursor(); + inline int32_t Word(int32_t index) const; + inline int32_t Topic(int32_t index) const; + inline void SetTopic(int32_t index, int32_t topic); + + // should be called when sweeped over all the tokens in a document + void ResetCursor(); + void GetDocTopicCounter(lda::light_hash_map& doc_topic_counter); + + private: + LDADocument(const LDADocument &other) = delete; + LDADocument& operator=(const LDADocument &other) = delete; + + int32_t* memory_begin_; + int32_t* memory_end_; + int32_t& cursor_; // cursor_ is reference of *memory_begin_ + }; + + inline int32_t LDADocument::size() const + { + return (std::min)(static_cast((memory_end_ - memory_begin_) / 2), kMaxSizeLightHash); + } + inline int32_t& LDADocument::get_cursor() + { + return cursor_; + } + inline int32_t LDADocument::Word(int32_t index) const + { + return *(memory_begin_ + 1 + index * 2); + } + inline int32_t LDADocument::Topic(int32_t index) const + { + return *(memory_begin_ + 2 + index * 2); + } + inline void LDADocument::SetTopic(int32_t index, int32_t topic) + { + *(memory_begin_ + 2 + index * 2) = topic; + } +} \ No newline at end of file diff --git a/src/Native/LdaNative/lda_engine.cpp b/src/Native/LdaNative/lda_engine.cpp new file mode 100644 index 0000000000..5650ce73b5 --- /dev/null +++ b/src/Native/LdaNative/lda_engine.cpp @@ -0,0 +1,1066 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "timer.h" +#include "rand_int_rng.h" +#include "lda_document.h" +#include "data_block.h" +#include "model_block.h" +#include "lda_engine.hpp" +#include "utils.hpp" +#include "simple_barrier.h" +#include "light_doc_sampler.hpp" + +#ifdef _MSC_VER +#include "windows.h" +#elif defined(__APPLE__) +#include +#include +#include +#else +#include "sched.h" +#endif + +namespace lda { + LdaEngine::LdaEngine(int numTopic, + int numVocab, + float alphaSum, + float beta, + int numIter, + int likelihoodInterval, + int numThread, + int mhstep, + int maxDocToken) + : K_(numTopic), + V_(numVocab), + compute_ll_interval_(likelihoodInterval), + beta_(beta), + num_iterations_(numIter), + mh_step_(mhstep), + alpha_sum_(alphaSum), + maxDocToken_(maxDocToken), + samplers_(nullptr), + document_buffer_(nullptr) + { + if (numThread > 0) + { + num_threads_ = numThread; + } + else + { + unsigned int uNumCPU = std::thread::hardware_concurrency(); + num_threads_ = std::max(1, (int)(uNumCPU - 2)); + } + printf("using %d thread(s) to do train/test\n", num_threads_); + + bAlphaSumMultiplied = false; + atomic_stats_ = new LDAEngineAtomics(); + model_block_ = new LDAModelBlock(); + data_block_ = new LDADataBlock(num_threads_); + process_barrier_ = new SimpleBarrier(num_threads_); + samplerQueue_ = new CBlockedIntQueue(); + + document_buffer_ = new int32_t*[num_threads_]; + for (int i = 0; i < num_threads_; i++) + document_buffer_[i] = new int32_t[maxDocToken_ * 2 + 1]; + + likelihood_in_iter_ = nullptr; + + beta_sum_ = beta_ * V_; + } + + LdaEngine::LdaEngine(int32_t K, int32_t V, int32_t num_threads, int32_t compute_ll_interval, float beta, int32_t num_iterations, int32_t mh_step, float alpha_sum, int maxDocToken) + : K_(K), + V_(V), + compute_ll_interval_(compute_ll_interval), + beta_(beta), + num_iterations_(num_iterations), + mh_step_(mh_step), + alpha_sum_(alpha_sum), + maxDocToken_(maxDocToken), + samplers_(nullptr), + document_buffer_(nullptr) + { + if (num_threads > 0) + { + num_threads_ = num_threads; + } + else + { + unsigned int uNumCPU = std::thread::hardware_concurrency(); + num_threads_ = std::max(1, (int)(uNumCPU - 2)); + } + bAlphaSumMultiplied = false; + process_barrier_ = new SimpleBarrier(num_threads_); + atomic_stats_ = new LDAEngineAtomics(); + data_block_ = new LDADataBlock(num_threads_); + model_block_ = new LDAModelBlock(); + samplerQueue_ = new CBlockedIntQueue(); + + document_buffer_ = new int32_t*[num_threads_]; + for (int i = 0; i < num_threads_; i++) + document_buffer_[i] = new int32_t[maxDocToken_ * 2 + 1]; + + likelihood_in_iter_ = nullptr; + beta_sum_ = beta_ * V_; + } + + + LdaEngine::~LdaEngine() + { + //delete memory space + delete process_barrier_; + process_barrier_ = nullptr; + + delete data_block_; + data_block_ = nullptr; + + delete atomic_stats_; + atomic_stats_ = nullptr; + + delete model_block_; + model_block_ = nullptr; + + delete samplerQueue_; + samplerQueue_ = nullptr; + + for (int i = 0; i < num_threads_; ++i) + { + delete samplers_[i]; + } + delete[] samplers_; + + if (document_buffer_) + { + for (int i = 0; i < num_threads_; ++i) + { + delete[]document_buffer_[i]; + document_buffer_[i] = nullptr; + } + delete[]document_buffer_; + document_buffer_ = nullptr; + } + + if (likelihood_in_iter_) + { + delete[] likelihood_in_iter_; + likelihood_in_iter_ = nullptr; + } + } + + bool LdaEngine::InitializeBeforeTrain() + { + CTimer tmDebug(true); + CheckFunction(0, tmDebug, "enter initializeBeforeTrain", false); + //allocate model memory from the data preloaded + AllocateModelMemory(data_block_); + CheckFunction(0, tmDebug, "allocate model memory", false); + + double alloc_start = lda::get_time(); + global_word_topic_table_.resize(V_); + alias_rng_int_.Init(K_); + beta_k_v_.resize(K_); + global_alias_k_v_.resize(V_); + + for (int i = 0; i < V_; ++i) + { + global_alias_k_v_[i] = model_block_->get_alias_row(i); + } + global_summary_row_.resize(K_); + CheckFunction(0, tmDebug, "initlaizing global tables used in sampling", false); + + word_range_for_each_thread_.resize(num_threads_ + 1); + int32_t word_num_each_thread = V_ / num_threads_; + word_range_for_each_thread_[0] = 0; + for (int32_t i = 0; i < num_threads_ - 1; ++i) + { + word_range_for_each_thread_[i + 1] = word_range_for_each_thread_[i] + word_num_each_thread; + } + word_range_for_each_thread_[num_threads_] = V_; + + //setup sampler + samplers_ = new LightDocSampler*[num_threads_]; + samplerQueue_->clear(); + + for (int i = 0; i < num_threads_; ++i) + { + samplers_[i] = new LightDocSampler( + K_, + V_, + num_threads_, + mh_step_, + beta_, + alpha_sum_, + global_word_topic_table_, + global_summary_row_, + global_alias_k_v_, + beta_height_, + beta_mass_, + beta_k_v_); + + samplerQueue_->push(i); + } + CheckFunction(0, tmDebug, "create samplers", false); + return true; + } + + void LdaEngine::InitializeBeforeTest() + { + // TODO: + // Allocating space for word-topic-table and alias table according to the input data of SetModel interface (done) + // Create multiple thread-specific sampler + // set word_range_for_each_thread_ + // Adjust the alpha_sum_ parameter for each thread-specific sampler + CTimer tmDebug(true); + CheckFunction(0, tmDebug, "enter initializeBeforeTest", false); + + global_word_topic_table_.resize(V_); + alias_rng_int_.Init(K_); + beta_k_v_.resize(K_); + global_alias_k_v_.resize(V_); + + for (int i = 0; i < V_; ++i) + { + global_alias_k_v_[i] = model_block_->get_alias_row(i); + } + CheckFunction(0, tmDebug, "initlaizing global tables used in sampling", false); + + // Set the word range for each thread + word_range_for_each_thread_.resize(num_threads_ + 1); + int32_t word_num_each_thread = V_ / num_threads_; + word_range_for_each_thread_[0] = 0; + for (int32_t i = 0; i < num_threads_ - 1; ++i) + { + word_range_for_each_thread_[i + 1] = word_range_for_each_thread_[i] + word_num_each_thread; + } + word_range_for_each_thread_[num_threads_] = V_; + + //setup sampler + if (samplers_) + { + for (int i = 0; i < num_threads_; ++i) + { + delete samplers_[i]; + } + delete[] samplers_; + } + if (document_buffer_) + { + for (int i = 0; i < num_threads_; ++i) + { + delete[]document_buffer_[i]; + document_buffer_[i] = nullptr; + } + delete[]document_buffer_; + document_buffer_ = nullptr; + } + + samplers_ = new LightDocSampler*[num_threads_]; + document_buffer_ = new int32_t*[num_threads_]; + samplerQueue_->clear(); + + for (int i = 0; i < num_threads_; ++i) + { + samplers_[i] = new LightDocSampler( + K_, + V_, + num_threads_, + mh_step_, + beta_, + alpha_sum_, + global_word_topic_table_, + global_summary_row_, + global_alias_k_v_, + beta_height_, + beta_mass_, + beta_k_v_); + + samplers_[i]->AdaptAlphaSum(false); + document_buffer_[i] = new int32_t[maxDocToken_ * 2 + 1]; + + samplerQueue_->push(i); + } + CheckFunction(0, tmDebug, "create samplers", false); + + // build alias table + // build alias table for the dense term, beta_k_v_, which is shared by all the words + beta_mass_ = 0; + std::vector proportion(K_); + for (int k = 0; k < K_; ++k) + { + proportion[k] = beta_ / (global_summary_row_[k] + beta_sum_); + beta_mass_ += proportion[k]; + } + alias_rng_int_.SetProportionMass(proportion, beta_mass_, beta_k_v_, &beta_height_, samplers_[0]->rng()); + + // build alias table for the sparse term + for (int thread_id = 0; thread_id < num_threads_; ++thread_id) + { + LightDocSampler &sampler = *(samplers_[thread_id]); + sampler.build_alias_table(word_range_for_each_thread_[thread_id], word_range_for_each_thread_[thread_id + 1], thread_id); + } + CheckFunction(0, tmDebug, "build alisa table", false); + } + + void LdaEngine::Train(const char* pTrainOutput) + { + std::vector threads(num_threads_); + atomic_stats_->thread_counter_ = 0; + + for (auto& thr : threads) { + thr = std::thread(&LdaEngine::Training_Thread, this); + } + + printf("started training with %d threads\n", num_threads_); + for (auto& thr : threads) { + thr.join(); + } + + if (pTrainOutput) + { + DumpDocTopicTable(pTrainOutput); + } + } + + void LdaEngine::Test(int32_t burnin_iter, float* pLoglikelihood) + { + std::vector threads(num_threads_); + atomic_stats_->thread_counter_ = 0; + burnin_iterations_ = burnin_iter; + + likelihood_in_iter_ = new float[burnin_iterations_]; + for (int i = 0; i < burnin_iterations_; i++) + { + likelihood_in_iter_[i] = 0.0; + } + + for (auto& thr : threads) { + thr = std::thread(&LdaEngine::Testing_Thread, this); + } + + printf("started testing with %d threads\n", num_threads_); + + for (auto& thr : threads) { + thr.join(); + } + + //get the loglikelihood of each burn in iteration + for (int i = 0; i < burnin_iterations_; i++) + { + pLoglikelihood[i] = likelihood_in_iter_[i]; //just set an arbitary value here for later update + } + } + + void LdaEngine::CheckFunction(int thread_id, CTimer &tmDebug, const char* msg, bool waitBarrier) + { + } + + void LdaEngine::Training_Thread() + { + CTimer tmDebug(true); + + int thread_id = atomic_stats_->thread_counter_++; + std::vector> llcontainer; + // Set core affinity which helps performance improvement +#ifdef _MSC_VER + long long maskLL = 0; + maskLL |= (1LL << (thread_id)); + DWORD_PTR mask = maskLL; + SetThreadAffinityMask(GetCurrentThread(), mask); +#elif defined(__APPLE__) + thread_port_t thread = pthread_mach_thread_np(pthread_self()); + thread_affinity_policy_data_t policy = { thread_id }; + thread_policy_set(thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); +#else + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(thread_id, &set); + sched_setaffinity(0, sizeof(cpu_set_t), &set); +#endif + + // Each thread builds a portion of word-topic table. We do this way because each word-topic row + // has a thread-specific buffer for rehashing + process_barrier_->wait(); + LightDocSampler &sampler_ = *(samplers_[thread_id]); + sampler_.AdaptAlphaSum(true); + + sampler_.build_word_topic_table(thread_id, num_threads_, *model_block_); + process_barrier_->wait(); + CheckFunction(thread_id, tmDebug, "intialize word_topic_table for sampler - in function train_thread"); + + int32_t token_num = 0; + int32_t doc_start = data_block_->Begin(thread_id); + int32_t doc_end = data_block_->End(thread_id); + + for (int32_t doc_index = doc_start; doc_index != doc_end; ++doc_index) + { + std::shared_ptr doc = data_block_->GetOneDoc(doc_index); + int doc_size = doc->size(); + for (int i = 0; i < doc_size; ++i) + { + int topic = sampler_.rand_k(); + doc->SetTopic(i, topic); + } + int cursor = doc->get_cursor(); + token_num += sampler_.GlobalInit(doc.get()); + } + process_barrier_->wait(); + CheckFunction(thread_id, tmDebug, "intialize token topic before iterations - in function train_thread"); + + for (int i = 0; i < num_threads_; ++i) + { + std::vector& wtd_vec = samplers_[i]->get_word_topic_delta(thread_id); + for (auto& wtd : wtd_vec) + { + global_word_topic_table_[wtd.word].inc(wtd.topic, wtd.delta); + } + } + process_barrier_->wait(); + CheckFunction(thread_id, tmDebug, "intialize word topic model before iterations - in function train_thread"); + + // use thread-private delta table to get global table + { + std::lock_guard lock(atomic_stats_->global_mutex_); + + std::vector &summary = sampler_.get_delta_summary_row(); + for (int i = 0; i < K_; ++i) + { + global_summary_row_[i] += summary[i]; + } + } + process_barrier_->wait(); + CheckFunction(thread_id, tmDebug, "global summary & Complete setup train before iterations - in function train_thread"); + + for (int iter = 0; iter < num_iterations_; ++iter) + { + CheckFunction(thread_id, tmDebug, "----------------------iteration start - in function train_thread---------------------"); + int32_t token_sweeped = 0; + atomic_stats_->num_tokens_clock_ = 0; + // build alias table + // build alias table for the dense term, beta_k_v_, which is shared by all the words + if (thread_id == 0) + { + beta_mass_ = 0; + std::vector proportion(K_); + for (int k = 0; k < K_; ++k) + { + proportion[k] = beta_ / (global_summary_row_[k] + beta_sum_); + beta_mass_ += proportion[k]; + } + + alias_rng_int_.SetProportionMass(proportion, beta_mass_, beta_k_v_, &beta_height_, sampler_.rng()); + } + process_barrier_->wait(); + CheckFunction(thread_id, tmDebug, "built alias table dense - in function train_thread"); + + // build alias table for the sparse term + sampler_.build_alias_table(word_range_for_each_thread_[thread_id], word_range_for_each_thread_[thread_id + 1], thread_id); + process_barrier_->wait(); + CheckFunction(thread_id, tmDebug, "built alias table sparse - in function train_thread"); + + sampler_.EpocInit(); + process_barrier_->wait(); + CheckFunction(thread_id, tmDebug, "EpochInit - in function train_thread"); + + // main part of the training - sampling over documents in this iteration + double iter_start = lda::get_time(); + int32_t doc_start_local = data_block_->Begin(thread_id); + int32_t doc_end_local = data_block_->End(thread_id); + + for (int32_t doc_index = doc_start_local; doc_index != doc_end_local; ++doc_index) + { + std::shared_ptr doc = data_block_->GetOneDoc(doc_index); + token_sweeped += sampler_.SampleOneDoc(doc.get()); + } + atomic_stats_->num_tokens_clock_ += token_sweeped; + + process_barrier_->wait(); + double iter_end = lda::get_time(); + + if (thread_id == 0) + { + double seconds_this_iter = iter_end - iter_start; + + printf("Iter: %04d", iter); + std::cout + << "\tThread = " << thread_id + << "\tTokens: " << atomic_stats_->num_tokens_clock_ + << "\tTook: " << seconds_this_iter << " sec" + << "\tThroughput: " + << static_cast(atomic_stats_->num_tokens_clock_) / (seconds_this_iter) << " token/(thread*sec)" + << std::endl; + } + process_barrier_->wait(); + CheckFunction(thread_id, tmDebug, "train(gibbs sampling) - in function train_thread"); + + // syncup global table + double sync_start = lda::get_time(); + for (int i = 0; i < num_threads_; ++i) + { + std::vector & wtd_vec = samplers_[i]->get_word_topic_delta(thread_id); + for (auto& wtd : wtd_vec) + { + global_word_topic_table_[wtd.word].inc(wtd.topic, wtd.delta); + } + } + + // use thread-private delta table to update global table + { + std::lock_guard lock(atomic_stats_->global_mutex_); + std::vector &summary = sampler_.get_delta_summary_row(); + for (int i = 0; i < K_; ++i) + { + global_summary_row_[i] += summary[i]; + } + } + process_barrier_->wait(); + CheckFunction(thread_id, tmDebug, "syncup global word_topic table - in function train_thread"); + + if (compute_ll_interval_ != -1 && (iter % compute_ll_interval_ == 0 || iter == num_iterations_ - 1)) + { + double ll = EvalLogLikelihood(true, thread_id, iter, sampler_); + llcontainer.push_back(std::pair(iter, ll)); + } + + CheckFunction(thread_id, tmDebug, "----------------------iteration end - in function train_thread---------------------"); + } + + if (thread_id == 0) + { + //output the ll once + for (int i = 0; i < llcontainer.size(); i++) + { + printf("loglikelihood @iter%04d = %f\n", llcontainer[i].first, llcontainer[i].second); + } + } + + process_barrier_->wait(); + + snprintf(tmDebug.m_szMessage, 200, "thread_id = %d, training iterations", thread_id); + tmDebug.InnerTag(); + } + + void LdaEngine::Testing_Thread() + { + int thread_id = atomic_stats_->thread_counter_++; + + // Set core affinity which helps performance improvement +#ifdef _MSC_VER + long long maskLL = 0; + maskLL |= (1LL << (thread_id)); + DWORD_PTR mask = maskLL; + SetThreadAffinityMask(GetCurrentThread(), mask); +#elif defined(__APPLE__) + thread_port_t thread = pthread_mach_thread_np(pthread_self()); + thread_affinity_policy_data_t policy = { thread_id }; + thread_policy_set(thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); +#else + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(thread_id, &set); + sched_setaffinity(0, sizeof(cpu_set_t), &set); +#endif + process_barrier_->wait(); + + //// Each thread builds a portion of word-topic table. We do this way because each word-topic row + //// has a thread-specific buffer for rehashing + LightDocSampler &sampler_ = *(samplers_[thread_id]); + sampler_.AdaptAlphaSum(false); + + double init_start = lda::get_time(); + int32_t token_num = 0; + int32_t doc_start = data_block_->Begin(thread_id); + int32_t doc_end = data_block_->End(thread_id); + + for (int32_t doc_index = doc_start; doc_index != doc_end; ++doc_index) + { + std::shared_ptr doc = data_block_->GetOneDoc(doc_index); + int doc_size = doc->size(); + for (int i = 0; i < doc_size; ++i) + { + int topic = sampler_.rand_k(); + doc->SetTopic(i, topic); + } + int cursor = doc->get_cursor(); + token_num += sampler_.GlobalInit(doc.get()); + } + + process_barrier_->wait(); + + // build alias table + // build alias table for the dense term, beta_k_v_, which is shared by all the words + if (thread_id == 0) + { + beta_mass_ = 0; + std::vector proportion(K_); + for (int k = 0; k < K_; ++k) + { + proportion[k] = beta_ / (global_summary_row_[k] + beta_sum_); + beta_mass_ += proportion[k]; + } + + alias_rng_int_.SetProportionMass(proportion, beta_mass_, beta_k_v_, &beta_height_, sampler_.rng()); + } + + // build alias table for the sparse term + double alias_start = lda::get_time(); + process_barrier_->wait(); + sampler_.build_alias_table(word_range_for_each_thread_[thread_id], word_range_for_each_thread_[thread_id + 1], thread_id); + process_barrier_->wait(); + + // print the log-likelihood before inference + EvalLogLikelihood(true, thread_id, 0, sampler_); + + double total_start = lda::get_time(); + for (int iter = 0; iter < burnin_iterations_; ++iter) + { + double iter_start = lda::get_time(); + int32_t token_sweeped = 0; + atomic_stats_->num_tokens_clock_ = 0; + int32_t doc_start_local = data_block_->Begin(thread_id); + int32_t doc_end_local = data_block_->End(thread_id); + + for (int32_t doc_index = doc_start_local; doc_index != doc_end_local; ++doc_index) + { + std::shared_ptr doc = data_block_->GetOneDoc(doc_index); + token_sweeped += sampler_.InferOneDoc(doc.get()); + } + atomic_stats_->num_tokens_clock_ += token_sweeped; + + process_barrier_->wait(); + double iter_end = lda::get_time(); + + if (thread_id == 0) + { + double seconds_this_iter = iter_end - iter_start; + + printf("Iter: %04d", iter); + std::cout + << "\tThread = " << thread_id + << "\tTokens: " << atomic_stats_->num_tokens_clock_ + << "\tTook: " << seconds_this_iter << " sec" + << "\tThroughput: " + << static_cast(atomic_stats_->num_tokens_clock_) / (seconds_this_iter) << " token/(thread*sec)" + << std::endl; + + } + + process_barrier_->wait(); + + if (compute_ll_interval_ != -1 && (iter % compute_ll_interval_ == 0 || iter == burnin_iterations_ - 1)) + { + EvalLogLikelihood(false, thread_id, iter, sampler_); + } + } + + double total_end = lda::get_time(); + printf("thread_id = %d, Total time for burnin iterations : %f sec.\n", thread_id, total_end - total_start); + } + + void LdaEngine::AllocateDataMemory(int num_document, int64_t corpus_size) + { + data_block_->Allocate(num_document, corpus_size); + } + + void LdaEngine::AllocateModelMemory(const LDADataBlock* data_block) + { + model_block_->InitFromDataBlock(data_block, V_, K_); + + global_word_topic_table_.resize(V_); + + for (int i = 0; i < V_; ++i) + { + global_word_topic_table_[i] = model_block_->get_row(i, nullptr); + } + } + + void LdaEngine::AllocateModelMemory(int num_vocabs, int num_topics, int64_t nonzero_num) + { + model_block_->Init(num_vocabs, num_topics, nonzero_num); + + global_word_topic_table_.resize(num_vocabs); + + for (int i = 0; i < num_vocabs; ++i) + { + global_word_topic_table_[i] = model_block_->get_row(i, nullptr); + } + } + + void LdaEngine::AllocateModelMemory(int num_vocabs, int num_topics, int64_t mem_block_size, int64_t alias_mem_block_size) + { + model_block_->Init(num_vocabs, num_topics, mem_block_size, alias_mem_block_size); //memory allocated here + + global_word_topic_table_.resize(num_vocabs); + global_summary_row_.resize(K_, 0); + + //each value inside the global_word_topic_table_ will be set while call SetWordTopic() + } + + int LdaEngine::FeedInData(int* term_id, int* term_freq, int32_t term_num, int32_t vocab_size) + { + if (V_ == 0) //number vocab could be set in allocating model memory function + V_ = vocab_size; + + //data_block represent for one doc + return data_block_->Add(term_id, term_freq, term_num); + } + + int LdaEngine::FeedInDataDense(int* term_freq, int32_t term_num, int32_t vocab_size) + { + if (V_ == 0) //number vocab could be set in allocating model memory function + V_ = vocab_size; + + //data_block represent for one doc + return data_block_->AddDense(term_freq, term_num); + } + + void LdaEngine::TestOneDoc(int* term_id, int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset) + { + //numTopicsMax initialy holds the max returned topic number in order to hold the pTopic/pProbs memory in outside function + //when data return, numTopicsMax should contains the real topic number returned. + int sampler_id = 0; + sampler_id = samplerQueue_->pop(); + + LightDocSampler &sampler = *(samplers_[sampler_id]); + int64_t data_length = 1; + for (int i = 0; i < term_num; ++i) + { + for (int j = 0; j < term_freq[i]; ++j) + { + data_length += 2; + } + } + + assert(data_length <= maxDocToken_ * 2 + 1); + + if (reset) + { + // restart the rng seeds, so that we always get consistent result for the same input + rng_.restart(); + sampler.rng_restart(); + } + + // NOTE: in multi-threaded implementation, the dynamic memory allocation + // may cause contention at OS heap lock + int64_t idx = 1; + for (int i = 0; i < term_num; ++i) + { + for (int j = 0; j < term_freq[i]; ++j) + { + document_buffer_[sampler_id][idx++] = term_id[i]; + document_buffer_[sampler_id][idx++] = rng_.rand_k(K_); + } + } + + std::shared_ptr doc(new LDADocument(document_buffer_[sampler_id], document_buffer_[sampler_id] + data_length)); + + for (int iter = 0; iter < numBurnIter; ++iter) + { + sampler.InferOneDoc(doc.get()); + } + sampler.GetDocTopic(doc.get(), pTopics, pProbs, numTopicsMax); + + samplerQueue_->push(sampler_id); + } + + void LdaEngine::TestOneDocDense(int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset) + { + //numTopicsMax initialy holds the max returned topic number in order to hold the pTopic/pProbs memory in outside function + //when data return, numTopicsMax should contains the real topic number returned. + int sampler_id = 0; + sampler_id = samplerQueue_->pop(); + + LightDocSampler &sampler = *(samplers_[sampler_id]); + int64_t data_length = 1; + for (int i = 0; i < term_num; ++i) + { + for (int j = 0; j < term_freq[i]; ++j) + { + data_length += 2; + } + } + + assert(data_length <= maxDocToken_ * 2 + 1); + + if (reset) + { + // restart the rng seeds, so that we always get consistent result for the same input + rng_.restart(); + sampler.rng_restart(); + } + + // NOTE: in multi-threaded implementation, the dynamic memory allocation + // may cause contention at OS heap lock + int64_t idx = 1; + for (int i = 0; i < term_num; ++i) + { + for (int j = 0; j < term_freq[i]; ++j) + { + document_buffer_[sampler_id][idx++] = i; + document_buffer_[sampler_id][idx++] = rng_.rand_k(K_); + } + } + + std::shared_ptr doc(new LDADocument(document_buffer_[sampler_id], document_buffer_[sampler_id] + data_length)); + + for (int iter = 0; iter < numBurnIter; ++iter) + { + sampler.InferOneDoc(doc.get()); + } + sampler.GetDocTopic(doc.get(), pTopics, pProbs, numTopicsMax); + + samplerQueue_->push(sampler_id); + } + + void LdaEngine::GetDocTopic(int docID, int* pTopic, int* pProb, int32_t& numTopicReturn) + { + //get the current topic vector of the document + int thread_id = 0; + LightDocSampler &sampler = *(samplers_[thread_id]); + + sampler.GetDocTopic(data_block_->GetOneDoc(docID).get(), pTopic, pProb, numTopicReturn); + } + + void LdaEngine::SetAlphaSum(float avgDocLength) + { + if (!bAlphaSumMultiplied) + { + alpha_sum_ = alpha_sum_ * avgDocLength; + bAlphaSumMultiplied = true; + } + printf("alpha_sum was set to %f", alpha_sum_); + } + + bool LdaEngine::ClearData() + { + data_block_->Clear(); + return true; + } + + bool LdaEngine::ClearModel() + { + model_block_->Clear(); + return true; + } + + //function to support dumping the topic_model model file + void LdaEngine::GetWordTopic(int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t& length) + { + //cap the topic number here according to inpassed value of length + int lengthCap = length; + + // NOTE: we MUST check whether the word-topic row is empty before get its value + if (global_word_topic_table_[wordId].capacity() == 0) + { + length = 0; + return; + } + + length = 0; + for (int i = 0; i < K_; ++i) + { + if (global_word_topic_table_[wordId][i] > 0) + { + pTopic[length] = i; + pProb[length] = global_word_topic_table_[wordId][i]; + length++; + + if (length >= lengthCap) + break; + } + } + } + + // Compare by frequencies in descending order. + bool CompareTerms(const std::pair &term1, const std::pair &term2) + { + // REVIEW: consider changing this to impose a total order, since quicksort is not stable. + return term2.second < term1.second; + } + + void LdaEngine::GetTopicSummary(int32_t topicId, int32_t* pWords, float* pProb, int32_t& length) + { + std::vector> allTermsVec; + int sumFreq = 0; + for (int i = 0; i < V_; i++) //for all the terms check the topic distribution + { + if (global_word_topic_table_[i][topicId] > 0) + { + std::pair p; + p.first = i; + p.second = global_word_topic_table_[i][topicId]; + allTermsVec.push_back(p); + sumFreq += global_word_topic_table_[i][topicId]; + } + } + + std::sort(allTermsVec.begin(), allTermsVec.end(), CompareTerms); + + int usedTerm = (int)allTermsVec.size(); + length = std::min(usedTerm, length); + for (int i = 0; i < length; i++) + { + pWords[i] = allTermsVec[i].first; + pProb[i] = (((float)(allTermsVec[i].second)) + beta_) / (sumFreq + beta_ * V_); + } + } + + //function to support loading the topic_model model file + void LdaEngine::SetWordTopic(int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t length) + { + //NOTE: whether we should really use the "true" here + model_block_->SetWordInfo(wordId, length, true); + global_word_topic_table_[wordId] = model_block_->get_row(wordId, nullptr); + + for (int i = 0; i < length; ++i) + { + global_word_topic_table_[wordId].inc(pTopic[i], pProb[i]); + global_summary_row_[pTopic[i]] += pProb[i]; + } + } + + void LdaEngine::GetModelStat(int64_t &memBlockSize, int64_t &aliasMemBlockSize) + { + //NOTE: get the model's value at the end of training stage. try to save these two numbers to disk file + model_block_->GetModelStat(memBlockSize, aliasMemBlockSize); + } + + double LdaEngine::EvalLogLikelihood(bool is_train, int thread_id, int iter, LightDocSampler &sampler) + { + double doc_ll = 0; + double word_ll = 0; + + if (thread_id == 0) + { + atomic_stats_->doc_ll_ = 0; + atomic_stats_->word_ll_ = 0; + } + process_barrier_->wait(); + + int doc_num = 0; + int32_t doc_start = data_block_->Begin(thread_id); + int32_t doc_end = data_block_->End(thread_id); + for (int32_t doc_index = doc_start; doc_index != doc_end; ++doc_index) + { + std::shared_ptr doc = data_block_->GetOneDoc(doc_index); + doc_ll += sampler.ComputeOneDocLLH(doc.get()); + doc_num++; + } + atomic_stats_->doc_ll_ = atomic_stats_->doc_ll_ + doc_ll; + process_barrier_->wait(); + + word_ll = sampler.ComputeWordLLH(word_range_for_each_thread_[thread_id], word_range_for_each_thread_[thread_id + 1]); + atomic_stats_->word_ll_ = atomic_stats_->word_ll_ + word_ll; + process_barrier_->wait(); + + double total_ll = 0; + if (thread_id == 0) + { + double normalized_ll = sampler.NormalizeWordLLH(); + + total_ll = 0; + total_ll += atomic_stats_->doc_ll_; + total_ll += atomic_stats_->word_ll_; + total_ll += normalized_ll; + + if (!is_train) + { + likelihood_in_iter_[iter] = (float)total_ll; + } + + std::cout << "Total likelihood: " << total_ll << "\t"; + std::cout << "..........[Nomralized word ll: " << normalized_ll << "\t" + << "Word likelihood: " << atomic_stats_->word_ll_ << "\t" + << "Doc likelihood: " << atomic_stats_->doc_ll_ << "]" << std::endl; + } + process_barrier_->wait(); + + return total_ll; + } + + void LdaEngine::DumpDocTopicTable(const std::string& doc_topic_file) + { + std::ofstream dt_stream; + dt_stream.open(doc_topic_file, std::ios::out); + assert(dt_stream.good()); + + int32_t num_documents = data_block_->num_documents(); + int32_t doc_start = 0; + int32_t doc_end = num_documents; + + lda::light_hash_map doc_topic_counter_(1024); + + for (int32_t doc_index = doc_start; doc_index != doc_end; ++doc_index) + { + std::shared_ptr doc = data_block_->GetOneDoc(doc_index); + doc_topic_counter_.clear(); + doc->GetDocTopicCounter(doc_topic_counter_); + + dt_stream << doc_index; + if (doc->size()) + { + int32_t capacity = doc_topic_counter_.capacity(); + int32_t *key = doc_topic_counter_.key(); + int32_t *value = doc_topic_counter_.value(); + int32_t nonzero_num = 0; + + for (int i = 0; i < capacity; ++i) + { + if (key[i] > 0) + { + dt_stream << " " << key[i] - 1 << ":" << value[i]; + } + } + } + dt_stream << std::endl; + } + dt_stream.close(); + } + + void LdaEngine::DumpFullModel(const std::string& word_topic_dump) + { + std::ofstream wt_stream; + wt_stream.open(word_topic_dump, std::ios::out); + assert(wt_stream.good()); + + for (int w = 0; w < V_; ++w) + { + int nonzero_num = global_word_topic_table_[w].nonzero_num(); + if (nonzero_num) + { + wt_stream << w; + for (int t = 0; t < K_; ++t) + { + if (global_word_topic_table_[w][t] > 0) + { + wt_stream << " " << t << ":" << global_word_topic_table_[w][t]; + } + } + wt_stream << std::endl; + } + } + wt_stream.close(); + + std::ofstream summary_stream; + summary_stream.open("summary_row.txt", std::ios::out); + for (int i = 0; i < K_; ++i) + { + summary_stream << global_summary_row_[i] << std::endl; + } + summary_stream.close(); + } +} // namespace lda diff --git a/src/Native/LdaNative/lda_engine.hpp b/src/Native/LdaNative/lda_engine.hpp new file mode 100644 index 0000000000..95a107f355 --- /dev/null +++ b/src/Native/LdaNative/lda_engine.hpp @@ -0,0 +1,144 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "lda_document.h" +#include "hybrid_map.h" +#include "hybrid_alias_map.h" + +#include "alias_multinomial_rng_int.hpp" + +#ifdef _MSC_VER +#define EXPORT_API(ret) extern "C" __declspec(dllexport) ret __stdcall +#else +#define EXPORT_API(ret) extern "C" __attribute__((visibility("default"))) ret +#endif + +//ignore all such warnings since our stl class will not used internally in the class as private member +#pragma warning(disable : 4251) +class CTimer; +namespace lda { + + class LDADataBlock; + class LDAModelBlock; + class SimpleBarrier; + struct LDAEngineAtomics; + class LightDocSampler; + class CBlockedIntQueue; + + // Engine takes care of the entire pipeline of LDA, from reading data to + // spawning threads, to recording execution time and loglikelihood. + class LdaEngine { + public: + LdaEngine(); + LdaEngine(int numTopic, + int numVocab, + float alphaSum, + float beta, + int numIter, + int likelihoodInterval, + int numThread, + int mhstep, + int maxDocToken); + + LdaEngine(int32_t K, int32_t V, int32_t num_threads, int32_t compute_ll_interval, float beta, int32_t num_iterations, int32_t mh_step, float alpha_sum, int maxDocToken); + + ~LdaEngine(); + + + void InitializeBeforeTest(); + bool InitializeBeforeTrain(); + void AllocateDataMemory(int num_document, int64_t corpus_size); + void AllocateModelMemory(const LDADataBlock* data_block); //in this case, model memory is allocated according to the datablock; + void AllocateModelMemory(int num_vocabs, int num_topics, int64_t nonzero_num); + void AllocateModelMemory(int num_vocabs, int num_topics, int64_t mem_block_size, int64_t alias_mem_block_size); + void SetAlphaSum(float avgDocLength); //alphasum parameter is set by avgdoclength * alpha + + //IO, data + bool ClearData(); //for clean up training data + bool ClearModel(); //for testing purpose, before calling SetWordTopic, please clear the old model + + int FeedInData(int* term_id, int* term_freq, int32_t term_num, int32_t vocab_size); + int FeedInDataDense(int* term_freq, int32_t term_num, int32_t vocab_size); + + //IO, model + // NOTE: assume pTopic and pProb are allocated outside the function + // the length returned will be capped by the pass-in initial value of length(usually it's the size of preallocated memory for pTopic&pProb + void GetWordTopic(int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t& length); + void SetWordTopic(int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t length); + void GetModelStat(int64_t &memBlockSize, int64_t &aliasMemBlockSize); + void GetTopicSummary(int32_t topicId, int32_t* pWords, float* pProb, int32_t& length); + + //mutlithread train/test with the data inside the engine + void Train(const char* pTrainOutput = nullptr); + void Test(int32_t burnin_iter, float* pLoglikelihood); + + //testing on single doc + void TestOneDoc(int* term_id, int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset); + void TestOneDocDense(int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset); + void GetDocTopic(int docID, int* pTopic, int* pProb, int32_t& numTopicReturn); // use this function to get the doc's topic output in batch testing scenario + + //output model(word topic) and doc topic + void DumpFullModel(const std::string& word_topic_dump); + void DumpDocTopicTable(const std::string& doc_topic_file); + + private: + double EvalLogLikelihood(bool is_train, int thread_id, int iter, LightDocSampler &sampler); + + private: // private data + void Training_Thread(); + void Testing_Thread(); + void CheckFunction(int thread_id, CTimer& tmDebug, const char* msg, bool waitBarrier = true); + + // Number of topics + int32_t K_; + // Number of vocabs. + int32_t V_; + + int32_t compute_ll_interval_; + int32_t num_threads_; + int32_t num_iterations_; + int32_t burnin_iterations_; + int32_t mh_step_; + float beta_; + float alpha_sum_; + float beta_sum_; + int maxDocToken_; + bool bAlphaSumMultiplied; //used to check whether alpha_sum_ is real alpha sum but not alpha + std::vector word_range_for_each_thread_; + + LDAEngineAtomics* atomic_stats_; + SimpleBarrier* process_barrier_; // Local barrier across threads. + + LDADataBlock* data_block_; + LDAModelBlock* model_block_; + + std::vector global_word_topic_table_; + std::vector global_alias_k_v_; + std::vector global_summary_row_; + + // for generating alias table of beta term + wood::AliasMultinomialRNGInt alias_rng_int_; + int32_t beta_height_; + float beta_mass_; + std::vector beta_k_v_; + + LightDocSampler **samplers_; + float* likelihood_in_iter_; + + // For TestDocSafe purpose + int32_t **document_buffer_; + + wood::xorshift_rng rng_; + CBlockedIntQueue *samplerQueue_; + }; +} // namespace lda diff --git a/src/Native/LdaNative/lda_engine_export.cpp b/src/Native/LdaNative/lda_engine_export.cpp new file mode 100644 index 0000000000..7f6bc62b70 --- /dev/null +++ b/src/Native/LdaNative/lda_engine_export.cpp @@ -0,0 +1,109 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#include "lda_engine.hpp" + +/// This file use to expose public API to be consumed by ML.NET. +namespace lda { + + EXPORT_API(LdaEngine*) CreateEngine(int numTopic, int numVocab, float alphaSum, float beta, int numIter, int likelihoodInterval, int numThread, int mhstep, int maxDocToken) + { + return new LdaEngine(numTopic, numVocab, alphaSum, beta, numIter, likelihoodInterval, numThread, mhstep, maxDocToken); + } + + EXPORT_API(void) DestroyEngine(LdaEngine* engine) + { + delete engine; + } + + EXPORT_API(void) AllocateModelMemory(LdaEngine* engine, int numTopic, int numVocab, int64_t tableSize, int64_t aliasTableSize) + { + engine->AllocateModelMemory(numVocab, numTopic, tableSize, aliasTableSize); + } + + EXPORT_API(void) AllocateDataMemory(LdaEngine* engine, int num_document, int64_t corpus_size) + { + engine->AllocateDataMemory(num_document, corpus_size); + } + + EXPORT_API(void) Train(LdaEngine* engine, const char* trainOutput) + { + engine->Train(trainOutput); + } + + EXPORT_API(void) Test(LdaEngine* engine, int32_t burnin_iter, float* pLoglikelihood) + { + engine->Test(burnin_iter, pLoglikelihood); + } + + EXPORT_API(void) CleanData(LdaEngine* engine) + { + engine->ClearData(); + } + + EXPORT_API(void) CleanModel(LdaEngine* engine) + { + engine->ClearModel(); + } + + EXPORT_API(void) GetModelStat(LdaEngine* engine, int64_t &memBlockSize, int64_t &aliasMemBlockSize) + { + engine->GetModelStat(memBlockSize, aliasMemBlockSize); + } + + EXPORT_API(void) GetWordTopic(LdaEngine* engine, int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t& length) + { + engine->GetWordTopic(wordId, pTopic, pProb, length); + } + + EXPORT_API(void) SetWordTopic(LdaEngine* engine, int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t length) + { + engine->SetWordTopic(wordId, pTopic, pProb, length); + } + + EXPORT_API(void) GetTopicSummary(LdaEngine* engine, int32_t topicId, int32_t* pWords, float* pProb, int32_t& length) + { + engine->GetTopicSummary(topicId, pWords, pProb, length); + } + + EXPORT_API(void) SetAlphaSum(LdaEngine* engine, float avgDocLength) + { + engine->SetAlphaSum(avgDocLength); + } + + EXPORT_API(int) FeedInData(LdaEngine* engine, int* term_id, int* term_freq, int32_t term_num, int32_t vocab_size) + { + return engine->FeedInData(term_id, term_freq, term_num, vocab_size); + } + + EXPORT_API(int) FeedInDataDense(LdaEngine* engine, int* term_freq, int32_t term_num, int32_t vocab_size) + { + return engine->FeedInDataDense(term_freq, term_num, vocab_size); + } + + EXPORT_API(void) GetDocTopic(LdaEngine* engine, int docID, int* pTopic, int* pProb, int32_t& numTopicReturn) + { + engine->GetDocTopic(docID, pTopic, pProb, numTopicReturn); + } + + EXPORT_API(void) TestOneDoc(LdaEngine* engine, int* term_id, int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset) + { + engine->TestOneDoc(term_id, term_freq, term_num, pTopics, pProbs, numTopicsMax, numBurnIter, reset); + } + + EXPORT_API(void) TestOneDocDense(LdaEngine* engine, int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset) + { + engine->TestOneDocDense(term_freq, term_num, pTopics, pProbs, numTopicsMax, numBurnIter, reset); + } + + EXPORT_API(void) InitializeBeforeTrain(LdaEngine* engine) + { + engine->InitializeBeforeTrain(); + } + + EXPORT_API(void) InitializeBeforeTest(LdaEngine* engine) + { + engine->InitializeBeforeTest(); + } +} \ No newline at end of file diff --git a/src/Native/LdaNative/light_doc_sampler.cpp b/src/Native/LdaNative/light_doc_sampler.cpp new file mode 100644 index 0000000000..ea628d3891 --- /dev/null +++ b/src/Native/LdaNative/light_doc_sampler.cpp @@ -0,0 +1,667 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#include +#include +#include +#include +#include +#include + +#include "lda_document.h" +#include "light_doc_sampler.hpp" + + +namespace lda +{ + LightDocSampler::LightDocSampler( + int32_t K, + int32_t V, + int32_t num_threads, + int32_t mh_step, + float beta, + float alpha_sum, + std::vector &word_topic_table, + std::vector &summary_row, + std::vector &alias_kv, + int32_t &beta_height, + float& beta_mass, + std::vector &beta_k_v) + : doc_topic_counter_(1024), + word_topic_table_(word_topic_table), summary_row_(summary_row), + alias_k_v_(alias_kv), + beta_height_(beta_height), + beta_mass_(beta_mass), + beta_k_v_(beta_k_v), + K_(K), + V_(V), + num_threads_(num_threads), + mh_step_for_gs_(mh_step), + beta_(beta), + alpha_sum_(alpha_sum) + { + beta_sum_ = beta_ * V_; + alpha_ = alpha_sum_ / K_; + + ll_alpha_ = (lda::real_t)0.01; + ll_alpha_sum_ = ll_alpha_ * K_; + + // Precompute LLH parameters + log_doc_normalizer_ = LogGamma(ll_alpha_ * K_) - K_ * LogGamma(ll_alpha_); + log_topic_normalizer_ = LogGamma(beta_sum_) - V_ * LogGamma(beta_); + + alias_rng_.Init(K_); + + q_w_proportion_.resize(K_); + delta_summary_row_.resize(K_); + word_topic_delta_.resize(num_threads_); + + rehashing_buf_ = new int32_t[K_ * 2]; + } + + LightDocSampler::~LightDocSampler() + { + delete[] rehashing_buf_; + } + + // Initialize word_topic_table and doc_topic_counter for each doc + int32_t LightDocSampler::GlobalInit(LDADocument *doc) + { + int32_t token_num = 0; + int32_t doc_size = doc->size(); + for (int i = 0; i < doc_size; ++i) + { + int32_t w = doc->Word(i); + int32_t t = doc->Topic(i); + + word_topic_delta wtd; + int32_t shard_id = w % num_threads_; + wtd.word = w; + wtd.topic = t; + wtd.delta = 1; + word_topic_delta_[shard_id].push_back(wtd); + + ++delta_summary_row_[t]; + + ++token_num; + } + return token_num; + } + + int32_t LightDocSampler::DocInit(LDADocument *doc) + { + int num_words = doc->size(); + + // compute the doc_topic_counter on the fly + doc_topic_counter_.clear(); + doc->GetDocTopicCounter(doc_topic_counter_); + + doc_size_ = num_words; + n_td_sum_ = (lda::real_t)num_words; + + return 0; + } + + bool CompareFirstElement(const std::pair &p1, const std::pair &p2) + { + return p1.first < p2.first; + } + + void LightDocSampler::GetDocTopic(LDADocument *doc, int* pTopics, int* pProbs, int32_t& numTopicsMax) + { + doc_topic_counter_.clear(); + doc->GetDocTopicCounter(doc_topic_counter_); + + // NOTE: do we have to assume this? + // probably first sort the topic vector according to the probs and keep the first numTopicsMax topics + // We assume the numTopicsMax is not less than the length of current document?? or it should be maxiumly the toipc number + // assert(numTopicsMax >= doc->size()); + + int32_t capacity = doc_topic_counter_.capacity(); + int32_t *key = doc_topic_counter_.key(); + int32_t *value = doc_topic_counter_.value(); + + std::vector> vec; + int32_t idx = 0; + for (int i = 0; i < capacity; ++i) + { + if (key[i] > 0) + { + std::pair pair; + pair.first = key[i] - 1; + pair.second = value[i]; + vec.push_back(pair); + idx++; + + if (idx == numTopicsMax) + break; + } + } + numTopicsMax = idx; + std::sort(vec.begin(), vec.end(), CompareFirstElement); + for (int i = 0; i < idx; i++) + { + pTopics[i] = vec[i].first; + pProbs[i] = vec[i].second; + } + } + + void LightDocSampler::EpocInit() + { + std::fill(delta_summary_row_.begin(), delta_summary_row_.end(), 0); + for (auto &shard : word_topic_delta_) + { + shard.clear(); + } + } + + void LightDocSampler::AdaptAlphaSum(bool is_train) + { + rng_.restart(); //reset the sampler so that we will get deterministic result by different runs, train-test, train-save-test, etc. + + if (is_train) + { + if (alpha_sum_ < 10) + { + alpha_sum_ = 100; + } + } + else + { + if (alpha_sum_ > 10) + { + alpha_sum_ = 1; + } + } + alpha_ = alpha_sum_ / K_; + } + + void LightDocSampler::build_alias_table(int32_t lower, int32_t upper, int thread_id) + { + for (int w = lower; w < upper; ++w) + { + GenerateAliasTableforWord(w); + } + } + void LightDocSampler::build_word_topic_table(int32_t thread_id, int32_t num_threads, lda::LDAModelBlock &model_block) + { + for (int i = 0; i < V_; ++i) + { + if (i % num_threads == thread_id) + { + word_topic_table_[i] = model_block.get_row(i, rehashing_buf_); + } + } + } + + int32_t LightDocSampler::SampleOneDoc(LDADocument *doc) + { + return OldProposalFreshSample(doc); + } + + int32_t LightDocSampler::InferOneDoc(LDADocument *doc) + { + return OldProposalFreshSampleInfer(doc); + } + int32_t LightDocSampler::Sample2WordFirst(LDADocument *doc, int32_t w, int32_t s, int32_t old_topic) + { + int32_t w_t_cnt; + int32_t w_s_cnt; + + real_t n_td_alpha; + real_t n_sd_alpha; + real_t n_tw_beta; + real_t n_sw_beta; + real_t n_s_beta_sum; + real_t n_t_beta_sum; + + real_t proposal_s; + real_t proposal_t; + + real_t nominator; + real_t denominator; + + real_t rejection; + real_t pi; + int m; + + for (int i = 0; i < mh_step_for_gs_; ++i) + { + int32_t t; + + t = alias_k_v_[w].next(rng_, beta_height_, beta_mass_, beta_k_v_, false); + + rejection = rng_.rand_real(); + + n_td_alpha = doc_topic_counter_[t] + alpha_; + n_sd_alpha = doc_topic_counter_[s] + alpha_; + + + w_s_cnt = get_word_topic(w, s); + w_t_cnt = get_word_topic(w, t); + + if (s != old_topic && t != old_topic) + { + n_tw_beta = w_t_cnt + beta_; + n_t_beta_sum = summary_row_[t] + beta_sum_; + + n_sw_beta = w_s_cnt + beta_; + n_s_beta_sum = summary_row_[s] + beta_sum_; + } + else if (s != old_topic && t == old_topic) + { + n_td_alpha -= 1; + + n_tw_beta = w_t_cnt + beta_ - 1; + n_t_beta_sum = summary_row_[t] + beta_sum_ - 1; + + n_sw_beta = w_s_cnt + beta_; + n_s_beta_sum = summary_row_[s] + beta_sum_; + } + else if (s == old_topic && t != old_topic) + { + n_sd_alpha -= 1; + + n_tw_beta = w_t_cnt + beta_; + n_t_beta_sum = summary_row_[t] + beta_sum_; + + n_sw_beta = w_s_cnt + beta_ - 1; + n_s_beta_sum = summary_row_[s] + beta_sum_ - 1; + } + else + { + n_td_alpha -= 1; + n_sd_alpha -= 1; + + n_tw_beta = w_t_cnt + beta_ - 1; + n_t_beta_sum = summary_row_[t] + beta_sum_ - 1; + + n_sw_beta = w_s_cnt + beta_ - 1; + n_s_beta_sum = summary_row_[s] + beta_sum_ - 1; + } + + proposal_s = (w_s_cnt + beta_) / (summary_row_[s] + beta_sum_); + proposal_t = (w_t_cnt + beta_) / (summary_row_[t] + beta_sum_); + + nominator = n_td_alpha + * n_tw_beta + * n_s_beta_sum + * proposal_s; + + denominator = n_sd_alpha + * n_sw_beta + * n_t_beta_sum + * proposal_t; + + + pi = std::min((real_t)1.0, nominator / denominator); + + // s = rejection < pi ? t : s; + m = -(rejection < pi); + s = (t & m) | (s & ~m); + + real_t n_td_or_alpha = rng_.rand_real() * (n_td_sum_ + alpha_sum_); + if (n_td_or_alpha < n_td_sum_) + { + int32_t t_idx = rng_.rand_k(doc_size_); + t = doc->Topic(t_idx); + } + else + { + t = rng_.rand_k(K_); + } + + rejection = rng_.rand_real(); + + n_td_alpha = doc_topic_counter_[t] + alpha_; + n_sd_alpha = doc_topic_counter_[s] + alpha_; + + + if (s != old_topic && t != old_topic) + { + w_t_cnt = get_word_topic(w, t); + n_tw_beta = w_t_cnt + beta_; + n_t_beta_sum = summary_row_[t] + beta_sum_; + + w_s_cnt = get_word_topic(w, s); + n_sw_beta = w_s_cnt + beta_; + n_s_beta_sum = summary_row_[s] + beta_sum_; + } + else if (s != old_topic && t == old_topic) + { + n_td_alpha -= 1; + + w_t_cnt = get_word_topic(w, t) - 1; + n_tw_beta = w_t_cnt + beta_; + n_t_beta_sum = summary_row_[t] + beta_sum_ - 1; + + w_s_cnt = get_word_topic(w, s); + n_sw_beta = w_s_cnt + beta_; + n_s_beta_sum = summary_row_[s] + beta_sum_; + } + else if (s == old_topic && t != old_topic) + { + n_sd_alpha -= 1; + + w_t_cnt = get_word_topic(w, t); + n_tw_beta = w_t_cnt + beta_; + n_t_beta_sum = summary_row_[t] + beta_sum_; + + w_s_cnt = get_word_topic(w, s) - 1; + n_sw_beta = w_s_cnt + beta_; + n_s_beta_sum = summary_row_[s] + beta_sum_ - 1; + } + else + { + n_td_alpha -= 1; + n_sd_alpha -= 1; + + w_t_cnt = get_word_topic(w, t) - 1; + n_tw_beta = w_t_cnt + beta_; + n_t_beta_sum = summary_row_[t] + beta_sum_ - 1; + + w_s_cnt = get_word_topic(w, s) - 1; + n_sw_beta = w_s_cnt + beta_; + n_s_beta_sum = summary_row_[s] + beta_sum_ - 1; + } + + proposal_t = doc_topic_counter_[t] + alpha_; + proposal_s = doc_topic_counter_[s] + alpha_; + + nominator = n_td_alpha + * n_tw_beta + * n_s_beta_sum + * proposal_s; + + denominator = n_sd_alpha + * n_sw_beta + * n_t_beta_sum + * proposal_t; + + + pi = std::min((real_t)1.0, nominator / denominator); + + // s = rejection < pi ? t : s; + m = -(rejection < pi); + s = (t & m) | (s & ~m); + } + int32_t src = s; + return src; + } + + int32_t LightDocSampler::Sample2WordFirstInfer(LDADocument *doc, int32_t w, int32_t s, int32_t old_topic) + { + int32_t w_t_cnt; + int32_t w_s_cnt; + + float n_td_alpha; + float n_sd_alpha; + float n_tw_beta; + float n_sw_beta; + float n_s_beta_sum; + float n_t_beta_sum; + + float nominator; + float denominator; + + float rejection; + float pi; + int m; + + for (int i = 0; i < mh_step_for_gs_; ++i) + { + int32_t t; + t = alias_k_v_[w].next(rng_, beta_height_, beta_mass_, beta_k_v_, false); + + rejection = rng_.rand_real(); + + n_td_alpha = doc_topic_counter_[t] + alpha_; + n_sd_alpha = doc_topic_counter_[s] + alpha_; + + nominator = n_td_alpha; + denominator = n_sd_alpha; + + pi = std::min((float)1.0, nominator / denominator); + + m = -(rejection < pi); + s = (t & m) | (s & ~m); + + float n_td_or_alpha = rng_.rand_real() * (n_td_sum_ + alpha_sum_); + if (n_td_or_alpha < n_td_sum_) + { + int32_t t_idx = rng_.rand_k(doc_size_); + t = doc->Topic(t_idx); + } + else + { + t = rng_.rand_k(K_); + } + + rejection = rng_.rand_real(); + + + w_t_cnt = get_word_topic(w, t); + n_tw_beta = w_t_cnt + beta_; + n_t_beta_sum = summary_row_[t] + beta_sum_; + + w_s_cnt = get_word_topic(w, s); + n_sw_beta = w_s_cnt + beta_; + n_s_beta_sum = summary_row_[s] + beta_sum_; + + nominator = n_tw_beta + * n_s_beta_sum; + + + denominator = n_sw_beta + * n_t_beta_sum; + + pi = std::min((float)1.0, nominator / denominator); + + m = -(rejection < pi); + s = (t & m) | (s & ~m); + } + int32_t src = s; + return src; + } + + int32_t LightDocSampler::OldProposalFreshSample(LDADocument *doc) + { + DocInit(doc); + int num_token = doc->size(); + int32_t &cursor = doc->get_cursor(); + + int32_t token_sweeped = 0; + cursor = 0; + + while (cursor < num_token) + { + ++token_sweeped; + + int32_t w = doc->Word(cursor); + int32_t s = doc->Topic(cursor); // old topic + + int t = Sample2WordFirst(doc, w, s, s); // new topic + + if (s != t) + { + word_topic_delta wtd; + int32_t shard_id = w % num_threads_; + wtd.word = w; + wtd.topic = s; + wtd.delta = -1; + word_topic_delta_[shard_id].push_back(wtd); + + wtd.topic = t; + wtd.delta = +1; + word_topic_delta_[shard_id].push_back(wtd); + + --delta_summary_row_[s]; + ++delta_summary_row_[t]; + + doc->SetTopic(cursor, t); + doc_topic_counter_.inc(s, -1); + doc_topic_counter_.inc(t, 1); + } + cursor++; + } + return token_sweeped; + } + + int32_t LightDocSampler::OldProposalFreshSampleInfer(LDADocument *doc) + { + + DocInit(doc); + int num_token = doc->size(); + int32_t &cursor = doc->get_cursor(); + + int32_t token_sweeped = 0; + cursor = 0; + + while (cursor < num_token) + { + ++token_sweeped; + + int32_t w = doc->Word(cursor); + int32_t s = doc->Topic(cursor); // old topic + + int t = Sample2WordFirstInfer(doc, w, s, s); // new topic + + if (s != t) + { + doc->SetTopic(cursor, t); + doc_topic_counter_.inc(s, -1); + doc_topic_counter_.inc(t, 1); + } + cursor++; + } + return token_sweeped; + } + + double LightDocSampler::NormalizeWordLLH() + { + double word_llh = K_ * log_topic_normalizer_; + for (int k = 0; k < K_; ++k) + { + word_llh -= LogGamma(summary_row_[k] + beta_sum_); + } + return word_llh; + } + + + double LightDocSampler::ComputeOneDocLLH(LDADocument* doc) + { + double doc_ll = 0; + double one_doc_llh = log_doc_normalizer_; + + // Compute doc-topic vector on the fly. + int num_tokens = doc->size(); + + if (num_tokens == 0) + { + return doc_ll; + } + + doc_topic_counter_.clear(); + doc->GetDocTopicCounter(doc_topic_counter_); + + int32_t capacity = doc_topic_counter_.capacity(); + int32_t *key = doc_topic_counter_.key(); + int32_t *value = doc_topic_counter_.value(); + int32_t nonzero_num = 0; + + for (int i = 0; i < capacity; ++i) + { + if (key[i] > 0) + { + one_doc_llh += LogGamma(value[i] + ll_alpha_); + ++nonzero_num; + } + } + one_doc_llh += (K_ - nonzero_num) * LogGamma(ll_alpha_); + one_doc_llh -= LogGamma(num_tokens + ll_alpha_ * K_); + + doc_ll += one_doc_llh; + return doc_ll; + } + + double LightDocSampler::ComputeWordLLH(int32_t lower, int32_t upper) + { + // word_llh is P(w|z). + double word_llh = 0; + double zero_entry_llh = LogGamma(beta_); + + // Since some vocabs are not present in the corpus, use num_words_seen to + // count # of words in corpus. + int num_words_seen = 0; + for (int w = lower; w < upper; ++w) + { + auto word_topic_row = get_word_row(w); + int32_t total_count = 0; + double delta = 0; + if (word_topic_row.is_dense()) + { + int32_t* memory = word_topic_row.memory(); + int32_t capacity = word_topic_row.capacity(); + int32_t count; + for (int i = 0; i < capacity; ++i) + { + count = memory[i]; + total_count += count; + delta += LogGamma(count + beta_); + } + } + else + { + int32_t* key = word_topic_row.key(); + int32_t* value = word_topic_row.value(); + int32_t capacity = word_topic_row.capacity(); + int32_t count; + int32_t nonzero_num = 0; + for (int i = 0; i < capacity; ++i) + { + if (key[i] > 0) + { + count = value[i]; + total_count += count; + delta += LogGamma(count + beta_); + ++nonzero_num; + } + } + delta += (K_ - nonzero_num) * zero_entry_llh; + } + + if (total_count) + { + word_llh += delta; + } + } + + return word_llh; + } + + void LightDocSampler::Dump(const std::string &dump_name, int32_t lower, int32_t upper) + { + std::ofstream wt_stream; + wt_stream.open(dump_name, std::ios::out); + + for (int w = lower; w < upper; ++w) + { + //why not just a serialization of current hybrid_map? do we need to do a search? + int nonzero_num = word_topic_table_[w].nonzero_num(); + if (nonzero_num) + { + wt_stream << w; + for (int t = 0; t < K_; ++t) + { + if (word_topic_table_[w][t] > 0) + { + wt_stream << " " << t << ":" << word_topic_table_[w][t]; + } + } + wt_stream << std::endl; + } + } + wt_stream.close(); + } +} \ No newline at end of file diff --git a/src/Native/LdaNative/light_doc_sampler.hpp b/src/Native/LdaNative/light_doc_sampler.hpp new file mode 100644 index 0000000000..82e37b3bc5 --- /dev/null +++ b/src/Native/LdaNative/light_doc_sampler.hpp @@ -0,0 +1,187 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once + +#include "type_common.h" +#include "lda_document.h" +#include "rand_int_rng.h" +#include +#include +#include +#include +#include +#include +#include "alias_multinomial_rng_int.hpp" +#include "light_hash_map.h" +#include "utils.hpp" +#include "hybrid_map.h" +#include "hybrid_alias_map.h" +#include "model_block.h" + +namespace lda +{ + struct word_topic_delta + { + int32_t word; + int32_t topic; + int32_t delta; + }; + + class LightDocSampler + { + public: + LightDocSampler( + int32_t K, + int32_t V, + int32_t num_threads, + int32_t mh_step, + float beta, + float alpha_sum, + std::vector &word_topic_table, + std::vector &summary_row, + std::vector &alias_kv, + int32_t &beta_height, + float &beta_mass, + std::vector &beta_k_v + ); + + ~LightDocSampler(); + + int32_t GlobalInit(LDADocument *doc); + int32_t DocInit(LDADocument *doc); + void EpocInit(); + void AdaptAlphaSum(bool is_train); + void GetDocTopic(LDADocument *doc, int* pTopics, int* pProbs, int32_t& numTopicsMax); + + + int32_t SampleOneDoc(LDADocument *doc); + int32_t InferOneDoc(LDADocument *doc); + + // The i-th complete-llh calculation will use row i in llh_able_. This is + // part of log P(z) in eq.[3]. + double ComputeOneDocLLH(LDADocument* doc); + double ComputeWordLLH(int32_t lower, int32_t upper); + double NormalizeWordLLH(); + + inline void rng_restart() + { + rng_.restart(); + } + + + void Dump(const std::string &dump_name, int32_t lower, int32_t upper); + + void build_alias_table(int32_t lower, int32_t upper, int thread_id); + void build_word_topic_table(int32_t thread_id, int32_t num_threads, lda::LDAModelBlock &model_block); + + inline int32_t rand_k(); + inline wood::xorshift_rng& rng(); + inline lda::hybrid_map& get_word_row(int32_t word); + inline std::vector &get_summary_row(); + inline std::vector& get_word_topic_delta(int32_t thread_id); + inline std::vector& get_delta_summary_row(); + + private: + int32_t Sample2WordFirst(LDADocument *doc, int32_t w, int32_t s, int32_t old_topic); + int32_t Sample2WordFirstInfer(LDADocument *doc, int32_t w, int32_t s, int32_t old_topic); + inline void GenerateAliasTableforWord(int32_t word); + inline int32_t get_word_topic(int32_t word, int32_t topic); + inline void word_topic_dec(int32_t word, int32_t topic); + inline void word_topic_inc(int32_t word, int32_t topic); + int32_t OldProposalFreshSample(LDADocument *doc); + int32_t OldProposalFreshSampleInfer(LDADocument *doc); + + private: + int32_t num_tokens_; + int32_t num_unique_words_; + + int32_t K_; + int32_t V_; + real_t beta_; + real_t beta_sum_; + real_t alpha_; + real_t alpha_sum_; + + real_t ll_alpha_; + real_t ll_alpha_sum_; + + real_t delta_alpha_sum_; + + std::vector q_w_proportion_; + wood::AliasMultinomialRNGInt alias_rng_; + wood::xorshift_rng rng_; + std::vector &alias_k_v_; + + int32_t doc_size_; + + // the number of Metropolis Hastings step + int32_t mh_step_for_gs_; + real_t n_td_sum_; + + // model + std::vector &summary_row_; + std::vector &word_topic_table_; + int32_t *rehashing_buf_; + + int32_t &beta_height_; + float &beta_mass_; + std::vector &beta_k_v_; + + // delta + std::vector delta_summary_row_; + + int32_t num_threads_; + std::vector> word_topic_delta_; + + // ================ Precompute LLH Parameters ================= + // Log of normalization constant (per docoument) from eq.[3]. + double log_doc_normalizer_; + + // Log of normalization constant (per topic) from eq.[2]. + double log_topic_normalizer_; + lda::light_hash_map doc_topic_counter_; + }; + + inline int32_t LightDocSampler::rand_k() + { + return rng_.rand_k(K_); + } + inline wood::xorshift_rng& LightDocSampler::rng() + { + return rng_; + } + inline lda::hybrid_map& LightDocSampler::get_word_row(int32_t word) + { + return word_topic_table_[word]; + } + inline std::vector& LightDocSampler::get_summary_row() + { + return summary_row_; + } + inline std::vector& LightDocSampler::get_word_topic_delta(int32_t thread_id) + { + return word_topic_delta_[thread_id]; + } + inline std::vector& LightDocSampler::get_delta_summary_row() + { + return delta_summary_row_; + } + inline int32_t LightDocSampler::get_word_topic(int32_t word, int32_t topic) + { + return word_topic_table_[word][topic]; + } + inline void LightDocSampler::word_topic_dec(int32_t word, int32_t topic) + { + word_topic_table_[word].inc(topic, -1); + } + inline void LightDocSampler::word_topic_inc(int32_t word, int32_t topic) + { + word_topic_table_[word].inc(topic, 1); + } + inline void LightDocSampler::GenerateAliasTableforWord(int32_t word) + { + alias_k_v_[word].build_table(alias_rng_, word_topic_table_[word], summary_row_, q_w_proportion_, beta_, beta_sum_, word, rng_); + } +} \ No newline at end of file diff --git a/src/Native/LdaNative/light_hash_map.cpp b/src/Native/LdaNative/light_hash_map.cpp new file mode 100644 index 0000000000..ae070c5e1c --- /dev/null +++ b/src/Native/LdaNative/light_hash_map.cpp @@ -0,0 +1,76 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#include +#include "light_hash_map.h" + +namespace lda +{ + light_hash_map::light_hash_map(int32_t *mem_block, int32_t capacity) : + own_memory_(false), + capacity_(capacity), + mem_block_(mem_block), + empty_key_(0), + deleted_key_(-2) + { + key_ = mem_block_; + value_ = mem_block_ + capacity_; + clear(); + } + + light_hash_map::light_hash_map(int32_t capacity) : + own_memory_(true), + capacity_(capacity), + empty_key_(0), + deleted_key_(-2) + { + mem_block_ = new int32_t[capacity_ * 2]; + key_ = mem_block_; + value_ = mem_block_ + capacity_; + clear(); + } + + // must call set_memory after construction before use + light_hash_map::light_hash_map() : + capacity_(1024), + own_memory_(false), + empty_key_(0), + deleted_key_(-2), + mem_block_(nullptr), + key_(nullptr), + value_(nullptr) + { + } + + light_hash_map::~light_hash_map() + { + capacity_ = 0; + if (own_memory_ && mem_block_ != nullptr) + { + delete[]mem_block_; + } + + mem_block_ = nullptr; + key_ = nullptr; + value_ = nullptr; + } + + void light_hash_map::clear() + { + memset(mem_block_, 0, capacity_ * 2 * sizeof(int32_t)); + } + + void light_hash_map::sort() + { + //key is probablly empty in key_, sort by value_ + //this is just for the output process like getting the topic of document or a topic of term + } + + void light_hash_map::set_memory(int32_t *mem_block) + { + mem_block_ = mem_block; + key_ = mem_block_; + value_ = mem_block_ + capacity_; + } +} \ No newline at end of file diff --git a/src/Native/LdaNative/light_hash_map.h b/src/Native/LdaNative/light_hash_map.h new file mode 100644 index 0000000000..6e07c4ce58 --- /dev/null +++ b/src/Native/LdaNative/light_hash_map.h @@ -0,0 +1,189 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once +#include +#include +#include +#include + +/* +A light-weight hash table, borrowing the idea from google::dense_hash_map +0, pair must be +1, It can or can not own memory, +2, It has a fixed capacity, needless to resize or shrink, +3, capacity_ should at lease be twice of the maximum number of inserted items, guaranteeing a low load factor, +4, capacity_ should be an integer power of 2 +5, emptry_key_ is fixed to 0 +6, deleted_key_ is fixed to -2 +*/ + +namespace lda +{ +// The probing method: +// Linear probing +// #define JUMP_(key, num_probes) ( 1 ) + +// Quadratic probing +#define JUMP_(key, num_probes) ( num_probes ) + +#define ILLEGAL_BUCKET -1 + + class light_hash_map + { + public: + + // must call set_memory after construction before use + light_hash_map(); + // NOTE: the size of mem_block_ = 2 * capacity_ + light_hash_map(int32_t *mem_block, int32_t capacity); + light_hash_map(int32_t capacity); + + ~light_hash_map(); + + void clear(); + void set_memory(int32_t *mem_block); + void sort(); + + inline int32_t capacity() const; + inline int32_t size() const; + inline int32_t* key() const; + inline int32_t* value() const; + // whether we can find the |key| in this hash table + inline bool has(int32_t key) const; + + // if |key| is already in table, increase its coresponding |value| with |delta| + // if not, insert |key| into the table and set |delta| as the |value| of |key| + inline void inc(int32_t key, int32_t delta); + + // query the value of |key| + // if |key| is in the table, return the |value| corresonding to |key| + // if not, just return 0 + inline int32_t operator[](int32_t key); + + private: + + light_hash_map(const light_hash_map &other) = delete; + light_hash_map& operator=(const light_hash_map &other) = delete; + + // Returns a pair of positions: 1st where the object is, 2nd where + // it would go if you wanted to insert it. 1st is ILLEGAL_BUCKET + // if object is not found; 2nd is ILLEGAL_BUCKET if it is. + // NOTE: because of deletions where-to-insert is not trivial: it's the + // first deleted bucket we see, as long as we don't find the key later + inline std::pair find_position(const int32_t key) const; + + bool own_memory_; + int32_t capacity_; + int32_t *mem_block_; + int32_t *key_; + int32_t *value_; + + int32_t empty_key_; + int32_t deleted_key_; + }; + + inline int32_t light_hash_map::capacity() const + { + return capacity_; + } + inline int32_t light_hash_map::size() const + { + int32_t size = 0; + for (int i = 0; i < capacity_; ++i) + { + if (key_[i] > 0) + { + ++size; + } + } + return size; + } + + inline int32_t* light_hash_map::key() const + { + return key_; + } + inline int32_t* light_hash_map::value() const + { + return value_; + } + + inline bool light_hash_map::has(int32_t key) const + { + int32_t internal_key = key + 1; + std::pair pos = find_position(internal_key); + return pos.first != ILLEGAL_BUCKET; + } + + inline void light_hash_map::inc(int32_t key, int32_t delta) + { + int32_t internal_key = key + 1; + std::pair pos = find_position(internal_key); + if (pos.first != ILLEGAL_BUCKET) + { + value_[pos.first] += delta; + if (value_[pos.first] == 0) // the value becomes zero, delete the key + { + key_[pos.first] = deleted_key_; + } + } + else // not found the key, insert it with delta as value + { + key_[pos.second] = internal_key; + value_[pos.second] = delta; + } + } + + inline int32_t light_hash_map::operator[](int32_t key) + { + int32_t internal_key = key + 1; + std::pair pos = find_position(internal_key); + if (pos.first != ILLEGAL_BUCKET) + { + return value_[pos.first]; + } + else + { + return 0; + } + } + + inline std::pair light_hash_map::find_position(const int32_t key) const + { + int num_probes = 0; + int32_t capacity_minus_one = capacity_ - 1; + int32_t idx = key % capacity_; + int32_t insert_pos = ILLEGAL_BUCKET; + while (1) // probe until something happens + { + if (key_[idx] == empty_key_) // bucket is empty + { + if (insert_pos == ILLEGAL_BUCKET) // found no prior place to insert + { + return std::pair(ILLEGAL_BUCKET, idx); + } + else // previously, there is a position to insert + { + return std::pair(ILLEGAL_BUCKET, insert_pos); + } + } + else if (key_[idx] == deleted_key_) // keep searching, but makr to insert + { + if (insert_pos == ILLEGAL_BUCKET) + { + insert_pos = idx; + } + } + else if (key_[idx] == key) + { + return std::pair(idx, ILLEGAL_BUCKET); + } + ++num_probes; // we are doing another probe + idx = (idx + JUMP_(key, num_probes) & capacity_minus_one); + assert(num_probes < capacity_ + && "Hashtable is full: an error in key_equal<> or hash<>"); + } + } +} \ No newline at end of file diff --git a/src/Native/LdaNative/model_block.cpp b/src/Native/LdaNative/model_block.cpp new file mode 100644 index 0000000000..ec15834aca --- /dev/null +++ b/src/Native/LdaNative/model_block.cpp @@ -0,0 +1,463 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + +#include +#include +#include +#include "utils.hpp" +#include +#include "model_block.h" +#include "lda_document.h" + +namespace lda +{ + int64_t upper_bound(int64_t x) + { + if (x == 0) + { + return 0; + } + int64_t shift = 0; + int64_t y = 1; + x--; + while (x) + { + x = x >> 1; + y = y << 1; + ++shift; + } + return y; + } + + int32_t align64(int32_t size) + { + if (size % 64 == 0) + { + return size; + } + else + { + size = 64 * (size / 64) + 64; + return size; + } + } + + + LDAModelBlock::LDAModelBlock() + : dict_(nullptr), + num_vocabs_(0), + mem_block_size_(0), + mem_block_(nullptr), + alias_mem_block_size_(0), + alias_mem_block_(nullptr) + { + } + LDAModelBlock::~LDAModelBlock() + { + Clear(); + } + + void LDAModelBlock::Clear() + { + if (dict_) + { + delete[]dict_; + dict_ = nullptr; + } + if (mem_block_) + { + delete[]mem_block_; + mem_block_ = nullptr; + } + if (alias_mem_block_) + { + delete[]alias_mem_block_; + alias_mem_block_ = nullptr; + } + + num_vocabs_ = -1; + num_topics_ = -1; + + mem_block_size_ = 0; + alias_mem_block_size_ = 0; + } + + void LDAModelBlock::Init(int32_t num_vocabs, int32_t num_topics, int64_t nonzero_num) + { + num_vocabs_ = num_vocabs; + num_topics_ = num_topics; + + dict_ = new WordEntry[num_vocabs_]; + for (int i = 0; i < num_vocabs_; ++i) + { + // This warning is a false positive. Supressing it similar to the existing one on Line 140 below. +#pragma warning(suppress: 6386) + dict_[i].is_dense_ = 0; + dict_[i].is_alias_dense_ = 0; + } + + mem_block_size_ = 2 * upper_bound(load_factor_ * nonzero_num); + alias_mem_block_size_ = nonzero_num * 3; + + mem_block_ = new int32_t[mem_block_size_](); // NOTE: force to initialize the values to be zero + alias_mem_block_ = new int32_t[alias_mem_block_size_](); // NOTE: force to initialize the values to be zero + } + + void LDAModelBlock::Init(int32_t num_vocabs, int32_t num_topics, int64_t mem_block_size, int64_t alias_mem_block_size) + { + num_vocabs_ = num_vocabs; + num_topics_ = num_topics; + + dict_ = new WordEntry[num_vocabs_]; + for (int i = 0; i < num_vocabs_; ++i) + { + // This warning is a false positive. Supressing it similar to the existing one on Line 140 below. +#pragma warning(suppress: 6386) + dict_[i].is_dense_ = 0; + dict_[i].is_alias_dense_ = 0; + } + + mem_block_size_ = mem_block_size; + mem_block_ = new int32_t[mem_block_size_](); // NOTE : force to initialize the values to be zero + + alias_mem_block_size_ = alias_mem_block_size; + alias_mem_block_ = new int32_t[alias_mem_block_size_](); //NOTE: force to initialize the values to be zero + + std::cout << "mem_block_size = " << mem_block_size_ * 4 << std::endl; + std::cout << "alias_mem_block_size = " << alias_mem_block_size_ * 4 << std::endl; + + offset_ = 0; + alias_offset_ = 0; + } + + void LDAModelBlock::Init(int32_t num_vocabs, int32_t num_topics) + { + num_vocabs_ = num_vocabs; + num_topics_ = num_topics; + + dict_ = new WordEntry[num_vocabs_]; + for (int i = 0; i < num_vocabs_; ++i) + { + // This warning is a false positive caused by an old bug in PREfast. It is fixed in VS 2015. +#pragma warning(suppress: 6386) + dict_[i].tf = 0; + dict_[i].is_dense_ = 0; + dict_[i].is_alias_dense_ = 0; + } + } + + void LDAModelBlock::SetWordInfo(int word_id, int32_t nonzero_num, bool fullSparse) + { + dict_[word_id].word_id_ = word_id; + dict_[word_id].tf = nonzero_num; + + int32_t hot_thresh; + if (fullSparse) + { + // use a very large threshold to ensure every row of word-topic-table using a sparse representation + hot_thresh = std::numeric_limits::max(); + } + else + { + hot_thresh = num_topics_ / (2 * load_factor_); //hybrid + } + int32_t alias_hot_thresh; + if (fullSparse) + { + // use a very large threshold to ensure every row of alias table using a sparse representation + alias_hot_thresh = std::numeric_limits::max(); + } + else + { + alias_hot_thresh = (num_topics_ * 2) / 3; + } + + int32_t capacity = 0; + int32_t row_size = 0; + int32_t alias_capacity = 0; + int32_t alias_row_size = 0; + + if (dict_[word_id].tf >= hot_thresh) + { + dict_[word_id].is_dense_ = 1; + capacity = num_topics_; + row_size = capacity; + } + else if (dict_[word_id].tf > 0) + { + dict_[word_id].is_dense_ = 0; + int capacity_lower_bound = load_factor_ * dict_[word_id].tf; + capacity = (int32_t)upper_bound(capacity_lower_bound); + row_size = capacity * 2; + } + else + { + dict_[word_id].is_dense_ = 1; + row_size = 0; + capacity = 0; + } + + dict_[word_id].offset_ = offset_; + dict_[word_id].end_offset_ = offset_ + row_size; + dict_[word_id].capacity_ = capacity; + + offset_ += row_size; + + if (dict_[word_id].tf >= alias_hot_thresh) + { + alias_capacity = num_topics_; + alias_row_size = 2 * num_topics_; + dict_[word_id].is_alias_dense_ = 1; + } + else if (dict_[word_id].tf > 0) + { + alias_capacity = dict_[word_id].tf; + alias_row_size = 3 * dict_[word_id].tf; + dict_[word_id].is_alias_dense_ = 0; + } + else + { + alias_capacity = 0; + alias_row_size = 0; + dict_[word_id].is_alias_dense_ = 1; + } + dict_[word_id].alias_capacity_ = alias_capacity; + dict_[word_id].alias_offset_ = alias_offset_; + dict_[word_id].alias_end_offset_ = alias_offset_ + alias_row_size; + + alias_offset_ += alias_row_size; + } + + // NOTE: sometimes, we use totally sparse representation (in testing phase), fullSparse == true + // in other times, we use hybrid structure (in training phase), fullSparse == false + void LDAModelBlock::InitModelBlockByTFS(bool fullSparse) + { + const int32_t max_tf_thresh = std::numeric_limits::max(); + int32_t hot_thresh; + if (fullSparse) + { + // totally sparse + // use a very large threshold to ensure every row of word-topic-table using a sparse representation + hot_thresh = std::numeric_limits::max(); + } + else + { + // hybrid + hot_thresh = num_topics_ / (2 * load_factor_); + } + int32_t alias_hot_thresh; + if (fullSparse) + { + // use a very large threshold to ensure every row of alias table using a sparse representation + alias_hot_thresh = std::numeric_limits::max(); + } + else + { + alias_hot_thresh = (num_topics_ * 2) / 3; + } + + int32_t word_id; + int32_t capacity = 0; + int32_t row_size = 0; + int32_t alias_capacity = 0; + int32_t alias_row_size = 0; + + int64_t offset = 0; + int64_t alias_offset = 0; + + for (word_id = 0; word_id < num_vocabs_; ++word_id) + { + int32_t tf = dict_[word_id].tf; + + dict_[word_id].word_id_ = word_id; + dict_[word_id].tf = tf; + + if (tf >= hot_thresh) + { + dict_[word_id].is_dense_ = 1; + capacity = num_topics_; + row_size = capacity; + } + else if (tf > 0) + { + dict_[word_id].is_dense_ = 0; + int capacity_lower_bound = load_factor_ * tf; + capacity = (int32_t)upper_bound(capacity_lower_bound); + row_size = capacity * 2; + } + else + { + dict_[word_id].is_dense_ = 1; + capacity = 0; + row_size = 0; + } + + dict_[word_id].offset_ = offset; + dict_[word_id].end_offset_ = offset + row_size; + dict_[word_id].capacity_ = capacity; + + offset += row_size; + + if (tf >= alias_hot_thresh) + { + alias_capacity = num_topics_; + alias_row_size = 2 * num_topics_; + dict_[word_id].is_alias_dense_ = 1; + } + else if (tf > 0) + { + alias_capacity = tf; + alias_row_size = 3 * tf; + dict_[word_id].is_alias_dense_ = 0; + } + else + { + alias_capacity = 0; + alias_row_size = 0; + dict_[word_id].is_alias_dense_ = 1; + } + dict_[word_id].alias_capacity_ = alias_capacity; + dict_[word_id].alias_offset_ = alias_offset; + dict_[word_id].alias_end_offset_ = alias_offset + alias_row_size; + alias_offset += alias_row_size; + } + + mem_block_size_ = dict_[num_vocabs_ - 1].end_offset_; + mem_block_ = new int32_t[mem_block_size_](); // NOTE: force to initialize the values to be zero + + alias_mem_block_size_ = dict_[num_vocabs_ - 1].alias_end_offset_; + alias_mem_block_ = new int32_t[alias_mem_block_size_](); //NOTE: force to initialize the values to be zero + + std::cout << "mem_block_size = " << mem_block_size_ * 4 << std::endl; + std::cout << "alias_mem_block_size = " << alias_mem_block_size_ * 4 << std::endl; + } + + void LDAModelBlock::InitFromDataBlock(const LDADataBlock *data_block, int32_t num_vocabs, int32_t num_topics) + { + num_vocabs_ = num_vocabs; + num_topics_ = num_topics; + + int32_t doc_num = data_block->num_documents(); + dict_ = new WordEntry[num_vocabs_]; + for (int i = 0; i < num_vocabs_; ++i) + { + dict_[i].tf = 0; + } + + for (int i = 0; i < doc_num; ++i) + { + std::shared_ptr doc = data_block->GetOneDoc(i); + int32_t doc_size = doc->size(); + for (int j = 0; j < doc_size; ++j) + { + int32_t w = doc->Word(j); + dict_[w].tf++; + } + } + + InitModelBlockByTFS(false); + } + // Count the number of nonzero values in each row + void LDAModelBlock::CountNonZero(std::vector &tfs) + { + for (int i = 0; i < num_vocabs_; ++i) + { + hybrid_map row(mem_block_ + dict_[i].offset_, + dict_[i].is_dense_, + dict_[i].capacity_, + 0, + nullptr); + tfs[i] = row.nonzero_num(); + } + } + + void LDAModelBlock::GetModelSizeByTFS(bool fullSparse, std::vector &tfs, int64_t &mem_block_size, int64_t &alias_mem_block_size) + { + const int32_t max_tf_thresh = std::numeric_limits::max(); + int32_t hot_thresh; + if (fullSparse) + { + // totally sparse + // use a very large threshold to ensure every row of word-topic-table using a sparse representation + hot_thresh = std::numeric_limits::max(); + } + else + { + // hybrid + hot_thresh = num_topics_ / (2 * load_factor_); + } + // hot_thresh = 0; // totally dense + int32_t alias_hot_thresh; + if (fullSparse) + { + // use a very large threshold to ensure every row of alias table using a sparse representation + alias_hot_thresh = std::numeric_limits::max(); + } + else + { + alias_hot_thresh = (num_topics_ * 2) / 3; + } + + int32_t word_id; + int32_t capacity = 0; + int32_t alias_capacity = 0; + int32_t row_size = 0; + int32_t alias_row_size = 0; + + mem_block_size = 0; + alias_mem_block_size = 0; + + for (word_id = 0; word_id < num_vocabs_; ++word_id) + { + int32_t tf = tfs[word_id]; + + if (tf >= hot_thresh) + { + capacity = num_topics_; + row_size = capacity; + } + else if (tf > 0) + { + int capacity_lower_bound = load_factor_ * tf; + capacity = (int32_t)upper_bound(capacity_lower_bound); + row_size = capacity * 2; + } + else + { + capacity = 0; + row_size = 0; + } + mem_block_size += row_size; + + if (tf >= alias_hot_thresh) + { + alias_capacity = num_topics_; + alias_row_size = 2 * num_topics_; + } + else if (tf > 0) + { + alias_capacity = tf; + alias_row_size = 3 * tf; + } + else + { + alias_capacity = 0; + alias_row_size = 0; + } + alias_mem_block_size += alias_row_size; + } + } + + // NOTE: we can re-use the dict_ variable here, but we deliberately not use it. + // This function should not change the internal state of model_block_ + void LDAModelBlock::GetModelStat(int64_t &mem_block_size, int64_t &alias_mem_block_size) + { + std::vector tfs(num_vocabs_, 0); + CountNonZero(tfs); + + // calculate the mem_block_size, alias_mem_block_size + GetModelSizeByTFS(true, tfs, mem_block_size, alias_mem_block_size); + } +} diff --git a/src/Native/LdaNative/model_block.h b/src/Native/LdaNative/model_block.h new file mode 100644 index 0000000000..2160be1d7f --- /dev/null +++ b/src/Native/LdaNative/model_block.h @@ -0,0 +1,93 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once +#include +#include +#include +#include +#include +#include +#include "data_block.h" +#include "hybrid_map.h" +#include "hybrid_alias_map.h" + +namespace lda +{ + struct WordEntry + { + int32_t word_id_; + int64_t offset_; + int64_t end_offset_; + int32_t capacity_; + int32_t is_dense_; + + int32_t tf; + int64_t alias_offset_; + int64_t alias_end_offset_; + int32_t alias_capacity_; + int32_t is_alias_dense_; + }; + + class LDAModelBlock + { + public: + LDAModelBlock(); + ~LDAModelBlock(); + + inline hybrid_map get_row(int word_id, int32_t *external_buf); + inline hybrid_alias_map get_alias_row(int word_id); + void SetWordInfo(int word_id, int32_t nonzero_num, bool fullSparse); + + void Clear(); + void Init(int32_t num_vocabs, int32_t num_topics); + void Init(int32_t num_vocabs, int32_t num_topics, int64_t nonzero_num); + void Init(int32_t num_vocabs, int32_t num_topics, int64_t mem_block_size, int64_t alias_mem_block_size); + + void InitFromDataBlock(const LDADataBlock *data_block, int32_t num_vocabs, int32_t num_topics); + + void GetModelStat(int64_t &mem_block_size, int64_t &alias_mem_block_size); + + private: + + LDAModelBlock(const LDAModelBlock &other) = delete; + LDAModelBlock& operator=(const LDAModelBlock &other) = delete; + + void CountNonZero(std::vector &tfs); + void InitModelBlockByTFS(bool fullSparse); + void GetModelSizeByTFS(bool fullSparse, std::vector &tfs, int64_t &mem_block_size, int64_t &alias_mem_block_size); + + int32_t num_vocabs_; + int32_t num_topics_; + WordEntry *dict_; + int32_t *mem_block_; + int64_t mem_block_size_; + + int32_t *alias_mem_block_; + int64_t alias_mem_block_size_; + + int64_t offset_; + int64_t alias_offset_; + + const int32_t load_factor_ = 2; + const int32_t sparse_factor_ = 5; + }; + inline hybrid_map LDAModelBlock::get_row(int word_id, int32_t *external_buf) + { + hybrid_map row(mem_block_ + dict_[word_id].offset_, + dict_[word_id].is_dense_, + dict_[word_id].capacity_, + 0, + external_buf); + return row; + } + inline hybrid_alias_map LDAModelBlock::get_alias_row(int word_id) + { + hybrid_alias_map row(alias_mem_block_ + dict_[word_id].alias_offset_, + dict_[word_id].is_alias_dense_, + dict_[word_id].alias_capacity_); + return row; + } + +} \ No newline at end of file diff --git a/src/Native/LdaNative/rand_int_rng.h b/src/Native/LdaNative/rand_int_rng.h new file mode 100644 index 0000000000..c51943e11f --- /dev/null +++ b/src/Native/LdaNative/rand_int_rng.h @@ -0,0 +1,45 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once +#include +#include + +namespace wood +{ + class xorshift_rng + { + public: + xorshift_rng() + { + jxr = 1234567; + } + ~xorshift_rng() {} + + inline void restart() + { + jxr = 1234567; + } + + inline int32_t rand() + { + jxr ^= (jxr << 13); jxr ^= (jxr >> 17); jxr ^= (jxr << 5); //get random (xorshift) 32-bit integer + return jxr & 0x7fffffff; + } + inline int32_t rand_k(int K) + { + return (int32_t)(rand() * 4.6566125e-10 * K); + } + inline float rand_real() + { + return (float)(rand() * 4.6566125e-10); + } + private: + + xorshift_rng(const xorshift_rng &other) = delete; + xorshift_rng& operator=(const xorshift_rng &other) = delete; + + unsigned int jxr; + }; +} \ No newline at end of file diff --git a/src/Native/LdaNative/simple_barrier.h b/src/Native/LdaNative/simple_barrier.h new file mode 100644 index 0000000000..55f8d601b9 --- /dev/null +++ b/src/Native/LdaNative/simple_barrier.h @@ -0,0 +1,66 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#ifndef _SIMPLE_BARRIER_H_ +#define _SIMPLE_BARRIER_H_ + +#include +#include +#include +namespace lda +{ + class SimpleBarrier + { + public: + SimpleBarrier(unsigned int n) :barrier_size_(n), num_of_waiting_(0), rounds_(0) + {}; + + void reset() + { + throw "not implemented yet."; + } + + bool wait() + { + std::unique_lock lock(mutex_); + if (num_of_waiting_.fetch_add(1) >= barrier_size_ - 1) + { + cond_.notify_all(); + num_of_waiting_.store(0); + rounds_.fetch_add(1); + return true; + } + else + { + + unsigned int i = rounds_.load(); + cond_.wait(lock, [&]{return i != rounds_.load(); }); + return false; + } + } + + ~SimpleBarrier() + { + num_of_waiting_ = 0; + rounds_ = 0; + } + + + + protected: + const unsigned int barrier_size_; + + std::atomic num_of_waiting_; + std::atomic rounds_; + std::condition_variable cond_; + std::mutex mutex_; + }; +} + + + + + +#endif // _SIMPLE_BARRIER_H_ + diff --git a/src/Native/LdaNative/timer.h b/src/Native/LdaNative/timer.h new file mode 100644 index 0000000000..ac9aff94b2 --- /dev/null +++ b/src/Native/LdaNative/timer.h @@ -0,0 +1,95 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once +#include +using namespace std::chrono; +class CTimer +{ + +private: + + steady_clock::time_point startPerfomanceCount; + steady_clock::time_point endPerfomanceCount; + duration totalElapsed; +public: + char m_szMessage[1024]; + +public: + CTimer() + { + Initialize(); + } + + CTimer(bool bStartOnCreate) + { + Initialize(); + + if (bStartOnCreate) + { + Start(); + } + } + + void Initialize() + { + totalElapsed = duration(); + } + + void Start() + { + startPerfomanceCount = std::chrono::steady_clock::now(); + } + + // time unit: seconds + void Tag(const char* pszMsg = NULL) + { + endPerfomanceCount = std::chrono::steady_clock::now(); + totalElapsed += duration_cast> (endPerfomanceCount - startPerfomanceCount); + OutputStatistics(pszMsg); + //start next round + Start(); + } + + // time unit: seconds + void InnerTag() + { + endPerfomanceCount = std::chrono::steady_clock::now(); + totalElapsed += duration_cast> (endPerfomanceCount - startPerfomanceCount); + + OutputStatistics(m_szMessage); + + //start next round + Start(); + } + + float GetTotalElaps() + { + return totalElapsed.count(); + } + float GetTimeSpan() + { + endPerfomanceCount = std::chrono::steady_clock::now(); + totalElapsed += duration_cast> (endPerfomanceCount - startPerfomanceCount); + float timespent = totalElapsed.count(); + + //start next round + Start(); + + return timespent; + } + + float GetTaggedTimeSpan() + { + return duration_cast> (endPerfomanceCount - startPerfomanceCount).count(); + } + + void OutputStatistics(const char* pszMsg = NULL) + { + printf("Time Cost totally: %f, last time span(%s): %f seconds.\n", GetTotalElaps(), pszMsg, GetTaggedTimeSpan()); + } + +private: + CTimer(const CTimer& obj); +}; \ No newline at end of file diff --git a/src/Native/LdaNative/type_common.h b/src/Native/LdaNative/type_common.h new file mode 100644 index 0000000000..a7043b28b1 --- /dev/null +++ b/src/Native/LdaNative/type_common.h @@ -0,0 +1,9 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once + +namespace lda { + typedef float real_t; +} \ No newline at end of file diff --git a/src/Native/LdaNative/utils.cpp b/src/Native/LdaNative/utils.cpp new file mode 100644 index 0000000000..c1c5cee076 --- /dev/null +++ b/src/Native/LdaNative/utils.cpp @@ -0,0 +1,62 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#include "utils.hpp" + +#include "math.h" +#include + +namespace { + const double cof[6] = { 76.18009172947146, -86.50532032941677, + 24.01409824083091, -1.231739572450155, + 0.1208650973866179e-2, -0.5395239384953e-5 + }; +} + +namespace lda { + + double LogGamma(double xx) + { + int j; + double x, y, tmp1, ser; + y = xx; + x = xx; + tmp1 = x + 5.5; + tmp1 -= (x + 0.5)*log(tmp1); + ser = 1.000000000190015; + for (j = 0; j < 6; j++) ser += cof[j] / ++y; + return -tmp1 + log(2.5066282746310005*ser / x); + } + + + double get_time() { + auto start = std::chrono::high_resolution_clock::now(); + auto since_epoch = start.time_since_epoch(); + return std::chrono::duration_cast>>(since_epoch).count(); + } + + void CBlockedIntQueue::clear() + { + std::lock_guard lock(_mutex); + _queue.clear(); + } + + int CBlockedIntQueue::pop() + { + std::unique_lock lock(_mutex); + _condition.wait(lock, [this] { return !_queue.empty(); }); + auto val = _queue.front(); + _queue.pop_front(); + return val; + } + + void CBlockedIntQueue::push(int value) + { + { + std::lock_guard lock(_mutex); + _queue.push_back(value); + } + _condition.notify_one(); + } +} diff --git a/src/Native/LdaNative/utils.hpp b/src/Native/LdaNative/utils.hpp new file mode 100644 index 0000000000..7b71ec67da --- /dev/null +++ b/src/Native/LdaNative/utils.hpp @@ -0,0 +1,49 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +#pragma once +#define NOMINMAX + +#include +#include +#include +#include +#include + + +namespace lda { + + double LogGamma(double xx); + double get_time(); + + struct LDAEngineAtomics + { + LDAEngineAtomics() :doc_ll_(0), word_ll_(0), num_tokens_clock_(0), thread_counter_(0){} + ~LDAEngineAtomics() {} + + std::atomic doc_ll_; + std::atomic word_ll_; + + // # of tokens processed in a Clock() call. + std::atomic num_tokens_clock_; + std::atomic thread_counter_; + + std::mutex global_mutex_; + }; + + class CBlockedIntQueue + { + public: + void clear(); + int pop(); + void push(int value); + + private: + std::mutex _mutex; + std::condition_variable _condition; + std::deque _queue; + }; + + +} diff --git a/src/Native/build.proj b/src/Native/build.proj index 5074517fda..c091a78c43 100644 --- a/src/Native/build.proj +++ b/src/Native/build.proj @@ -74,6 +74,9 @@ + + diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index a6d1f50668..a5b66052f7 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -76,6 +76,7 @@ Transforms.KeyToTextConverter KeyToValueTransform utilizes KeyValues metadata to Transforms.LabelColumnKeyBooleanConverter Transforms the label to either key or bool (if needed) to make it suitable for classification. Microsoft.ML.Runtime.EntryPoints.FeatureCombiner PrepareClassificationLabel Microsoft.ML.Runtime.EntryPoints.FeatureCombiner+ClassificationLabelInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput Transforms.LabelIndicator Label remapper used by OVA Microsoft.ML.Runtime.Data.LabelIndicatorTransform LabelIndicator Microsoft.ML.Runtime.Data.LabelIndicatorTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput Transforms.LabelToFloatConverter Transforms the label to float to make it suitable for regression. Microsoft.ML.Runtime.EntryPoints.FeatureCombiner PrepareRegressionLabel Microsoft.ML.Runtime.EntryPoints.FeatureCombiner+RegressionLabelInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput +Transforms.LightLda The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation. Microsoft.ML.Runtime.Transforms.TextAnalytics LightLda Microsoft.ML.Runtime.TextAnalytics.LdaTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput Transforms.LogMeanVarianceNormalizer Normalizes the data based on the computed mean and variance of the logarithm of the data. Microsoft.ML.Runtime.Data.Normalize LogMeanVar Microsoft.ML.Runtime.Data.NormalizeTransform+LogMeanVarArguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput Transforms.LpNormalizer Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm. Microsoft.ML.Runtime.Data.LpNormalization Normalize Microsoft.ML.Runtime.Data.LpNormNormalizerTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput Transforms.ManyHeterogeneousModelCombiner Combines a sequence of TransformModels and a PredictorModel into a single PredictorModel. Microsoft.ML.Runtime.EntryPoints.ModelOperations CombineModels Microsoft.ML.Runtime.EntryPoints.ModelOperations+PredictorModelInput Microsoft.ML.Runtime.EntryPoints.ModelOperations+PredictorModelOutput diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index a5cb656da9..b15d04c860 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -14831,6 +14831,388 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.LightLda", + "Desc": "The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.", + "FriendlyName": "Latent Dirichlet Allocation Transform", + "ShortName": "LightLda", + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "NumTopic", + "Type": "Int", + "Desc": "The number of topics in the LDA", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "AlphaSum", + "Type": "Float", + "Desc": "Dirichlet prior on document-topic vectors", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Beta", + "Type": "Float", + "Desc": "Dirichlet prior on vocab-topic vectors", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Mhstep", + "Type": "Int", + "Desc": "Number of Metropolis Hasting step", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "NumIterations", + "Type": "Int", + "Desc": "Number of iterations", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "LikelihoodInterval", + "Type": "Int", + "Desc": "Compute log likelihood over local dataset on this iteration interval", + "Aliases": [ + "llInterval" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of training threads", + "Aliases": [ + "t" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "NumMaxDocToken", + "Type": "Int", + "Desc": "The threshold of maximum count of tokens per doc", + "Aliases": [ + "maxNumToken" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "NumSummaryTermPerTopic", + "Type": "Int", + "Desc": "The number of words to summarize the topic", + "Aliases": [ + "ns" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "NumBurninIterations", + "Type": "Int", + "Desc": "The number of burn-in iterations", + "Aliases": [ + "burninIter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": 10 + }, + { + "Name": "ResetRandomGenerator", + "Type": "Bool", + "Desc": "Reset the random number generator for each document", + "Aliases": [ + "reset" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:srcs)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 49.0, + "IsNullable": false + }, + { + "Name": "NumTopic", + "Type": "Int", + "Desc": "The number of topics in the LDA", + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 100, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 20, + 40, + 100, + 200 + ] + } + }, + { + "Name": "NumMaxDocToken", + "Type": "Int", + "Desc": "The threshold of maximum count of tokens per doc", + "Aliases": [ + "maxNumToken" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 512 + }, + { + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of training threads. Default value depends on number of logical processors.", + "Aliases": [ + "t" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "AlphaSum", + "Type": "Float", + "Desc": "Dirichlet prior on document-topic vectors", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 100.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 100, + 200 + ] + } + }, + { + "Name": "Beta", + "Type": "Float", + "Desc": "Dirichlet prior on vocab-topic vectors", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.01, + 0.015, + 0.07, + 0.02 + ] + } + }, + { + "Name": "Mhstep", + "Type": "Int", + "Desc": "Number of Metropolis Hasting step", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 4, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 2, + 4, + 8, + 16 + ] + } + }, + { + "Name": "NumIterations", + "Type": "Int", + "Desc": "Number of iterations", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 200, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 100, + 200, + 300, + 400 + ] + } + }, + { + "Name": "LikelihoodInterval", + "Type": "Int", + "Desc": "Compute log likelihood over local dataset on this iteration interval", + "Aliases": [ + "llInterval" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 5 + }, + { + "Name": "NumSummaryTermPerTopic", + "Type": "Int", + "Desc": "The number of words to summarize the topic", + "Aliases": [ + "ns" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 10 + }, + { + "Name": "NumBurninIterations", + "Type": "Int", + "Desc": "The number of burn-in iterations", + "Aliases": [ + "burninIter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 10, + 20, + 30, + 40 + ] + } + }, + { + "Name": "ResetRandomGenerator", + "Type": "Bool", + "Desc": "Reset the random number generator for each document", + "Aliases": [ + "reset" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "OutputTopicWordSummary", + "Type": "Bool", + "Desc": "Whether to output the topic-word summary in text format", + "Aliases": [ + "summary" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.LogMeanVarianceNormalizer", "Desc": "Normalizes the data based on the computed mean and variance of the logarithm of the data.", diff --git a/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj b/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj index 9f38858721..5a66e5bbcd 100644 --- a/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj +++ b/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj @@ -20,6 +20,7 @@ + \ No newline at end of file diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 60e79a943d..b76c4ba24c 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -1116,6 +1116,39 @@ public void EntryPointPcaTransform() }); } + [Fact] + public void EntryPointLightLdaTransform() + { + string dataFile = DeleteOutputPath("SavePipe", "SavePipeTextLightLda-SampleText.txt"); + File.WriteAllLines(dataFile, new[] { + "The quick brown fox jumps over the lazy dog.", + "The five boxing wizards jump quickly." + }); + + TestEntryPointPipelineRoutine(dataFile, "sep={ } col=T:TX:0-**", + new[] + { + "Transforms.TextFeaturizer", + "Transforms.LightLda" + }, + new[] + { + @"'Column': { + 'Name': 'T', + 'Source': [ + 'T' + ] + + }, + 'VectorNormalizer': 'None'", + @"'Column': [ + { + 'Name': 'T', + 'Source': 'T' + }]" + }); + } + [Fact] public void EntryPointAveragePerceptron() { diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs new file mode 100644 index 0000000000..c598879795 --- /dev/null +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs @@ -0,0 +1,148 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Float = System.Single; + +using System; +using System.Collections.Generic; +using System.IO; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Data.IO; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; +using Microsoft.ML.Runtime.TextAnalytics; +using Xunit; + +namespace Microsoft.ML.Runtime.RunTests +{ + /// + /// A class for non-baseline data pipe tests. + /// + public sealed partial class TestDataPipeNoBaseline : TestDataViewBase + { + [Fact] + public void TestLDATransform() + { + var builder = new ArrayDataViewBuilder(Env); + var data = new[] + { + new[] { (Float)1.0, (Float)0.0, (Float)0.0 }, + new[] { (Float)0.0, (Float)1.0, (Float)0.0 }, + new[] { (Float)0.0, (Float)0.0, (Float)1.0 }, + }; + + builder.AddColumn("F1V", NumberType.Float, data); + + var srcView = builder.GetDataView(); + + LdaTransform.Column col = new LdaTransform.Column(); + col.Source = "F1V"; + col.NumTopic = 20; + col.NumTopic = 3; + col.NumSummaryTermPerTopic = 3; + col.AlphaSum = 3; + col.NumThreads = 1; + col.ResetRandomGenerator = true; + LdaTransform.Arguments args = new LdaTransform.Arguments(); + args.Column = new LdaTransform.Column[] { col }; + + LdaTransform ldaTransform = new LdaTransform(Env, args, srcView); + + using (var cursor = ldaTransform.GetRowCursor(c => true)) + { + var resultGetter = cursor.GetGetter>(1); + VBuffer resultFirstRow = new VBuffer(); + VBuffer resultSecondRow = new VBuffer(); + VBuffer resultThirdRow = new VBuffer(); + + Assert.True(cursor.MoveNext()); + resultGetter(ref resultFirstRow); + Assert.True(cursor.MoveNext()); + resultGetter(ref resultSecondRow); + Assert.True(cursor.MoveNext()); + resultGetter(ref resultThirdRow); + Assert.False(cursor.MoveNext()); + + Assert.True(resultFirstRow.Length == 3); + Assert.True(resultFirstRow.GetItemOrDefault(0) == 0); + Assert.True(resultFirstRow.GetItemOrDefault(2) == 0); + Assert.True(resultFirstRow.GetItemOrDefault(1) == 1.0); + Assert.True(resultSecondRow.Length == 3); + Assert.True(resultSecondRow.GetItemOrDefault(0) == 0); + Assert.True(resultSecondRow.GetItemOrDefault(2) == 0); + Assert.True(resultSecondRow.GetItemOrDefault(1) == 1.0); + Assert.True(resultThirdRow.Length == 3); + Assert.True(resultThirdRow.GetItemOrDefault(0) == 0); + Assert.True(resultThirdRow.GetItemOrDefault(1) == 0); + Assert.True(resultThirdRow.GetItemOrDefault(2) == 1.0); + } + + using (var cursor = ldaTransform.GetRowCursor(c => true)) + { + var resultGetter = cursor.GetGetter>(1); + VBuffer resultFirstRow = new VBuffer(); + VBuffer resultSecondRow = new VBuffer(); + VBuffer resultThirdRow = new VBuffer(); + + Assert.True(cursor.MoveNext()); + resultGetter(ref resultFirstRow); + Assert.True(cursor.MoveNext()); + resultGetter(ref resultSecondRow); + Assert.True(cursor.MoveNext()); + resultGetter(ref resultThirdRow); + Assert.False(cursor.MoveNext()); + + Assert.True(resultFirstRow.Length == 3); + Assert.True(resultFirstRow.GetItemOrDefault(0) == 0); + Assert.True(resultFirstRow.GetItemOrDefault(2) == 0); + Assert.True(resultFirstRow.GetItemOrDefault(1) == 1.0); + Assert.True(resultSecondRow.Length == 3); + Assert.True(resultSecondRow.GetItemOrDefault(0) == 0); + Assert.True(resultSecondRow.GetItemOrDefault(2) == 0); + Assert.True(resultSecondRow.GetItemOrDefault(1) == 1.0); + Assert.True(resultThirdRow.Length == 3); + Assert.True(resultThirdRow.GetItemOrDefault(0) == 0); + Assert.True(resultThirdRow.GetItemOrDefault(1) == 0); + Assert.True(resultThirdRow.GetItemOrDefault(2) == 1.0); + } + } + + [Fact] + public void TestLdaTransformEmptyDocumentException() + { + var builder = new ArrayDataViewBuilder(Env); + var data = new[] + { + new[] { (Float)0.0, (Float)0.0, (Float)0.0 }, + new[] { (Float)0.0, (Float)0.0, (Float)0.0 }, + new[] { (Float)0.0, (Float)0.0, (Float)0.0 }, + }; + + builder.AddColumn("Zeros", NumberType.Float, data); + + var srcView = builder.GetDataView(); + var col = new LdaTransform.Column() + { + Source = "Zeros" + }; + var args = new LdaTransform.Arguments() + { + Column = new[] { col } + }; + + try + { + var lda = new LdaTransform(Env, args, srcView); + } + catch (InvalidOperationException ex) + { + Assert.Equal(ex.Message, string.Format("The specified documents are all empty in column '{0}'.", col.Source)); + return; + } + + Assert.True(false, "The LDA transform does not throw expected error on empty documents."); + } + } +} diff --git a/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj b/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj index d9cf8a2f29..6b8c67b6ff 100644 --- a/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj +++ b/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj @@ -13,4 +13,9 @@ + + + + + \ No newline at end of file diff --git a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj index c4ccb76ae9..d9b1e9e18b 100644 --- a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj +++ b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj @@ -18,5 +18,6 @@ + \ No newline at end of file