diff --git a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
index 3ce5102fa1..d6de490d52 100644
--- a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
+++ b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
@@ -92,5 +92,20 @@ public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, C
                 OutputData = view
             };
         }
+
+        [TlcModule.EntryPoint(Name = "Transforms.LightLda", Desc = LdaTransform.Summary, UserName = LdaTransform.UserName, ShortName = LdaTransform.ShortName)]
+        public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LdaTransform.Arguments input)
+        {
+            Contracts.CheckValue(env, nameof(env));
+            env.CheckValue(input, nameof(input));
+
+            var h = EntryPointUtils.CheckArgsAndCreateHost(env, "LightLda", input);
+            var view = new LdaTransform(h, input, input.Data);
+            return new CommonOutputs.TransformOutput()
+            {
+                Model = new TransformModel(h, view, input.Data),
+                OutputData = view
+            };
+        }
     }
 }
diff --git a/src/Microsoft.ML.Transforms/Text/LdaSingleBox.cs b/src/Microsoft.ML.Transforms/Text/LdaSingleBox.cs
new file mode 100644
index 0000000000..36e55099d9
--- /dev/null
+++ b/src/Microsoft.ML.Transforms/Text/LdaSingleBox.cs
@@ -0,0 +1,357 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Runtime.InteropServices;
+using System.Security;
+
+namespace Microsoft.ML.Runtime.TextAnalytics
+{
+
+    internal static class LdaInterface
+    {
+        public struct LdaEngine
+        {
+            public IntPtr Ptr;
+        }
+
+        private const string NativeDll = "LdaNative";
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern LdaEngine CreateEngine(int numTopic, int numVocab, float alphaSum, float beta, int numIter,
+            int likelihoodInterval, int numThread, int mhstep, int maxDocToken);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void AllocateModelMemory(LdaEngine engine, int numTopic, int numVocab, long tableSize, long aliasTableSize);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void AllocateDataMemory(LdaEngine engine, int docNum, long corpusSize);
+
+        [DllImport(NativeDll, CharSet = CharSet.Ansi), SuppressUnmanagedCodeSecurity]
+        internal static extern void Train(LdaEngine engine, string trainOutput);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void GetModelStat(LdaEngine engine, out long memBlockSize, out long aliasMemBlockSize);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void Test(LdaEngine engine, int numBurninIter, float[] pLogLikelihood);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void CleanData(LdaEngine engine);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void CleanModel(LdaEngine engine);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void DestroyEngine(LdaEngine engine);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void GetWordTopic(LdaEngine engine, int wordId, int[] pTopic, int[] pProb, ref int length);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void SetWordTopic(LdaEngine engine, int wordId, int[] pTopic, int[] pProb, int length);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void SetAlphaSum(LdaEngine engine, float avgDocLength);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern int FeedInData(LdaEngine engine, int[] termId, int[] termFreq, int termNum, int numVocab);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern int FeedInDataDense(LdaEngine engine, int[] termFreq, int termNum, int numVocab);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void GetDocTopic(LdaEngine engine, int docId, int[] pTopic, int[] pProb, ref int numTopicReturn);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void GetTopicSummary(LdaEngine engine, int topicId, int[] pWords, float[] pProb, ref int numTopicReturn);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void TestOneDoc(LdaEngine engine, int[] termId, int[] termFreq, int termNum, int[] pTopics, int[] pProbs, ref int numTopicsMax, int numBurnIter, bool reset);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void TestOneDocDense(LdaEngine engine, int[] termFreq, int termNum, int[] pTopics, int[] pProbs, ref int numTopicsMax, int numBurninIter, bool reset);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void InitializeBeforeTrain(LdaEngine engine);
+
+        [DllImport(NativeDll), SuppressUnmanagedCodeSecurity]
+        internal static extern void InitializeBeforeTest(LdaEngine engine);
+    }
+
+    internal sealed class LdaSingleBox : IDisposable
+    {
+        private LdaInterface.LdaEngine _engine;
+        private bool _isDisposed;
+        private int[] _topics;
+        private int[] _probabilities;
+        private int[] _summaryTerm;
+        private float[] _summaryTermProb;
+        private readonly int _likelihoodInterval;
+        private readonly float _alpha;
+        private readonly float _beta;
+        private readonly int _mhStep;
+        private readonly int _numThread;
+        private readonly int _numSummaryTerms;
+        private readonly bool _denseOutput;
+
+        public readonly int NumTopic;
+        public readonly int NumVocab;
+        public LdaSingleBox(int numTopic, int numVocab, float alpha,
+                            float beta, int numIter, int likelihoodInterval, int numThread,
+                            int mhstep, int numSummaryTerms, bool denseOutput, int maxDocToken)
+        {
+            NumTopic = numTopic;
+            NumVocab = numVocab;
+            _alpha = alpha;
+            _beta = beta;
+            _mhStep = mhstep;
+            _numSummaryTerms = numSummaryTerms;
+            _denseOutput = denseOutput;
+            _likelihoodInterval = likelihoodInterval;
+            _numThread = numThread;
+
+            _topics = new int[numTopic];
+            _probabilities = new int[numTopic];
+
+            _summaryTerm = new int[_numSummaryTerms];
+            _summaryTermProb = new float[_numSummaryTerms];
+
+            _engine = LdaInterface.CreateEngine(numTopic, numVocab, alpha, beta, numIter, likelihoodInterval, numThread, mhstep, maxDocToken);
+        }
+
+        public void AllocateModelMemory(int numTopic, int numVocab, long tableSize, long aliasTableSize)
+        {
+            Contracts.Check(numTopic >= 0);
+            Contracts.Check(numVocab >= 0);
+            Contracts.Check(tableSize >= 0);
+            Contracts.Check(aliasTableSize >= 0);
+            LdaInterface.AllocateModelMemory(_engine, numVocab, numTopic, tableSize, aliasTableSize);
+        }
+
+        public void AllocateDataMemory(int docNum, long corpusSize)
+        {
+            Contracts.Check(docNum >= 0);
+            Contracts.Check(corpusSize >= 0);
+            LdaInterface.AllocateDataMemory(_engine, docNum, corpusSize);
+        }
+
+        public void Train(string trainOutput)
+        {
+            if (string.IsNullOrWhiteSpace(trainOutput))
+                LdaInterface.Train(_engine, null);
+            else
+                LdaInterface.Train(_engine, trainOutput);
+        }
+
+        public void GetModelStat(out long memBlockSize, out long aliasMemBlockSize)
+        {
+            LdaInterface.GetModelStat(_engine, out memBlockSize, out aliasMemBlockSize);
+        }
+
+        public void Test(int numBurninIter, float[] logLikelihood)
+        {
+            Contracts.Check(numBurninIter >= 0);
+            var pLogLikelihood = new float[numBurninIter];
+            LdaInterface.Test(_engine, numBurninIter, pLogLikelihood);
+            logLikelihood = pLogLikelihood.Select(item => (float)item).ToArray();
+        }
+
+        public void CleanData()
+        {
+            LdaInterface.CleanData(_engine);
+        }
+
+        public void CleanModel()
+        {
+            LdaInterface.CleanModel(_engine);
+        }
+
+        public void CopyModel(LdaSingleBox trainer, int wordId)
+        {
+            int length = NumTopic;
+            LdaInterface.GetWordTopic(trainer._engine, wordId, _topics, _probabilities, ref length);
+            LdaInterface.SetWordTopic(_engine, wordId, _topics, _probabilities, length);
+        }
+
+        public void SetAlphaSum(float averageDocLength)
+        {
+            LdaInterface.SetAlphaSum(_engine, averageDocLength);
+        }
+
+        public int LoadDoc(int[] termID, double[] termVal, int termNum, int numVocab)
+        {
+            Contracts.Check(numVocab == NumVocab);
+            Contracts.Check(termNum > 0);
+            Contracts.Check(termID.Length >= termNum);
+            Contracts.Check(termVal.Length >= termNum);
+
+            int[] pID = new int[termNum];
+            int[] pVal = termVal.Select(item => (int)item).ToArray();
+            Array.Copy(termID, pID, termNum);
+            return LdaInterface.FeedInData(_engine, pID, pVal, termNum, NumVocab);
+        }
+
+        public int LoadDocDense(double[] termVal, int termNum, int numVocab)
+        {
+            Contracts.Check(numVocab == NumVocab);
+            Contracts.Check(termNum > 0);
+
+            Contracts.Check(termVal.Length >= termNum);
+
+            int[] pID = new int[termNum];
+            int[] pVal = termVal.Select(item => (int)item).ToArray();
+            return LdaInterface.FeedInDataDense(_engine, pVal, termNum, NumVocab);
+
+        }
+
+        public List<KeyValuePair<int, float>> GetDocTopicVector(int docID)
+        {
+            int numTopicReturn = NumTopic;
+            LdaInterface.GetDocTopic(_engine, docID, _topics, _probabilities, ref numTopicReturn);
+            var topicRet = new List<KeyValuePair<int, float>>();
+            int currentTopic = 0;
+            for (int i = 0; i < numTopicReturn; i++)
+            {
+                if (_denseOutput)
+                {
+                    while (currentTopic < _topics[i])
+                    {
+                        //use a value to smooth the count so that we get dense output on each topic
+                        //the smooth value is usually set to 0.1 
+                        topicRet.Add(new KeyValuePair<int, float>(currentTopic, (float)_alpha));
+                        currentTopic++;
+                    }
+                    topicRet.Add(new KeyValuePair<int, float>(_topics[i], _probabilities[i] + (float)_alpha));
+                    currentTopic++;
+                }
+                else
+                {
+                    topicRet.Add(new KeyValuePair<int, float>(_topics[i], (float)_probabilities[i]));
+                }
+            }
+
+            if (_denseOutput)
+            {
+                while (currentTopic < NumTopic)
+                {
+                    topicRet.Add(new KeyValuePair<int, float>(currentTopic, (float)_alpha));
+                    currentTopic++;
+                }
+            }
+            return topicRet;
+        }
+
+        public List<KeyValuePair<int, float>> TestDoc(int[] termID, double[] termVal, int termNum, int numBurninIter, bool reset)
+        {
+            Contracts.Check(termNum > 0);
+            Contracts.Check(termVal.Length >= termNum);
+            Contracts.Check(termID.Length >= termNum);
+
+            int[] pID = new int[termNum];
+            int[] pVal = termVal.Select(item => (int)item).ToArray();
+            int[] pTopic = new int[NumTopic];
+            int[] pProb = new int[NumTopic];
+            Array.Copy(termID, pID, termNum);
+
+            int numTopicReturn = NumTopic;
+
+            LdaInterface.TestOneDoc(_engine, pID, pVal, termNum, pTopic, pProb, ref numTopicReturn, numBurninIter, reset);
+
+            // PREfast suspects that the value of numTopicReturn could be changed in _engine->TestOneDoc, which might result in read overrun in the following loop.
+            if (numTopicReturn > NumTopic)
+            {
+                Contracts.Check(false);
+                numTopicReturn = NumTopic;
+            }
+
+            var topicRet = new List<KeyValuePair<int, float>>();
+            for (int i = 0; i < numTopicReturn; i++)
+                topicRet.Add(new KeyValuePair<int, float>(pTopic[i], (float)pProb[i]));
+            return topicRet;
+        }
+
+        public List<KeyValuePair<int, float>> TestDocDense(double[] termVal, int termNum, int numBurninIter, bool reset)
+        {
+            Contracts.Check(termNum > 0);
+            Contracts.Check(numBurninIter > 0);
+            Contracts.Check(termVal.Length >= termNum);
+            int[] pVal = termVal.Select(item => (int)item).ToArray();
+            int[] pTopic = new int[NumTopic];
+            int[] pProb = new int[NumTopic];
+
+            int numTopicReturn = NumTopic;
+
+            // There are two versions of TestOneDoc interfaces
+            // (1) TestOneDoc
+            // (2) TestOneDocRestart
+            // The second one is the same as the first one except that it will reset
+            // the states of the internal random number generator, so that it yields reproducable results for the same input
+            LdaInterface.TestOneDocDense(_engine, pVal, termNum, pTopic, pProb, ref numTopicReturn, numBurninIter, reset);
+
+            // PREfast suspects that the value of numTopicReturn could be changed in _engine->TestOneDoc, which might result in read overrun in the following loop.
+            if (numTopicReturn > NumTopic)
+            {
+                Contracts.Check(false);
+                numTopicReturn = NumTopic;
+            }
+
+            var topicRet = new List<KeyValuePair<int, float>>();
+            for (int i = 0; i < numTopicReturn; i++)
+                topicRet.Add(new KeyValuePair<int, float>(pTopic[i], (float)pProb[i]));
+            return topicRet;
+        }
+
+        public void InitializeBeforeTrain()
+        {
+            LdaInterface.InitializeBeforeTrain(_engine);
+        }
+
+        public void InitializeBeforeTest()
+        {
+            LdaInterface.InitializeBeforeTest(_engine);
+        }
+
+        public KeyValuePair<int, int>[] GetModel(int wordId)
+        {
+            int length = NumTopic;
+            LdaInterface.GetWordTopic(_engine, wordId, _topics, _probabilities, ref length);
+            var wordTopicVector = new KeyValuePair<int, int>[length];
+
+            for (int i = 0; i < length; i++)
+                wordTopicVector[i] = new KeyValuePair<int, int>(_topics[i], _probabilities[i]);
+            return wordTopicVector;
+        }
+
+        public KeyValuePair<int, float>[] GetTopicSummary(int topicId)
+        {
+            int length = _numSummaryTerms;
+            LdaInterface.GetTopicSummary(_engine, topicId, _summaryTerm, _summaryTermProb, ref length);
+            var topicSummary = new KeyValuePair<int, float>[length];
+
+            for (int i = 0; i < length; i++)
+                topicSummary[i] = new KeyValuePair<int, float>(_summaryTerm[i], _summaryTermProb[i]);
+            return topicSummary;
+        }
+
+        public void SetModel(int termID, int[] topicID, int[] topicProb, int topicNum)
+        {
+            Contracts.Check(termID >= 0);
+            Contracts.Check(topicNum <= NumTopic);
+            Array.Copy(topicID, _topics, topicNum);
+            Array.Copy(topicProb, _probabilities, topicNum);
+            LdaInterface.SetWordTopic(_engine, termID, _topics, _probabilities, topicNum);
+        }
+
+        public void Dispose()
+        {
+            if (_isDisposed)
+                return;
+            _isDisposed = true;
+            LdaInterface.DestroyEngine(_engine);
+            _engine.Ptr = IntPtr.Zero;
+        }
+    }
+}
diff --git a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs
new file mode 100644
index 0000000000..1267f634cd
--- /dev/null
+++ b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs
@@ -0,0 +1,962 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Float = System.Single;
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Microsoft.ML.Runtime;
+using Microsoft.ML.Runtime.CommandLine;
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.EntryPoints;
+using Microsoft.ML.Runtime.Internal.Internallearn;
+using Microsoft.ML.Runtime.Internal.Utilities;
+using Microsoft.ML.Runtime.Model;
+using Microsoft.ML.Runtime.TextAnalytics;
+
+[assembly: LoadableClass(typeof(LdaTransform), typeof(LdaTransform.Arguments), typeof(SignatureDataTransform),
+    LdaTransform.UserName, LdaTransform.LoaderSignature, LdaTransform.ShortName, DocName = "transform/LdaTransform.md")]
+
+[assembly: LoadableClass(typeof(LdaTransform), null, typeof(SignatureLoadDataTransform),
+    LdaTransform.UserName, LdaTransform.LoaderSignature)]
+
+namespace Microsoft.ML.Runtime.TextAnalytics
+{
+    /// <summary>
+    /// The latent Dirichlet allocation (LDA) transform. 
+    /// http://arxiv.org/abs/1412.1576 
+    /// </summary>
+    public sealed class LdaTransform : OneToOneTransformBase
+    {
+        public sealed class Arguments : TransformInputBase
+        {
+            [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:srcs)", ShortName = "col", SortOrder = 49)]
+            public Column[] Column;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "The number of topics in the LDA", SortOrder = 50)]
+            [TGUI(SuggestedSweeps = "20,40,100,200")]
+            [TlcModule.SweepableDiscreteParam("NumTopic", new object[] { 20, 40, 100, 200 })]
+            public int NumTopic = 100;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Dirichlet prior on document-topic vectors")]
+            [TGUI(SuggestedSweeps = "1,10,100,200")]
+            [TlcModule.SweepableDiscreteParam("AlphaSum", new object[] { 1, 10, 100, 200 })]
+            public Single AlphaSum = 100;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Dirichlet prior on vocab-topic vectors")]
+            [TGUI(SuggestedSweeps = "0.01,0.015,0.07,0.02")]
+            [TlcModule.SweepableDiscreteParam("Beta", new object[] { 0.01f, 0.015f, 0.07f, 0.02f })]
+            public Single Beta = 0.01f;
+
+            [Argument(ArgumentType.Multiple, HelpText = "Number of Metropolis Hasting step")]
+            [TGUI(SuggestedSweeps = "2,4,8,16")]
+            [TlcModule.SweepableDiscreteParam("Mhstep", new object[] { 2, 4, 8, 16 })]
+            public int Mhstep = 4;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Number of iterations", ShortName = "iter")]
+            [TGUI(SuggestedSweeps = "100,200,300,400")]
+            [TlcModule.SweepableDiscreteParam("NumIterations", new object[] { 100, 200, 300, 400 })]
+            public int NumIterations = 200;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Compute log likelihood over local dataset on this iteration interval", ShortName = "llInterval")]
+            public int LikelihoodInterval = 5;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "The threshold of maximum count of tokens per doc", ShortName = "maxNumToken", SortOrder = 50)]
+            public int NumMaxDocToken = 512;
+
+            // REVIEW: Should change the default when multi-threading support is optimized.
+            [Argument(ArgumentType.AtMostOnce, HelpText = "The number of training threads. Default value depends on number of logical processors.", ShortName = "t", SortOrder = 50)]
+            public int? NumThreads;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "The number of words to summarize the topic", ShortName = "ns")]
+            public int NumSummaryTermPerTopic = 10;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "The number of burn-in iterations", ShortName = "burninIter")]
+            [TGUI(SuggestedSweeps = "10,20,30,40")]
+            [TlcModule.SweepableDiscreteParam("NumBurninIterations", new object[] { 10, 20, 30, 40 })]
+            public int NumBurninIterations = 10;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Reset the random number generator for each document", ShortName = "reset")]
+            public bool ResetRandomGenerator;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to output the topic-word summary in text format", ShortName = "summary")]
+            public bool OutputTopicWordSummary;
+        }
+
+        public sealed class Column : OneToOneColumn
+        {
+            [Argument(ArgumentType.AtMostOnce, HelpText = "The number of topics in the LDA")]
+            public int? NumTopic;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Dirichlet prior on document-topic vectors")]
+            public Single? AlphaSum;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Dirichlet prior on vocab-topic vectors")]
+            public Single? Beta;
+
+            [Argument(ArgumentType.Multiple, HelpText = "Number of Metropolis Hasting step")]
+            public int? Mhstep;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Number of iterations", ShortName = "iter")]
+            public int? NumIterations;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Compute log likelihood over local dataset on this iteration interval", ShortName = "llInterval")]
+            public int? LikelihoodInterval;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "The number of training threads", ShortName = "t")]
+            public int? NumThreads;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "The threshold of maximum count of tokens per doc", ShortName = "maxNumToken")]
+            public int? NumMaxDocToken;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "The number of words to summarize the topic", ShortName = "ns")]
+            public int? NumSummaryTermPerTopic;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "The number of burn-in iterations", ShortName = "burninIter")]
+            public int? NumBurninIterations = 10;
+
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Reset the random number generator for each document", ShortName = "reset")]
+            public bool? ResetRandomGenerator;
+
+            public static Column Parse(string str)
+            {
+                Contracts.AssertNonEmpty(str);
+
+                var res = new Column();
+                if (res.TryParse(str))
+                    return res;
+                return null;
+            }
+
+            public bool TryUnparse(StringBuilder sb)
+            {
+                Contracts.AssertValue(sb);
+                if (NumTopic != null || AlphaSum != null || Beta != null || Mhstep != null || NumIterations != null || LikelihoodInterval != null ||
+                    NumThreads != null || NumMaxDocToken != null || NumSummaryTermPerTopic != null || ResetRandomGenerator != null)
+                    return false;
+                return TryUnparseCore(sb);
+            }
+        }
+
+        private sealed class ColInfoEx
+        {
+            public readonly int NumTopic;
+            public readonly Single AlphaSum;
+            public readonly Single Beta;
+            public readonly int MHStep;
+            public readonly int NumIter;
+            public readonly int LikelihoodInterval;
+            public readonly int NumThread;
+            public readonly int NumMaxDocToken;
+            public readonly int NumSummaryTermPerTopic;
+            public readonly int NumBurninIter;
+            public readonly bool ResetRandomGenerator;
+
+            public ColInfoEx(IExceptionContext ectx, Column item, Arguments args)
+            {
+                Contracts.AssertValue(ectx);
+
+                NumTopic = item.NumTopic ?? args.NumTopic;
+                Contracts.CheckUserArg(NumTopic > 0, nameof(item.NumTopic), "Must be positive.");
+
+                AlphaSum = item.AlphaSum ?? args.AlphaSum;
+
+                Beta = item.Beta ?? args.Beta;
+
+                MHStep = item.Mhstep ?? args.Mhstep;
+                ectx.CheckUserArg(MHStep > 0, nameof(item.Mhstep), "Must be positive.");
+
+                NumIter = item.NumIterations ?? args.NumIterations;
+                ectx.CheckUserArg(NumIter > 0, nameof(item.NumIterations), "Must be positive.");
+
+                LikelihoodInterval = item.LikelihoodInterval ?? args.LikelihoodInterval;
+                ectx.CheckUserArg(LikelihoodInterval > 0, nameof(item.LikelihoodInterval), "Must be positive.");
+
+                NumThread = item.NumThreads ?? args.NumThreads ?? 0;
+                ectx.CheckUserArg(NumThread >= 0, nameof(item.NumThreads), "Must be positive or zero.");
+
+                NumMaxDocToken = item.NumMaxDocToken ?? args.NumMaxDocToken;
+                ectx.CheckUserArg(NumMaxDocToken > 0, nameof(item.NumMaxDocToken), "Must be positive.");
+
+                NumSummaryTermPerTopic = item.NumSummaryTermPerTopic ?? args.NumSummaryTermPerTopic;
+                ectx.CheckUserArg(NumSummaryTermPerTopic > 0, nameof(item.NumSummaryTermPerTopic), "Must be positive");
+
+                NumBurninIter = item.NumBurninIterations ?? args.NumBurninIterations;
+                ectx.CheckUserArg(NumBurninIter >= 0, nameof(item.NumBurninIterations), "Must be non-negative.");
+
+                ResetRandomGenerator = item.ResetRandomGenerator ?? args.ResetRandomGenerator;
+            }
+
+            public ColInfoEx(IExceptionContext ectx, ModelLoadContext ctx)
+            {
+                Contracts.AssertValue(ectx);
+                ectx.AssertValue(ctx);
+
+                // *** Binary format ***
+                // int NumTopic;
+                // Single AlphaSum;
+                // Single Beta;
+                // int MHStep;
+                // int NumIter;
+                // int LikelihoodInterval;
+                // int NumThread;
+                // int NumMaxDocToken;
+                // int NumSummaryTermPerTopic;
+                // int NumBurninIter;
+                // byte ResetRandomGenerator;
+
+                NumTopic = ctx.Reader.ReadInt32();
+                ectx.CheckDecode(NumTopic > 0);
+
+                AlphaSum = ctx.Reader.ReadSingle();
+
+                Beta = ctx.Reader.ReadSingle();
+
+                MHStep = ctx.Reader.ReadInt32();
+                ectx.CheckDecode(MHStep > 0);
+
+                NumIter = ctx.Reader.ReadInt32();
+                ectx.CheckDecode(NumIter > 0);
+
+                LikelihoodInterval = ctx.Reader.ReadInt32();
+                ectx.CheckDecode(LikelihoodInterval > 0);
+
+                NumThread = ctx.Reader.ReadInt32();
+                ectx.CheckDecode(NumThread >= 0);
+
+                NumMaxDocToken = ctx.Reader.ReadInt32();
+                ectx.CheckDecode(NumMaxDocToken > 0);
+
+                NumSummaryTermPerTopic = ctx.Reader.ReadInt32();
+                ectx.CheckDecode(NumSummaryTermPerTopic > 0);
+
+                NumBurninIter = ctx.Reader.ReadInt32();
+                ectx.CheckDecode(NumBurninIter >= 0);
+
+                ResetRandomGenerator = ctx.Reader.ReadBoolByte();
+            }
+
+            public void Save(ModelSaveContext ctx)
+            {
+                Contracts.AssertValue(ctx);
+
+                // *** Binary format ***
+                // int NumTopic;
+                // Single AlphaSum;
+                // Single Beta;
+                // int MHStep;
+                // int NumIter;
+                // int LikelihoodInterval;
+                // int NumThread;
+                // int NumMaxDocToken;
+                // int NumSummaryTermPerTopic;
+                // int NumBurninIter;
+                // byte ResetRandomGenerator;
+
+                ctx.Writer.Write(NumTopic);
+                ctx.Writer.Write(AlphaSum);
+                ctx.Writer.Write(Beta);
+                ctx.Writer.Write(MHStep);
+                ctx.Writer.Write(NumIter);
+                ctx.Writer.Write(LikelihoodInterval);
+                ctx.Writer.Write(NumThread);
+                ctx.Writer.Write(NumMaxDocToken);
+                ctx.Writer.Write(NumSummaryTermPerTopic);
+                ctx.Writer.Write(NumBurninIter);
+                ctx.Writer.WriteBoolByte(ResetRandomGenerator);
+            }
+        }
+
+        public const string LoaderSignature = "LdaTransform";
+        private static VersionInfo GetVersionInfo()
+        {
+            return new VersionInfo(
+                modelSignature: "LIGHTLDA",
+                verWrittenCur: 0x00010001, // Initial
+                verReadableCur: 0x00010001,
+                verWeCanReadBack: 0x00010001,
+                loaderSignature: LoaderSignature);
+        }
+
+        private readonly ColInfoEx[] _exes;
+        private readonly LdaState[] _ldas;
+        private readonly ColumnType[] _types;
+        private readonly bool _saveText;
+
+        private const string RegistrationName = "LightLda";
+        private const string WordTopicModelFilename = "word_topic_summary.txt";
+        internal const string Summary = "The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.";
+        internal const string UserName = "Latent Dirichlet Allocation Transform";
+        internal const string ShortName = "LightLda";
+
+        public LdaTransform(IHostEnvironment env, Arguments args, IDataView input)
+            : base(env, RegistrationName, args.Column, input, TestType)
+        {
+            Host.CheckValue(args, nameof(args));
+            Host.CheckUserArg(args.NumTopic > 0, nameof(args.NumTopic), "Must be positive.");
+            Host.CheckValue(input, nameof(input));
+            Host.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column));
+            _exes = new ColInfoEx[Infos.Length];
+            _types = new ColumnType[Infos.Length];
+            _ldas = new LdaState[Infos.Length];
+            _saveText = args.OutputTopicWordSummary;
+            for (int i = 0; i < Infos.Length; i++)
+            {
+                var ex = new ColInfoEx(Host, args.Column[i], args);
+                _exes[i] = ex;
+                _types[i] = new VectorType(NumberType.Float, ex.NumTopic);
+            }
+            using (var ch = Host.Start("Train"))
+            {
+                Train(ch, input, _ldas);
+                ch.Done();
+            }
+            Metadata.Seal();
+        }
+
+        private void Dispose(bool disposing)
+        {
+            if (_ldas != null)
+            {
+                foreach (var state in _ldas)
+                    state?.Dispose();
+            }
+            if (disposing)
+                GC.SuppressFinalize(this);
+        }
+
+        public void Dispose()
+        {
+            Dispose(true);
+        }
+
+        ~LdaTransform()
+        {
+            Dispose(false);
+        }
+
+        private LdaTransform(IHost host, ModelLoadContext ctx, IDataView input)
+            : base(host, ctx, input, TestType)
+        {
+            Host.AssertValue(ctx);
+
+            // *** Binary format ***
+            // <prefix handled in static Create method>
+            // <base>
+            // ldaState[num infos]: The LDA parameters
+
+            // Note: infos.length would be just one in most cases.
+            _exes = new ColInfoEx[Infos.Length];
+            _ldas = new LdaState[Infos.Length];
+            _types = new ColumnType[Infos.Length];
+            for (int i = 0; i < _ldas.Length; i++)
+            {
+                _ldas[i] = new LdaState(Host, ctx);
+                _exes[i] = _ldas[i].InfoEx;
+                _types[i] = new VectorType(NumberType.Float, _ldas[i].InfoEx.NumTopic);
+            }
+            using (var ent = ctx.Repository.OpenEntryOrNull("model", WordTopicModelFilename))
+            {
+                _saveText = ent != null;
+            }
+            Metadata.Seal();
+        }
+
+        public static LdaTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input)
+        {
+            Contracts.CheckValue(env, nameof(env));
+            var h = env.Register(RegistrationName);
+
+            h.CheckValue(ctx, nameof(ctx));
+            ctx.CheckAtModel(GetVersionInfo());
+            h.CheckValue(input, nameof(input));
+
+            return h.Apply(
+                "Loading Model",
+                ch =>
+                {
+                    // *** Binary Format ***
+                    // int: sizeof(Float)
+                    // <remainder handled in ctors>
+                    int cbFloat = ctx.Reader.ReadInt32();
+                    h.CheckDecode(cbFloat == sizeof(Float));
+                    return new LdaTransform(h, ctx, input);
+                });
+        }
+
+        public string GetTopicSummary()
+        {
+            StringWriter writer = new StringWriter();
+            VBuffer<DvText> slotNames = default(VBuffer<DvText>);
+            for (int i = 0; i < _ldas.Length; i++)
+            {
+                GetSlotNames(i, ref slotNames);
+                _ldas[i].GetTopicSummaryWriter(slotNames)(writer);
+                writer.WriteLine();
+            }
+            return writer.ToString();
+        }
+
+        public override void Save(ModelSaveContext ctx)
+        {
+            Host.CheckValue(ctx, nameof(ctx));
+            ctx.CheckAtModel();
+            ctx.SetVersionInfo(GetVersionInfo());
+
+            // *** Binary format ***
+            // int: sizeof(Float)
+            // <base>
+            // ldaState[num infos]: The LDA parameters
+
+            ctx.Writer.Write(sizeof(Float));
+            SaveBase(ctx);
+            Host.Assert(_ldas.Length == Infos.Length);
+            VBuffer<DvText> slotNames = default(VBuffer<DvText>);
+            for (int i = 0; i < _ldas.Length; i++)
+            {
+                GetSlotNames(i, ref slotNames);
+                _ldas[i].Save(ctx, _saveText, slotNames);
+            }
+        }
+
+        private void GetSlotNames(int iinfo, ref VBuffer<DvText> dst)
+        {
+            Host.Assert(0 <= iinfo && iinfo < Infos.Length);
+            if (Source.Schema.HasSlotNames(Infos[iinfo].Source, Infos[iinfo].TypeSrc.ValueCount))
+                Source.Schema.GetMetadata(MetadataUtils.Kinds.SlotNames, Infos[iinfo].Source, ref dst);
+            else
+                dst = default(VBuffer<DvText>);
+        }
+
+        private static string TestType(ColumnType t)
+        {
+            // LDA consumes term frequency vectors, so I am assuming VBuffer<Float> is an appropriate input type. 
+            // It must also be of known size for the sake of the LDA trainer initialization.
+            if (t.IsKnownSizeVector && t.ItemType is NumberType)
+                return null;
+            return "Expected vector of number type of known size.";
+        }
+
+        private static int GetFrequency(double value)
+        {
+            int result = (int)value;
+            if (!(result == value && result >= 0))
+                return -1;
+            return result;
+        }
+
+        private void Train(IChannel ch, IDataView trainingData, LdaState[] states)
+        {
+            Host.AssertValue(ch);
+            ch.AssertValue(trainingData);
+            ch.AssertValue(states);
+            ch.Assert(states.Length == Infos.Length);
+
+            bool[] activeColumns = new bool[trainingData.Schema.ColumnCount];
+            int[] numVocabs = new int[Infos.Length];
+
+            for (int i = 0; i < Infos.Length; i++)
+            {
+                activeColumns[Infos[i].Source] = true;
+                numVocabs[i] = 0;
+            }
+
+            //the current lda needs the memory allocation before feedin data, so needs two sweeping of the data, 
+            //one for the pre-calc memory, one for feedin data really
+            //another solution can be prepare these two value externally and put them in the beginning of the input file.
+            long[] corpusSize = new long[Infos.Length];
+            int[] numDocArray = new int[Infos.Length];
+
+            using (var cursor = trainingData.GetRowCursor(col => activeColumns[col]))
+            {
+                var getters = new ValueGetter<VBuffer<Double>>[Utils.Size(Infos)];
+                for (int i = 0; i < Infos.Length; i++)
+                {
+                    corpusSize[i] = 0;
+                    numDocArray[i] = 0;
+                    getters[i] = RowCursorUtils.GetVecGetterAs<Double>(NumberType.R8, cursor, Infos[i].Source);
+                }
+                VBuffer<Double> src = default(VBuffer<Double>);
+                long rowCount = 0;
+
+                while (cursor.MoveNext())
+                {
+                    ++rowCount;
+                    for (int i = 0; i < Infos.Length; i++)
+                    {
+                        int docSize = 0;
+                        getters[i](ref src);
+
+                        // compute term, doc instance#.
+                        for (int termID = 0; termID < src.Count; termID++)
+                        {
+                            int termFreq = GetFrequency(src.Values[termID]);
+                            if (termFreq < 0)
+                            {
+                                // Ignore this row.
+                                docSize = 0;
+                                break;
+                            }
+
+                            if (docSize >= _exes[i].NumMaxDocToken - termFreq)
+                                break; //control the document length
+
+                            //if legal then add the term
+                            docSize += termFreq;
+                        }
+
+                        // Ignore empty doc
+                        if (docSize == 0)
+                            continue;
+
+                        numDocArray[i]++;
+                        corpusSize[i] += docSize * 2 + 1;   // in the beggining of each doc, there is a cursor variable
+
+                        // increase numVocab if needed.
+                        if (numVocabs[i] < src.Length)
+                            numVocabs[i] = src.Length;
+                    }
+                }
+
+                for (int i = 0; i < Infos.Length; ++i)
+                {
+                    if (numDocArray[i] != rowCount)
+                    {
+                        ch.Assert(numDocArray[i] < rowCount);
+                        ch.Warning($"Column '{Infos[i].Name}' has skipped {rowCount - numDocArray[i]} of {rowCount} rows either empty or with negative, non-finite, or fractional values.");
+                    }
+                }
+            }
+
+            // Initialize all LDA states
+            for (int i = 0; i < Infos.Length; i++)
+            {
+                var state = new LdaState(Host, _exes[i], numVocabs[i]);
+                if (numDocArray[i] == 0 || corpusSize[i] == 0)
+                    throw ch.Except("The specified documents are all empty in column '{0}'.", Infos[i].Name);
+
+                state.AllocateDataMemory(numDocArray[i], corpusSize[i]);
+                states[i] = state;
+            }
+
+            using (var cursor = trainingData.GetRowCursor(col => activeColumns[col]))
+            {
+                int[] docSizeCheck = new int[Infos.Length];
+                // This could be optimized so that if multiple trainers consume the same column, it is
+                // fed into the train method once.
+                var getters = new ValueGetter<VBuffer<Double>>[Utils.Size(Infos)];
+                for (int i = 0; i < Infos.Length; i++)
+                {
+                    docSizeCheck[i] = 0;
+                    getters[i] = RowCursorUtils.GetVecGetterAs<Double>(NumberType.R8, cursor, Infos[i].Source);
+                }
+
+                VBuffer<Double> src = default(VBuffer<Double>);
+
+                while (cursor.MoveNext())
+                {
+                    for (int i = 0; i < Infos.Length; i++)
+                    {
+                        getters[i](ref src);
+                        docSizeCheck[i] += states[i].FeedTrain(Host, ref src);
+                    }
+                }
+                for (int i = 0; i < Infos.Length; i++)
+                {
+                    Host.Assert(corpusSize[i] == docSizeCheck[i]);
+                    states[i].CompleteTrain();
+                }
+            }
+        }
+
+        private sealed class LdaState : IDisposable
+        {
+            public readonly ColInfoEx InfoEx;
+            private readonly int _numVocab;
+            private readonly object _preparationSyncRoot;
+            private readonly object _testSyncRoot;
+            private bool _predictionPreparationDone;
+            private LdaSingleBox _ldaTrainer;
+
+            private LdaState()
+            {
+                _preparationSyncRoot = new object();
+                _testSyncRoot = new object();
+            }
+
+            public LdaState(IExceptionContext ectx, ColInfoEx ex, int numVocab)
+                : this()
+            {
+                Contracts.AssertValue(ectx);
+                ectx.AssertValue(ex, "ex");
+
+                ectx.Assert(numVocab >= 0);
+                InfoEx = ex;
+                _numVocab = numVocab;
+
+                _ldaTrainer = new LdaSingleBox(
+                    InfoEx.NumTopic,
+                    numVocab, /* Need to set number of vocabulary here */
+                    InfoEx.AlphaSum,
+                    InfoEx.Beta,
+                    InfoEx.NumIter,
+                    InfoEx.LikelihoodInterval,
+                    InfoEx.NumThread,
+                    InfoEx.MHStep,
+                    InfoEx.NumSummaryTermPerTopic,
+                    false,
+                    InfoEx.NumMaxDocToken);
+            }
+
+            public LdaState(IExceptionContext ectx, ModelLoadContext ctx)
+                : this()
+            {
+                ectx.AssertValue(ctx);
+
+                // *** Binary format ***
+                // <ColInfoEx>
+                // int: vocabnum
+                // long: memblocksize
+                // long: aliasMemBlockSize
+                // (serializing term by term, for one term)
+                // int: term_id, int: topic_num, KeyValuePair<int, int>[]: termTopicVector
+
+                InfoEx = new ColInfoEx(ectx, ctx);
+
+                _numVocab = ctx.Reader.ReadInt32();
+                ectx.CheckDecode(_numVocab > 0);
+
+                long memBlockSize = ctx.Reader.ReadInt64();
+                ectx.CheckDecode(memBlockSize > 0);
+
+                long aliasMemBlockSize = ctx.Reader.ReadInt64();
+                ectx.CheckDecode(aliasMemBlockSize > 0);
+
+                _ldaTrainer = new LdaSingleBox(
+                    InfoEx.NumTopic,
+                    _numVocab, /* Need to set number of vocabulary here */
+                    InfoEx.AlphaSum,
+                    InfoEx.Beta,
+                    InfoEx.NumIter,
+                    InfoEx.LikelihoodInterval,
+                    InfoEx.NumThread,
+                    InfoEx.MHStep,
+                    InfoEx.NumSummaryTermPerTopic,
+                    false,
+                    InfoEx.NumMaxDocToken);
+
+                _ldaTrainer.AllocateModelMemory(_numVocab, InfoEx.NumTopic, memBlockSize, aliasMemBlockSize);
+
+                for (int i = 0; i < _numVocab; i++)
+                {
+                    int termID = ctx.Reader.ReadInt32();
+                    ectx.CheckDecode(termID >= 0);
+                    int termTopicNum = ctx.Reader.ReadInt32();
+                    ectx.CheckDecode(termTopicNum >= 0);
+
+                    int[] topicId = new int[termTopicNum];
+                    int[] topicProb = new int[termTopicNum];
+
+                    for (int j = 0; j < termTopicNum; j++)
+                    {
+                        topicId[j] = ctx.Reader.ReadInt32();
+                        topicProb[j] = ctx.Reader.ReadInt32();
+                    }
+
+                    //set the topic into _ldaTrainer inner topic table
+                    _ldaTrainer.SetModel(termID, topicId, topicProb, termTopicNum);
+                }
+
+                //do the preparation
+                if (!_predictionPreparationDone)
+                {
+                    _ldaTrainer.InitializeBeforeTest();
+                    _predictionPreparationDone = true;
+                }
+            }
+
+            public Action<TextWriter> GetTopicSummaryWriter(VBuffer<DvText> mapping)
+            {
+                Action<TextWriter> writeAction;
+
+                if (mapping.Length == 0)
+                {
+                    writeAction =
+                        writer =>
+                        {
+                            for (int i = 0; i < _ldaTrainer.NumTopic; i++)
+                            {
+                                KeyValuePair<int, float>[] topicSummaryVector = _ldaTrainer.GetTopicSummary(i);
+                                writer.Write("{0}\t{1}\t", i, topicSummaryVector.Length);
+                                foreach (KeyValuePair<int, float> p in topicSummaryVector)
+                                    writer.Write("{0}:{1}\t", p.Key, p.Value);
+                                writer.WriteLine();
+                            }
+                        };
+                }
+                else
+                {
+                    writeAction =
+                        writer =>
+                        {
+                            DvText slotName = default(DvText);
+                            for (int i = 0; i < _ldaTrainer.NumTopic; i++)
+                            {
+                                KeyValuePair<int, float>[] topicSummaryVector = _ldaTrainer.GetTopicSummary(i);
+                                writer.Write("{0}\t{1}\t", i, topicSummaryVector.Length);
+                                foreach (KeyValuePair<int, float> p in topicSummaryVector)
+                                {
+                                    mapping.GetItemOrDefault(p.Key, ref slotName);
+                                    writer.Write("{0}[{1}]:{2}\t", p.Key, slotName, p.Value);
+                                }
+                                writer.WriteLine();
+                            }
+                        };
+                }
+
+                return writeAction;
+            }
+
+            public void Save(ModelSaveContext ctx, bool saveText, VBuffer<DvText> mapping)
+            {
+                Contracts.AssertValue(ctx);
+                long memBlockSize = 0;
+                long aliasMemBlockSize = 0;
+                _ldaTrainer.GetModelStat(out memBlockSize, out aliasMemBlockSize);
+
+                // *** Binary format ***
+                // <ColInfoEx>
+                // int: vocabnum
+                // long: memblocksize
+                // long: aliasMemBlockSize
+                // (serializing term by term, for one term)
+                // int: term_id, int: topic_num, KeyValuePair<int, int>[]: termTopicVector
+
+                InfoEx.Save(ctx);
+                ctx.Writer.Write(_ldaTrainer.NumVocab);
+                ctx.Writer.Write(memBlockSize);
+                ctx.Writer.Write(aliasMemBlockSize);
+
+                //save model from this interface
+                for (int i = 0; i < _ldaTrainer.NumVocab; i++)
+                {
+                    KeyValuePair<int, int>[] termTopicVector = _ldaTrainer.GetModel(i);
+
+                    //write the topic to disk through ctx
+                    ctx.Writer.Write(i); //term_id
+                    ctx.Writer.Write(termTopicVector.Length);
+
+                    foreach (KeyValuePair<int, int> p in termTopicVector)
+                    {
+                        ctx.Writer.Write(p.Key);
+                        ctx.Writer.Write(p.Value);
+                    }
+                }
+
+                var writeAction = GetTopicSummaryWriter(mapping);
+
+                // save word-topic summary in text
+                if (saveText)
+                    ctx.SaveTextStream(WordTopicModelFilename, writeAction);
+            }
+
+            public void AllocateDataMemory(int docNum, long corpusSize)
+            {
+                _ldaTrainer.AllocateDataMemory(docNum, corpusSize);
+            }
+
+            public int FeedTrain(IExceptionContext ectx, ref VBuffer<Double> input)
+            {
+                Contracts.AssertValue(ectx);
+
+                // REVIEW: Input the counts to your trainer here. This
+                // is called multiple times.
+
+                int docSize = 0;
+                int termNum = 0;
+
+                for (int i = 0; i < input.Count; i++)
+                {
+                    int termFreq = GetFrequency(input.Values[i]);
+                    if (termFreq < 0)
+                    {
+                        // Ignore this row.
+                        return 0;
+                    }
+                    if (docSize >= InfoEx.NumMaxDocToken - termFreq)
+                        break;
+
+                    // If legal then add the term.
+                    docSize += termFreq;
+                    termNum++;
+                }
+
+                // Ignore empty doc.
+                if (docSize == 0)
+                    return 0;
+
+                int actualSize = 0;
+                if (input.IsDense)
+                    actualSize = _ldaTrainer.LoadDocDense(input.Values, termNum, input.Length);
+                else
+                    actualSize = _ldaTrainer.LoadDoc(input.Indices, input.Values, termNum, input.Length);
+
+                ectx.Assert(actualSize == 2 * docSize + 1, string.Format("The doc size are distinct. Actual: {0}, Expected: {1}", actualSize, 2 * docSize + 1));
+                return actualSize;
+            }
+
+            public void CompleteTrain()
+            {
+                //allocate all kinds of in memory sample tables
+                _ldaTrainer.InitializeBeforeTrain();
+
+                //call native lda trainer to perform the multi-thread training
+                _ldaTrainer.Train(""); /* Need to pass in an empty string */
+            }
+
+            public void Output(ref VBuffer<Double> src, ref VBuffer<Float> dst, int numBurninIter, bool reset)
+            {
+                // Prediction for a single document.
+                // LdaSingleBox.InitializeBeforeTest() is NOT thread-safe.
+                if (!_predictionPreparationDone)
+                {
+                    lock (_preparationSyncRoot)
+                    {
+                        if (!_predictionPreparationDone)
+                        {
+                            //do some preparation for building tables in native c++
+                            _ldaTrainer.InitializeBeforeTest();
+                            _predictionPreparationDone = true;
+                        }
+                    }
+                }
+
+                int len = InfoEx.NumTopic;
+                var values = dst.Values;
+                var indices = dst.Indices;
+                if (src.Count == 0)
+                {
+                    dst = new VBuffer<Float>(len, 0, values, indices);
+                    return;
+                }
+
+                // Make sure all the frequencies are valid and truncate if the sum gets too large.
+                int docSize = 0;
+                int termNum = 0;
+                for (int i = 0; i < src.Count; i++)
+                {
+                    int termFreq = GetFrequency(src.Values[i]);
+                    if (termFreq < 0)
+                    {
+                        // REVIEW: Should this log a warning message? And what should it produce?
+                        // It currently produces a vbuffer of all NA values.
+                        // REVIEW: Need a utility method to do this...
+                        if (Utils.Size(values) < len)
+                            values = new Float[len];
+                        for (int k = 0; k < len; k++)
+                            values[k] = Float.NaN;
+                        dst = new VBuffer<Float>(len, values, indices);
+                        return;
+                    }
+
+                    if (docSize >= InfoEx.NumMaxDocToken - termFreq)
+                        break;
+
+                    docSize += termFreq;
+                    termNum++;
+                }
+
+                // REVIEW: Too much memory allocation here on each prediction.
+                List<KeyValuePair<int, float>> retTopics;
+                if (src.IsDense)
+                    retTopics = _ldaTrainer.TestDocDense(src.Values, termNum, numBurninIter, reset);
+                else
+                    retTopics = _ldaTrainer.TestDoc(src.Indices.Take(src.Count).ToArray(), src.Values.Take(src.Count).ToArray(), termNum, numBurninIter, reset);
+
+                int count = retTopics.Count;
+                Contracts.Assert(count <= len);
+                if (Utils.Size(values) < count)
+                    values = new Float[count];
+                if (count < len && Utils.Size(indices) < count)
+                    indices = new int[count];
+
+                double normalizer = 0;
+                for (int i = 0; i < count; i++)
+                {
+                    int index = retTopics[i].Key;
+                    Float value = retTopics[i].Value;
+                    Contracts.Assert(value >= 0);
+                    Contracts.Assert(0 <= index && index < len);
+                    if (count < len)
+                    {
+                        Contracts.Assert(i == 0 || indices[i - 1] < index);
+                        indices[i] = index;
+                    }
+                    else
+                        Contracts.Assert(index == i);
+
+                    values[i] = value;
+                    normalizer += value;
+                }
+
+                if (normalizer > 0)
+                {
+                    for (int i = 0; i < count; i++)
+                        values[i] = (Float)(values[i] / normalizer);
+                }
+                dst = new VBuffer<Float>(len, count, values, indices);
+            }
+
+            public void Dispose()
+            {
+                _ldaTrainer.Dispose();
+            }
+        }
+
+        private ColumnType[] InitColumnTypes(int numTopics)
+        {
+            Host.Assert(Utils.Size(Infos) > 0);
+            var types = new ColumnType[Infos.Length];
+            for (int c = 0; c < Infos.Length; c++)
+                types[c] = new VectorType(NumberType.Float, numTopics);
+            return types;
+        }
+
+        protected override ColumnType GetColumnTypeCore(int iinfo)
+        {
+            Host.Assert(0 <= iinfo & iinfo < Utils.Size(_types));
+            return _types[iinfo];
+        }
+
+        protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer)
+        {
+            Host.AssertValueOrNull(ch);
+            Host.AssertValue(input);
+            Host.Assert(0 <= iinfo && iinfo < Infos.Length);
+            disposer = null;
+
+            return GetTopic(input, iinfo);
+        }
+
+        private ValueGetter<VBuffer<Float>> GetTopic(IRow input, int iinfo)
+        {
+            var getSrc = RowCursorUtils.GetVecGetterAs<Double>(NumberType.R8, input, Infos[iinfo].Source);
+            var src = default(VBuffer<Double>);
+            var lda = _ldas[iinfo];
+            int numBurninIter = lda.InfoEx.NumBurninIter;
+            bool reset = lda.InfoEx.ResetRandomGenerator;
+            return
+                (ref VBuffer<Float> dst) =>
+                {
+                    // REVIEW: This will work, but there are opportunities for caching
+                    // based on input.Counter that are probably worthwhile given how long inference takes.
+                    getSrc(ref src);
+                    lda.Output(ref src, ref dst, numBurninIter, reset);
+                };
+        }
+    }
+}
diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index fd365acfbe..0f8fefb267 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -958,6 +958,18 @@ public void Add(Microsoft.ML.Transforms.LabelToFloatConverter input, Microsoft.M
                 _jsonNodes.Add(Serialize("Transforms.LabelToFloatConverter", input, output));
             }
 
+            public Microsoft.ML.Transforms.LightLda.Output Add(Microsoft.ML.Transforms.LightLda input)
+            {
+                var output = new Microsoft.ML.Transforms.LightLda.Output();
+                Add(input, output);
+                return output;
+            }
+
+            public void Add(Microsoft.ML.Transforms.LightLda input, Microsoft.ML.Transforms.LightLda.Output output)
+            {
+                _jsonNodes.Add(Serialize("Transforms.LightLda", input, output));
+            }
+
             public Microsoft.ML.Transforms.LogMeanVarianceNormalizer.Output Add(Microsoft.ML.Transforms.LogMeanVarianceNormalizer input)
             {
                 var output = new Microsoft.ML.Transforms.LogMeanVarianceNormalizer.Output();
@@ -10506,6 +10518,246 @@ public LabelToFloatConverterPipelineStep(Output output)
         }
     }
 
+    namespace Transforms
+    {
+
+        public sealed partial class LdaTransformColumn : OneToOneColumn<LdaTransformColumn>, IOneToOneColumn
+        {
+            /// <summary>
+            /// The number of topics in the LDA
+            /// </summary>
+            public int? NumTopic { get; set; }
+
+            /// <summary>
+            /// Dirichlet prior on document-topic vectors
+            /// </summary>
+            public float? AlphaSum { get; set; }
+
+            /// <summary>
+            /// Dirichlet prior on vocab-topic vectors
+            /// </summary>
+            public float? Beta { get; set; }
+
+            /// <summary>
+            /// Number of Metropolis Hasting step
+            /// </summary>
+            public int? Mhstep { get; set; }
+
+            /// <summary>
+            /// Number of iterations
+            /// </summary>
+            public int? NumIterations { get; set; }
+
+            /// <summary>
+            /// Compute log likelihood over local dataset on this iteration interval
+            /// </summary>
+            public int? LikelihoodInterval { get; set; }
+
+            /// <summary>
+            /// The number of training threads
+            /// </summary>
+            public int? NumThreads { get; set; }
+
+            /// <summary>
+            /// The threshold of maximum count of tokens per doc
+            /// </summary>
+            public int? NumMaxDocToken { get; set; }
+
+            /// <summary>
+            /// The number of words to summarize the topic
+            /// </summary>
+            public int? NumSummaryTermPerTopic { get; set; }
+
+            /// <summary>
+            /// The number of burn-in iterations
+            /// </summary>
+            public int? NumBurninIterations { get; set; } = 10;
+
+            /// <summary>
+            /// Reset the random number generator for each document
+            /// </summary>
+            public bool? ResetRandomGenerator { get; set; }
+
+            /// <summary>
+            /// Name of the new column
+            /// </summary>
+            public string Name { get; set; }
+
+            /// <summary>
+            /// Name of the source column
+            /// </summary>
+            public string Source { get; set; }
+
+        }
+
+        /// <summary>
+        /// The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.
+        /// </summary>
+        public sealed partial class LightLda : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
+        {
+
+            public LightLda()
+            {
+            }
+            
+            public LightLda(params string[] inputColumns)
+            {
+                if (inputColumns != null)
+                {
+                    foreach (string input in inputColumns)
+                    {
+                        AddColumn(input);
+                    }
+                }
+            }
+            
+            public LightLda(params (string inputColumn, string outputColumn)[] inputOutputColumns)
+            {
+                if (inputOutputColumns != null)
+                {
+                    foreach (var inputOutput in inputOutputColumns)
+                    {
+                        AddColumn(inputOutput.outputColumn, inputOutput.inputColumn);
+                    }
+                }
+            }
+            
+            public void AddColumn(string inputColumn)
+            {
+                var list = Column == null ? new List<Microsoft.ML.Transforms.LdaTransformColumn>() : new List<Microsoft.ML.Transforms.LdaTransformColumn>(Column);
+                list.Add(OneToOneColumn<Microsoft.ML.Transforms.LdaTransformColumn>.Create(inputColumn));
+                Column = list.ToArray();
+            }
+
+            public void AddColumn(string outputColumn, string inputColumn)
+            {
+                var list = Column == null ? new List<Microsoft.ML.Transforms.LdaTransformColumn>() : new List<Microsoft.ML.Transforms.LdaTransformColumn>(Column);
+                list.Add(OneToOneColumn<Microsoft.ML.Transforms.LdaTransformColumn>.Create(outputColumn, inputColumn));
+                Column = list.ToArray();
+            }
+
+
+            /// <summary>
+            /// New column definition(s) (optional form: name:srcs)
+            /// </summary>
+            public LdaTransformColumn[] Column { get; set; }
+
+            /// <summary>
+            /// The number of topics in the LDA
+            /// </summary>
+            [TlcModule.SweepableDiscreteParamAttribute("NumTopic", new object[]{20, 40, 100, 200})]
+            public int NumTopic { get; set; } = 100;
+
+            /// <summary>
+            /// Dirichlet prior on document-topic vectors
+            /// </summary>
+            [TlcModule.SweepableDiscreteParamAttribute("AlphaSum", new object[]{1, 10, 100, 200})]
+            public float AlphaSum { get; set; } = 100f;
+
+            /// <summary>
+            /// Dirichlet prior on vocab-topic vectors
+            /// </summary>
+            [TlcModule.SweepableDiscreteParamAttribute("Beta", new object[]{0.01f, 0.015f, 0.07f, 0.02f})]
+            public float Beta { get; set; } = 0.01f;
+
+            /// <summary>
+            /// Number of Metropolis Hasting step
+            /// </summary>
+            [TlcModule.SweepableDiscreteParamAttribute("Mhstep", new object[]{2, 4, 8, 16})]
+            public int Mhstep { get; set; } = 4;
+
+            /// <summary>
+            /// Number of iterations
+            /// </summary>
+            [TlcModule.SweepableDiscreteParamAttribute("NumIterations", new object[]{100, 200, 300, 400})]
+            public int NumIterations { get; set; } = 200;
+
+            /// <summary>
+            /// Compute log likelihood over local dataset on this iteration interval
+            /// </summary>
+            public int LikelihoodInterval { get; set; } = 5;
+
+            /// <summary>
+            /// The threshold of maximum count of tokens per doc
+            /// </summary>
+            public int NumMaxDocToken { get; set; } = 512;
+
+            /// <summary>
+            /// The number of training threads. Default value depends on number of logical processors.
+            /// </summary>
+            public int? NumThreads { get; set; }
+
+            /// <summary>
+            /// The number of words to summarize the topic
+            /// </summary>
+            public int NumSummaryTermPerTopic { get; set; } = 10;
+
+            /// <summary>
+            /// The number of burn-in iterations
+            /// </summary>
+            [TlcModule.SweepableDiscreteParamAttribute("NumBurninIterations", new object[]{10, 20, 30, 40})]
+            public int NumBurninIterations { get; set; } = 10;
+
+            /// <summary>
+            /// Reset the random number generator for each document
+            /// </summary>
+            public bool ResetRandomGenerator { get; set; } = false;
+
+            /// <summary>
+            /// Whether to output the topic-word summary in text format
+            /// </summary>
+            public bool OutputTopicWordSummary { get; set; } = false;
+
+            /// <summary>
+            /// Input dataset
+            /// </summary>
+            public Var<Microsoft.ML.Runtime.Data.IDataView> Data { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();
+
+
+            public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput
+            {
+                /// <summary>
+                /// Transformed dataset
+                /// </summary>
+                public Var<Microsoft.ML.Runtime.Data.IDataView> OutputData { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();
+
+                /// <summary>
+                /// Transform model
+                /// </summary>
+                public Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel> Model { get; set; } = new Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel>();
+
+            }
+            public Var<IDataView> GetInputData() => Data;
+            
+            public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
+            {
+                if (previousStep != null)
+                {
+                    if (!(previousStep is ILearningPipelineDataStep dataStep))
+                    {
+                        throw new InvalidOperationException($"{ nameof(LightLda)} only supports an { nameof(ILearningPipelineDataStep)} as an input.");
+                    }
+
+                    Data = dataStep.Data;
+                }
+                Output output = experiment.Add(this);
+                return new LightLdaPipelineStep(output);
+            }
+
+            private class LightLdaPipelineStep : ILearningPipelineDataStep
+            {
+                public LightLdaPipelineStep(Output output)
+                {
+                    Data = output.OutputData;
+                    Model = output.Model;
+                }
+
+                public Var<IDataView> Data { get; }
+                public Var<ITransformModel> Model { get; }
+            }
+        }
+    }
+
     namespace Transforms
     {
 
diff --git a/src/Native/CMakeLists.txt b/src/Native/CMakeLists.txt
index 947a664ab6..d8f963e44e 100644
--- a/src/Native/CMakeLists.txt
+++ b/src/Native/CMakeLists.txt
@@ -179,4 +179,5 @@ function(install_library_and_symbols targetName)
 endfunction()
 
 add_subdirectory(CpuMathNative)
-add_subdirectory(FastTreeNative)
\ No newline at end of file
+add_subdirectory(FastTreeNative)
+add_subdirectory(LdaNative)
\ No newline at end of file
diff --git a/src/Native/LdaNative/CMakeLists.txt b/src/Native/LdaNative/CMakeLists.txt
new file mode 100644
index 0000000000..f2e1d340de
--- /dev/null
+++ b/src/Native/LdaNative/CMakeLists.txt
@@ -0,0 +1,19 @@
+project (LdaNative)
+
+set(SOURCES
+    alias_multinomial_rng_int.cpp
+    data_block.cpp
+    hybrid_alias_map.cpp
+    hybrid_map.cpp
+    lda_document.cpp
+    lda_engine.cpp
+    lda_engine_export.cpp
+    light_doc_sampler.cpp
+    light_hash_map.cpp
+    model_block.cpp
+    utils.cpp
+)
+
+add_library(LdaNative SHARED ${SOURCES} ${RESOURCES})
+
+install_library_and_symbols (LdaNative)
diff --git a/src/Native/LdaNative/alias_multinomial_rng_int.cpp b/src/Native/LdaNative/alias_multinomial_rng_int.cpp
new file mode 100644
index 0000000000..a945feb6be
--- /dev/null
+++ b/src/Native/LdaNative/alias_multinomial_rng_int.cpp
@@ -0,0 +1,49 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include "alias_multinomial_rng_int.hpp"
+#include "rand_int_rng.h"
+#include <ctime>
+#include <list>
+#include <algorithm>
+#include <iostream>
+
+namespace wood
+{
+    AliasMultinomialRNGInt::AliasMultinomialRNGInt()
+        : n_(-1), internal_memory_(nullptr)
+    {
+
+    }
+    AliasMultinomialRNGInt::~AliasMultinomialRNGInt()
+    {
+        if (internal_memory_ != nullptr)
+        {
+            delete[]internal_memory_;
+        }
+    }
+    
+    int32_t AliasMultinomialRNGInt::Next(xorshift_rng& rng, std::vector<alias_k_v>& alias_kv)
+    {
+        // NOTE: stl uniform_real_distribution generates the highest quality random numbers
+        // yet, the other two are much faster
+        auto sample = rng.rand();
+        
+        // NOTE: use std::floor is too slow
+        // here we guarantee sample * n_ is nonnegative, this makes cast work
+        int idx = sample / a_int_;
+
+        if (n_ <= idx)
+        {
+            idx = n_ - 1;
+        }
+
+        // the following code is equivalent to 
+        // return sample < V_[idx] ? idx : K_[idx];
+        // but faster, see
+        // http://stackoverflow.com/questions/6754454/speed-difference-between-if-else-and-ternary-operator-in-c
+        int m = -(sample < alias_kv[idx].v_);
+        return (idx & m) | (alias_kv[idx].k_ & ~m);
+    }
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/alias_multinomial_rng_int.hpp b/src/Native/LdaNative/alias_multinomial_rng_int.hpp
new file mode 100644
index 0000000000..e25bc5bc17
--- /dev/null
+++ b/src/Native/LdaNative/alias_multinomial_rng_int.hpp
@@ -0,0 +1,454 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+
+#include <cstring>
+#include <stdint.h>
+#include <random>
+#include <vector>
+#include <queue>
+#include <memory>
+#include "rand_int_rng.h"
+#include <iostream>
+#include <assert.h>
+/*
+Algorithm described in 
+http://www.jstatsoft.org/v11/i03/paper
+George Marsaglia
+Fast generation of discrete random variables
+*/
+namespace wood
+{
+    struct alias_k_v
+    {
+        int32_t k_;
+        int32_t v_;
+    };
+
+    class AliasMultinomialRNGInt
+    {
+    public:
+        AliasMultinomialRNGInt();
+        ~AliasMultinomialRNGInt();
+
+        void Init(int K)
+        {
+            L_.resize(K);
+            H_.resize(K);
+            proportion_int_.resize(K);
+            internal_memory_ = new int32_t[2 * K];
+        }
+
+        void SetProportionMass(std::vector<float> &proportion,
+            float mass,
+            std::vector<alias_k_v> &alias_kv,
+            int32_t *height,
+            xorshift_rng &rng)
+        {
+            n_ = (int32_t)proportion.size(); //proportion number should be kept within 2Billion
+
+            mass_int_ = 0x7fffffff;
+            a_int_ = mass_int_ / n_;
+            mass_int_ = a_int_ * n_;
+            *height = a_int_;
+
+            int64_t mass_sum = 0;   //use int64_t to avoid overflowing
+            for (int i = 0; i < n_; ++i)
+            {
+                proportion[i] /= mass;
+                proportion_int_[i] = (int32_t)(proportion[i] * mass_int_);
+                mass_sum += proportion_int_[i];
+            }
+
+            if (mass_sum > mass_int_)
+            {
+                //Todo: is this data type safe? more is int and mass_sum is in64
+                int32_t more = (int32_t)(mass_sum - mass_int_);
+
+                int i = 0;
+                int id = 0;
+                int r = 0;
+                while (i < more)
+                {
+                    if (proportion_int_[id] >= 1)
+                    {
+                        proportion_int_[id]--;
+                        ++i;
+                    }
+                    id = (id + 1) % n_;
+                }
+            }
+
+            if (mass_sum < mass_int_)
+            {
+                //Todo: is this data type safe? more is int and mass_sum is in64
+                int32_t more = (int32_t)(mass_int_ - mass_sum);
+
+                int i = 0;
+                int id = 0;
+                while (i < more)
+                {
+                    proportion_int_[id]++;
+                    id = (id + 1) % n_;
+                    i++;
+                }
+            }
+
+            for (int i = 0; i < n_; ++i)
+            {
+                alias_kv[i].k_ = i;
+                alias_kv[i].v_ = (i + 1) * a_int_;
+            }
+
+            int32_t L_head = 0;
+            int32_t L_tail = 0;
+
+            int32_t H_head = 0;
+            int32_t H_tail = 0;
+
+            for (auto i = 0; i < proportion_int_.size(); ++i)
+            {
+                auto val = proportion_int_[i];
+                if (val < a_int_)
+                {
+                    L_[L_tail].first = i;
+                    L_[L_tail].second = val;
+                    ++L_tail;
+                }
+                else
+                {
+                    H_[H_tail].first = i;
+                    H_[H_tail].second = val;
+                    ++H_tail;
+                }
+            }
+
+            assert(L_tail + H_tail == n_);
+
+            while (L_head != L_tail && H_head != H_tail)
+            {
+                auto &i_pi = L_[L_head++];
+                auto &h_ph = H_[H_head++];
+
+                alias_kv[i_pi.first].k_ = h_ph.first;
+                alias_kv[i_pi.first].v_ = i_pi.first * a_int_ + i_pi.second;
+
+                auto sum = h_ph.second + i_pi.second;
+                if (sum > 2 * a_int_)
+                {
+                    H_[H_tail].first = h_ph.first;
+                    H_[H_tail].second = sum - a_int_;
+                    ++H_tail;
+                }
+                else
+                {
+                    L_[L_tail].first = h_ph.first;
+                    L_[L_tail].second = sum - a_int_;
+                    ++L_tail;
+                }
+            }
+            while (L_head != L_tail)
+            {
+                auto first = L_[L_head].first;
+                auto second = L_[L_head].second;
+                alias_kv[first].k_ = first;
+                alias_kv[first].v_ = first  * a_int_ + second;
+                ++L_head;
+            }
+            while (H_head != H_tail)
+            {
+                auto first = H_[H_head].first;
+                auto second = H_[H_head].second;
+                alias_kv[first].k_ = first;
+                alias_kv[first].v_ = first * a_int_ + second;
+                ++H_head;
+            }
+
+        }
+
+        inline void SetProportionMass(std::vector<float> &proportion,
+            float mass,
+            int32_t* memory,
+            int32_t *height,
+            xorshift_rng &rng)
+        {
+            n_ = (int32_t)proportion.size();
+
+            mass_int_ = 0x7fffffff;
+            a_int_ = mass_int_ / n_;
+            mass_int_ = a_int_ * n_;
+            *height = a_int_;
+
+            int64_t mass_sum = 0;
+            for (int i = 0; i < n_; ++i)
+            {
+                proportion[i] /= mass;
+                proportion_int_[i] = (int32_t)(proportion[i] * mass_int_);
+                mass_sum += proportion_int_[i];
+            }
+
+            if (mass_sum > mass_int_)
+            {
+                //Todo: is this data type safe? more is int and mass_sum is in64
+                int32_t more = (int32_t)(mass_sum - mass_int_);
+                int i = 0;
+                int id = 0;
+                int r = 0;
+                while (i < more)
+                {
+                    if (proportion_int_[id] >= 1)
+                    {
+                        proportion_int_[id]--;
+                        ++i;
+                    }
+                    id = (id + 1) % n_;
+                }
+            }
+
+            if (mass_sum < mass_int_)
+            {
+                //Todo: is this data type safe? more is int and mass_sum is in64
+                int32_t more = (int32_t)(mass_int_ - mass_sum);
+                int i = 0;
+                int id = 0;
+                while (i < more)
+                {
+                    proportion_int_[id]++;
+                    id = (id + 1) % n_;
+                    i++;
+                }
+            }
+
+            for (int i = 0; i < n_; ++i)
+            {
+                int32_t *p = internal_memory_ + 2 * i;
+                *p = i;  p++;
+                *p = (i + 1) * a_int_;
+            }
+            
+            int32_t L_head = 0;
+            int32_t L_tail = 0;
+
+            int32_t H_head = 0;
+            int32_t H_tail = 0;
+
+            for (auto i = 0; i < n_; ++i)
+            {
+                auto val = proportion_int_[i];
+                if (val < a_int_)
+                {
+                    L_[L_tail].first = i;
+                    L_[L_tail].second = val;
+                    ++L_tail;
+                }
+                else
+                {
+                    H_[H_tail].first = i;
+                    H_[H_tail].second = val;
+                    ++H_tail;
+                }
+            }
+
+            assert(L_tail + H_tail == n_);
+
+            while (L_head != L_tail && H_head != H_tail)
+            {
+                auto &i_pi = L_[L_head++];
+                auto &h_ph = H_[H_head++];
+
+                int32_t *p = internal_memory_ + 2 * i_pi.first;
+                *p = h_ph.first; p++;
+                *p = i_pi.first * a_int_ + i_pi.second;
+
+                auto sum = h_ph.second + i_pi.second;
+                if (sum > 2 * a_int_)
+                {
+                    H_[H_tail].first = h_ph.first;
+                    H_[H_tail].second = sum - a_int_;
+                    ++H_tail;
+                }
+                else
+                {
+                    L_[L_tail].first = h_ph.first;
+                    L_[L_tail].second = sum - a_int_;
+                    ++L_tail;
+                }
+            }
+            while (L_head != L_tail)
+            {
+                auto first = L_[L_head].first;
+                auto second = L_[L_head].second;
+
+                int32_t *p = internal_memory_ + 2 * first;
+                *p = first; p++;
+                *p = first * a_int_ + second;
+                ++L_head;
+            }
+            while (H_head != H_tail)
+            {
+                auto first = H_[H_head].first;
+                auto second = H_[H_head].second;
+
+                int32_t *p = internal_memory_ + 2 * first;
+                *p = first; p++;
+                *p = first * a_int_ + second;
+                ++H_head;
+            }    
+            memcpy(memory, internal_memory_, sizeof(int32_t)* 2 * n_);
+        }
+
+        inline void SetProportionMass(std::vector<float> &proportion,
+            int32_t size,
+            float mass,
+            int32_t* memory,
+            int32_t *height,
+            xorshift_rng &rng,
+            int32_t word_id)
+        {
+            n_ = size;
+
+            mass_int_ = 0x7fffffff;
+            a_int_ = mass_int_ / n_;
+            mass_int_ = a_int_ * n_;
+            *height = a_int_;
+
+            int64_t mass_sum = 0;
+            for (int i = 0; i < n_; ++i)
+            {
+                proportion[i] /= mass;
+                proportion_int_[i] = (int32_t)(proportion[i] * mass_int_);
+                mass_sum += proportion_int_[i];
+            }
+
+            if (mass_sum > mass_int_)
+            {
+                //Todo: is this data type safe? more is int and mass_sum is in64
+                int32_t more = (int32_t)(mass_sum - mass_int_);
+
+                int i = 0;
+                int id = 0;
+                int r = 0;
+                while (i < more)
+                {
+                    if (proportion_int_[id] >= 1)
+                    {
+                        proportion_int_[id]--;
+                        ++i;
+                    }
+                    id = (id + 1) % n_;
+                }
+            }
+
+            if (mass_sum < mass_int_)
+            {
+                //Todo: is this data type safe? more is int and mass_sum is in64
+                int32_t more = (int32_t)(mass_int_ - mass_sum);
+
+                int i = 0;
+                int id = 0;
+                while (i < more)
+                {
+                    proportion_int_[id]++;
+                    id = (id + 1) % n_;
+                    i++;
+                }
+            }
+
+            int32_t L_head = 0;
+            int32_t L_tail = 0;
+            int32_t H_head = 0;
+            int32_t H_tail = 0;
+
+            for (int i = 0; i < n_; ++i)
+            {
+                int32_t *p = memory + 2 * i;
+                *p = i; p++;
+                *p = (i + 1) * a_int_;
+            }
+
+            for (auto i = 0; i < n_; ++i)
+            {
+                auto val = proportion_int_[i];
+                if (val < a_int_)
+                {
+                    L_[L_tail].first = i;
+                    L_[L_tail].second = val;
+                    ++L_tail;
+                }
+                else
+                {
+                    H_[H_tail].first = i;
+                    H_[H_tail].second = val;
+                    ++H_tail;
+                }
+            }
+
+            assert(L_tail + H_tail == n_);
+
+            while (L_head != L_tail && H_head != H_tail)
+            {
+                auto &i_pi = L_[L_head++];
+                auto &h_ph = H_[H_head++];
+
+                int32_t *p = memory + 2 * i_pi.first;
+                *p = h_ph.first; p++;
+                *p = i_pi.first * a_int_ + i_pi.second;
+
+                auto sum = h_ph.second + i_pi.second;
+                if (sum > 2 * a_int_)
+                {
+                    H_[H_tail].first = h_ph.first;
+                    H_[H_tail].second = sum - a_int_;
+                    ++H_tail;
+                }
+                else
+                {
+                    L_[L_tail].first = h_ph.first;
+                    L_[L_tail].second = sum - a_int_;
+                    ++L_tail;
+                }
+            }
+            while (L_head != L_tail)
+            {
+                auto first = L_[L_head].first;
+                auto second = L_[L_head].second;
+                int32_t *p = memory + 2 * first;
+                *p = first;  p++;
+                *p = first * a_int_ + second;
+                ++L_head;
+            }
+            while (H_head != H_tail)
+            {
+                auto first = H_[H_head].first;
+                auto second = H_[H_head].second;
+                int32_t *p = memory + 2 * first;
+
+                *p = first; p++;
+                *p = first * a_int_ + second;
+                ++H_head;
+            }
+        }
+
+        // Make sure to call SetProportion or SetProportionMass before calling Next
+        int32_t Next(xorshift_rng& rng, std::vector<alias_k_v>& alias_kv);
+
+    private:
+        void GenerateAliasTable(std::vector<alias_k_v>& alias_kv);
+
+    public:
+        AliasMultinomialRNGInt(const AliasMultinomialRNGInt &other) = delete;
+        AliasMultinomialRNGInt& operator=(const AliasMultinomialRNGInt &other) = delete;
+
+        std::vector<int32_t> proportion_int_;
+        int32_t *internal_memory_;
+
+        int32_t n_;
+        int32_t a_int_;
+        int32_t mass_int_;
+
+        std::vector<std::pair<int32_t, int32_t>> L_;
+        std::vector<std::pair<int32_t, int32_t>> H_;
+    };
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/data_block.cpp b/src/Native/LdaNative/data_block.cpp
new file mode 100644
index 0000000000..11b56b9ad7
--- /dev/null
+++ b/src/Native/LdaNative/data_block.cpp
@@ -0,0 +1,117 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include <iostream>
+#include "data_block.h"
+#include "lda_document.h"
+
+namespace lda
+{
+    LDADataBlock::LDADataBlock(int32_t num_threads) : 
+        num_threads_(num_threads), has_read_(false), index_document_(0), documents_buffer_(nullptr), offset_buffer_(nullptr)
+    {
+    }
+
+    LDADataBlock::~LDADataBlock() 
+    {
+        if (has_read_)
+        {
+            delete[] offset_buffer_;
+            delete[] documents_buffer_;
+        }
+    }
+
+    void LDADataBlock::Clear()
+    {
+        has_read_ = false;
+        index_document_ = 0;
+        used_size_ = 0;
+
+        num_documents_ = 0;
+        corpus_size_ = 0;
+
+        if (offset_buffer_)
+        {
+            delete[]offset_buffer_;
+            offset_buffer_ = nullptr;
+        }
+        if (documents_buffer_)
+        {
+            delete[]documents_buffer_;
+            documents_buffer_ = nullptr;
+        }
+    }
+
+    void LDADataBlock::Allocate(const int32_t num_document, const int64_t corpus_size)
+    {
+        num_documents_ = num_document;
+        corpus_size_ = corpus_size;
+
+        offset_buffer_ = new int64_t[num_documents_ + 1]; // +1: one for the end of last document,
+        documents_buffer_ = new int32_t[corpus_size_];
+
+        index_document_ = 0;
+        used_size_ = 0;
+
+        offset_buffer_[0] = 0;
+    }
+
+
+    //term_id, term_freq, term_num
+    int LDADataBlock::Add(int32_t* term_id, int32_t* term_freq, int32_t term_num)
+    {
+        int64_t data_length = 1;
+
+        int64_t idx = offset_buffer_[index_document_] + 1;
+        for (int i = 0; i < term_num; ++i)
+        {
+            for (int j = 0; j < term_freq[i]; ++j)
+            {
+                documents_buffer_[idx++] = term_id[i];
+                documents_buffer_[idx++] = 0;
+                data_length += 2;
+            }
+        }
+
+        index_document_++;
+        used_size_ += data_length;
+
+        offset_buffer_[index_document_] = used_size_;
+        has_read_ = true;
+
+        return (int)data_length;
+    }
+
+    int LDADataBlock::AddDense(int32_t* term_freq, int32_t term_num)
+    {
+        int64_t data_length = 1;
+        
+        int64_t idx = offset_buffer_[index_document_] + 1;
+        for (int i = 0; i < term_num; ++i)
+        {
+            for (int j = 0; j < term_freq[i]; ++j)
+            {
+                documents_buffer_[idx++] = i;
+                documents_buffer_[idx++] = 0;
+                data_length += 2;
+            }
+        }
+
+        index_document_++;
+        used_size_ += data_length;
+
+        offset_buffer_[index_document_] = used_size_;
+        has_read_ = true;
+
+        return (int)data_length;
+    }
+
+    std::shared_ptr<LDADocument> LDADataBlock::GetOneDoc(int32_t index) const
+    {
+        std::shared_ptr<LDADocument> returned_ptr(
+            new LDADocument(documents_buffer_ + offset_buffer_[index],
+                documents_buffer_ + offset_buffer_[index + 1]));
+        return returned_ptr;
+    }
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/data_block.h b/src/Native/LdaNative/data_block.h
new file mode 100644
index 0000000000..9f0894a858
--- /dev/null
+++ b/src/Native/LdaNative/data_block.h
@@ -0,0 +1,70 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+
+#include <string>
+#include <algorithm>
+#include <memory>
+#include "light_hash_map.h"
+
+namespace lda
+{
+    class LDADocument;
+    class LDADataBlock
+    {
+    public:
+        explicit LDADataBlock(int32_t num_threads);
+        ~LDADataBlock();
+        
+        void Clear();
+        //in data feedin scenario
+        void Allocate(const int32_t num_document, const int64_t corpus_size);        
+        //port the data from external process, e.g. c#
+        int AddDense(int32_t* term_freq, int32_t term_num);
+        int Add(int32_t* term_id, int32_t* term_freq, int32_t term_num);
+        std::shared_ptr<LDADocument> GetOneDoc(int32_t index) const;
+
+        inline int32_t num_documents() const;
+        // Return the first document for thread thread_id
+        inline int32_t Begin(int32_t thread_id) const;        
+        // Return the next to last document for thread thread_i
+        inline int32_t End(int32_t thread_id) const;
+
+
+    private:
+        LDADataBlock(const LDADataBlock& other) = delete;
+        LDADataBlock& operator=(const LDADataBlock& other) = delete;
+
+        int32_t num_threads_;
+        bool has_read_;             // equal true if LDADataBlock holds memory
+
+        int32_t index_document_;
+        int64_t used_size_;
+
+        int32_t num_documents_; 
+        int64_t corpus_size_;
+
+        int64_t* offset_buffer_;    // offset_buffer_ size = num_document_ + 1
+        int32_t* documents_buffer_; // documents_buffer_ size = corpus_size_;
+    };
+
+    inline int32_t LDADataBlock::num_documents() const
+    {
+        return num_documents_;
+    }
+    inline int32_t LDADataBlock::Begin(int32_t thread_id) const
+    {
+        int32_t num_of_one_doc = num_documents_ / num_threads_;
+        return thread_id * num_of_one_doc;
+    }
+
+    inline int32_t LDADataBlock::End(int32_t thread_id) const
+    {
+        if (thread_id == num_threads_ - 1)         // last thread
+            return num_documents_;
+        int32_t num_of_one_doc = num_documents_ / num_threads_;
+        return (thread_id + 1) * num_of_one_doc;
+    }
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/hybrid_alias_map.cpp b/src/Native/LdaNative/hybrid_alias_map.cpp
new file mode 100644
index 0000000000..fcbeee3806
--- /dev/null
+++ b/src/Native/LdaNative/hybrid_alias_map.cpp
@@ -0,0 +1,198 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include "utils.hpp"
+#include <stdlib.h>
+#include "hybrid_alias_map.h"
+
+namespace lda
+{
+    hybrid_alias_map::hybrid_alias_map()
+        :memory_(nullptr),
+        is_dense_(1),
+        kv_(nullptr),
+        idx_(nullptr),
+        capacity_(0),
+        size_(0),
+        mass_(0),
+        n_kw_mass_(0.0),
+        beta_mass_(0.0)
+    {
+    }
+    hybrid_alias_map::hybrid_alias_map(int32_t *memory, int32_t is_dense, int32_t capacity)
+        :memory_(memory),
+        is_dense_(is_dense),
+        capacity_(capacity),
+        kv_(nullptr),
+        idx_(nullptr),
+        size_(0),
+        mass_(0),
+        n_kw_mass_(0.0),
+        beta_mass_(0.0)
+    {
+        if (is_dense_)
+        {
+            kv_ = memory_;
+            idx_ = nullptr;
+        }
+        else
+        {
+            kv_ = memory_;
+            idx_ = memory_ + capacity_ * 2;
+        }
+    }
+
+    hybrid_alias_map::hybrid_alias_map(const hybrid_alias_map &other)
+    {
+        this->memory_ = other.memory_;
+        this->is_dense_ = other.is_dense_;
+        this->capacity_ = other.capacity_;
+
+        this->kv_ = other.kv_;
+        this->idx_ = other.idx_;
+        this->height_ = other.height_;
+        this->size_ = other.size_;
+
+        this->mass_ = other.mass_;
+        this->n_kw_mass_ = other.n_kw_mass_;
+        this->beta_mass_ = other.beta_mass_;
+    }
+    hybrid_alias_map& hybrid_alias_map::operator=(const hybrid_alias_map &other)
+    {
+        this->memory_ = other.memory_;
+        this->is_dense_ = other.is_dense_;
+        this->capacity_ = other.capacity_;
+
+        this->kv_ = other.kv_;
+        this->idx_ = other.idx_;
+        this->height_ = other.height_;
+        this->size_ = other.size_;
+
+        this->mass_ = other.mass_;
+        this->n_kw_mass_ = other.n_kw_mass_;
+        this->beta_mass_ = other.beta_mass_;
+
+        return *this;
+    }
+
+    void hybrid_alias_map::clear()
+    {
+        size_ = 0;
+    }
+
+    std::string hybrid_alias_map::DebugString()
+    {
+        std::string str = "";
+
+        if (size_ == 0)
+        {
+            return str;
+        }
+
+        str += "is_dense:" + std::to_string(is_dense_) + " height:" + std::to_string(height_) + " mass:" + std::to_string(n_kw_mass_);
+        if (is_dense_)
+        {
+            for (int i = 0; i < capacity_; ++i)
+            {
+                str += " " + std::to_string(i) + ":" + std::to_string(*(memory_ + 2 * i)) + ":" + std::to_string(*(memory_ + 2 * i + 1));
+            }
+        }
+        else
+        {
+            for (int i = 0; i < size_; ++i)
+            {
+                str += " " + std::to_string(idx_[i]) + ":" + std::to_string(*(memory_ + 2 * i)) + ":" + std::to_string(*(memory_ + 2 * i + 1));
+            }
+        }
+
+        return str;
+    }
+
+    void hybrid_alias_map::build_table(
+        wood::AliasMultinomialRNGInt &alias_rng,
+        const hybrid_map &word_topic_row,
+        const std::vector<int64_t> &summary_row,
+        std::vector<float> &q_w_proportion,
+        float beta,
+        float beta_sum,
+        int word_id,
+        wood::xorshift_rng &rng)
+    {
+        if (is_dense_)
+        {
+            size_ = capacity_;
+            mass_ = 0.0;
+            for (int k = 0; k < capacity_; ++k)
+            {
+                int32_t n_kw = word_topic_row[k];
+                float prop = (n_kw + beta) / (summary_row[k] + beta_sum);
+                q_w_proportion[k] = prop;
+                mass_ += prop;
+            }
+            if (size_ == 0)
+            {
+                return;
+            }
+            alias_rng.SetProportionMass(q_w_proportion, mass_, memory_, &height_, rng);
+
+        }
+        else
+        {
+            if (word_topic_row.is_dense())
+            {
+                size_ = 0;
+                n_kw_mass_ = 0.0;
+                for (int k = 0; k < word_topic_row.capacity_; ++k)
+                {
+                    if (word_topic_row.memory_[k] == 0) continue;
+                    int32_t n_tw = word_topic_row.memory_[k];
+                    int64_t n_t = summary_row[k];
+                    q_w_proportion[size_] = n_tw / (n_t + beta_sum);
+                    idx_[size_] = k;
+                    n_kw_mass_ += q_w_proportion[size_];
+                    ++size_;
+                }
+
+                if (size_ == 0)
+                {
+                    // it is possible that, the local tf of a word is zero
+                    return;
+                }
+                alias_rng.SetProportionMass(q_w_proportion, size_, n_kw_mass_, memory_, &height_, rng, word_id);
+            }
+            else
+            {
+                size_ = 0;
+                n_kw_mass_ = 0.0;
+                int32_t row_capacity = word_topic_row.capacity_;
+                for (int k = 0; k < row_capacity; ++k)
+                {
+                    int32_t key = word_topic_row.key_[k];
+                    if (key > 0)
+                    {
+                        int32_t n_kw = word_topic_row.value_[k];
+                        float prop = n_kw / (summary_row[key - 1] + beta_sum);
+
+
+
+                        q_w_proportion[size_] = prop;
+                        idx_[size_] = word_topic_row.key_[k] - 1;   // minus one from the the internal key
+                        n_kw_mass_ += prop;
+
+                        ++size_;
+                    }
+                }
+                if (size_ == 0)
+                {
+                    // it is possible that, the local tf of a word is zero
+                    return;
+                }
+                alias_rng.SetProportionMass(q_w_proportion, size_, n_kw_mass_, memory_, &height_, rng, word_id);
+            }
+        }
+    }
+}
diff --git a/src/Native/LdaNative/hybrid_alias_map.h b/src/Native/LdaNative/hybrid_alias_map.h
new file mode 100644
index 0000000000..f62b1e33af
--- /dev/null
+++ b/src/Native/LdaNative/hybrid_alias_map.h
@@ -0,0 +1,128 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <cassert>
+#include <map>
+#include "alias_multinomial_rng_int.hpp"
+#include "hybrid_map.h"
+
+namespace lda
+{
+    class hybrid_alias_map
+    {
+    public:
+
+        hybrid_alias_map();
+        hybrid_alias_map(int32_t *memory, int32_t is_dense, int32_t capacity);
+        hybrid_alias_map(const hybrid_alias_map &other);
+        hybrid_alias_map& operator=(const hybrid_alias_map &other);
+
+        void clear();
+        inline int32_t size() const;
+
+        std::string DebugString();
+        void build_table(
+            wood::AliasMultinomialRNGInt &alias_rng,
+            const hybrid_map &word_topic_row,
+            const std::vector<int64_t> &summary_row,
+            std::vector<float> &q_w_proportion,
+            float beta,
+            float beta_sum,
+            int word_id,
+            wood::xorshift_rng &rng);
+    
+        inline int32_t next(wood::xorshift_rng &rng, int32_t beta_height, float beta_mass, std::vector<wood::alias_k_v> &beta_k_v, bool debug);
+
+    private:
+        int32_t *memory_;
+        int32_t is_dense_;
+        int32_t *kv_;
+        int32_t *idx_;
+        int32_t height_;
+        int32_t capacity_;
+        int32_t size_;
+
+        float mass_;
+        float n_kw_mass_;
+        float beta_mass_;
+    };
+
+    inline int32_t hybrid_alias_map::size() const
+    {
+        return size_;
+    }
+
+    inline int32_t hybrid_alias_map::next(wood::xorshift_rng &rng, int32_t beta_height, float beta_mass, std::vector<wood::alias_k_v> &beta_k_v, bool debug)
+    {
+        //NOTE: here we will set those unseen words' topic to 0. logicall we could set it to random as well.
+        if (capacity_ == 0)
+        {
+            return 0;
+        }
+
+        if (is_dense_)
+        {
+            auto sample = rng.rand();
+            int idx = sample / height_;
+            if (idx >= size_)
+            {
+                idx = size_ - 1;
+            }
+
+            int32_t *p = memory_ + 2 * idx;
+            int32_t k = *p;
+            p++;
+            int32_t v = *p;
+            int32_t m = -(sample < v);
+            return (idx & m) | (k & ~m);
+        }
+        else
+        {
+            float sample = rng.rand_real() * (n_kw_mass_ + beta_mass);
+            if (sample < n_kw_mass_)
+            {
+                auto n_kw_sample = rng.rand();
+                int32_t idx = n_kw_sample / height_;
+
+                if (idx >= size_)
+                {
+                    idx = size_ - 1;
+                }
+
+
+                int32_t *p = memory_ + 2 * idx;
+                int32_t k = *p; p++;
+                int32_t v = *p;
+                int32_t id = idx_[idx];
+                int32_t k_id = idx_[k];
+
+                int32_t m = -(n_kw_sample < v);
+                return (id & m) | (k_id & ~m);
+
+            }
+            else
+            {
+                auto sampleLocal = rng.rand();
+                int idx = sampleLocal / beta_height;
+                int beta_size = (int)beta_k_v.size();
+
+                if (idx >= beta_size)
+                {
+                    idx = beta_size - 1;
+                }
+
+                int32_t k = beta_k_v[idx].k_;
+                int32_t v = beta_k_v[idx].v_;
+                int32_t m = -(sampleLocal < v);
+                return (idx & m) | (k & ~m);
+            }
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/hybrid_map.cpp b/src/Native/LdaNative/hybrid_map.cpp
new file mode 100644
index 0000000000..e5c8252702
--- /dev/null
+++ b/src/Native/LdaNative/hybrid_map.cpp
@@ -0,0 +1,142 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include "utils.hpp"
+#include <stdlib.h>
+#include "hybrid_map.h"
+
+namespace lda
+{
+    hybrid_map::hybrid_map()
+        :memory_(nullptr),
+        is_dense_(1),
+        capacity_(0),
+        empty_key_(0),
+        deleted_key_(-1),
+        key_(nullptr),
+        value_(nullptr),
+        num_deleted_key_(0),
+        external_rehash_buf_(nullptr)
+    {
+    }
+    hybrid_map::hybrid_map(int32_t *memory, int32_t is_dense, int32_t capacity, int32_t num_deleted_key
+        , int32_t *external_rehash_buf_)
+        : memory_(memory),
+        is_dense_(is_dense),
+        capacity_(capacity),
+        empty_key_(0),
+        deleted_key_(-1),
+        key_(nullptr),
+        value_(nullptr),
+        num_deleted_key_(num_deleted_key),
+        external_rehash_buf_(external_rehash_buf_)
+    {
+        if (is_dense_ == 0) {
+            key_ = memory_;
+            value_ = memory_ + capacity_;
+        }
+    }
+
+    hybrid_map::hybrid_map(const hybrid_map &other)
+    {
+        this->memory_ = other.memory_;
+        this->is_dense_ = other.is_dense_;
+        this->capacity_ = other.capacity_;
+        empty_key_ = other.empty_key_;
+        deleted_key_ = other.deleted_key_;
+        num_deleted_key_ = other.num_deleted_key_;
+        external_rehash_buf_ = other.external_rehash_buf_;
+        if (this->is_dense_)
+        {
+            this->key_ = nullptr;
+            this->value_ = nullptr;
+        }
+        else
+        {
+            this->key_ = this->memory_;
+            this->value_ = this->memory_ + capacity_;
+        }
+
+    }
+    hybrid_map& hybrid_map::operator=(const hybrid_map &other)
+    {
+        this->memory_ = other.memory_;
+        this->is_dense_ = other.is_dense_;
+        this->capacity_ = other.capacity_;
+        empty_key_ = other.empty_key_;
+        deleted_key_ = other.deleted_key_;
+        num_deleted_key_ = other.num_deleted_key_;
+        external_rehash_buf_ = other.external_rehash_buf_;
+        if (this->is_dense_)
+        {
+            this->key_ = nullptr;
+            this->value_ = nullptr;
+        }
+        else
+        {
+            this->key_ = this->memory_;
+            this->value_ = this->memory_ + capacity_;
+        }
+        return *this;
+    }
+
+    void hybrid_map::clear()
+    {
+        int32_t memory_size = is_dense_ ? capacity_ : 2 * capacity_;
+        memset(memory_, 0, memory_size * sizeof(int32_t));
+    }
+
+    std::string hybrid_map::DumpString() const
+    {
+        if (is_dense_)
+        {
+            std::string result;
+            for (int i = 0; i < capacity_; ++i)
+            {
+                if (memory_[i] != 0)
+                {
+                    result += std::to_string(i) + ":" + std::to_string(memory_[i]) + " ";
+                }
+            }
+            return result;
+        }
+        else
+        {
+            std::string result;
+            for (int i = 0; i < capacity_; ++i)
+            {
+                if (key_[i] > 0)
+                {
+                    result += std::to_string(key_[i] - 1) + ":" + std::to_string(value_[i]) + " ";
+                }
+            }
+            return result;
+        }
+    }
+
+    void hybrid_map::sorted_rehashing()
+    {
+        if (!is_dense_)
+        {
+            std::map<int32_t, int32_t> rehash_buffer;
+            for (int i = 0; i < capacity_; ++i)
+            {
+                if (key_[i] > 0)
+                {
+                    rehash_buffer[key_[i] - 1] = value_[i];
+                }
+            }
+            memset(memory_, 0, 2 * capacity_ * sizeof(int32_t));
+            for (auto it = rehash_buffer.begin();
+                it != rehash_buffer.end(); ++it)
+            {
+                inc(it->first, it->second);
+            }
+        }
+    }
+
+}
diff --git a/src/Native/LdaNative/hybrid_map.h b/src/Native/LdaNative/hybrid_map.h
new file mode 100644
index 0000000000..88bbc82d5b
--- /dev/null
+++ b/src/Native/LdaNative/hybrid_map.h
@@ -0,0 +1,238 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <cassert>
+#include <map>
+
+// The probing method:
+// Linear probing
+// #define JUMP_(key, num_probes)    ( 1 )
+
+// Quadratic probing
+#define JUMP_(key, num_probes)    ( num_probes )
+#define ILLEGAL_BUCKET -1
+
+namespace lda
+{    
+    class hybrid_alias_map;
+
+    class hybrid_map
+    {
+        friend class hybrid_alias_map;
+    public:
+        hybrid_map();
+        hybrid_map(int32_t *memory, int32_t is_dense, int32_t capacity, int32_t num_deleted_key
+            , int32_t *external_rehash_buf_);
+        hybrid_map(const hybrid_map &other);
+        hybrid_map& operator=(const hybrid_map &other);
+
+
+        void clear();
+        std::string DumpString() const;
+        void sorted_rehashing();
+
+        inline int32_t nonzero_num() const;
+        inline bool is_dense() const;
+        inline int32_t capacity() const;
+        inline int32_t *memory() const;
+        inline int32_t* key() const;
+        inline int32_t* value() const;
+        inline void rehashing();
+        inline void inc(int32_t key, int32_t delta);
+        // query the value of |key|
+        // if |key| is in the table, return the |value| corresonding to |key|
+        // if not, just return 0
+        inline int32_t operator[](int32_t key) const;
+    
+    private:
+        inline std::pair<int32_t, int32_t> find_position(const int32_t key) const;
+
+        int32_t *memory_;
+        int32_t is_dense_;
+        int32_t *key_;
+        int32_t *value_;
+
+        // if |is_dense_| == true, capactiy_ is the length of an array
+        // if |is dense_| == false, capacity_ is the size of a light hash table
+        int32_t capacity_;
+        int32_t empty_key_;
+        int32_t deleted_key_;
+
+        int32_t num_deleted_key_;
+        int32_t* external_rehash_buf_;
+    };
+
+    inline int32_t hybrid_map::nonzero_num() const
+    {
+        if (is_dense_)
+        {
+            int32_t size = 0;
+            for (int i = 0; i < capacity_; ++i)
+            {
+                if (memory_[i] > 0)
+                {
+                    ++size;
+                }
+            }
+            return size;
+        }
+        else
+        {
+            int32_t size = 0;
+            for (int i = 0; i < capacity_; ++i)
+            {
+                if (key_[i] > 0)
+                {
+                    ++size;
+                }
+            }
+            return size;
+        }
+    }
+
+    inline bool hybrid_map::is_dense() const
+    {
+        return is_dense_ != 0;
+    }
+
+    inline int32_t hybrid_map::capacity() const
+    {
+        return capacity_;
+    }
+
+    inline int32_t* hybrid_map::memory() const
+    {
+        return memory_;
+    }
+    inline int32_t* hybrid_map::key() const
+    {
+        return key_;
+    }
+    inline int32_t* hybrid_map::value() const
+    {
+        return value_;
+    }
+    inline void hybrid_map::rehashing()
+    {
+        if (!is_dense_)
+        {
+            memcpy(external_rehash_buf_, memory_, 2 * capacity_ * sizeof(int32_t));
+            int32_t *key = external_rehash_buf_;
+            int32_t *value = external_rehash_buf_ + capacity_;
+            memset(memory_, 0, 2 * capacity_ * sizeof(int32_t));
+            for (int i = 0; i < capacity_; ++i)
+            {
+                if (key[i] > 0)
+                {
+                    inc(key[i] - 1, value[i]);
+                }
+            }
+            num_deleted_key_ = 0;
+        }
+    }
+    inline void hybrid_map::inc(int32_t key, int32_t delta)
+    {
+        if (is_dense_)
+        {
+            memory_[key] += delta;
+        }
+        else
+        {
+            int32_t internal_key = key + 1;
+            std::pair<int32_t, int32_t> pos = find_position(internal_key);
+            if (pos.first != ILLEGAL_BUCKET)
+            {
+                value_[pos.first] += delta;
+                if (value_[pos.first] == 0)       // the value becomes zero, delete the key
+                {
+                    key_[pos.first] = deleted_key_;
+
+                    ++num_deleted_key_;        // num_deleted_key ++
+                    if (num_deleted_key_ * 20 > capacity_)
+                    {
+                        rehashing();
+                    }
+                }
+            }
+            else                                 // not found the key, insert it with delta as value
+            {
+                key_[pos.second] = internal_key;
+                value_[pos.second] = delta;
+            }
+        }
+    }
+
+    // query the value of |key|
+    // if |key| is in the table, return the |value| corresonding to |key|
+    // if not, just return 0
+    inline int32_t hybrid_map::operator[](int32_t key) const
+    {
+        if (is_dense_)
+        {
+            //return memory_[key];
+            if (capacity_ > 0)
+            {
+                return memory_[key];
+            }
+            else
+            {
+                return 0;
+            }
+        }
+        else
+        {
+            int32_t internal_key = key + 1;
+            std::pair<int32_t, int32_t> pos = find_position(internal_key);
+            if (pos.first != ILLEGAL_BUCKET)
+            {
+                return value_[pos.first];
+            }
+            else
+            {
+                return 0;
+            }
+        }
+    }
+    inline std::pair<int32_t, int32_t> hybrid_map::find_position(const int32_t key) const
+    {
+        int num_probes = 0;
+        int32_t capacity_minus_one = capacity_ - 1;
+        int32_t idx = key % capacity_;
+        int32_t insert_pos = ILLEGAL_BUCKET;
+        while (1)                                           // probe until something happens
+        {
+            if (key_[idx] == empty_key_)                    // bucket is empty
+            {
+                if (insert_pos == ILLEGAL_BUCKET)           // found no prior place to insert
+                {
+                    return std::pair<int32_t, int32_t>(ILLEGAL_BUCKET, idx);
+                }
+                else                                        // previously, there is a position to insert
+                {
+                    return std::pair<int32_t, int32_t>(ILLEGAL_BUCKET, insert_pos);
+                }
+            }
+            else if (key_[idx] == deleted_key_)            // keep searching, but makr to insert
+            {
+                if (insert_pos == ILLEGAL_BUCKET)
+                {
+                    insert_pos = idx;
+                }
+            }
+            else if (key_[idx] == key)
+            {
+                return std::pair<int32_t, int32_t>(idx, ILLEGAL_BUCKET);
+            }
+            ++num_probes;                                // we are doing another probe
+            idx = (idx + JUMP_(key, num_probes) & capacity_minus_one);
+            assert(num_probes < capacity_); // && "Hashtable is full: an error in key_equal<> or hash<>");
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/lda_document.cpp b/src/Native/LdaNative/lda_document.cpp
new file mode 100644
index 0000000000..c2a3371020
--- /dev/null
+++ b/src/Native/LdaNative/lda_document.cpp
@@ -0,0 +1,29 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include "lda_document.h"
+
+namespace lda
+{
+    LDADocument::LDADocument(int32_t* memory_begin, int32_t* memory_end) :
+        memory_begin_(memory_begin), memory_end_(memory_end), cursor_(*memory_begin) {}
+
+    // should be called when sweeped over all the tokens in a document
+    void LDADocument::ResetCursor()
+    {
+        cursor_ = 0;
+    }
+    void LDADocument::GetDocTopicCounter(lda::light_hash_map& doc_topic_counter)
+    {
+        int32_t* p = memory_begin_ + 2;
+        int32_t num = 0;
+        while (p < memory_end_)
+        {
+            doc_topic_counter.inc(*p, 1);
+            ++p; ++p;
+            if (++num == 512)
+                return;
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/lda_document.h b/src/Native/LdaNative/lda_document.h
new file mode 100644
index 0000000000..45df42f06c
--- /dev/null
+++ b/src/Native/LdaNative/lda_document.h
@@ -0,0 +1,60 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+
+#include <string>
+#include <algorithm>
+#include <memory>
+#include "light_hash_map.h"
+
+namespace lda
+{
+    class LDADocument
+    {
+    public:
+        const int32_t kMaxSizeLightHash = 512; // This is for the easy use of LightHashMap
+
+        LDADocument(int32_t* memory_begin, int32_t* memory_end);
+        
+        inline int32_t size() const;
+        inline int32_t& get_cursor();
+        inline int32_t Word(int32_t index) const;
+        inline int32_t Topic(int32_t index) const;
+        inline void SetTopic(int32_t index, int32_t topic);
+
+        // should be called when sweeped over all the tokens in a document
+        void ResetCursor();
+        void GetDocTopicCounter(lda::light_hash_map& doc_topic_counter);
+
+    private:
+        LDADocument(const LDADocument &other) = delete;
+        LDADocument& operator=(const LDADocument &other) = delete;
+
+        int32_t* memory_begin_;
+        int32_t* memory_end_;
+        int32_t& cursor_; // cursor_ is reference of *memory_begin_
+    };
+
+    inline int32_t LDADocument::size() const
+    {
+        return (std::min)(static_cast<int32_t>((memory_end_ - memory_begin_) / 2), kMaxSizeLightHash);
+    }
+    inline int32_t& LDADocument::get_cursor()
+    {
+        return cursor_;
+    }
+    inline int32_t LDADocument::Word(int32_t index) const
+    {
+        return *(memory_begin_ + 1 + index * 2);
+    }
+    inline int32_t LDADocument::Topic(int32_t index) const
+    {
+        return *(memory_begin_ + 2 + index * 2);
+    }
+    inline void LDADocument::SetTopic(int32_t index, int32_t topic)
+    {
+        *(memory_begin_ + 2 + index * 2) = topic;
+    }
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/lda_engine.cpp b/src/Native/LdaNative/lda_engine.cpp
new file mode 100644
index 0000000000..5650ce73b5
--- /dev/null
+++ b/src/Native/LdaNative/lda_engine.cpp
@@ -0,0 +1,1066 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include <unordered_map>
+#include <cstdint>
+#include <string>
+#include <iostream>
+#include <cstdlib>
+#include <time.h>
+#include <mutex>
+#include <set>
+#include <fstream>
+#include <thread>
+#include <algorithm>
+#include <stdlib.h>
+
+#include "timer.h"
+#include "rand_int_rng.h"
+#include "lda_document.h"
+#include "data_block.h"
+#include "model_block.h"
+#include "lda_engine.hpp"
+#include "utils.hpp"
+#include "simple_barrier.h"
+#include "light_doc_sampler.hpp"
+
+#ifdef _MSC_VER
+#include "windows.h"
+#elif defined(__APPLE__)
+#include <mach/mach_types.h>
+#include <mach/thread_act.h>
+#include <pthread.h>
+#else
+#include "sched.h"
+#endif
+
+namespace lda {
+    LdaEngine::LdaEngine(int numTopic,
+        int numVocab,
+        float alphaSum,
+        float beta,
+        int numIter,
+        int likelihoodInterval,
+        int numThread,
+        int mhstep,
+        int maxDocToken)
+        : K_(numTopic),
+        V_(numVocab),
+        compute_ll_interval_(likelihoodInterval),
+        beta_(beta),
+        num_iterations_(numIter),
+        mh_step_(mhstep),
+        alpha_sum_(alphaSum),
+        maxDocToken_(maxDocToken),
+        samplers_(nullptr),
+        document_buffer_(nullptr)
+    {
+        if (numThread > 0)
+        {
+            num_threads_ = numThread;
+        }
+        else
+        {
+            unsigned int uNumCPU = std::thread::hardware_concurrency();
+            num_threads_ = std::max(1, (int)(uNumCPU - 2));
+        }
+        printf("using %d thread(s) to do train/test\n", num_threads_);
+
+        bAlphaSumMultiplied = false;
+        atomic_stats_ = new LDAEngineAtomics();
+        model_block_ = new LDAModelBlock();
+        data_block_ = new LDADataBlock(num_threads_);
+        process_barrier_ = new SimpleBarrier(num_threads_);
+        samplerQueue_ = new CBlockedIntQueue();
+
+        document_buffer_ = new int32_t*[num_threads_];
+        for (int i = 0; i < num_threads_; i++)
+            document_buffer_[i] = new int32_t[maxDocToken_ * 2 + 1];
+
+        likelihood_in_iter_ = nullptr;
+
+        beta_sum_ = beta_ * V_;
+    }
+
+    LdaEngine::LdaEngine(int32_t K, int32_t V, int32_t num_threads, int32_t compute_ll_interval, float beta, int32_t num_iterations, int32_t mh_step, float alpha_sum, int maxDocToken)
+        : K_(K),
+        V_(V),
+        compute_ll_interval_(compute_ll_interval),
+        beta_(beta),
+        num_iterations_(num_iterations),
+        mh_step_(mh_step),
+        alpha_sum_(alpha_sum),
+        maxDocToken_(maxDocToken),
+        samplers_(nullptr),
+        document_buffer_(nullptr)
+    {
+        if (num_threads > 0)
+        {
+            num_threads_ = num_threads;
+        }
+        else
+        {
+            unsigned int uNumCPU = std::thread::hardware_concurrency();
+            num_threads_ = std::max(1, (int)(uNumCPU - 2));
+        }
+        bAlphaSumMultiplied = false;
+        process_barrier_ = new SimpleBarrier(num_threads_);
+        atomic_stats_ = new LDAEngineAtomics();
+        data_block_ = new LDADataBlock(num_threads_);
+        model_block_ = new LDAModelBlock();
+        samplerQueue_ = new CBlockedIntQueue();
+
+        document_buffer_ = new int32_t*[num_threads_];
+        for (int i = 0; i < num_threads_; i++)
+            document_buffer_[i] = new int32_t[maxDocToken_ * 2 + 1];
+
+        likelihood_in_iter_ = nullptr;
+        beta_sum_ = beta_ * V_;
+    }
+
+
+    LdaEngine::~LdaEngine()
+    {
+        //delete memory space
+        delete process_barrier_;
+        process_barrier_ = nullptr;
+
+        delete data_block_;
+        data_block_ = nullptr;
+
+        delete atomic_stats_;
+        atomic_stats_ = nullptr;
+
+        delete model_block_;
+        model_block_ = nullptr;
+
+        delete samplerQueue_;
+        samplerQueue_ = nullptr;
+
+        for (int i = 0; i < num_threads_; ++i)
+        {
+            delete samplers_[i];
+        }
+        delete[] samplers_;
+
+        if (document_buffer_)
+        {
+            for (int i = 0; i < num_threads_; ++i)
+            {
+                delete[]document_buffer_[i];
+                document_buffer_[i] = nullptr;
+            }
+            delete[]document_buffer_;
+            document_buffer_ = nullptr;
+        }
+
+        if (likelihood_in_iter_)
+        {
+            delete[] likelihood_in_iter_;
+            likelihood_in_iter_ = nullptr;
+        }
+    }
+
+    bool LdaEngine::InitializeBeforeTrain()
+    {
+        CTimer tmDebug(true);
+        CheckFunction(0, tmDebug, "enter initializeBeforeTrain", false);
+        //allocate model memory from the data preloaded
+        AllocateModelMemory(data_block_);
+        CheckFunction(0, tmDebug, "allocate model memory", false);
+
+        double alloc_start = lda::get_time();
+        global_word_topic_table_.resize(V_);
+        alias_rng_int_.Init(K_);
+        beta_k_v_.resize(K_);
+        global_alias_k_v_.resize(V_);
+
+        for (int i = 0; i < V_; ++i)
+        {
+            global_alias_k_v_[i] = model_block_->get_alias_row(i);
+        }
+        global_summary_row_.resize(K_);
+        CheckFunction(0, tmDebug, "initlaizing global tables used in sampling", false);
+
+        word_range_for_each_thread_.resize(num_threads_ + 1);
+        int32_t word_num_each_thread = V_ / num_threads_;
+        word_range_for_each_thread_[0] = 0;
+        for (int32_t i = 0; i < num_threads_ - 1; ++i)
+        {
+            word_range_for_each_thread_[i + 1] = word_range_for_each_thread_[i] + word_num_each_thread;
+        }
+        word_range_for_each_thread_[num_threads_] = V_;
+
+        //setup sampler
+        samplers_ = new LightDocSampler*[num_threads_];
+        samplerQueue_->clear();
+
+        for (int i = 0; i < num_threads_; ++i)
+        {
+            samplers_[i] = new LightDocSampler(
+                K_,
+                V_,
+                num_threads_,
+                mh_step_,
+                beta_,
+                alpha_sum_,
+                global_word_topic_table_,
+                global_summary_row_,
+                global_alias_k_v_,
+                beta_height_,
+                beta_mass_,
+                beta_k_v_);
+
+            samplerQueue_->push(i);
+        }
+        CheckFunction(0, tmDebug, "create samplers", false);
+        return true;
+    }
+
+    void LdaEngine::InitializeBeforeTest()
+    {
+        // TODO:
+        // Allocating space for word-topic-table and alias table according to the input data of SetModel interface (done)
+        // Create multiple thread-specific sampler
+        // set word_range_for_each_thread_
+        // Adjust the alpha_sum_ parameter for each thread-specific sampler
+        CTimer tmDebug(true);
+        CheckFunction(0, tmDebug, "enter initializeBeforeTest", false);
+
+        global_word_topic_table_.resize(V_);
+        alias_rng_int_.Init(K_);
+        beta_k_v_.resize(K_);
+        global_alias_k_v_.resize(V_);
+
+        for (int i = 0; i < V_; ++i)
+        {
+            global_alias_k_v_[i] = model_block_->get_alias_row(i);
+        }
+        CheckFunction(0, tmDebug, "initlaizing global tables used in sampling", false);
+
+        // Set the word range for each thread
+        word_range_for_each_thread_.resize(num_threads_ + 1);
+        int32_t word_num_each_thread = V_ / num_threads_;
+        word_range_for_each_thread_[0] = 0;
+        for (int32_t i = 0; i < num_threads_ - 1; ++i)
+        {
+            word_range_for_each_thread_[i + 1] = word_range_for_each_thread_[i] + word_num_each_thread;
+        }
+        word_range_for_each_thread_[num_threads_] = V_;
+
+        //setup sampler
+        if (samplers_)
+        {
+            for (int i = 0; i < num_threads_; ++i)
+            {
+                delete samplers_[i];
+            }
+            delete[] samplers_;
+        }
+        if (document_buffer_)
+        {
+            for (int i = 0; i < num_threads_; ++i)
+            {
+                delete[]document_buffer_[i];
+                document_buffer_[i] = nullptr;
+            }
+            delete[]document_buffer_;
+            document_buffer_ = nullptr;
+        }
+
+        samplers_ = new LightDocSampler*[num_threads_];
+        document_buffer_ = new int32_t*[num_threads_];
+        samplerQueue_->clear();
+
+        for (int i = 0; i < num_threads_; ++i)
+        {
+            samplers_[i] = new LightDocSampler(
+                K_,
+                V_,
+                num_threads_,
+                mh_step_,
+                beta_,
+                alpha_sum_,
+                global_word_topic_table_,
+                global_summary_row_,
+                global_alias_k_v_,
+                beta_height_,
+                beta_mass_,
+                beta_k_v_);
+
+            samplers_[i]->AdaptAlphaSum(false);
+            document_buffer_[i] = new int32_t[maxDocToken_ * 2 + 1];
+
+            samplerQueue_->push(i);
+        }
+        CheckFunction(0, tmDebug, "create samplers", false);
+
+        // build alias table
+        // build alias table for the dense term,  beta_k_v_, which is shared by all the words
+        beta_mass_ = 0;
+        std::vector<float> proportion(K_);
+        for (int k = 0; k < K_; ++k)
+        {
+            proportion[k] = beta_ / (global_summary_row_[k] + beta_sum_);
+            beta_mass_ += proportion[k];
+        }
+        alias_rng_int_.SetProportionMass(proportion, beta_mass_, beta_k_v_, &beta_height_, samplers_[0]->rng());
+
+        // build alias table for the sparse term
+        for (int thread_id = 0; thread_id < num_threads_; ++thread_id)
+        {
+            LightDocSampler &sampler = *(samplers_[thread_id]);
+            sampler.build_alias_table(word_range_for_each_thread_[thread_id], word_range_for_each_thread_[thread_id + 1], thread_id);
+        }
+        CheckFunction(0, tmDebug, "build alisa table", false);
+    }
+
+    void LdaEngine::Train(const char* pTrainOutput)
+    {
+        std::vector<std::thread> threads(num_threads_);
+        atomic_stats_->thread_counter_ = 0;
+
+        for (auto& thr : threads) {
+            thr = std::thread(&LdaEngine::Training_Thread, this);
+        }
+
+        printf("started training with %d threads\n", num_threads_);
+        for (auto& thr : threads) {
+            thr.join();
+        }
+
+        if (pTrainOutput)
+        {
+            DumpDocTopicTable(pTrainOutput);
+        }
+    }
+
+    void LdaEngine::Test(int32_t burnin_iter, float* pLoglikelihood)
+    {
+        std::vector<std::thread> threads(num_threads_);
+        atomic_stats_->thread_counter_ = 0;
+        burnin_iterations_ = burnin_iter;
+
+        likelihood_in_iter_ = new float[burnin_iterations_];
+        for (int i = 0; i < burnin_iterations_; i++)
+        {
+            likelihood_in_iter_[i] = 0.0;
+        }
+
+        for (auto& thr : threads) {
+            thr = std::thread(&LdaEngine::Testing_Thread, this);
+        }
+
+        printf("started testing with %d threads\n", num_threads_);
+
+        for (auto& thr : threads) {
+            thr.join();
+        }
+
+        //get the loglikelihood of each burn in iteration
+        for (int i = 0; i < burnin_iterations_; i++)
+        {
+            pLoglikelihood[i] = likelihood_in_iter_[i]; //just set an arbitary value here for later update
+        }
+    }
+
+    void LdaEngine::CheckFunction(int thread_id, CTimer &tmDebug, const char* msg, bool waitBarrier)
+    {
+    }
+
+    void LdaEngine::Training_Thread()
+    {
+        CTimer tmDebug(true);
+
+        int thread_id = atomic_stats_->thread_counter_++;
+        std::vector<std::pair<int, double>> llcontainer;
+        // Set core affinity which helps performance improvement
+#ifdef _MSC_VER
+        long long maskLL = 0;
+        maskLL |= (1LL << (thread_id));
+        DWORD_PTR mask = maskLL;
+        SetThreadAffinityMask(GetCurrentThread(), mask);
+#elif defined(__APPLE__)
+        thread_port_t thread = pthread_mach_thread_np(pthread_self());
+        thread_affinity_policy_data_t policy = { thread_id };
+        thread_policy_set(thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1);
+#else
+        cpu_set_t set;
+        CPU_ZERO(&set);
+        CPU_SET(thread_id, &set);
+        sched_setaffinity(0, sizeof(cpu_set_t), &set);
+#endif
+
+        // Each thread builds a portion of word-topic table. We do this way because each word-topic row 
+        // has a thread-specific buffer for rehashing
+        process_barrier_->wait();
+        LightDocSampler &sampler_ = *(samplers_[thread_id]);
+        sampler_.AdaptAlphaSum(true);
+
+        sampler_.build_word_topic_table(thread_id, num_threads_, *model_block_);
+        process_barrier_->wait();
+        CheckFunction(thread_id, tmDebug, "intialize word_topic_table for sampler - in function train_thread");
+
+        int32_t token_num = 0;
+        int32_t doc_start = data_block_->Begin(thread_id);
+        int32_t doc_end = data_block_->End(thread_id);
+
+        for (int32_t doc_index = doc_start; doc_index != doc_end; ++doc_index)
+        {
+            std::shared_ptr<LDADocument> doc = data_block_->GetOneDoc(doc_index);
+            int doc_size = doc->size();
+            for (int i = 0; i < doc_size; ++i)
+            {
+                int topic = sampler_.rand_k();
+                doc->SetTopic(i, topic);
+            }
+            int cursor = doc->get_cursor();
+            token_num += sampler_.GlobalInit(doc.get());
+        }
+        process_barrier_->wait();
+        CheckFunction(thread_id, tmDebug, "intialize token topic before iterations - in function train_thread");
+
+        for (int i = 0; i < num_threads_; ++i)
+        {
+            std::vector<word_topic_delta>& wtd_vec = samplers_[i]->get_word_topic_delta(thread_id);
+            for (auto& wtd : wtd_vec)
+            {
+                global_word_topic_table_[wtd.word].inc(wtd.topic, wtd.delta);
+            }
+        }
+        process_barrier_->wait();
+        CheckFunction(thread_id, tmDebug, "intialize word topic model before iterations - in function train_thread");
+
+        // use thread-private delta table to get global table
+        {
+            std::lock_guard<std::mutex> lock(atomic_stats_->global_mutex_);
+
+            std::vector<int64_t> &summary = sampler_.get_delta_summary_row();
+            for (int i = 0; i < K_; ++i)
+            {
+                global_summary_row_[i] += summary[i];
+            }
+        }
+        process_barrier_->wait();
+        CheckFunction(thread_id, tmDebug, "global summary & Complete setup train before iterations - in function train_thread");
+
+        for (int iter = 0; iter < num_iterations_; ++iter)
+        {
+            CheckFunction(thread_id, tmDebug, "----------------------iteration start - in function train_thread---------------------");
+            int32_t token_sweeped = 0;
+            atomic_stats_->num_tokens_clock_ = 0;
+            // build alias table
+            // build alias table for the dense term,  beta_k_v_, which is shared by all the words
+            if (thread_id == 0)
+            {
+                beta_mass_ = 0;
+                std::vector<float> proportion(K_);
+                for (int k = 0; k < K_; ++k)
+                {
+                    proportion[k] = beta_ / (global_summary_row_[k] + beta_sum_);
+                    beta_mass_ += proportion[k];
+                }
+
+                alias_rng_int_.SetProportionMass(proportion, beta_mass_, beta_k_v_, &beta_height_, sampler_.rng());
+            }
+            process_barrier_->wait();
+            CheckFunction(thread_id, tmDebug, "built alias table dense - in function train_thread");
+
+            // build alias table for the sparse term
+            sampler_.build_alias_table(word_range_for_each_thread_[thread_id], word_range_for_each_thread_[thread_id + 1], thread_id);
+            process_barrier_->wait();
+            CheckFunction(thread_id, tmDebug, "built alias table sparse - in function train_thread");
+
+            sampler_.EpocInit();
+            process_barrier_->wait();
+            CheckFunction(thread_id, tmDebug, "EpochInit - in function train_thread");
+
+            // main part of the training - sampling over documents in this iteration
+            double iter_start = lda::get_time();
+            int32_t doc_start_local = data_block_->Begin(thread_id);
+            int32_t doc_end_local = data_block_->End(thread_id);
+
+            for (int32_t doc_index = doc_start_local; doc_index != doc_end_local; ++doc_index)
+            {
+                std::shared_ptr<LDADocument> doc = data_block_->GetOneDoc(doc_index);
+                token_sweeped += sampler_.SampleOneDoc(doc.get());
+            }
+            atomic_stats_->num_tokens_clock_ += token_sweeped;
+
+            process_barrier_->wait();
+            double iter_end = lda::get_time();
+
+            if (thread_id == 0)
+            {
+                double seconds_this_iter = iter_end - iter_start;
+
+                printf("Iter: %04d", iter);
+                std::cout
+                    << "\tThread = " << thread_id
+                    << "\tTokens: " << atomic_stats_->num_tokens_clock_
+                    << "\tTook: " << seconds_this_iter << " sec"
+                    << "\tThroughput: "
+                    << static_cast<double>(atomic_stats_->num_tokens_clock_) / (seconds_this_iter) << " token/(thread*sec)"
+                    << std::endl;
+            }
+            process_barrier_->wait();
+            CheckFunction(thread_id, tmDebug, "train(gibbs sampling) - in function train_thread");
+
+            // syncup global table
+            double sync_start = lda::get_time();
+            for (int i = 0; i < num_threads_; ++i)
+            {
+                std::vector<word_topic_delta> & wtd_vec = samplers_[i]->get_word_topic_delta(thread_id);
+                for (auto& wtd : wtd_vec)
+                {
+                    global_word_topic_table_[wtd.word].inc(wtd.topic, wtd.delta);
+                }
+            }
+
+            // use thread-private delta table to update global table
+            {
+                std::lock_guard<std::mutex> lock(atomic_stats_->global_mutex_);
+                std::vector<int64_t> &summary = sampler_.get_delta_summary_row();
+                for (int i = 0; i < K_; ++i)
+                {
+                    global_summary_row_[i] += summary[i];
+                }
+            }
+            process_barrier_->wait();
+            CheckFunction(thread_id, tmDebug, "syncup global word_topic table - in function train_thread");
+
+            if (compute_ll_interval_ != -1 && (iter % compute_ll_interval_ == 0 || iter == num_iterations_ - 1))
+            {
+                double ll = EvalLogLikelihood(true, thread_id, iter, sampler_);
+                llcontainer.push_back(std::pair<int, double>(iter, ll));
+            }
+
+            CheckFunction(thread_id, tmDebug, "----------------------iteration end - in function train_thread---------------------");
+        }
+
+        if (thread_id == 0)
+        {
+            //output the ll once
+            for (int i = 0; i < llcontainer.size(); i++)
+            {
+                printf("loglikelihood @iter%04d = %f\n", llcontainer[i].first, llcontainer[i].second);
+            }
+        }
+
+        process_barrier_->wait();
+
+        snprintf(tmDebug.m_szMessage, 200, "thread_id = %d, training iterations", thread_id);
+        tmDebug.InnerTag();
+    }
+
+    void LdaEngine::Testing_Thread()
+    {
+        int thread_id = atomic_stats_->thread_counter_++;
+
+        // Set core affinity which helps performance improvement
+#ifdef _MSC_VER
+        long long maskLL = 0;
+        maskLL |= (1LL << (thread_id));
+        DWORD_PTR mask = maskLL;
+        SetThreadAffinityMask(GetCurrentThread(), mask);
+#elif defined(__APPLE__)
+        thread_port_t thread = pthread_mach_thread_np(pthread_self());
+        thread_affinity_policy_data_t policy = { thread_id };
+        thread_policy_set(thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1);
+#else
+        cpu_set_t set;
+        CPU_ZERO(&set);
+        CPU_SET(thread_id, &set);
+        sched_setaffinity(0, sizeof(cpu_set_t), &set);
+#endif
+        process_barrier_->wait();
+
+        //// Each thread builds a portion of word-topic table. We do this way because each word-topic row 
+        //// has a thread-specific buffer for rehashing
+        LightDocSampler &sampler_ = *(samplers_[thread_id]);
+        sampler_.AdaptAlphaSum(false);
+
+        double init_start = lda::get_time();
+        int32_t token_num = 0;
+        int32_t doc_start = data_block_->Begin(thread_id);
+        int32_t doc_end = data_block_->End(thread_id);
+
+        for (int32_t doc_index = doc_start; doc_index != doc_end; ++doc_index)
+        {
+            std::shared_ptr<LDADocument> doc = data_block_->GetOneDoc(doc_index);
+            int doc_size = doc->size();
+            for (int i = 0; i < doc_size; ++i)
+            {
+                int topic = sampler_.rand_k();
+                doc->SetTopic(i, topic);
+            }
+            int cursor = doc->get_cursor();
+            token_num += sampler_.GlobalInit(doc.get());
+        }
+
+        process_barrier_->wait();
+
+        // build alias table
+        // build alias table for the dense term,  beta_k_v_, which is shared by all the words
+        if (thread_id == 0)
+        {
+            beta_mass_ = 0;
+            std::vector<float> proportion(K_);
+            for (int k = 0; k < K_; ++k)
+            {
+                proportion[k] = beta_ / (global_summary_row_[k] + beta_sum_);
+                beta_mass_ += proportion[k];
+            }
+
+            alias_rng_int_.SetProportionMass(proportion, beta_mass_, beta_k_v_, &beta_height_, sampler_.rng());
+        }
+
+        // build alias table for the sparse term
+        double alias_start = lda::get_time();
+        process_barrier_->wait();
+        sampler_.build_alias_table(word_range_for_each_thread_[thread_id], word_range_for_each_thread_[thread_id + 1], thread_id);
+        process_barrier_->wait();
+
+        // print the log-likelihood before inference
+        EvalLogLikelihood(true, thread_id, 0, sampler_);
+
+        double total_start = lda::get_time();
+        for (int iter = 0; iter < burnin_iterations_; ++iter)
+        {
+            double iter_start = lda::get_time();
+            int32_t token_sweeped = 0;
+            atomic_stats_->num_tokens_clock_ = 0;
+            int32_t doc_start_local = data_block_->Begin(thread_id);
+            int32_t doc_end_local = data_block_->End(thread_id);
+
+            for (int32_t doc_index = doc_start_local; doc_index != doc_end_local; ++doc_index)
+            {
+                std::shared_ptr<LDADocument> doc = data_block_->GetOneDoc(doc_index);
+                token_sweeped += sampler_.InferOneDoc(doc.get());
+            }
+            atomic_stats_->num_tokens_clock_ += token_sweeped;
+
+            process_barrier_->wait();
+            double iter_end = lda::get_time();
+
+            if (thread_id == 0)
+            {
+                double seconds_this_iter = iter_end - iter_start;
+
+                printf("Iter: %04d", iter);
+                std::cout
+                    << "\tThread = " << thread_id
+                    << "\tTokens: " << atomic_stats_->num_tokens_clock_
+                    << "\tTook: " << seconds_this_iter << " sec"
+                    << "\tThroughput: "
+                    << static_cast<double>(atomic_stats_->num_tokens_clock_) / (seconds_this_iter) << " token/(thread*sec)"
+                    << std::endl;
+
+            }
+
+            process_barrier_->wait();
+
+            if (compute_ll_interval_ != -1 && (iter % compute_ll_interval_ == 0 || iter == burnin_iterations_ - 1))
+            {
+                EvalLogLikelihood(false, thread_id, iter, sampler_);
+            }
+        }
+
+        double total_end = lda::get_time();
+        printf("thread_id = %d, Total time for burnin iterations : %f sec.\n", thread_id, total_end - total_start);
+    }
+
+    void LdaEngine::AllocateDataMemory(int num_document, int64_t corpus_size)
+    {
+        data_block_->Allocate(num_document, corpus_size);
+    }
+
+    void LdaEngine::AllocateModelMemory(const LDADataBlock* data_block)
+    {
+        model_block_->InitFromDataBlock(data_block, V_, K_);
+
+        global_word_topic_table_.resize(V_);
+
+        for (int i = 0; i < V_; ++i)
+        {
+            global_word_topic_table_[i] = model_block_->get_row(i, nullptr);
+        }
+    }
+
+    void LdaEngine::AllocateModelMemory(int num_vocabs, int num_topics, int64_t nonzero_num)
+    {
+        model_block_->Init(num_vocabs, num_topics, nonzero_num);
+
+        global_word_topic_table_.resize(num_vocabs);
+
+        for (int i = 0; i < num_vocabs; ++i)
+        {
+            global_word_topic_table_[i] = model_block_->get_row(i, nullptr);
+        }
+    }
+
+    void LdaEngine::AllocateModelMemory(int num_vocabs, int num_topics, int64_t mem_block_size, int64_t alias_mem_block_size)
+    {
+        model_block_->Init(num_vocabs, num_topics, mem_block_size, alias_mem_block_size); //memory allocated here
+
+        global_word_topic_table_.resize(num_vocabs);
+        global_summary_row_.resize(K_, 0);
+
+        //each value inside the global_word_topic_table_ will be set while call SetWordTopic()
+    }
+
+    int LdaEngine::FeedInData(int* term_id, int* term_freq, int32_t term_num, int32_t vocab_size)
+    {
+        if (V_ == 0) //number vocab could be set in allocating model memory function
+            V_ = vocab_size;
+
+        //data_block represent for one doc
+        return data_block_->Add(term_id, term_freq, term_num);
+    }
+
+    int LdaEngine::FeedInDataDense(int* term_freq, int32_t term_num, int32_t vocab_size)
+    {
+        if (V_ == 0) //number vocab could be set in allocating model memory function
+            V_ = vocab_size;
+
+        //data_block represent for one doc
+        return data_block_->AddDense(term_freq, term_num);
+    }
+
+    void LdaEngine::TestOneDoc(int* term_id, int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset)
+    {
+        //numTopicsMax initialy holds the max returned topic number in order to hold the pTopic/pProbs memory in outside function
+        //when data return, numTopicsMax should contains the real topic number returned.
+        int sampler_id = 0;
+        sampler_id = samplerQueue_->pop();
+
+        LightDocSampler &sampler = *(samplers_[sampler_id]);
+        int64_t data_length = 1;
+        for (int i = 0; i < term_num; ++i)
+        {
+            for (int j = 0; j < term_freq[i]; ++j)
+            {
+                data_length += 2;
+            }
+        }
+
+        assert(data_length <= maxDocToken_ * 2 + 1);
+
+        if (reset)
+        {
+            // restart the rng seeds, so that we always get consistent result for the same input
+            rng_.restart();
+            sampler.rng_restart();
+        }
+
+        // NOTE: in multi-threaded implementation, the dynamic memory allocation
+        // may cause contention at OS heap lock
+        int64_t idx = 1;
+        for (int i = 0; i < term_num; ++i)
+        {
+            for (int j = 0; j < term_freq[i]; ++j)
+            {
+                document_buffer_[sampler_id][idx++] = term_id[i];
+                document_buffer_[sampler_id][idx++] = rng_.rand_k(K_);
+            }
+        }
+
+        std::shared_ptr<LDADocument> doc(new LDADocument(document_buffer_[sampler_id], document_buffer_[sampler_id] + data_length));
+
+        for (int iter = 0; iter < numBurnIter; ++iter)
+        {
+            sampler.InferOneDoc(doc.get());
+        }
+        sampler.GetDocTopic(doc.get(), pTopics, pProbs, numTopicsMax);
+
+        samplerQueue_->push(sampler_id);
+    }
+
+    void LdaEngine::TestOneDocDense(int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset)
+    {
+        //numTopicsMax initialy holds the max returned topic number in order to hold the pTopic/pProbs memory in outside function
+        //when data return, numTopicsMax should contains the real topic number returned.
+        int sampler_id = 0;
+        sampler_id = samplerQueue_->pop();
+
+        LightDocSampler &sampler = *(samplers_[sampler_id]);
+        int64_t data_length = 1;
+        for (int i = 0; i < term_num; ++i)
+        {
+            for (int j = 0; j < term_freq[i]; ++j)
+            {
+                data_length += 2;
+            }
+        }
+
+        assert(data_length <= maxDocToken_ * 2 + 1);
+
+        if (reset)
+        {
+            // restart the rng seeds, so that we always get consistent result for the same input
+            rng_.restart();
+            sampler.rng_restart();
+        }
+
+        // NOTE: in multi-threaded implementation, the dynamic memory allocation
+        // may cause contention at OS heap lock
+        int64_t idx = 1;
+        for (int i = 0; i < term_num; ++i)
+        {
+            for (int j = 0; j < term_freq[i]; ++j)
+            {
+                document_buffer_[sampler_id][idx++] = i;
+                document_buffer_[sampler_id][idx++] = rng_.rand_k(K_);
+            }
+        }
+
+        std::shared_ptr<LDADocument> doc(new LDADocument(document_buffer_[sampler_id], document_buffer_[sampler_id] + data_length));
+
+        for (int iter = 0; iter < numBurnIter; ++iter)
+        {
+            sampler.InferOneDoc(doc.get());
+        }
+        sampler.GetDocTopic(doc.get(), pTopics, pProbs, numTopicsMax);
+
+        samplerQueue_->push(sampler_id);
+    }
+
+    void LdaEngine::GetDocTopic(int docID, int* pTopic, int* pProb, int32_t& numTopicReturn)
+    {
+        //get the current topic vector of the document
+        int thread_id = 0;
+        LightDocSampler &sampler = *(samplers_[thread_id]);
+
+        sampler.GetDocTopic(data_block_->GetOneDoc(docID).get(), pTopic, pProb, numTopicReturn);
+    }
+
+    void LdaEngine::SetAlphaSum(float avgDocLength)
+    {
+        if (!bAlphaSumMultiplied)
+        {
+            alpha_sum_ = alpha_sum_ * avgDocLength;
+            bAlphaSumMultiplied = true;
+        }
+        printf("alpha_sum was set to %f", alpha_sum_);
+    }
+
+    bool LdaEngine::ClearData()
+    {
+        data_block_->Clear();
+        return true;
+    }
+
+    bool LdaEngine::ClearModel()
+    {
+        model_block_->Clear();
+        return true;
+    }
+
+    //function to support dumping the topic_model model file
+    void LdaEngine::GetWordTopic(int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t& length)
+    {
+        //cap the topic number here according to inpassed value of length
+        int lengthCap = length;
+
+        // NOTE: we MUST check whether the word-topic row is empty before get its value
+        if (global_word_topic_table_[wordId].capacity() == 0)
+        {
+            length = 0;
+            return;
+        }
+
+        length = 0;
+        for (int i = 0; i < K_; ++i)
+        {
+            if (global_word_topic_table_[wordId][i] > 0)
+            {
+                pTopic[length] = i;
+                pProb[length] = global_word_topic_table_[wordId][i];
+                length++;
+
+                if (length >= lengthCap)
+                    break;
+            }
+        }
+    }
+
+    // Compare by frequencies in descending order.
+    bool CompareTerms(const std::pair<int, int> &term1, const std::pair<int, int> &term2)
+    {
+        // REVIEW: consider changing this to impose a total order, since quicksort is not stable.
+        return term2.second < term1.second;
+    }
+
+    void LdaEngine::GetTopicSummary(int32_t topicId, int32_t* pWords, float* pProb, int32_t& length)
+    {
+        std::vector<std::pair<int, int>> allTermsVec;
+        int sumFreq = 0;
+        for (int i = 0; i < V_; i++) //for all the terms check the topic distribution
+        {
+            if (global_word_topic_table_[i][topicId] > 0)
+            {
+                std::pair<int, int> p;
+                p.first = i;
+                p.second = global_word_topic_table_[i][topicId];
+                allTermsVec.push_back(p);
+                sumFreq += global_word_topic_table_[i][topicId];
+            }
+        }
+
+        std::sort(allTermsVec.begin(), allTermsVec.end(), CompareTerms);
+
+        int usedTerm = (int)allTermsVec.size();
+        length = std::min(usedTerm, length);
+        for (int i = 0; i < length; i++)
+        {
+            pWords[i] = allTermsVec[i].first;
+            pProb[i] = (((float)(allTermsVec[i].second)) + beta_) / (sumFreq + beta_ * V_);
+        }
+    }
+
+    //function to support loading the topic_model model file
+    void LdaEngine::SetWordTopic(int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t length)
+    {
+        //NOTE: whether we should really use the "true" here
+        model_block_->SetWordInfo(wordId, length, true);
+        global_word_topic_table_[wordId] = model_block_->get_row(wordId, nullptr);
+
+        for (int i = 0; i < length; ++i)
+        {
+            global_word_topic_table_[wordId].inc(pTopic[i], pProb[i]);
+            global_summary_row_[pTopic[i]] += pProb[i];
+        }
+    }
+
+    void LdaEngine::GetModelStat(int64_t &memBlockSize, int64_t &aliasMemBlockSize)
+    {
+        //NOTE: get the model's value at the end of training stage. try to save these two numbers to disk file
+        model_block_->GetModelStat(memBlockSize, aliasMemBlockSize);
+    }
+
+    double LdaEngine::EvalLogLikelihood(bool is_train, int thread_id, int iter, LightDocSampler &sampler)
+    {
+        double doc_ll = 0;
+        double word_ll = 0;
+
+        if (thread_id == 0)
+        {
+            atomic_stats_->doc_ll_ = 0;
+            atomic_stats_->word_ll_ = 0;
+        }
+        process_barrier_->wait();
+
+        int doc_num = 0;
+        int32_t doc_start = data_block_->Begin(thread_id);
+        int32_t doc_end = data_block_->End(thread_id);
+        for (int32_t doc_index = doc_start; doc_index != doc_end; ++doc_index)
+        {
+            std::shared_ptr<LDADocument> doc = data_block_->GetOneDoc(doc_index);
+            doc_ll += sampler.ComputeOneDocLLH(doc.get());
+            doc_num++;
+        }
+        atomic_stats_->doc_ll_ = atomic_stats_->doc_ll_ + doc_ll;
+        process_barrier_->wait();
+
+        word_ll = sampler.ComputeWordLLH(word_range_for_each_thread_[thread_id], word_range_for_each_thread_[thread_id + 1]);
+        atomic_stats_->word_ll_ = atomic_stats_->word_ll_ + word_ll;
+        process_barrier_->wait();
+
+        double total_ll = 0;
+        if (thread_id == 0)
+        {
+            double normalized_ll = sampler.NormalizeWordLLH();
+
+            total_ll = 0;
+            total_ll += atomic_stats_->doc_ll_;
+            total_ll += atomic_stats_->word_ll_;
+            total_ll += normalized_ll;
+
+            if (!is_train)
+            {
+                likelihood_in_iter_[iter] = (float)total_ll;
+            }
+
+            std::cout << "Total likelihood: " << total_ll << "\t";
+            std::cout << "..........[Nomralized word ll: " << normalized_ll << "\t"
+                << "Word  likelihood: " << atomic_stats_->word_ll_ << "\t"
+                << "Doc   likelihood: " << atomic_stats_->doc_ll_ << "]" << std::endl;
+        }
+        process_barrier_->wait();
+
+        return total_ll;
+    }
+
+    void LdaEngine::DumpDocTopicTable(const std::string& doc_topic_file)
+    {
+        std::ofstream dt_stream;
+        dt_stream.open(doc_topic_file, std::ios::out);
+        assert(dt_stream.good());
+
+        int32_t num_documents = data_block_->num_documents();
+        int32_t doc_start = 0;
+        int32_t doc_end = num_documents;
+
+        lda::light_hash_map doc_topic_counter_(1024);
+
+        for (int32_t doc_index = doc_start; doc_index != doc_end; ++doc_index)
+        {
+            std::shared_ptr<LDADocument> doc = data_block_->GetOneDoc(doc_index);
+            doc_topic_counter_.clear();
+            doc->GetDocTopicCounter(doc_topic_counter_);
+
+            dt_stream << doc_index;
+            if (doc->size())
+            {
+                int32_t capacity = doc_topic_counter_.capacity();
+                int32_t *key = doc_topic_counter_.key();
+                int32_t *value = doc_topic_counter_.value();
+                int32_t nonzero_num = 0;
+
+                for (int i = 0; i < capacity; ++i)
+                {
+                    if (key[i] > 0)
+                    {
+                        dt_stream << " " << key[i] - 1 << ":" << value[i];
+                    }
+                }
+            }
+            dt_stream << std::endl;
+        }
+        dt_stream.close();
+    }
+
+    void LdaEngine::DumpFullModel(const std::string& word_topic_dump)
+    {
+        std::ofstream wt_stream;
+        wt_stream.open(word_topic_dump, std::ios::out);
+        assert(wt_stream.good());
+
+        for (int w = 0; w < V_; ++w)
+        {
+            int nonzero_num = global_word_topic_table_[w].nonzero_num();
+            if (nonzero_num)
+            {
+                wt_stream << w;
+                for (int t = 0; t < K_; ++t)
+                {
+                    if (global_word_topic_table_[w][t] > 0)
+                    {
+                        wt_stream << " " << t << ":" << global_word_topic_table_[w][t];
+                    }
+                }
+                wt_stream << std::endl;
+            }
+        }
+        wt_stream.close();
+
+        std::ofstream summary_stream;
+        summary_stream.open("summary_row.txt", std::ios::out);
+        for (int i = 0; i < K_; ++i)
+        {
+            summary_stream << global_summary_row_[i] << std::endl;
+        }
+        summary_stream.close();
+    }
+}   // namespace lda
diff --git a/src/Native/LdaNative/lda_engine.hpp b/src/Native/LdaNative/lda_engine.hpp
new file mode 100644
index 0000000000..95a107f355
--- /dev/null
+++ b/src/Native/LdaNative/lda_engine.hpp
@@ -0,0 +1,144 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+
+#include <memory>
+#include <cstdint>
+#include <utility>
+#include <vector>
+#include <string>
+#include <set>
+
+#include "lda_document.h"
+#include "hybrid_map.h"
+#include "hybrid_alias_map.h"
+
+#include "alias_multinomial_rng_int.hpp"
+
+#ifdef _MSC_VER
+#define EXPORT_API(ret) extern "C" __declspec(dllexport) ret __stdcall
+#else
+#define EXPORT_API(ret) extern "C" __attribute__((visibility("default"))) ret
+#endif
+
+//ignore all such warnings since our stl class will not used internally in the class as private member
+#pragma warning(disable : 4251)
+class CTimer;
+namespace lda {
+
+    class LDADataBlock;
+    class LDAModelBlock;
+    class SimpleBarrier;
+    struct LDAEngineAtomics;
+    class LightDocSampler;
+    class CBlockedIntQueue;
+
+    // Engine takes care of the entire pipeline of LDA, from reading data to
+    // spawning threads, to recording execution time and loglikelihood.
+    class LdaEngine {
+    public:
+        LdaEngine();
+        LdaEngine(int numTopic,
+            int numVocab,
+            float alphaSum,
+            float beta,
+            int numIter,
+            int likelihoodInterval,
+            int numThread,
+            int mhstep,
+            int maxDocToken);
+
+        LdaEngine(int32_t K, int32_t V, int32_t num_threads, int32_t compute_ll_interval, float beta, int32_t num_iterations, int32_t mh_step, float alpha_sum, int maxDocToken);
+
+        ~LdaEngine();
+
+
+        void InitializeBeforeTest();
+        bool InitializeBeforeTrain();
+        void AllocateDataMemory(int num_document, int64_t corpus_size);
+        void AllocateModelMemory(const LDADataBlock* data_block); //in this case, model memory is allocated according to the datablock;
+        void AllocateModelMemory(int num_vocabs, int num_topics, int64_t nonzero_num);
+        void AllocateModelMemory(int num_vocabs, int num_topics, int64_t mem_block_size, int64_t alias_mem_block_size);
+        void SetAlphaSum(float avgDocLength); //alphasum parameter is set by avgdoclength * alpha
+
+        //IO, data
+        bool ClearData();  //for clean up training data
+        bool ClearModel(); //for testing purpose, before calling SetWordTopic, please clear the old model
+
+        int FeedInData(int* term_id, int* term_freq, int32_t term_num, int32_t vocab_size);
+        int FeedInDataDense(int* term_freq, int32_t term_num, int32_t vocab_size);
+
+        //IO, model 
+        // NOTE: assume pTopic and pProb are allocated outside the function
+        // the length returned will be capped by the pass-in initial value of length(usually it's the size of preallocated memory for pTopic&pProb
+        void GetWordTopic(int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t& length);
+        void SetWordTopic(int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t length);
+        void GetModelStat(int64_t &memBlockSize, int64_t &aliasMemBlockSize);
+        void GetTopicSummary(int32_t topicId, int32_t* pWords, float* pProb, int32_t& length);
+
+        //mutlithread train/test with the data inside the engine
+        void Train(const char* pTrainOutput = nullptr);
+        void Test(int32_t burnin_iter, float* pLoglikelihood);
+
+        //testing on single doc
+        void TestOneDoc(int* term_id, int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset);
+        void TestOneDocDense(int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset);
+        void GetDocTopic(int docID, int* pTopic, int* pProb, int32_t& numTopicReturn); // use this function to get the doc's topic output in batch testing scenario
+
+        //output model(word topic) and doc topic
+        void DumpFullModel(const std::string& word_topic_dump);
+        void DumpDocTopicTable(const std::string& doc_topic_file);
+
+    private:
+        double EvalLogLikelihood(bool is_train, int thread_id, int iter, LightDocSampler &sampler);
+
+    private:  // private data
+        void Training_Thread();
+        void Testing_Thread();
+        void CheckFunction(int thread_id, CTimer& tmDebug, const char* msg, bool waitBarrier = true);
+
+        // Number of topics
+        int32_t K_;
+        // Number of vocabs.
+        int32_t V_;
+
+        int32_t compute_ll_interval_;
+        int32_t num_threads_;
+        int32_t num_iterations_;
+        int32_t burnin_iterations_;
+        int32_t mh_step_;
+        float beta_;
+        float alpha_sum_;
+        float beta_sum_;
+        int maxDocToken_;
+        bool bAlphaSumMultiplied; //used to check whether alpha_sum_ is real alpha sum but not alpha
+        std::vector<int32_t> word_range_for_each_thread_;
+
+        LDAEngineAtomics* atomic_stats_;
+        SimpleBarrier* process_barrier_;         // Local barrier across threads.
+
+        LDADataBlock* data_block_;
+        LDAModelBlock* model_block_;
+
+        std::vector<lda::hybrid_map> global_word_topic_table_;
+        std::vector<lda::hybrid_alias_map> global_alias_k_v_;
+        std::vector<int64_t> global_summary_row_;
+
+        // for generating alias table of beta term
+        wood::AliasMultinomialRNGInt alias_rng_int_;
+        int32_t beta_height_;
+        float beta_mass_;
+        std::vector<wood::alias_k_v> beta_k_v_;
+
+        LightDocSampler **samplers_;
+        float* likelihood_in_iter_;
+
+        // For TestDocSafe purpose
+        int32_t **document_buffer_;
+
+        wood::xorshift_rng rng_;
+        CBlockedIntQueue *samplerQueue_;
+    };
+}   // namespace lda
diff --git a/src/Native/LdaNative/lda_engine_export.cpp b/src/Native/LdaNative/lda_engine_export.cpp
new file mode 100644
index 0000000000..7f6bc62b70
--- /dev/null
+++ b/src/Native/LdaNative/lda_engine_export.cpp
@@ -0,0 +1,109 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include "lda_engine.hpp"
+
+/// This file use to expose public API to be consumed by ML.NET.
+namespace lda {
+
+    EXPORT_API(LdaEngine*) CreateEngine(int numTopic, int numVocab, float alphaSum, float beta, int numIter, int likelihoodInterval, int numThread, int mhstep, int maxDocToken)
+    {
+        return new LdaEngine(numTopic, numVocab, alphaSum, beta, numIter, likelihoodInterval, numThread, mhstep, maxDocToken);
+    }
+
+    EXPORT_API(void) DestroyEngine(LdaEngine* engine)
+    {
+        delete engine;
+    }
+
+    EXPORT_API(void) AllocateModelMemory(LdaEngine* engine, int numTopic, int numVocab, int64_t tableSize, int64_t aliasTableSize)
+    {
+        engine->AllocateModelMemory(numVocab, numTopic, tableSize, aliasTableSize);
+    }
+
+    EXPORT_API(void) AllocateDataMemory(LdaEngine* engine, int num_document, int64_t corpus_size)
+    {
+        engine->AllocateDataMemory(num_document, corpus_size);
+    }
+
+    EXPORT_API(void) Train(LdaEngine* engine, const char* trainOutput)
+    {
+        engine->Train(trainOutput);
+    }
+
+    EXPORT_API(void) Test(LdaEngine* engine, int32_t burnin_iter, float* pLoglikelihood)
+    {
+        engine->Test(burnin_iter, pLoglikelihood);
+    }
+
+    EXPORT_API(void) CleanData(LdaEngine* engine)
+    {
+        engine->ClearData();
+    }
+
+    EXPORT_API(void) CleanModel(LdaEngine* engine)
+    {
+        engine->ClearModel();
+    }
+
+    EXPORT_API(void) GetModelStat(LdaEngine* engine, int64_t &memBlockSize, int64_t &aliasMemBlockSize)
+    {
+        engine->GetModelStat(memBlockSize, aliasMemBlockSize);
+    }
+
+    EXPORT_API(void) GetWordTopic(LdaEngine* engine, int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t& length)
+    {
+        engine->GetWordTopic(wordId, pTopic, pProb, length);
+    }
+
+    EXPORT_API(void) SetWordTopic(LdaEngine* engine, int32_t wordId, int32_t* pTopic, int32_t* pProb, int32_t length)
+    {
+        engine->SetWordTopic(wordId, pTopic, pProb, length);
+    }
+
+    EXPORT_API(void) GetTopicSummary(LdaEngine* engine, int32_t topicId, int32_t* pWords, float* pProb, int32_t& length)
+    {
+        engine->GetTopicSummary(topicId, pWords, pProb, length);
+    }
+
+    EXPORT_API(void) SetAlphaSum(LdaEngine* engine, float avgDocLength)
+    {
+        engine->SetAlphaSum(avgDocLength);
+    }
+
+    EXPORT_API(int) FeedInData(LdaEngine* engine, int* term_id, int* term_freq, int32_t term_num, int32_t vocab_size)
+    {
+        return engine->FeedInData(term_id, term_freq, term_num, vocab_size);
+    }
+
+    EXPORT_API(int) FeedInDataDense(LdaEngine* engine, int* term_freq, int32_t term_num, int32_t vocab_size)
+    {
+        return engine->FeedInDataDense(term_freq, term_num, vocab_size);
+    }
+
+    EXPORT_API(void) GetDocTopic(LdaEngine* engine, int docID, int* pTopic, int* pProb, int32_t& numTopicReturn)
+    {
+        engine->GetDocTopic(docID, pTopic, pProb, numTopicReturn);
+    }
+
+    EXPORT_API(void) TestOneDoc(LdaEngine* engine, int* term_id, int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset)
+    {
+        engine->TestOneDoc(term_id, term_freq, term_num, pTopics, pProbs, numTopicsMax, numBurnIter, reset);
+    }
+
+    EXPORT_API(void) TestOneDocDense(LdaEngine* engine, int* term_freq, int32_t term_num, int* pTopics, int* pProbs, int32_t& numTopicsMax, int32_t numBurnIter, bool reset)
+    {
+        engine->TestOneDocDense(term_freq, term_num, pTopics, pProbs, numTopicsMax, numBurnIter, reset);
+    }
+
+    EXPORT_API(void) InitializeBeforeTrain(LdaEngine* engine)
+    {
+        engine->InitializeBeforeTrain();
+    }
+
+    EXPORT_API(void) InitializeBeforeTest(LdaEngine* engine)
+    {
+        engine->InitializeBeforeTest();
+    }
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/light_doc_sampler.cpp b/src/Native/LdaNative/light_doc_sampler.cpp
new file mode 100644
index 0000000000..ea628d3891
--- /dev/null
+++ b/src/Native/LdaNative/light_doc_sampler.cpp
@@ -0,0 +1,667 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include <algorithm>
+#include <time.h>
+#include <sstream>
+#include <string>
+#include <fstream>
+#include <iostream>
+
+#include "lda_document.h"
+#include "light_doc_sampler.hpp"
+
+
+namespace lda
+{
+    LightDocSampler::LightDocSampler(
+        int32_t K,
+        int32_t V,
+        int32_t num_threads,
+        int32_t mh_step,
+        float beta,
+        float alpha_sum,
+        std::vector<lda::hybrid_map> &word_topic_table,
+        std::vector<int64_t> &summary_row,
+        std::vector<lda::hybrid_alias_map> &alias_kv,
+        int32_t &beta_height,
+        float& beta_mass,
+        std::vector<wood::alias_k_v> &beta_k_v)
+        : doc_topic_counter_(1024),
+        word_topic_table_(word_topic_table), summary_row_(summary_row),
+        alias_k_v_(alias_kv),
+        beta_height_(beta_height),
+        beta_mass_(beta_mass),
+        beta_k_v_(beta_k_v),
+        K_(K),
+        V_(V),
+        num_threads_(num_threads),
+        mh_step_for_gs_(mh_step),
+        beta_(beta),
+        alpha_sum_(alpha_sum)
+    {
+        beta_sum_ = beta_ * V_;
+        alpha_ = alpha_sum_ / K_;
+
+        ll_alpha_ = (lda::real_t)0.01;
+        ll_alpha_sum_ = ll_alpha_ * K_;
+
+        // Precompute LLH parameters
+        log_doc_normalizer_ = LogGamma(ll_alpha_ * K_) - K_ * LogGamma(ll_alpha_);
+        log_topic_normalizer_ = LogGamma(beta_sum_) - V_ * LogGamma(beta_);
+
+        alias_rng_.Init(K_);
+
+        q_w_proportion_.resize(K_);
+        delta_summary_row_.resize(K_);
+        word_topic_delta_.resize(num_threads_);
+
+        rehashing_buf_ = new int32_t[K_ * 2];
+    }
+
+    LightDocSampler::~LightDocSampler()
+    {
+        delete[] rehashing_buf_;
+    }
+
+    // Initialize word_topic_table and doc_topic_counter for each doc
+    int32_t LightDocSampler::GlobalInit(LDADocument *doc)
+    {
+        int32_t token_num = 0;
+        int32_t doc_size = doc->size();
+        for (int i = 0; i < doc_size; ++i)
+        {
+            int32_t w = doc->Word(i);
+            int32_t t = doc->Topic(i);
+
+            word_topic_delta wtd;
+            int32_t shard_id = w % num_threads_;
+            wtd.word = w;
+            wtd.topic = t;
+            wtd.delta = 1;
+            word_topic_delta_[shard_id].push_back(wtd);
+
+            ++delta_summary_row_[t];
+
+            ++token_num;
+        }
+        return token_num;
+    }
+
+    int32_t LightDocSampler::DocInit(LDADocument *doc)
+    {
+        int num_words = doc->size();
+
+        // compute the doc_topic_counter on the fly
+        doc_topic_counter_.clear();
+        doc->GetDocTopicCounter(doc_topic_counter_);
+
+        doc_size_ = num_words;
+        n_td_sum_ = (lda::real_t)num_words;
+
+        return 0;
+    }
+
+    bool CompareFirstElement(const std::pair<int, int> &p1, const std::pair<int, int> &p2)
+    {
+        return p1.first < p2.first;
+    }
+
+    void LightDocSampler::GetDocTopic(LDADocument *doc, int* pTopics, int* pProbs, int32_t& numTopicsMax)
+    {
+        doc_topic_counter_.clear();
+        doc->GetDocTopicCounter(doc_topic_counter_);
+
+        // NOTE: do we have to assume this?
+        // probably first sort the topic vector according to the probs and keep the first numTopicsMax topics
+        // We assume the numTopicsMax is not less than the length of current document?? or it should be maxiumly the toipc number
+        // assert(numTopicsMax >= doc->size());
+
+        int32_t capacity = doc_topic_counter_.capacity();
+        int32_t *key = doc_topic_counter_.key();
+        int32_t *value = doc_topic_counter_.value();
+
+        std::vector<std::pair<int, int>> vec;
+        int32_t idx = 0;
+        for (int i = 0; i < capacity; ++i)
+        {
+            if (key[i] > 0)
+            {
+                std::pair<int, int> pair;
+                pair.first = key[i] - 1;
+                pair.second = value[i];
+                vec.push_back(pair);
+                idx++;
+
+                if (idx == numTopicsMax)
+                    break;
+            }
+        }
+        numTopicsMax = idx;
+        std::sort(vec.begin(), vec.end(), CompareFirstElement);
+        for (int i = 0; i < idx; i++)
+        {
+            pTopics[i] = vec[i].first;
+            pProbs[i] = vec[i].second;
+        }
+    }
+
+    void LightDocSampler::EpocInit()
+    {
+        std::fill(delta_summary_row_.begin(), delta_summary_row_.end(), 0);
+        for (auto &shard : word_topic_delta_)
+        {
+            shard.clear();
+        }
+    }
+
+    void LightDocSampler::AdaptAlphaSum(bool is_train)
+    {
+        rng_.restart(); //reset the sampler so that we will get deterministic result by different runs, train-test, train-save-test, etc.
+
+        if (is_train)
+        {
+            if (alpha_sum_ < 10)
+            {
+                alpha_sum_ = 100;
+            }
+        }
+        else
+        {
+            if (alpha_sum_ > 10)
+            {
+                alpha_sum_ = 1;
+            }
+        }
+        alpha_ = alpha_sum_ / K_;
+    }
+
+    void LightDocSampler::build_alias_table(int32_t lower, int32_t upper, int thread_id)
+    {
+        for (int w = lower; w < upper; ++w)
+        {
+            GenerateAliasTableforWord(w);
+        }
+    }
+    void LightDocSampler::build_word_topic_table(int32_t thread_id, int32_t num_threads, lda::LDAModelBlock &model_block)
+    {
+        for (int i = 0; i < V_; ++i)
+        {
+            if (i % num_threads == thread_id)
+            {
+                word_topic_table_[i] = model_block.get_row(i, rehashing_buf_);
+            }
+        }
+    }
+
+    int32_t LightDocSampler::SampleOneDoc(LDADocument *doc)
+    {
+        return  OldProposalFreshSample(doc);
+    }
+
+    int32_t LightDocSampler::InferOneDoc(LDADocument *doc)
+    {
+        return OldProposalFreshSampleInfer(doc);
+    }
+    int32_t LightDocSampler::Sample2WordFirst(LDADocument *doc, int32_t w, int32_t s, int32_t old_topic)
+    {
+        int32_t w_t_cnt;
+        int32_t w_s_cnt;
+
+        real_t n_td_alpha;
+        real_t n_sd_alpha;
+        real_t n_tw_beta;
+        real_t n_sw_beta;
+        real_t n_s_beta_sum;
+        real_t n_t_beta_sum;
+
+        real_t proposal_s;
+        real_t proposal_t;
+
+        real_t nominator;
+        real_t denominator;
+
+        real_t rejection;
+        real_t pi;
+        int m;
+
+        for (int i = 0; i < mh_step_for_gs_; ++i)
+        {
+            int32_t t;
+
+            t = alias_k_v_[w].next(rng_, beta_height_, beta_mass_, beta_k_v_, false);
+
+            rejection = rng_.rand_real();
+
+            n_td_alpha = doc_topic_counter_[t] + alpha_;
+            n_sd_alpha = doc_topic_counter_[s] + alpha_;
+
+
+            w_s_cnt = get_word_topic(w, s);
+            w_t_cnt = get_word_topic(w, t);
+
+            if (s != old_topic && t != old_topic)
+            {
+                n_tw_beta = w_t_cnt + beta_;
+                n_t_beta_sum = summary_row_[t] + beta_sum_;
+
+                n_sw_beta = w_s_cnt + beta_;
+                n_s_beta_sum = summary_row_[s] + beta_sum_;
+            }
+            else if (s != old_topic && t == old_topic)
+            {
+                n_td_alpha -= 1;
+
+                n_tw_beta = w_t_cnt + beta_ - 1;
+                n_t_beta_sum = summary_row_[t] + beta_sum_ - 1;
+
+                n_sw_beta = w_s_cnt + beta_;
+                n_s_beta_sum = summary_row_[s] + beta_sum_;
+            }
+            else if (s == old_topic && t != old_topic)
+            {
+                n_sd_alpha -= 1;
+
+                n_tw_beta = w_t_cnt + beta_;
+                n_t_beta_sum = summary_row_[t] + beta_sum_;
+
+                n_sw_beta = w_s_cnt + beta_ - 1;
+                n_s_beta_sum = summary_row_[s] + beta_sum_ - 1;
+            }
+            else
+            {
+                n_td_alpha -= 1;
+                n_sd_alpha -= 1;
+
+                n_tw_beta = w_t_cnt + beta_ - 1;
+                n_t_beta_sum = summary_row_[t] + beta_sum_ - 1;
+
+                n_sw_beta = w_s_cnt + beta_ - 1;
+                n_s_beta_sum = summary_row_[s] + beta_sum_ - 1;
+            }
+
+            proposal_s = (w_s_cnt + beta_) / (summary_row_[s] + beta_sum_);
+            proposal_t = (w_t_cnt + beta_) / (summary_row_[t] + beta_sum_);
+
+            nominator = n_td_alpha
+                * n_tw_beta
+                * n_s_beta_sum
+                * proposal_s;
+
+            denominator = n_sd_alpha
+                * n_sw_beta
+                * n_t_beta_sum
+                * proposal_t;
+
+
+            pi = std::min<real_t>((real_t)1.0, nominator / denominator);
+
+            // s = rejection < pi ? t : s;
+            m = -(rejection < pi);
+            s = (t & m) | (s & ~m);
+
+            real_t n_td_or_alpha = rng_.rand_real() * (n_td_sum_ + alpha_sum_);
+            if (n_td_or_alpha < n_td_sum_)
+            {
+                int32_t t_idx = rng_.rand_k(doc_size_);
+                t = doc->Topic(t_idx);
+            }
+            else
+            {
+                t = rng_.rand_k(K_);
+            }
+
+            rejection = rng_.rand_real();
+
+            n_td_alpha = doc_topic_counter_[t] + alpha_;
+            n_sd_alpha = doc_topic_counter_[s] + alpha_;
+
+
+            if (s != old_topic && t != old_topic)
+            {
+                w_t_cnt = get_word_topic(w, t);
+                n_tw_beta = w_t_cnt + beta_;
+                n_t_beta_sum = summary_row_[t] + beta_sum_;
+
+                w_s_cnt = get_word_topic(w, s);
+                n_sw_beta = w_s_cnt + beta_;
+                n_s_beta_sum = summary_row_[s] + beta_sum_;
+            }
+            else if (s != old_topic && t == old_topic)
+            {
+                n_td_alpha -= 1;
+
+                w_t_cnt = get_word_topic(w, t) - 1;
+                n_tw_beta = w_t_cnt + beta_;
+                n_t_beta_sum = summary_row_[t] + beta_sum_ - 1;
+
+                w_s_cnt = get_word_topic(w, s);
+                n_sw_beta = w_s_cnt + beta_;
+                n_s_beta_sum = summary_row_[s] + beta_sum_;
+            }
+            else if (s == old_topic && t != old_topic)
+            {
+                n_sd_alpha -= 1;
+
+                w_t_cnt = get_word_topic(w, t);
+                n_tw_beta = w_t_cnt + beta_;
+                n_t_beta_sum = summary_row_[t] + beta_sum_;
+
+                w_s_cnt = get_word_topic(w, s) - 1;
+                n_sw_beta = w_s_cnt + beta_;
+                n_s_beta_sum = summary_row_[s] + beta_sum_ - 1;
+            }
+            else
+            {
+                n_td_alpha -= 1;
+                n_sd_alpha -= 1;
+
+                w_t_cnt = get_word_topic(w, t) - 1;
+                n_tw_beta = w_t_cnt + beta_;
+                n_t_beta_sum = summary_row_[t] + beta_sum_ - 1;
+
+                w_s_cnt = get_word_topic(w, s) - 1;
+                n_sw_beta = w_s_cnt + beta_;
+                n_s_beta_sum = summary_row_[s] + beta_sum_ - 1;
+            }
+
+            proposal_t = doc_topic_counter_[t] + alpha_;
+            proposal_s = doc_topic_counter_[s] + alpha_;
+
+            nominator = n_td_alpha
+                * n_tw_beta
+                * n_s_beta_sum
+                * proposal_s;
+
+            denominator = n_sd_alpha
+                * n_sw_beta
+                * n_t_beta_sum
+                * proposal_t;
+
+
+            pi = std::min<real_t>((real_t)1.0, nominator / denominator);
+
+            // s = rejection < pi ? t : s;
+            m = -(rejection < pi);
+            s = (t & m) | (s & ~m);
+        }
+        int32_t src = s;
+        return src;
+    }
+
+    int32_t LightDocSampler::Sample2WordFirstInfer(LDADocument *doc, int32_t w, int32_t s, int32_t old_topic)
+    {
+        int32_t w_t_cnt;
+        int32_t w_s_cnt;
+
+        float n_td_alpha;
+        float n_sd_alpha;
+        float n_tw_beta;
+        float n_sw_beta;
+        float n_s_beta_sum;
+        float n_t_beta_sum;
+
+        float nominator;
+        float denominator;
+
+        float rejection;
+        float pi;
+        int m;
+
+        for (int i = 0; i < mh_step_for_gs_; ++i)
+        {
+            int32_t t;
+            t = alias_k_v_[w].next(rng_, beta_height_, beta_mass_, beta_k_v_, false);
+
+            rejection = rng_.rand_real();
+
+            n_td_alpha = doc_topic_counter_[t] + alpha_;
+            n_sd_alpha = doc_topic_counter_[s] + alpha_;
+
+            nominator = n_td_alpha;
+            denominator = n_sd_alpha;
+
+            pi = std::min((float)1.0, nominator / denominator);
+
+            m = -(rejection < pi);
+            s = (t & m) | (s & ~m);
+
+            float n_td_or_alpha = rng_.rand_real() * (n_td_sum_ + alpha_sum_);
+            if (n_td_or_alpha < n_td_sum_)
+            {
+                int32_t t_idx = rng_.rand_k(doc_size_);
+                t = doc->Topic(t_idx);
+            }
+            else
+            {
+                t = rng_.rand_k(K_);
+            }
+
+            rejection = rng_.rand_real();
+
+
+            w_t_cnt = get_word_topic(w, t);
+            n_tw_beta = w_t_cnt + beta_;
+            n_t_beta_sum = summary_row_[t] + beta_sum_;
+
+            w_s_cnt = get_word_topic(w, s);
+            n_sw_beta = w_s_cnt + beta_;
+            n_s_beta_sum = summary_row_[s] + beta_sum_;
+
+            nominator = n_tw_beta
+                * n_s_beta_sum;
+
+
+            denominator = n_sw_beta
+                * n_t_beta_sum;
+
+            pi = std::min((float)1.0, nominator / denominator);
+
+            m = -(rejection < pi);
+            s = (t & m) | (s & ~m);
+        }
+        int32_t src = s;
+        return src;
+    }
+
+    int32_t LightDocSampler::OldProposalFreshSample(LDADocument *doc)
+    {
+        DocInit(doc);
+        int num_token = doc->size();
+        int32_t &cursor = doc->get_cursor();
+
+        int32_t token_sweeped = 0;
+        cursor = 0;
+
+        while (cursor < num_token)
+        {
+            ++token_sweeped;
+
+            int32_t w = doc->Word(cursor);
+            int32_t s = doc->Topic(cursor);            // old topic
+
+            int t = Sample2WordFirst(doc, w, s, s);    // new topic
+
+            if (s != t)
+            {
+                word_topic_delta wtd;
+                int32_t shard_id = w % num_threads_;
+                wtd.word = w;
+                wtd.topic = s;
+                wtd.delta = -1;
+                word_topic_delta_[shard_id].push_back(wtd);
+
+                wtd.topic = t;
+                wtd.delta = +1;
+                word_topic_delta_[shard_id].push_back(wtd);
+
+                --delta_summary_row_[s];
+                ++delta_summary_row_[t];
+
+                doc->SetTopic(cursor, t);
+                doc_topic_counter_.inc(s, -1);
+                doc_topic_counter_.inc(t, 1);
+            }
+            cursor++;
+        }
+        return token_sweeped;
+    }
+
+    int32_t LightDocSampler::OldProposalFreshSampleInfer(LDADocument *doc)
+    {
+
+        DocInit(doc);
+        int num_token = doc->size();
+        int32_t &cursor = doc->get_cursor();
+
+        int32_t token_sweeped = 0;
+        cursor = 0;
+
+        while (cursor < num_token)
+        {
+            ++token_sweeped;
+
+            int32_t w = doc->Word(cursor);
+            int32_t s = doc->Topic(cursor);            // old topic
+
+            int t = Sample2WordFirstInfer(doc, w, s, s);    // new topic
+
+            if (s != t)
+            {
+                doc->SetTopic(cursor, t);
+                doc_topic_counter_.inc(s, -1);
+                doc_topic_counter_.inc(t, 1);
+            }
+            cursor++;
+        }
+        return token_sweeped;
+    }
+
+    double LightDocSampler::NormalizeWordLLH()
+    {
+        double word_llh = K_ * log_topic_normalizer_;
+        for (int k = 0; k < K_; ++k)
+        {
+            word_llh -= LogGamma(summary_row_[k] + beta_sum_);
+        }
+        return word_llh;
+    }
+
+
+    double LightDocSampler::ComputeOneDocLLH(LDADocument* doc)
+    {
+        double doc_ll = 0;
+        double one_doc_llh = log_doc_normalizer_;
+
+        // Compute doc-topic vector on the fly.
+        int num_tokens = doc->size();
+
+        if (num_tokens == 0)
+        {
+            return doc_ll;
+        }
+
+        doc_topic_counter_.clear();
+        doc->GetDocTopicCounter(doc_topic_counter_);
+
+        int32_t capacity = doc_topic_counter_.capacity();
+        int32_t *key = doc_topic_counter_.key();
+        int32_t *value = doc_topic_counter_.value();
+        int32_t nonzero_num = 0;
+
+        for (int i = 0; i < capacity; ++i)
+        {
+            if (key[i] > 0)
+            {
+                one_doc_llh += LogGamma(value[i] + ll_alpha_);
+                ++nonzero_num;
+            }
+        }
+        one_doc_llh += (K_ - nonzero_num) * LogGamma(ll_alpha_);
+        one_doc_llh -= LogGamma(num_tokens + ll_alpha_ * K_);
+
+        doc_ll += one_doc_llh;
+        return doc_ll;
+    }
+
+    double LightDocSampler::ComputeWordLLH(int32_t lower, int32_t upper)
+    {
+        // word_llh is P(w|z).
+        double word_llh = 0;
+        double zero_entry_llh = LogGamma(beta_);
+
+        // Since some vocabs are not present in the corpus, use num_words_seen to
+        // count # of words in corpus.
+        int num_words_seen = 0;
+        for (int w = lower; w < upper; ++w)
+        {
+            auto word_topic_row = get_word_row(w);
+            int32_t total_count = 0;
+            double delta = 0;
+            if (word_topic_row.is_dense())
+            {
+                int32_t* memory = word_topic_row.memory();
+                int32_t capacity = word_topic_row.capacity();
+                int32_t count;
+                for (int i = 0; i < capacity; ++i)
+                {
+                    count = memory[i];
+                    total_count += count;
+                    delta += LogGamma(count + beta_);
+                }
+            }
+            else
+            {
+                int32_t* key = word_topic_row.key();
+                int32_t* value = word_topic_row.value();
+                int32_t capacity = word_topic_row.capacity();
+                int32_t count;
+                int32_t nonzero_num = 0;
+                for (int i = 0; i < capacity; ++i)
+                {
+                    if (key[i] > 0)
+                    {
+                        count = value[i];
+                        total_count += count;
+                        delta += LogGamma(count + beta_);
+                        ++nonzero_num;
+                    }
+                }
+                delta += (K_ - nonzero_num) * zero_entry_llh;
+            }
+
+            if (total_count)
+            {
+                word_llh += delta;
+            }
+        }
+
+        return word_llh;
+    }
+
+    void LightDocSampler::Dump(const std::string &dump_name, int32_t lower, int32_t upper)
+    {
+        std::ofstream wt_stream;
+        wt_stream.open(dump_name, std::ios::out);
+
+        for (int w = lower; w < upper; ++w)
+        {
+            //why not just a serialization of current hybrid_map? do we need to do a search?
+            int nonzero_num = word_topic_table_[w].nonzero_num();
+            if (nonzero_num)
+            {
+                wt_stream << w;
+                for (int t = 0; t < K_; ++t)
+                {
+                    if (word_topic_table_[w][t] > 0)
+                    {
+                        wt_stream << " " << t << ":" << word_topic_table_[w][t];
+                    }
+                }
+                wt_stream << std::endl;
+            }
+        }
+        wt_stream.close();
+    }
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/light_doc_sampler.hpp b/src/Native/LdaNative/light_doc_sampler.hpp
new file mode 100644
index 0000000000..82e37b3bc5
--- /dev/null
+++ b/src/Native/LdaNative/light_doc_sampler.hpp
@@ -0,0 +1,187 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+
+#include "type_common.h"
+#include "lda_document.h"
+#include "rand_int_rng.h"
+#include <string>
+#include <utility>
+#include <vector>
+#include <unordered_map>
+#include <queue>
+#include <map>
+#include "alias_multinomial_rng_int.hpp"
+#include "light_hash_map.h"
+#include "utils.hpp"
+#include "hybrid_map.h"
+#include "hybrid_alias_map.h"
+#include "model_block.h"
+
+namespace lda
+{
+    struct word_topic_delta
+    {
+        int32_t word;
+        int32_t topic;
+        int32_t delta;
+    };
+
+    class LightDocSampler
+    {
+    public:
+        LightDocSampler(
+            int32_t K,
+            int32_t V,
+            int32_t num_threads,
+            int32_t mh_step,
+            float beta,
+            float alpha_sum,
+            std::vector<lda::hybrid_map> &word_topic_table,
+            std::vector<int64_t> &summary_row,
+            std::vector<lda::hybrid_alias_map> &alias_kv,
+            int32_t &beta_height,
+            float &beta_mass,
+            std::vector<wood::alias_k_v> &beta_k_v
+            );
+
+        ~LightDocSampler();
+
+        int32_t GlobalInit(LDADocument *doc);
+        int32_t DocInit(LDADocument *doc);
+        void EpocInit();
+        void AdaptAlphaSum(bool is_train);
+        void GetDocTopic(LDADocument *doc, int* pTopics, int* pProbs, int32_t& numTopicsMax);
+
+
+        int32_t SampleOneDoc(LDADocument *doc);
+        int32_t InferOneDoc(LDADocument *doc);
+        
+        // The i-th complete-llh calculation will use row i in llh_able_. This is
+        // part of log P(z) in eq.[3].
+        double ComputeOneDocLLH(LDADocument* doc);
+        double ComputeWordLLH(int32_t lower, int32_t upper);
+        double NormalizeWordLLH();
+
+        inline void rng_restart()
+        {
+            rng_.restart();
+        }
+
+
+        void Dump(const std::string &dump_name, int32_t lower, int32_t upper);
+
+        void build_alias_table(int32_t lower, int32_t upper, int thread_id);
+        void build_word_topic_table(int32_t thread_id, int32_t num_threads, lda::LDAModelBlock &model_block);
+
+        inline int32_t rand_k();
+        inline wood::xorshift_rng& rng();
+        inline lda::hybrid_map& get_word_row(int32_t word);
+        inline std::vector<int64_t> &get_summary_row();
+        inline std::vector<word_topic_delta>& get_word_topic_delta(int32_t thread_id);
+        inline std::vector<int64_t>& get_delta_summary_row();
+
+    private:
+        int32_t Sample2WordFirst(LDADocument *doc, int32_t w, int32_t s, int32_t old_topic);
+        int32_t Sample2WordFirstInfer(LDADocument *doc, int32_t w, int32_t s, int32_t old_topic);
+        inline void GenerateAliasTableforWord(int32_t word);
+        inline int32_t get_word_topic(int32_t word, int32_t topic);
+        inline void word_topic_dec(int32_t word, int32_t topic);
+        inline void word_topic_inc(int32_t word, int32_t topic);
+        int32_t OldProposalFreshSample(LDADocument *doc);
+        int32_t OldProposalFreshSampleInfer(LDADocument *doc);
+
+    private:
+        int32_t num_tokens_;
+        int32_t num_unique_words_;
+
+        int32_t K_;
+        int32_t V_;
+        real_t beta_;
+        real_t beta_sum_;
+        real_t alpha_;
+        real_t alpha_sum_;
+
+        real_t ll_alpha_;
+        real_t ll_alpha_sum_;
+
+        real_t delta_alpha_sum_;
+
+        std::vector<float> q_w_proportion_;
+        wood::AliasMultinomialRNGInt alias_rng_;
+        wood::xorshift_rng rng_;
+        std::vector<lda::hybrid_alias_map> &alias_k_v_;
+
+        int32_t doc_size_;
+
+        // the number of Metropolis Hastings step
+        int32_t mh_step_for_gs_;
+        real_t n_td_sum_;
+        
+        // model
+        std::vector<int64_t> &summary_row_;
+        std::vector<lda::hybrid_map> &word_topic_table_;
+        int32_t *rehashing_buf_;
+
+        int32_t &beta_height_;
+        float &beta_mass_;
+        std::vector<wood::alias_k_v> &beta_k_v_;
+
+        // delta
+        std::vector<int64_t> delta_summary_row_;
+
+        int32_t num_threads_;
+        std::vector<std::vector<word_topic_delta>> word_topic_delta_;    
+
+        // ================ Precompute LLH Parameters =================
+        // Log of normalization constant (per docoument) from eq.[3].
+        double log_doc_normalizer_;
+
+        // Log of normalization constant (per topic) from eq.[2].
+        double log_topic_normalizer_;
+        lda::light_hash_map doc_topic_counter_;
+    };
+    
+    inline int32_t LightDocSampler::rand_k()
+    {
+        return rng_.rand_k(K_);
+    }
+    inline wood::xorshift_rng& LightDocSampler::rng()
+    {
+        return rng_;
+    }
+    inline lda::hybrid_map& LightDocSampler::get_word_row(int32_t word)
+    {
+        return word_topic_table_[word];
+    }
+    inline std::vector<int64_t>& LightDocSampler::get_summary_row()
+    {
+        return summary_row_;
+    }
+    inline std::vector<word_topic_delta>& LightDocSampler::get_word_topic_delta(int32_t thread_id)
+    {
+        return word_topic_delta_[thread_id];
+    }
+    inline std::vector<int64_t>& LightDocSampler::get_delta_summary_row()
+    {
+        return delta_summary_row_;
+    }
+    inline int32_t LightDocSampler::get_word_topic(int32_t word, int32_t topic)
+    {
+        return word_topic_table_[word][topic];
+    }    
+    inline void LightDocSampler::word_topic_dec(int32_t word, int32_t topic)
+    {
+        word_topic_table_[word].inc(topic, -1);
+    }
+    inline void LightDocSampler::word_topic_inc(int32_t word, int32_t topic)
+    {
+        word_topic_table_[word].inc(topic, 1);
+    }
+    inline void LightDocSampler::GenerateAliasTableforWord(int32_t word)
+    {
+        alias_k_v_[word].build_table(alias_rng_, word_topic_table_[word], summary_row_, q_w_proportion_, beta_, beta_sum_, word, rng_);
+    }
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/light_hash_map.cpp b/src/Native/LdaNative/light_hash_map.cpp
new file mode 100644
index 0000000000..ae070c5e1c
--- /dev/null
+++ b/src/Native/LdaNative/light_hash_map.cpp
@@ -0,0 +1,76 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include <cstring>
+#include "light_hash_map.h"
+
+namespace lda
+{
+    light_hash_map::light_hash_map(int32_t *mem_block, int32_t capacity) :
+        own_memory_(false),
+        capacity_(capacity),
+        mem_block_(mem_block),
+        empty_key_(0),
+        deleted_key_(-2)
+    {
+        key_ = mem_block_;
+        value_ = mem_block_ + capacity_;
+        clear();
+    }
+
+    light_hash_map::light_hash_map(int32_t capacity) :
+        own_memory_(true),
+        capacity_(capacity),
+        empty_key_(0),
+        deleted_key_(-2)
+    {
+        mem_block_ = new int32_t[capacity_ * 2];
+        key_ = mem_block_;
+        value_ = mem_block_ + capacity_;
+        clear();
+    }
+
+    // must call set_memory after construction before use
+    light_hash_map::light_hash_map() :
+        capacity_(1024),
+        own_memory_(false),
+        empty_key_(0),
+        deleted_key_(-2),
+        mem_block_(nullptr),
+        key_(nullptr),
+        value_(nullptr)
+    {
+    }
+
+    light_hash_map::~light_hash_map()
+    {
+        capacity_ = 0;
+        if (own_memory_ && mem_block_ != nullptr)
+        {
+            delete[]mem_block_;
+        }
+
+        mem_block_ = nullptr;
+        key_ = nullptr;
+        value_ = nullptr;
+    }
+
+    void light_hash_map::clear()
+    {
+        memset(mem_block_, 0, capacity_ * 2 * sizeof(int32_t));
+    }
+
+    void light_hash_map::sort()
+    {
+        //key is probablly empty in key_, sort by value_
+        //this is just for the output process like getting the topic of document or a topic of term
+    }
+
+    void light_hash_map::set_memory(int32_t *mem_block)
+    {
+        mem_block_ = mem_block;
+        key_ = mem_block_;
+        value_ = mem_block_ + capacity_;
+    }
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/light_hash_map.h b/src/Native/LdaNative/light_hash_map.h
new file mode 100644
index 0000000000..6e07c4ce58
--- /dev/null
+++ b/src/Native/LdaNative/light_hash_map.h
@@ -0,0 +1,189 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+#include <stdint.h>
+#include <fstream>
+#include <unordered_map>
+#include <cassert>
+
+/*
+A light-weight hash table, borrowing the idea from google::dense_hash_map
+0, <key, value> pair must be <int32_t, int32_t>
+1, It can or can not own memory,
+2, It has a fixed capacity, needless to resize or shrink,
+3, capacity_ should at lease be twice of the maximum number of inserted items, guaranteeing a low load factor,
+4, capacity_ should be an integer power of 2
+5, emptry_key_ is fixed to 0
+6, deleted_key_ is fixed to -2
+*/
+
+namespace lda
+{
+// The probing method:
+// Linear probing
+// #define JUMP_(key, num_probes)    ( 1 )
+
+// Quadratic probing
+#define JUMP_(key, num_probes)    ( num_probes )
+
+#define ILLEGAL_BUCKET -1
+
+    class light_hash_map
+    {
+    public:
+
+        // must call set_memory after construction before use
+        light_hash_map();
+        // NOTE: the size of mem_block_ = 2 * capacity_
+        light_hash_map(int32_t *mem_block, int32_t capacity);
+        light_hash_map(int32_t capacity);
+
+        ~light_hash_map();
+
+        void clear();
+        void set_memory(int32_t *mem_block);
+        void sort();
+
+        inline int32_t capacity() const;
+        inline int32_t size() const;
+        inline int32_t* key() const;
+        inline int32_t* value() const;
+        // whether we can find the |key| in this hash table
+        inline bool has(int32_t key) const;
+
+        // if |key| is already in table, increase its coresponding |value| with |delta|
+        // if not, insert |key| into the table and set |delta| as the |value| of |key|
+        inline void inc(int32_t key, int32_t delta);
+
+        // query the value of |key|
+        // if |key| is in the table, return the |value| corresonding to |key|
+        // if not, just return 0
+        inline int32_t operator[](int32_t key);
+
+    private:
+
+        light_hash_map(const light_hash_map &other) = delete;
+        light_hash_map& operator=(const light_hash_map &other) = delete;
+
+        // Returns a pair of positions: 1st where the object is, 2nd where
+        // it would go if you wanted to insert it.  1st is ILLEGAL_BUCKET
+        // if object is not found; 2nd is ILLEGAL_BUCKET if it is.
+        // NOTE: because of deletions where-to-insert is not trivial: it's the
+        // first deleted bucket we see, as long as we don't find the key later
+        inline std::pair<int32_t, int32_t> find_position(const int32_t key) const;
+
+        bool own_memory_;
+        int32_t capacity_;
+        int32_t *mem_block_;
+        int32_t *key_;
+        int32_t *value_;
+
+        int32_t empty_key_;
+        int32_t deleted_key_;
+    };
+    
+    inline int32_t light_hash_map::capacity() const
+    {
+        return capacity_;
+    }
+    inline int32_t light_hash_map::size() const
+    {
+        int32_t size = 0;
+        for (int i = 0; i < capacity_; ++i)
+        {
+            if (key_[i] > 0)
+            {
+                ++size;
+            }
+        }
+        return size;
+    }
+
+    inline int32_t* light_hash_map::key() const
+    {
+        return key_;
+    }
+    inline int32_t* light_hash_map::value() const
+    {
+        return value_;
+    }
+
+    inline bool light_hash_map::has(int32_t key) const
+    {
+        int32_t internal_key = key + 1;
+        std::pair<int32_t, int32_t> pos = find_position(internal_key);
+        return pos.first != ILLEGAL_BUCKET;
+    }
+
+    inline void light_hash_map::inc(int32_t key, int32_t delta)
+    {
+        int32_t internal_key = key + 1;
+        std::pair<int32_t, int32_t> pos = find_position(internal_key);
+        if (pos.first != ILLEGAL_BUCKET)
+        {
+            value_[pos.first] += delta;
+            if (value_[pos.first] == 0)       // the value becomes zero, delete the key
+            {
+                key_[pos.first] = deleted_key_;
+            }
+        }
+        else                                 // not found the key, insert it with delta as value
+        {
+            key_[pos.second] = internal_key;
+            value_[pos.second] = delta;
+        }
+    }
+
+    inline int32_t light_hash_map::operator[](int32_t key)
+    {
+        int32_t internal_key = key + 1;
+        std::pair<int32_t, int32_t> pos = find_position(internal_key);
+        if (pos.first != ILLEGAL_BUCKET)
+        {
+            return value_[pos.first];
+        }
+        else
+        {
+            return 0;
+        }
+    }
+
+    inline std::pair<int32_t, int32_t> light_hash_map::find_position(const int32_t key) const
+    {
+        int num_probes = 0;
+        int32_t capacity_minus_one = capacity_ - 1;
+        int32_t idx = key % capacity_;
+        int32_t insert_pos = ILLEGAL_BUCKET;
+        while (1)                                           // probe until something happens
+        {
+            if (key_[idx] == empty_key_)                    // bucket is empty
+            {
+                if (insert_pos == ILLEGAL_BUCKET)           // found no prior place to insert
+                {
+                    return std::pair<int32_t, int32_t>(ILLEGAL_BUCKET, idx);
+                }
+                else                                        // previously, there is a position to insert
+                {
+                    return std::pair<int32_t, int32_t>(ILLEGAL_BUCKET, insert_pos);
+                }
+            }
+            else if (key_[idx] == deleted_key_)            // keep searching, but makr to insert
+            {
+                if (insert_pos == ILLEGAL_BUCKET)
+                {
+                    insert_pos = idx;
+                }
+            }
+            else if (key_[idx] == key)
+            {
+                return std::pair<int32_t, int32_t>(idx, ILLEGAL_BUCKET);
+            }
+            ++num_probes;                                // we are doing another probe
+            idx = (idx + JUMP_(key, num_probes) & capacity_minus_one);
+            assert(num_probes < capacity_
+                && "Hashtable is full: an error in key_equal<> or hash<>");
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/model_block.cpp b/src/Native/LdaNative/model_block.cpp
new file mode 100644
index 0000000000..ec15834aca
--- /dev/null
+++ b/src/Native/LdaNative/model_block.cpp
@@ -0,0 +1,463 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include "utils.hpp"
+#include <stdlib.h>
+#include "model_block.h"
+#include "lda_document.h"
+
+namespace lda
+{
+    int64_t upper_bound(int64_t x)
+    {
+        if (x == 0)
+        {
+            return 0;
+        }
+        int64_t shift = 0;
+        int64_t y = 1;
+        x--;
+        while (x)
+        {
+            x = x >> 1;
+            y = y << 1;
+            ++shift;
+        }
+        return y;
+    }
+
+    int32_t align64(int32_t size)
+    {
+        if (size % 64 == 0)
+        {
+            return size;
+        }
+        else
+        {
+            size = 64 * (size / 64) + 64;
+            return size;
+        }
+    }
+
+
+    LDAModelBlock::LDAModelBlock()
+        : dict_(nullptr),
+        num_vocabs_(0),
+        mem_block_size_(0),
+        mem_block_(nullptr),
+        alias_mem_block_size_(0),
+        alias_mem_block_(nullptr)
+    {
+    }
+    LDAModelBlock::~LDAModelBlock()
+    {
+        Clear();
+    }
+
+    void LDAModelBlock::Clear()
+    {
+        if (dict_)
+        {
+            delete[]dict_;
+            dict_ = nullptr;
+        }
+        if (mem_block_)
+        {
+            delete[]mem_block_;
+            mem_block_ = nullptr;
+        }
+        if (alias_mem_block_)
+        {
+            delete[]alias_mem_block_;
+            alias_mem_block_ = nullptr;
+        }
+
+        num_vocabs_ = -1;
+        num_topics_ = -1;
+
+        mem_block_size_ = 0;
+        alias_mem_block_size_ = 0;
+    }
+
+    void LDAModelBlock::Init(int32_t num_vocabs, int32_t num_topics, int64_t nonzero_num)
+    {
+        num_vocabs_ = num_vocabs;
+        num_topics_ = num_topics;
+
+        dict_ = new WordEntry[num_vocabs_];
+        for (int i = 0; i < num_vocabs_; ++i)
+        {
+            // This warning is a false positive. Supressing it similar to the existing one on Line 140 below.
+#pragma warning(suppress: 6386)
+            dict_[i].is_dense_ = 0;
+            dict_[i].is_alias_dense_ = 0;
+        }
+
+        mem_block_size_ = 2 * upper_bound(load_factor_ * nonzero_num);
+        alias_mem_block_size_ = nonzero_num * 3; 
+
+        mem_block_ = new int32_t[mem_block_size_]();                // NOTE: force to initialize the values to be zero
+        alias_mem_block_ = new int32_t[alias_mem_block_size_]();    // NOTE: force to initialize the values to be zero
+    }
+
+    void LDAModelBlock::Init(int32_t num_vocabs, int32_t num_topics, int64_t mem_block_size, int64_t alias_mem_block_size)
+    {
+        num_vocabs_ = num_vocabs;
+        num_topics_ = num_topics;
+
+        dict_ = new WordEntry[num_vocabs_];
+        for (int i = 0; i < num_vocabs_; ++i)
+        {
+            // This warning is a false positive. Supressing it similar to the existing one on Line 140 below.
+#pragma warning(suppress: 6386)
+            dict_[i].is_dense_ = 0;
+            dict_[i].is_alias_dense_ = 0;
+        }
+
+        mem_block_size_ = mem_block_size;
+        mem_block_ = new int32_t[mem_block_size_]();   // NOTE : force to initialize the values to be zero
+
+        alias_mem_block_size_ = alias_mem_block_size;
+        alias_mem_block_ = new int32_t[alias_mem_block_size_]();    //NOTE: force to initialize the values to be zero
+
+        std::cout << "mem_block_size = " << mem_block_size_ * 4 << std::endl;
+        std::cout << "alias_mem_block_size = " << alias_mem_block_size_ * 4 << std::endl;
+
+        offset_ = 0;
+        alias_offset_ = 0;
+    }
+
+    void LDAModelBlock::Init(int32_t num_vocabs, int32_t num_topics)
+    {
+        num_vocabs_ = num_vocabs;
+        num_topics_ = num_topics;
+
+        dict_ = new WordEntry[num_vocabs_];
+        for (int i = 0; i < num_vocabs_; ++i)
+        {
+            // This warning is a false positive caused by an old bug in PREfast. It is fixed in VS 2015.
+#pragma warning(suppress: 6386) 
+            dict_[i].tf = 0;
+            dict_[i].is_dense_ = 0;
+            dict_[i].is_alias_dense_ = 0;
+        }
+    }
+
+    void LDAModelBlock::SetWordInfo(int word_id, int32_t nonzero_num, bool fullSparse)
+    {
+        dict_[word_id].word_id_ = word_id;
+        dict_[word_id].tf = nonzero_num;
+
+        int32_t hot_thresh;
+        if (fullSparse)
+        {
+            // use a very large threshold to ensure every row of word-topic-table using a sparse representation
+            hot_thresh = std::numeric_limits<int>::max();
+        }
+        else
+        {
+            hot_thresh = num_topics_ / (2 * load_factor_);  //hybrid
+        }
+        int32_t alias_hot_thresh;
+        if (fullSparse)
+        {
+            // use a very large threshold to ensure every row of alias table using a sparse representation
+            alias_hot_thresh = std::numeric_limits<int>::max();
+        }
+        else
+        {
+            alias_hot_thresh = (num_topics_ * 2) / 3;
+        }
+
+        int32_t capacity = 0;
+        int32_t row_size = 0;
+        int32_t alias_capacity = 0;
+        int32_t alias_row_size = 0;
+
+        if (dict_[word_id].tf >= hot_thresh)
+        {
+            dict_[word_id].is_dense_ = 1;
+            capacity = num_topics_;
+            row_size = capacity;
+        }
+        else if (dict_[word_id].tf > 0)
+        {
+            dict_[word_id].is_dense_ = 0;
+            int capacity_lower_bound = load_factor_ * dict_[word_id].tf;
+            capacity = (int32_t)upper_bound(capacity_lower_bound);
+            row_size = capacity * 2;
+        }
+        else
+        {
+            dict_[word_id].is_dense_ = 1;
+            row_size = 0;
+            capacity = 0;
+        }
+
+        dict_[word_id].offset_ = offset_;
+        dict_[word_id].end_offset_ = offset_ + row_size;
+        dict_[word_id].capacity_ = capacity;
+
+        offset_ += row_size;
+
+        if (dict_[word_id].tf >= alias_hot_thresh)
+        {
+            alias_capacity = num_topics_;
+            alias_row_size = 2 * num_topics_;
+            dict_[word_id].is_alias_dense_ = 1;
+        }
+        else if (dict_[word_id].tf > 0)
+        {
+            alias_capacity = dict_[word_id].tf;
+            alias_row_size = 3 * dict_[word_id].tf;
+            dict_[word_id].is_alias_dense_ = 0;
+        }
+        else
+        {
+            alias_capacity = 0;
+            alias_row_size = 0;
+            dict_[word_id].is_alias_dense_ = 1;
+        }
+        dict_[word_id].alias_capacity_ = alias_capacity;
+        dict_[word_id].alias_offset_ = alias_offset_;
+        dict_[word_id].alias_end_offset_ = alias_offset_ + alias_row_size;
+
+        alias_offset_ += alias_row_size;
+    }
+
+    // NOTE: sometimes, we use totally sparse representation (in testing phase), fullSparse == true
+    // in other times, we use hybrid structure (in training phase), fullSparse == false
+    void LDAModelBlock::InitModelBlockByTFS(bool fullSparse)
+    {
+        const int32_t max_tf_thresh = std::numeric_limits<int32_t>::max();
+        int32_t hot_thresh;
+        if (fullSparse)
+        {
+            // totally sparse
+            // use a very large threshold to ensure every row of word-topic-table using a sparse representation
+            hot_thresh = std::numeric_limits<int>::max();
+        }
+        else
+        {
+            // hybrid
+            hot_thresh = num_topics_ / (2 * load_factor_);
+        }
+        int32_t alias_hot_thresh;
+        if (fullSparse)
+        {
+            // use a very large threshold to ensure every row of alias table using a sparse representation
+            alias_hot_thresh = std::numeric_limits<int>::max();
+        }
+        else
+        {
+            alias_hot_thresh = (num_topics_ * 2) / 3;
+        }
+
+        int32_t word_id;
+        int32_t capacity = 0;
+        int32_t row_size = 0;
+        int32_t alias_capacity = 0;
+        int32_t alias_row_size = 0;
+
+        int64_t offset = 0;
+        int64_t alias_offset = 0;
+
+        for (word_id = 0; word_id < num_vocabs_; ++word_id)
+        {
+            int32_t tf = dict_[word_id].tf;
+
+            dict_[word_id].word_id_ = word_id;
+            dict_[word_id].tf = tf;
+
+            if (tf >= hot_thresh)
+            {
+                dict_[word_id].is_dense_ = 1;
+                capacity = num_topics_;
+                row_size = capacity;
+            }
+            else if (tf > 0)
+            {
+                dict_[word_id].is_dense_ = 0;
+                int capacity_lower_bound = load_factor_ * tf;
+                capacity = (int32_t)upper_bound(capacity_lower_bound);
+                row_size = capacity * 2;
+            }
+            else
+            {
+                dict_[word_id].is_dense_ = 1;
+                capacity = 0;
+                row_size = 0;
+            }
+
+            dict_[word_id].offset_ = offset;
+            dict_[word_id].end_offset_ = offset + row_size;
+            dict_[word_id].capacity_ = capacity;
+
+            offset += row_size;
+
+            if (tf >= alias_hot_thresh)
+            {
+                alias_capacity = num_topics_;
+                alias_row_size = 2 * num_topics_;
+                dict_[word_id].is_alias_dense_ = 1;
+            }
+            else if (tf > 0)
+            {
+                alias_capacity = tf;
+                alias_row_size = 3 * tf;
+                dict_[word_id].is_alias_dense_ = 0;
+            }
+            else
+            {
+                alias_capacity = 0;
+                alias_row_size = 0;
+                dict_[word_id].is_alias_dense_ = 1;
+            }
+            dict_[word_id].alias_capacity_ = alias_capacity;
+            dict_[word_id].alias_offset_ = alias_offset;
+            dict_[word_id].alias_end_offset_ = alias_offset + alias_row_size;
+            alias_offset += alias_row_size;
+        }
+
+        mem_block_size_ = dict_[num_vocabs_ - 1].end_offset_;
+        mem_block_ = new int32_t[mem_block_size_]();                // NOTE: force to initialize the values to be zero
+
+        alias_mem_block_size_ = dict_[num_vocabs_ - 1].alias_end_offset_;
+        alias_mem_block_ = new int32_t[alias_mem_block_size_]();    //NOTE: force to initialize the values to be zero
+
+        std::cout << "mem_block_size = " << mem_block_size_ * 4 << std::endl;
+        std::cout << "alias_mem_block_size = " << alias_mem_block_size_ * 4 << std::endl;
+    }
+
+    void LDAModelBlock::InitFromDataBlock(const LDADataBlock *data_block, int32_t num_vocabs, int32_t num_topics)
+    {
+        num_vocabs_ = num_vocabs;
+        num_topics_ = num_topics;
+
+        int32_t doc_num = data_block->num_documents();
+        dict_ = new WordEntry[num_vocabs_];
+        for (int i = 0; i < num_vocabs_; ++i)
+        {
+            dict_[i].tf = 0;
+        }
+
+        for (int i = 0; i < doc_num; ++i)
+        {
+            std::shared_ptr<LDADocument> doc = data_block->GetOneDoc(i);
+            int32_t doc_size = doc->size();
+            for (int j = 0; j < doc_size; ++j)
+            {
+                int32_t w = doc->Word(j);
+                dict_[w].tf++;
+            }
+        }
+
+        InitModelBlockByTFS(false);
+    }
+    // Count the number of nonzero values in each row
+    void LDAModelBlock::CountNonZero(std::vector<int32_t> &tfs)
+    {
+        for (int i = 0; i < num_vocabs_; ++i)
+        {
+            hybrid_map row(mem_block_ + dict_[i].offset_,
+                dict_[i].is_dense_,
+                dict_[i].capacity_,
+                0,
+                nullptr);
+            tfs[i] = row.nonzero_num();
+        }
+    }
+
+    void LDAModelBlock::GetModelSizeByTFS(bool fullSparse, std::vector<int32_t> &tfs, int64_t &mem_block_size, int64_t &alias_mem_block_size)
+    {
+        const int32_t max_tf_thresh = std::numeric_limits<int32_t>::max();
+        int32_t hot_thresh;
+        if (fullSparse)
+        {
+            // totally sparse
+            // use a very large threshold to ensure every row of word-topic-table using a sparse representation
+            hot_thresh = std::numeric_limits<int>::max();
+        }
+        else
+        {
+            // hybrid
+            hot_thresh = num_topics_ / (2 * load_factor_);
+        }
+        // hot_thresh = 0;  // totally dense
+        int32_t alias_hot_thresh;
+        if (fullSparse)
+        {
+            // use a very large threshold to ensure every row of alias table using a sparse representation
+            alias_hot_thresh = std::numeric_limits<int>::max();
+        }
+        else
+        {
+            alias_hot_thresh = (num_topics_ * 2) / 3;
+        }
+
+        int32_t word_id;
+        int32_t capacity = 0;
+        int32_t alias_capacity = 0;
+        int32_t row_size = 0;
+        int32_t alias_row_size = 0;
+
+        mem_block_size = 0;
+        alias_mem_block_size = 0;
+
+        for (word_id = 0; word_id < num_vocabs_; ++word_id)
+        {
+            int32_t tf = tfs[word_id];
+
+            if (tf >= hot_thresh)
+            {
+                capacity = num_topics_;
+                row_size = capacity;
+            }
+            else if (tf > 0)
+            {
+                int capacity_lower_bound = load_factor_ * tf;
+                capacity = (int32_t)upper_bound(capacity_lower_bound);
+                row_size = capacity * 2;
+            }
+            else
+            {    
+                capacity = 0;
+                row_size = 0;
+            }
+            mem_block_size += row_size;
+
+            if (tf >= alias_hot_thresh)
+            {
+                alias_capacity = num_topics_;
+                alias_row_size = 2 * num_topics_;
+            }
+            else if (tf > 0)
+            {
+                alias_capacity = tf;
+                alias_row_size = 3 * tf;
+            }
+            else
+            {
+                alias_capacity = 0;
+                alias_row_size = 0;
+            }
+            alias_mem_block_size += alias_row_size;
+        }
+    }
+
+    // NOTE: we can re-use the dict_ variable here, but we deliberately not use it.
+    // This function should not change the internal state of model_block_
+    void LDAModelBlock::GetModelStat(int64_t &mem_block_size, int64_t &alias_mem_block_size)
+    {
+        std::vector<int32_t> tfs(num_vocabs_, 0);
+        CountNonZero(tfs);
+
+        // calculate the mem_block_size, alias_mem_block_size
+        GetModelSizeByTFS(true, tfs, mem_block_size, alias_mem_block_size);
+    }
+}
diff --git a/src/Native/LdaNative/model_block.h b/src/Native/LdaNative/model_block.h
new file mode 100644
index 0000000000..2160be1d7f
--- /dev/null
+++ b/src/Native/LdaNative/model_block.h
@@ -0,0 +1,93 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <cassert>
+#include <map>
+#include "data_block.h"
+#include "hybrid_map.h"
+#include "hybrid_alias_map.h"
+
+namespace lda
+{
+    struct WordEntry
+    {
+        int32_t word_id_;
+        int64_t offset_;
+        int64_t end_offset_;
+        int32_t capacity_;
+        int32_t is_dense_;
+
+        int32_t tf;
+        int64_t alias_offset_;
+        int64_t alias_end_offset_;
+        int32_t alias_capacity_;
+        int32_t is_alias_dense_;
+    };
+
+    class LDAModelBlock
+    {
+    public:
+        LDAModelBlock();
+        ~LDAModelBlock();
+
+        inline hybrid_map get_row(int word_id, int32_t *external_buf);
+        inline hybrid_alias_map get_alias_row(int word_id);
+        void SetWordInfo(int word_id, int32_t nonzero_num, bool fullSparse);
+
+        void Clear();
+        void Init(int32_t num_vocabs, int32_t num_topics);
+        void Init(int32_t num_vocabs, int32_t num_topics, int64_t nonzero_num);
+        void Init(int32_t num_vocabs, int32_t num_topics, int64_t mem_block_size, int64_t alias_mem_block_size);
+
+        void InitFromDataBlock(const LDADataBlock *data_block, int32_t num_vocabs, int32_t num_topics);
+
+        void GetModelStat(int64_t &mem_block_size, int64_t &alias_mem_block_size);
+
+    private:
+
+        LDAModelBlock(const LDAModelBlock &other) = delete;
+        LDAModelBlock& operator=(const LDAModelBlock &other) = delete;
+
+        void CountNonZero(std::vector<int32_t> &tfs);
+        void InitModelBlockByTFS(bool fullSparse);
+        void GetModelSizeByTFS(bool fullSparse, std::vector<int32_t> &tfs, int64_t &mem_block_size, int64_t &alias_mem_block_size);
+
+        int32_t num_vocabs_;
+        int32_t num_topics_;
+        WordEntry *dict_;
+        int32_t *mem_block_;
+        int64_t mem_block_size_;
+
+        int32_t *alias_mem_block_;
+        int64_t alias_mem_block_size_;
+
+        int64_t offset_;
+        int64_t alias_offset_;
+
+        const int32_t load_factor_ = 2;
+        const int32_t sparse_factor_ = 5;
+    };
+    inline hybrid_map LDAModelBlock::get_row(int word_id, int32_t *external_buf)
+    {
+        hybrid_map row(mem_block_ + dict_[word_id].offset_,
+            dict_[word_id].is_dense_,
+            dict_[word_id].capacity_,
+            0,
+            external_buf);
+        return row;
+    }
+    inline hybrid_alias_map LDAModelBlock::get_alias_row(int word_id)
+    {
+        hybrid_alias_map row(alias_mem_block_ + dict_[word_id].alias_offset_,
+            dict_[word_id].is_alias_dense_,
+            dict_[word_id].alias_capacity_);
+        return row;
+    }
+
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/rand_int_rng.h b/src/Native/LdaNative/rand_int_rng.h
new file mode 100644
index 0000000000..c51943e11f
--- /dev/null
+++ b/src/Native/LdaNative/rand_int_rng.h
@@ -0,0 +1,45 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+#include <time.h>
+#include <stdint.h>
+
+namespace wood
+{
+    class xorshift_rng
+    {
+    public:
+        xorshift_rng()
+        {
+            jxr = 1234567;
+        }
+        ~xorshift_rng() {}
+
+        inline void restart()
+        {
+            jxr = 1234567;
+        }
+
+        inline int32_t rand()
+        {
+            jxr ^= (jxr << 13); jxr ^= (jxr >> 17); jxr ^= (jxr << 5);  //get random (xorshift) 32-bit integer
+            return jxr & 0x7fffffff;
+        }
+        inline int32_t rand_k(int K)
+        {
+            return (int32_t)(rand() * 4.6566125e-10 * K);
+        }
+        inline float rand_real()
+        {
+            return (float)(rand() * 4.6566125e-10);
+        }
+    private:
+
+        xorshift_rng(const xorshift_rng &other) = delete;
+        xorshift_rng& operator=(const xorshift_rng &other) = delete;
+
+        unsigned int jxr;
+    };
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/simple_barrier.h b/src/Native/LdaNative/simple_barrier.h
new file mode 100644
index 0000000000..55f8d601b9
--- /dev/null
+++ b/src/Native/LdaNative/simple_barrier.h
@@ -0,0 +1,66 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#ifndef _SIMPLE_BARRIER_H_
+#define _SIMPLE_BARRIER_H_
+
+#include <atomic>
+#include <condition_variable>
+#include <thread>
+namespace lda
+{
+    class SimpleBarrier
+    {
+    public:
+        SimpleBarrier(unsigned int n) :barrier_size_(n), num_of_waiting_(0), rounds_(0)
+        {};
+
+        void reset()
+        {
+            throw "not implemented yet.";
+        }
+
+        bool wait()
+        {
+            std::unique_lock<std::mutex> lock(mutex_);
+            if (num_of_waiting_.fetch_add(1) >= barrier_size_ - 1)
+            {
+                cond_.notify_all();
+                num_of_waiting_.store(0);
+                rounds_.fetch_add(1);
+                return true;
+            }
+            else
+            {
+    
+                unsigned int i = rounds_.load();
+                cond_.wait(lock, [&]{return i != rounds_.load(); });
+                return false;
+            }
+        }
+
+        ~SimpleBarrier()
+        {
+            num_of_waiting_ = 0;
+            rounds_ = 0;
+        }
+
+
+
+    protected:
+        const unsigned int barrier_size_;
+
+        std::atomic<unsigned int> num_of_waiting_;
+        std::atomic<unsigned int> rounds_;
+        std::condition_variable cond_;
+        std::mutex mutex_;
+    };
+}
+
+
+
+
+
+#endif //  _SIMPLE_BARRIER_H_
+
diff --git a/src/Native/LdaNative/timer.h b/src/Native/LdaNative/timer.h
new file mode 100644
index 0000000000..ac9aff94b2
--- /dev/null
+++ b/src/Native/LdaNative/timer.h
@@ -0,0 +1,95 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+#include <chrono>
+using namespace std::chrono;
+class CTimer
+{
+
+private:
+
+    steady_clock::time_point startPerfomanceCount;
+    steady_clock::time_point endPerfomanceCount;
+    duration<float> totalElapsed;
+public:
+    char m_szMessage[1024];
+
+public:
+    CTimer()
+    {
+        Initialize();
+    }
+
+    CTimer(bool bStartOnCreate)
+    {
+        Initialize();
+
+        if (bStartOnCreate)
+        {
+            Start();
+        }
+    }
+
+    void Initialize()
+    {
+        totalElapsed = duration<float>();
+    }
+
+    void Start()
+    {
+        startPerfomanceCount = std::chrono::steady_clock::now();
+    }
+
+    // time unit: seconds
+    void Tag(const char* pszMsg = NULL)
+    {
+        endPerfomanceCount = std::chrono::steady_clock::now();
+        totalElapsed += duration_cast<duration<float>> (endPerfomanceCount - startPerfomanceCount);
+        OutputStatistics(pszMsg);
+        //start next round
+        Start();
+    }
+
+    // time unit: seconds
+    void InnerTag()
+    {
+        endPerfomanceCount = std::chrono::steady_clock::now();
+        totalElapsed += duration_cast<duration<float>> (endPerfomanceCount - startPerfomanceCount);
+
+        OutputStatistics(m_szMessage);
+
+        //start next round
+        Start();
+    }
+
+    float GetTotalElaps()
+    {
+        return totalElapsed.count();
+    }
+    float GetTimeSpan()
+    {
+        endPerfomanceCount = std::chrono::steady_clock::now();
+        totalElapsed += duration_cast<duration<float>> (endPerfomanceCount - startPerfomanceCount);
+        float timespent = totalElapsed.count();
+
+        //start next round
+        Start();
+
+        return timespent;
+    }
+
+    float GetTaggedTimeSpan()
+    {
+        return duration_cast<duration<float>> (endPerfomanceCount - startPerfomanceCount).count();
+    }
+
+    void OutputStatistics(const char* pszMsg = NULL)
+    {
+        printf("Time Cost totally: %f, last time span(%s): %f seconds.\n", GetTotalElaps(), pszMsg, GetTaggedTimeSpan());
+    }
+
+private:
+    CTimer(const CTimer& obj);
+};
\ No newline at end of file
diff --git a/src/Native/LdaNative/type_common.h b/src/Native/LdaNative/type_common.h
new file mode 100644
index 0000000000..a7043b28b1
--- /dev/null
+++ b/src/Native/LdaNative/type_common.h
@@ -0,0 +1,9 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+
+namespace lda {
+    typedef float real_t;
+}
\ No newline at end of file
diff --git a/src/Native/LdaNative/utils.cpp b/src/Native/LdaNative/utils.cpp
new file mode 100644
index 0000000000..c1c5cee076
--- /dev/null
+++ b/src/Native/LdaNative/utils.cpp
@@ -0,0 +1,62 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#include "utils.hpp"
+
+#include "math.h"
+#include <chrono>
+
+namespace {
+    const double cof[6] = { 76.18009172947146, -86.50532032941677,
+        24.01409824083091, -1.231739572450155,
+        0.1208650973866179e-2, -0.5395239384953e-5
+    };
+}
+
+namespace lda {
+
+    double LogGamma(double xx)
+    {
+        int j;
+        double x, y, tmp1, ser;
+        y = xx;
+        x = xx;
+        tmp1 = x + 5.5;
+        tmp1 -= (x + 0.5)*log(tmp1);
+        ser = 1.000000000190015;
+        for (j = 0; j < 6; j++) ser += cof[j] / ++y;
+        return -tmp1 + log(2.5066282746310005*ser / x);
+    }
+
+
+    double get_time() {
+        auto start = std::chrono::high_resolution_clock::now();
+        auto since_epoch = start.time_since_epoch();
+        return std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(since_epoch).count();
+    }
+
+    void CBlockedIntQueue::clear()
+    {
+        std::lock_guard<std::mutex> lock(_mutex);
+        _queue.clear();
+    }
+
+    int CBlockedIntQueue::pop()
+    {
+        std::unique_lock<std::mutex> lock(_mutex);
+        _condition.wait(lock, [this] { return !_queue.empty(); });
+        auto val = _queue.front();
+        _queue.pop_front();
+        return val;
+    }
+
+    void CBlockedIntQueue::push(int value)
+    {
+        {
+            std::lock_guard<std::mutex> lock(_mutex);
+            _queue.push_back(value);
+        }
+        _condition.notify_one();
+    }
+}
diff --git a/src/Native/LdaNative/utils.hpp b/src/Native/LdaNative/utils.hpp
new file mode 100644
index 0000000000..7b71ec67da
--- /dev/null
+++ b/src/Native/LdaNative/utils.hpp
@@ -0,0 +1,49 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+#pragma once
+#define  NOMINMAX
+
+#include <condition_variable>
+#include <deque>
+#include <atomic>
+#include <mutex>
+#include <list>
+
+
+namespace lda {
+
+    double LogGamma(double xx);
+    double get_time();
+
+    struct LDAEngineAtomics
+    {
+        LDAEngineAtomics() :doc_ll_(0), word_ll_(0), num_tokens_clock_(0), thread_counter_(0){}
+        ~LDAEngineAtomics() {}
+
+        std::atomic<double> doc_ll_;
+        std::atomic<double> word_ll_;
+
+        // # of tokens processed in a Clock() call.
+        std::atomic<int> num_tokens_clock_;
+        std::atomic<int> thread_counter_;
+
+        std::mutex global_mutex_;
+    };
+
+    class CBlockedIntQueue
+    {
+    public:
+        void clear();
+        int pop();
+        void push(int value);
+
+    private:
+        std::mutex _mutex;
+        std::condition_variable _condition;
+        std::deque<int> _queue;
+    };
+
+
+}
diff --git a/src/Native/build.proj b/src/Native/build.proj
index 5074517fda..c091a78c43 100644
--- a/src/Native/build.proj
+++ b/src/Native/build.proj
@@ -74,6 +74,9 @@
 
       <NativePackageAsset Include="$(NativeAssetsBuiltPath)\$(NativeLibPrefix)FastTreeNative$(NativeLibExtension)"
                           RelativePath="Microsoft.ML\runtimes\$(PackageRid)\native" />
+
+	  <NativePackageAsset Include="$(NativeAssetsBuiltPath)\$(NativeLibPrefix)LdaNative$(NativeLibExtension)"
+                          RelativePath="Microsoft.ML\runtimes\$(PackageRid)\native" />
     </ItemGroup>
 
     <ItemGroup>
diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
index a6d1f50668..a5b66052f7 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
+++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
@@ -76,6 +76,7 @@ Transforms.KeyToTextConverter	KeyToValueTransform utilizes KeyValues metadata to
 Transforms.LabelColumnKeyBooleanConverter	Transforms the label to either key or bool (if needed) to make it suitable for classification.	Microsoft.ML.Runtime.EntryPoints.FeatureCombiner	PrepareClassificationLabel	Microsoft.ML.Runtime.EntryPoints.FeatureCombiner+ClassificationLabelInput	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.LabelIndicator	Label remapper used by OVA	Microsoft.ML.Runtime.Data.LabelIndicatorTransform	LabelIndicator	Microsoft.ML.Runtime.Data.LabelIndicatorTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.LabelToFloatConverter	Transforms the label to float to make it suitable for regression.	Microsoft.ML.Runtime.EntryPoints.FeatureCombiner	PrepareRegressionLabel	Microsoft.ML.Runtime.EntryPoints.FeatureCombiner+RegressionLabelInput	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
+Transforms.LightLda	The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.	Microsoft.ML.Runtime.Transforms.TextAnalytics	LightLda	Microsoft.ML.Runtime.TextAnalytics.LdaTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.LogMeanVarianceNormalizer	Normalizes the data based on the computed mean and variance of the logarithm of the data.	Microsoft.ML.Runtime.Data.Normalize	LogMeanVar	Microsoft.ML.Runtime.Data.NormalizeTransform+LogMeanVarArguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.LpNormalizer	Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm.	Microsoft.ML.Runtime.Data.LpNormalization	Normalize	Microsoft.ML.Runtime.Data.LpNormNormalizerTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.ManyHeterogeneousModelCombiner	Combines a sequence of TransformModels and a PredictorModel into a single PredictorModel.	Microsoft.ML.Runtime.EntryPoints.ModelOperations	CombineModels	Microsoft.ML.Runtime.EntryPoints.ModelOperations+PredictorModelInput	Microsoft.ML.Runtime.EntryPoints.ModelOperations+PredictorModelOutput
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
index a5cb656da9..b15d04c860 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json
+++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -14831,6 +14831,388 @@
         "ITransformOutput"
       ]
     },
+    {
+      "Name": "Transforms.LightLda",
+      "Desc": "The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.",
+      "FriendlyName": "Latent Dirichlet Allocation Transform",
+      "ShortName": "LightLda",
+      "Inputs": [
+        {
+          "Name": "Data",
+          "Type": "DataView",
+          "Desc": "Input dataset",
+          "Required": true,
+          "SortOrder": 1.0,
+          "IsNullable": false
+        },
+        {
+          "Name": "Column",
+          "Type": {
+            "Kind": "Array",
+            "ItemType": {
+              "Kind": "Struct",
+              "Fields": [
+                {
+                  "Name": "NumTopic",
+                  "Type": "Int",
+                  "Desc": "The number of topics in the LDA",
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": true,
+                  "Default": null
+                },
+                {
+                  "Name": "AlphaSum",
+                  "Type": "Float",
+                  "Desc": "Dirichlet prior on document-topic vectors",
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": true,
+                  "Default": null
+                },
+                {
+                  "Name": "Beta",
+                  "Type": "Float",
+                  "Desc": "Dirichlet prior on vocab-topic vectors",
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": true,
+                  "Default": null
+                },
+                {
+                  "Name": "Mhstep",
+                  "Type": "Int",
+                  "Desc": "Number of Metropolis Hasting step",
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": true,
+                  "Default": null
+                },
+                {
+                  "Name": "NumIterations",
+                  "Type": "Int",
+                  "Desc": "Number of iterations",
+                  "Aliases": [
+                    "iter"
+                  ],
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": true,
+                  "Default": null
+                },
+                {
+                  "Name": "LikelihoodInterval",
+                  "Type": "Int",
+                  "Desc": "Compute log likelihood over local dataset on this iteration interval",
+                  "Aliases": [
+                    "llInterval"
+                  ],
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": true,
+                  "Default": null
+                },
+                {
+                  "Name": "NumThreads",
+                  "Type": "Int",
+                  "Desc": "The number of training threads",
+                  "Aliases": [
+                    "t"
+                  ],
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": true,
+                  "Default": null
+                },
+                {
+                  "Name": "NumMaxDocToken",
+                  "Type": "Int",
+                  "Desc": "The threshold of maximum count of tokens per doc",
+                  "Aliases": [
+                    "maxNumToken"
+                  ],
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": true,
+                  "Default": null
+                },
+                {
+                  "Name": "NumSummaryTermPerTopic",
+                  "Type": "Int",
+                  "Desc": "The number of words to summarize the topic",
+                  "Aliases": [
+                    "ns"
+                  ],
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": true,
+                  "Default": null
+                },
+                {
+                  "Name": "NumBurninIterations",
+                  "Type": "Int",
+                  "Desc": "The number of burn-in iterations",
+                  "Aliases": [
+                    "burninIter"
+                  ],
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": true,
+                  "Default": 10
+                },
+                {
+                  "Name": "ResetRandomGenerator",
+                  "Type": "Bool",
+                  "Desc": "Reset the random number generator for each document",
+                  "Aliases": [
+                    "reset"
+                  ],
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": true,
+                  "Default": null
+                },
+                {
+                  "Name": "Name",
+                  "Type": "String",
+                  "Desc": "Name of the new column",
+                  "Aliases": [
+                    "name"
+                  ],
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": false,
+                  "Default": null
+                },
+                {
+                  "Name": "Source",
+                  "Type": "String",
+                  "Desc": "Name of the source column",
+                  "Aliases": [
+                    "src"
+                  ],
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": false,
+                  "Default": null
+                }
+              ]
+            }
+          },
+          "Desc": "New column definition(s) (optional form: name:srcs)",
+          "Aliases": [
+            "col"
+          ],
+          "Required": true,
+          "SortOrder": 49.0,
+          "IsNullable": false
+        },
+        {
+          "Name": "NumTopic",
+          "Type": "Int",
+          "Desc": "The number of topics in the LDA",
+          "Required": false,
+          "SortOrder": 50.0,
+          "IsNullable": false,
+          "Default": 100,
+          "SweepRange": {
+            "RangeType": "Discrete",
+            "Values": [
+              20,
+              40,
+              100,
+              200
+            ]
+          }
+        },
+        {
+          "Name": "NumMaxDocToken",
+          "Type": "Int",
+          "Desc": "The threshold of maximum count of tokens per doc",
+          "Aliases": [
+            "maxNumToken"
+          ],
+          "Required": false,
+          "SortOrder": 50.0,
+          "IsNullable": false,
+          "Default": 512
+        },
+        {
+          "Name": "NumThreads",
+          "Type": "Int",
+          "Desc": "The number of training threads. Default value depends on number of logical processors.",
+          "Aliases": [
+            "t"
+          ],
+          "Required": false,
+          "SortOrder": 50.0,
+          "IsNullable": true,
+          "Default": null
+        },
+        {
+          "Name": "AlphaSum",
+          "Type": "Float",
+          "Desc": "Dirichlet prior on document-topic vectors",
+          "Required": false,
+          "SortOrder": 150.0,
+          "IsNullable": false,
+          "Default": 100.0,
+          "SweepRange": {
+            "RangeType": "Discrete",
+            "Values": [
+              1,
+              10,
+              100,
+              200
+            ]
+          }
+        },
+        {
+          "Name": "Beta",
+          "Type": "Float",
+          "Desc": "Dirichlet prior on vocab-topic vectors",
+          "Required": false,
+          "SortOrder": 150.0,
+          "IsNullable": false,
+          "Default": 0.01,
+          "SweepRange": {
+            "RangeType": "Discrete",
+            "Values": [
+              0.01,
+              0.015,
+              0.07,
+              0.02
+            ]
+          }
+        },
+        {
+          "Name": "Mhstep",
+          "Type": "Int",
+          "Desc": "Number of Metropolis Hasting step",
+          "Required": false,
+          "SortOrder": 150.0,
+          "IsNullable": false,
+          "Default": 4,
+          "SweepRange": {
+            "RangeType": "Discrete",
+            "Values": [
+              2,
+              4,
+              8,
+              16
+            ]
+          }
+        },
+        {
+          "Name": "NumIterations",
+          "Type": "Int",
+          "Desc": "Number of iterations",
+          "Aliases": [
+            "iter"
+          ],
+          "Required": false,
+          "SortOrder": 150.0,
+          "IsNullable": false,
+          "Default": 200,
+          "SweepRange": {
+            "RangeType": "Discrete",
+            "Values": [
+              100,
+              200,
+              300,
+              400
+            ]
+          }
+        },
+        {
+          "Name": "LikelihoodInterval",
+          "Type": "Int",
+          "Desc": "Compute log likelihood over local dataset on this iteration interval",
+          "Aliases": [
+            "llInterval"
+          ],
+          "Required": false,
+          "SortOrder": 150.0,
+          "IsNullable": false,
+          "Default": 5
+        },
+        {
+          "Name": "NumSummaryTermPerTopic",
+          "Type": "Int",
+          "Desc": "The number of words to summarize the topic",
+          "Aliases": [
+            "ns"
+          ],
+          "Required": false,
+          "SortOrder": 150.0,
+          "IsNullable": false,
+          "Default": 10
+        },
+        {
+          "Name": "NumBurninIterations",
+          "Type": "Int",
+          "Desc": "The number of burn-in iterations",
+          "Aliases": [
+            "burninIter"
+          ],
+          "Required": false,
+          "SortOrder": 150.0,
+          "IsNullable": false,
+          "Default": 10,
+          "SweepRange": {
+            "RangeType": "Discrete",
+            "Values": [
+              10,
+              20,
+              30,
+              40
+            ]
+          }
+        },
+        {
+          "Name": "ResetRandomGenerator",
+          "Type": "Bool",
+          "Desc": "Reset the random number generator for each document",
+          "Aliases": [
+            "reset"
+          ],
+          "Required": false,
+          "SortOrder": 150.0,
+          "IsNullable": false,
+          "Default": false
+        },
+        {
+          "Name": "OutputTopicWordSummary",
+          "Type": "Bool",
+          "Desc": "Whether to output the topic-word summary in text format",
+          "Aliases": [
+            "summary"
+          ],
+          "Required": false,
+          "SortOrder": 150.0,
+          "IsNullable": false,
+          "Default": false
+        }
+      ],
+      "Outputs": [
+        {
+          "Name": "OutputData",
+          "Type": "DataView",
+          "Desc": "Transformed dataset"
+        },
+        {
+          "Name": "Model",
+          "Type": "TransformModel",
+          "Desc": "Transform model"
+        }
+      ],
+      "InputKind": [
+        "ITransformInput"
+      ],
+      "OutputKind": [
+        "ITransformOutput"
+      ]
+    },
     {
       "Name": "Transforms.LogMeanVarianceNormalizer",
       "Desc": "Normalizes the data based on the computed mean and variance of the logarithm of the data.",
diff --git a/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj b/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj
index 9f38858721..5a66e5bbcd 100644
--- a/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj
+++ b/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj
@@ -20,6 +20,7 @@
   <ItemGroup>
     <NativeAssemblyReference Include="CpuMathNative" />
     <NativeAssemblyReference Include="FastTreeNative" />
+    <NativeAssemblyReference Include="LdaNative" />
   </ItemGroup>
 
 </Project>
\ No newline at end of file
diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
index 60e79a943d..b76c4ba24c 100644
--- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
+++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
@@ -1116,6 +1116,39 @@ public void EntryPointPcaTransform()
                 });
         }
 
+        [Fact]
+        public void EntryPointLightLdaTransform()
+        {
+            string dataFile = DeleteOutputPath("SavePipe", "SavePipeTextLightLda-SampleText.txt");
+            File.WriteAllLines(dataFile, new[] {
+                "The quick brown fox jumps over the lazy dog.",
+                "The five boxing wizards jump quickly."
+            });
+
+            TestEntryPointPipelineRoutine(dataFile, "sep={ } col=T:TX:0-**",
+                new[]
+                {
+                    "Transforms.TextFeaturizer",
+                    "Transforms.LightLda"
+                },
+                new[]
+                {
+                   @"'Column': {
+                    'Name': 'T',
+                    'Source': [
+                        'T'
+                    ]
+
+                },
+                'VectorNormalizer': 'None'",
+                    @"'Column': [
+                      {
+                        'Name': 'T',
+                        'Source': 'T'
+                      }]"
+                });
+        }
+
         [Fact]
         public void EntryPointAveragePerceptron()
         {
diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs
new file mode 100644
index 0000000000..c598879795
--- /dev/null
+++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs
@@ -0,0 +1,148 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Float = System.Single;
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Microsoft.ML.Runtime.CommandLine;
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Data.IO;
+using Microsoft.ML.Runtime.Internal.Utilities;
+using Microsoft.ML.Runtime.Model;
+using Microsoft.ML.Runtime.TextAnalytics;
+using Xunit;
+
+namespace Microsoft.ML.Runtime.RunTests
+{
+    /// <summary>
+    /// A class for non-baseline data pipe tests.
+    /// </summary>
+    public sealed partial class TestDataPipeNoBaseline : TestDataViewBase
+    {
+        [Fact]
+        public void TestLDATransform()
+        {
+            var builder = new ArrayDataViewBuilder(Env);
+            var data = new[]
+            {
+                new[] {  (Float)1.0,  (Float)0.0,  (Float)0.0 },
+                new[] {  (Float)0.0,  (Float)1.0,  (Float)0.0 },
+                new[] {  (Float)0.0,  (Float)0.0,  (Float)1.0 },
+            };
+
+            builder.AddColumn("F1V", NumberType.Float, data);
+
+            var srcView = builder.GetDataView();
+
+            LdaTransform.Column col = new LdaTransform.Column();
+            col.Source = "F1V";
+            col.NumTopic = 20;
+            col.NumTopic = 3;
+            col.NumSummaryTermPerTopic = 3;
+            col.AlphaSum = 3;
+            col.NumThreads = 1;
+            col.ResetRandomGenerator = true;
+            LdaTransform.Arguments args = new LdaTransform.Arguments();
+            args.Column = new LdaTransform.Column[] { col };
+
+            LdaTransform ldaTransform = new LdaTransform(Env, args, srcView);
+
+            using (var cursor = ldaTransform.GetRowCursor(c => true))
+            {
+                var resultGetter = cursor.GetGetter<VBuffer<Float>>(1);
+                VBuffer<Float> resultFirstRow = new VBuffer<Float>();
+                VBuffer<Float> resultSecondRow = new VBuffer<Float>();
+                VBuffer<Float> resultThirdRow = new VBuffer<Float>();
+
+                Assert.True(cursor.MoveNext());
+                resultGetter(ref resultFirstRow);
+                Assert.True(cursor.MoveNext());
+                resultGetter(ref resultSecondRow);
+                Assert.True(cursor.MoveNext());
+                resultGetter(ref resultThirdRow);
+                Assert.False(cursor.MoveNext());
+
+                Assert.True(resultFirstRow.Length == 3);
+                Assert.True(resultFirstRow.GetItemOrDefault(0) == 0);
+                Assert.True(resultFirstRow.GetItemOrDefault(2) == 0);
+                Assert.True(resultFirstRow.GetItemOrDefault(1) == 1.0);
+                Assert.True(resultSecondRow.Length == 3);
+                Assert.True(resultSecondRow.GetItemOrDefault(0) == 0);
+                Assert.True(resultSecondRow.GetItemOrDefault(2) == 0);
+                Assert.True(resultSecondRow.GetItemOrDefault(1) == 1.0);
+                Assert.True(resultThirdRow.Length == 3);
+                Assert.True(resultThirdRow.GetItemOrDefault(0) == 0);
+                Assert.True(resultThirdRow.GetItemOrDefault(1) == 0);
+                Assert.True(resultThirdRow.GetItemOrDefault(2) == 1.0);
+            }
+
+            using (var cursor = ldaTransform.GetRowCursor(c => true))
+            {
+                var resultGetter = cursor.GetGetter<VBuffer<Float>>(1);
+                VBuffer<Float> resultFirstRow = new VBuffer<Float>();
+                VBuffer<Float> resultSecondRow = new VBuffer<Float>();
+                VBuffer<Float> resultThirdRow = new VBuffer<Float>();
+
+                Assert.True(cursor.MoveNext());
+                resultGetter(ref resultFirstRow);
+                Assert.True(cursor.MoveNext());
+                resultGetter(ref resultSecondRow);
+                Assert.True(cursor.MoveNext());
+                resultGetter(ref resultThirdRow);
+                Assert.False(cursor.MoveNext());
+
+                Assert.True(resultFirstRow.Length == 3);
+                Assert.True(resultFirstRow.GetItemOrDefault(0) == 0);
+                Assert.True(resultFirstRow.GetItemOrDefault(2) == 0);
+                Assert.True(resultFirstRow.GetItemOrDefault(1) == 1.0);
+                Assert.True(resultSecondRow.Length == 3);
+                Assert.True(resultSecondRow.GetItemOrDefault(0) == 0);
+                Assert.True(resultSecondRow.GetItemOrDefault(2) == 0);
+                Assert.True(resultSecondRow.GetItemOrDefault(1) == 1.0);
+                Assert.True(resultThirdRow.Length == 3);
+                Assert.True(resultThirdRow.GetItemOrDefault(0) == 0);
+                Assert.True(resultThirdRow.GetItemOrDefault(1) == 0);
+                Assert.True(resultThirdRow.GetItemOrDefault(2) == 1.0);
+            }
+        }
+
+        [Fact]
+        public void TestLdaTransformEmptyDocumentException()
+        {
+            var builder = new ArrayDataViewBuilder(Env);
+            var data = new[]
+            {
+                new[] {  (Float)0.0,  (Float)0.0,  (Float)0.0 },
+                new[] {  (Float)0.0,  (Float)0.0,  (Float)0.0 },
+                new[] {  (Float)0.0,  (Float)0.0,  (Float)0.0 },
+            };
+
+            builder.AddColumn("Zeros", NumberType.Float, data);
+
+            var srcView = builder.GetDataView();
+            var col = new LdaTransform.Column()
+            {
+                Source = "Zeros"
+            };
+            var args = new LdaTransform.Arguments()
+            {
+                Column = new[] { col }
+            };
+
+            try
+            {
+                var lda = new LdaTransform(Env, args, srcView);
+            }
+            catch (InvalidOperationException ex)
+            {
+                Assert.Equal(ex.Message, string.Format("The specified documents are all empty in column '{0}'.", col.Source));
+                return;
+            }
+
+            Assert.True(false, "The LDA transform does not throw expected error on empty documents.");
+        }
+    }
+}
diff --git a/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj b/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj
index d9cf8a2f29..6b8c67b6ff 100644
--- a/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj
+++ b/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj
@@ -13,4 +13,9 @@
     <ProjectReference Include="..\..\src\Microsoft.ML.ResultProcessor\Microsoft.ML.ResultProcessor.csproj" />
     <ProjectReference Include="..\..\src\Microsoft.ML\Microsoft.ML.csproj" />
   </ItemGroup>
+  
+  <ItemGroup>
+    <NativeAssemblyReference Include="LdaNative" />
+  </ItemGroup>
+
 </Project>
\ No newline at end of file
diff --git a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj
index c4ccb76ae9..d9b1e9e18b 100644
--- a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj
+++ b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj
@@ -18,5 +18,6 @@
   <ItemGroup>
     <NativeAssemblyReference Include="CpuMathNative" />
     <NativeAssemblyReference Include="FastTreeNative" />
+    <NativeAssemblyReference Include="LdaNative" />
   </ItemGroup>
 </Project>
\ No newline at end of file