diff --git a/Project.toml b/Project.toml
index 3e434c8..1b00b98 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,6 +11,7 @@ CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+GoogleDrive = "91feb7a0-3508-11ea-1e8e-afea2c1c9a19"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
@@ -20,6 +21,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
+Transformers = "21ca0261-441d-5938-ace7-c90938fde4d4"
 WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
@@ -34,6 +36,7 @@ Languages = "0.4.3"
 NNlib = "0.7"
 StatsBase = "0.33.6"
 TextAnalysis = "0.7.3"
+Transformers = "0.1"
 WordTokenizers = "0.5.6"
 Zygote = "0.6.10"
 julia = "1.6"
diff --git a/docs/make.jl b/docs/make.jl
index 64de323..4a3f9fb 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -12,6 +12,9 @@ makedocs(
         "Named Entity Recognition" => "ner.md",
         "Tagging Schemes" => "tagging.md",
         "Sentiment Analyzer" => "sentiment.md",
+        "ALBERT" => "ALBERT.md"
+        "Pretraining Tutorial (ALBERT)" => "Pretraining_Tutorial(ALBERT).md",
+        "Finetuning Tutorial (ALBERT)" => "Training_tutorial.md"
         "API References" => "APIReference.md"
     ],
 )
diff --git a/docs/src/ALBERT.md b/docs/src/ALBERT.md
new file mode 100644
index 0000000..7d87023
--- /dev/null
+++ b/docs/src/ALBERT.md
@@ -0,0 +1,186 @@
+# ALBERT 
+
+An upgrade to BERT that advances the state-of-the-art performance on 12 NLP tasks
+
+The success of ALBERT demonstrates the importance of identifying the aspects of a model that give rise to powerful contextual representations. By focusing improvement efforts on these aspects of the model architecture, it is possible to greatly improve both the model efficiency and performance on a wide range of NLP tasks
+
+The package can be used by NLP researchers and educators , Practitioners and engineers
+
+## Usage
+
+The package can be used with the help of other packages:
+
+- WordTokenizers for Tokenization (Statistical Tokenizer)
+
+- DataSets and other basic functionality
+
+  Bert uses Sentencepiece unigram model for Tokenization
+
+## Preprocessing 
+
+ALBERT just like any BERT families takes specific formate of input embeddings
+
+The model uses 2 types of indices or ids to generate or load token type embedding, segment embedding and position embeddings and also optional attention masks ( to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: `1` for tokens that are NOT MASKED, `0` for MASKED tokens)
+
+```julia
+julia> sample1 = "God is Great! I won a lottery."
+julia> sample2 = "If all their conversations in the three months he had been coming to the diner were put together, it was doubtful that they would make a respectable paragraph."
+julia> sample3 = "She had the job she had planned for the last three years."
+julia> sample = [sample1,sample2,sample3]
+julia> using WordTokenizers
+julia> spm = load(ALBERT_v1)
+WordTokenizers.SentencePieceModel(Dict("▁shots" => (-11.2373, 7281),"▁ordered" => (-9.84973, 1906),"▁doubtful" => (-12.7799, 22569),"▁glancing" => (-11.6676, 10426),"▁disrespect" => (-13.13, 26682),"▁without" => (-8.34227, 367),"▁pol" => (-10.7694, 4828),"chem" => (-12.3713, 17661),"▁1947," => (-11.7544, 11199),"▁kw" => (-10.4402, 3511)…), 2)
+
+julia> s1 = ids_from_tokens(spm, tokenizer(spm,sample[1]))
+julia> s2 = ids_from_tokens(spm, tokenizer(spm,sample[2]))
+julia> s3 = ids_from_tokens(spm, tokenizer(spm,sample[3]))
+julia> E = Flux.batchseq([s1,s2,s3],1)
+julia> E = Flux.stack(E,1)
+   32×3 Array{Int64,2}:
+   14     14    14
+    2      2     2
+ 5649    411   439
+    ⋮         
+    1  22740     1
+    1  20600     1
+    1     10     1
+
+julia> seg_indices = ones(Int, size(E)...)
+  32×3 Array{Int64,2}:
+ 1  1  1
+ 1  1  1
+ 1  1  1
+ ⋮     
+ 1  1  1
+ 1  1  1
+ 1  1  1
+```
+**NOTE:** 
+Special tokens are:
+```julia
+ids tokens 
+1 = <pad>	
+2 = <unk>	
+3 = [CLS]	
+4 = [SEP]	
+5 = [MASK]	
+```
+
+## TextAnalysis.ALBERT.albert_transformer (ALBERT layer)
+
+It is just another flux layer implemented on top of Transformers.jl
+
+```julia
+    albert_transformer(emb::Int,size::Int, head::Int, ps::Int, layer::Int, inner_group::Int, no_hidden_group::Int; 
+act = gelu, pdrop = 0.1, attn_pdrop = 0.1)
+```
+The A lite Bidirectional Encoder Representations from Transformer(ALBERT) model.
+​    
+```Julia
+(altrans::albert_transformer)(x::T, mask=nothing; all::Bool=false) where T
+```
+
+eval the albert layer on input `x`. If length `mask` is given (in shape (1, seq_len, batch_size)), mask the attention with `getmask(mask, mask)`. Moreover, set `all` to `true` to get all
+outputs of each transformer layer.
+```julia
+Arguments:
+
+emb  : Dimensionality of vocabulary embeddings
+size  : Dimensionality of the encoder layers and the pooler layer
+head  : Number of attention heads for each attention layer in the Transformer encoder
+ps  : The dimensionality of the “intermediate” (i.e., feed-forward) layer in the 
+Transformer encoder.   
+layer  : Number of hidden layers in the Transformer encoder
+inner_group  : The number of inner repetition of attention and ffn.
+no_hidden_group : Number of groups for the hidden layers, parameters in the same group are shared
+act  : The non-linear activation function (function or string) in the encoder and pooler. If string, “gelu”, “relu”, “swish” and “gelu_new” are supported
+pdrop  :  The dropout probability for all fully connected layers in the embeddings, encoder, and pooler
+attn_pdrop  :  The dropout ratio for the attention probabilities.
+```
+
+## Converted Tensorflow Checkpoints 
+
+Pre-trained tensorflow checkpoint file by [google-research](https://github.com/google-research/ALBERT) to the Julia desired pre-trained model format(i.e. BSON) :
+
+**Version-1 of ALBERT models**
+- [Base](https://drive.google.com/drive/u/1/folders/1HHTlS_jBYRE4cG0elITEH7fAkiNmrEgz) from [[link](https://storage.googleapis.com/albert_models/albert_base_v1.tar.gz)]
+- [Large](https://drive.google.com/drive/u/1/folders/1HHTlS_jBYRE4cG0elITEH7fAkiNmrEgz) from [[link](https://storage.googleapis.com/albert_models/albert_large_v1.tar.gz)]
+- [Xlarge](https://drive.google.com/drive/u/1/folders/1HHTlS_jBYRE4cG0elITEH7fAkiNmrEgz) from [[link](https://storage.googleapis.com/albert_models/albert_xlarge_v1.tar.gz)]
+- [Xxlarge](https://drive.google.com/drive/u/1/folders/1HHTlS_jBYRE4cG0elITEH7fAkiNmrEgz) from [[link](https://storage.googleapis.com/albert_models/albert_xxlarge_v1.tar.gz)]
+
+**Version-2 of ALBERT models**
+- [Base](https://drive.google.com/drive/u/1/folders/1DlX_WZacsjt6O8EDaawKJ-x4RWP46Xj-) 
+- [Large](https://drive.google.com/drive/u/1/folders/1DlX_WZacsjt6O8EDaawKJ-x4RWP46Xj-) 
+- [Xlarge](https://drive.google.com/drive/u/1/folders/1DlX_WZacsjt6O8EDaawKJ-x4RWP46Xj-)
+- [Xxlarge](https://drive.google.com/drive/u/1/folders/1DlX_WZacsjt6O8EDaawKJ-x4RWP46Xj-) 
+
+conversion code can be found [here](https://gist.github.com/tejasvaidhyadev/6c10bdda1f60c3e42472d356ecf3721a)
+
+## Pretrained models
+
+The following model version of albert are available :
+
+```julia
+julia> model_version(TextAnalysis.ALBERT.ALBERT_V1)
+4-element Array{String,1}:
+ "albert_base_v1"
+ "albert_large_v1"
+ "albert_xlarge_v1"
+ "albert_xxlarge_v1"
+
+julia> model_version(TextAnalysis.ALBERT.ALBERT_V2)
+4-element Array{String,1}:
+ "albert_base_v2"
+ "albert_large_v2"
+ "albert_xlarge_v2"
+ "albert_xxlarge_v2"
+```
+
+To load any of the above models 
+
+```julia
+julia> ALBERT.from_pretrained("albert_base_v1")
+TransformerModel{TextAnalysis.ALBERT.albert_transformer}(
+  embed = CompositeEmbedding(tok = Embed(128), segment = Embed(128), pe = PositionEmbedding(128, max_len=512), postprocessor = Positionwise(LayerNorm(128), Dropout(0.1))),
+  transformers = albert(layers=12, head=12, head_size=64, pwffn_size=3072, size=768),
+  classifier = 
+    (
+      pooler => Dense(768, 768, tanh)
+      masklm => (
+        transform => Chain(Dense(768, 128, gelu), LayerNorm(128))
+        output_bias => Array{Float32,1}
+      )
+      nextsentence => Chain(Dense(768, 2), logsoftmax)
+    )
+)
+```
+
+## Fine-tuning 
+
+To fine-tune albert on any of the downstream task , we need to replace classifier head from TransformerModel structure
+
+```julia
+julia> using Flux
+julia> using Transformers.Basic
+# lets say we are finetuing on sentence classification 
+julia> clf = Flux.Chain(
+       Flux.Dropout(0.1),
+       Flux.Dense(768, 2), Flux.logsoftmax,)
+Chain(Dropout(0.1), Dense(768, 2), logsoftmax)
+
+julia>Basic.set_classifier(model, 
+      ( pooler = transformers.classifier.pooler,
+        clf = clf ))
+
+Basic.set_classifier(model, (pooler = transformers.classifier.pooler,clf = clf))
+TransformerModel{TextAnalysis.ALBERT.albert_transformer}(
+  embed = CompositeEmbedding(tok = Embed(128), segment = Embed(128), pe = PositionEmbedding(128, max_len=512), postprocessor = Positionwise(LayerNorm(128), Dropout(0.1))),
+  transformers = albert(layers=12, head=12, head_size=64, pwffn_size=3072, size=768),
+  classifier = 
+    (
+      pooler => Dense(768, 768, tanh)
+      clf => Chain(Dropout(0.1), Dense(768, 2), logsoftmax)
+    )
+)
+```
+
diff --git a/docs/src/Pretraining_Tutorial(ALBERT).md b/docs/src/Pretraining_Tutorial(ALBERT).md
new file mode 100644
index 0000000..d2e6ec0
--- /dev/null
+++ b/docs/src/Pretraining_Tutorial(ALBERT).md
@@ -0,0 +1,598 @@
+
+## ALBERT
+ The success of ALBERT demonstrates the importance of identifying the aspects of a model that give rise to powerful contextual representations. By focusing improvement efforts on these aspects of the model architecture, it is possible to greatly improve both the model efficiency and performance on a wide range of NLP tasks
+
+Get the IPYNB - [here](https://github.com/tejasvaidhyadev/ALBERT.jl/blob/master/docs/Pretraining_Tutorial(ALBERT).ipynb)
+
+## Pretraining
+In this tutorial we are going to pre-train our albert model
+
+Inspired by https://nextjournal.com/chengchingwen/jsoc-2019-blog3end-of-phase-two-bert-model-in-julia
+
+## Julia- Flux ALBERT 
+It very easy and similar to any of the other Flux layer for training 
+
+
+```julia
+using TextAnalysis
+```
+
+~ *ignore all the warning as TextAnalysis is checked out for developement*
+
+
+```julia
+using TextAnalysis.ALBERT # it is where our model reside
+```
+
+#### we are going to use DataDeps for handling download of pretrained model of ALBERT
+- For now we are directly laoding 
+- other pretrained Weights can be found [here](https://drive.google.com/drive/u/1/folders/1HHTlS_jBYRE4cG0elITEH7fAkiNmrEgz)
+
+
+```julia
+using WordTokenizers
+using Random
+```
+
+loading spm tokenizer for albert
+
+
+```julia
+spm = load(ALBERT_V1)
+```
+
+**Output**
+
+
+    WordTokenizers.SentencePieceModel(Dict("▁shots" => (-11.2373, 7281),"▁ordered" => (-9.84973, 1906),"▁doubtful" => (-12.7799, 22569),"▁glancing" => (-11.6676, 10426),"▁disrespect" => (-13.13, 26682),"▁without" => (-8.34227, 367),"▁pol" => (-10.7694, 4828),"chem" => (-12.3713, 17661),"▁1947," => (-11.7544, 11199),"▁kw" => (-10.4402, 3511)…), 2)
+
+
+
+`masksentence` - API to preprocess input text by appling mask for MLM task
+
+
+```julia
+function masksentence(words,
+                      spm;
+                      mask_token = "[MASK]",
+                      mask_ratio = 0.15,
+                      real_token_ratio = 0.1,
+                      random_token_ratio = 0.1)
+
+tokens = spm(words)
+masked_idx = randsubseq(1:length(tokens), mask_ratio)
+
+masked_tokens = copy(tokens)
+
+  for idx ∈ masked_idx
+    r = rand()
+    if r <= random_token_ratio
+      masked_tokens[idx] = rand(keys(spm.vocab_map))
+    elseif r > real_token_ratio + random_token_ratio
+      masked_tokens[idx] = mask_token
+    end
+  end
+
+  return masked_tokens, tokens, masked_idx
+end
+```
+
+**Output** 
+
+
+    masksentence (generic function with 1 method)
+Lets check the example 
+
+```julia
+masksentence("i love julia language",spm;
+                      mask_token = "[MASK]",
+                      mask_ratio = 0.15,
+                      real_token_ratio = 0.1,
+                      random_token_ratio = 0.1)
+```
+
+**Output** 
+
+
+    (["▁i", "▁love", "▁julia", "▁language"], ["▁i", "▁love", "▁julia", "▁language"], Int64[])
+
+
+We will be using Tokenizer from WordTokenizers
+
+```julia
+using Random
+using WordTokenizers
+
+albert_pretrain_task(sentences,
+                       spm,
+                       sentences_pool = sentences;
+                       channel_size = 100,
+                       kwargs...
+                       )
+API for pretraining
+
+function albert_pretrain_task(sentences,
+                       spm,
+                       sentences_pool = sentences;
+                       channel_size = 100,
+                       kwargs...
+                       )
+  chn = Channel(channel_size)
+  task = @async albert_pretrain_task(chn, sentences, wordpiece, sentences_pool; kwargs...)
+  bind(chn, task)
+  chn
+end
+```
+
+**Output**
+
+
+    albert_pretrain_task (generic function with 6 methods)
+
+
+```julia
+function albert_pretrain_task(chn::Channel,
+                       sentences,
+                       spm,
+                       sentences_pool = sentences;
+                       start_token = "[CLS]",
+                       sep_token = "[SEP]",
+                       mask_token = "[MASK]",
+                       mask_ratio = 0.15,
+                       real_token_ratio = 0.1,
+                       random_token_ratio = 0.1,
+                       whole_word_mask = false,
+                       next_sentence_ratio = 0.5,
+                       next_sentence = true,
+                       return_real_sentence = false)
+
+  foreach(enumerate(sentences)) do (i, sentence)
+    sentenceA = masksentence(
+      sentence,
+      spm;
+      mask_token = mask_token,
+      mask_ratio = mask_ratio,
+      real_token_ratio = real_token_ratio,
+      random_token_ratio = random_token_ratio)
+    sentenceB = masksentence(
+        sentences[i+1],
+        spm;
+        mask_token = mask_token,
+        mask_ratio = mask_ratio,
+        real_token_ratio = real_token_ratio,
+        random_token_ratio = random_token_ratio)
+
+    if next_sentence
+      if rand() <= next_sentence_ratio && i != length(sentences)
+        isnext = true
+      else
+        temp = sentenceB
+        sentenceB = sentenceA
+        sentenceA = temp
+        isnext = false
+      end
+
+      masked_sentence = _wrap_sentence(sentenceA[1],
+                                       sentenceB[1];
+                                       start_token = start_token,
+                                       sep_token = sep_token)
+
+      sentence = _wrap_sentence(sentenceA[2],
+                                sentenceB[2];
+                                start_token = start_token,
+                                sep_token = sep_token) #implemented below
+
+      mask_idx = _wrap_idx(sentenceA[3],
+                           sentenceB[3],
+                           length(sentenceA[1])) #implemented below
+    else
+      masked_sentence = _wrap_sentence(sentenceA[1];
+                                       start_token = start_token,
+                                       sep_token = sep_token)
+
+      sentence = _wrap_sentence(sentenceA[2];
+                                start_token = start_token,
+                                sep_token = sep_token)
+
+      mask_idx = _wrap_idx(sentenceA[3])
+    end
+
+    masked_token = sentence[mask_idx]
+
+    if return_real_sentence
+      if next_sentence
+        put!(chn, (masked_sentence, mask_idx, masked_token, isnext, sentence))
+      else
+        put!(chn, (masked_sentence, mask_idx, masked_token, sentence))
+      end
+    else
+      if next_sentence
+        put!(chn, (masked_sentence, mask_idx, masked_token, isnext))
+      else
+        put!(chn, (masked_sentence, mask_idx, masked_token))
+      end
+    end
+  end
+end
+
+```
+
+**Output**
+
+
+    albert_pretrain_task (generic function with 6 methods)
+Some helper function
+
+```julia
+function _wrap_sentence(sentence1, sentence2...; start_token = "[CLS]", sep_token = "[SEP]")
+  pushfirst!(sentence1, start_token)
+  push!(sentence1, sep_token)
+  map(s->push!(s, sep_token), sentence2)
+  vcat(sentence1, sentence2...)
+end
+
+_wrap_idx(sentence1_idx, pre_len = 1) = sentence1_idx .+= pre_len
+function _wrap_idx(sentence1_idx, sentence2_idx, len1)
+  _wrap_idx(sentence1_idx)
+  _wrap_idx(sentence2_idx, len1)
+  vcat(sentence1_idx, sentence2_idx)
+end
+```
+
+**Output**
+
+
+    _wrap_idx (generic function with 3 methods)
+
+
+```julia
+function albert_pretrain_task(outchn::Channel,
+                       datachn::Channel,
+                       spm;
+                       buffer_size = 100,
+                       kwargs...
+                       )
+  task = @async begin
+    buffer = Vector(undef, buffer_size)
+    while isopen(datachn)
+      i = 1
+      eod = false
+      while i <= buffer_size
+        try
+          sentence = take!(datachn)
+          if isempty(sentence)
+            continue
+          else
+            buffer[i] = sentence
+            i+=1
+          end
+        catch e
+          if isa(e, InvalidStateException) && e.state==:closed
+            eod = true
+            break
+          else
+            rethrow()
+          end
+        end
+      end
+
+      i -= 1
+
+      if eod || i == buffer_size
+        albert_pretrain_task(outchn, @view(buffer[1:(eod ? i - 1 : i)]), spm; kwargs...)
+      end
+    end
+  end
+  bind(outchn, task)
+end
+
+```
+
+**Output** 
+
+
+    albert_pretrain_task (generic function with 6 methods
+
+
+
+```julia
+function albert_pretrain_task(datachn::Channel,
+                       spm;
+                       buffer_size = 100,
+                       channel_size = 100,
+                       kwargs...
+                       )
+  outchn = Channel(channel_size)
+  bert_pretrain_task(outchn, datachn, spm; buffer_size = buffer_size, kwargs...)
+  outchn
+end
+```
+
+**Output**
+
+
+    albert_pretrain_task (generic function with 6 methods)
+
+
+### Test Corpus
+
+```julia
+# one document from wiki dump, just for illustration
+docs = """
+Guy Fawkes (; 13 April 1570�罱�� 31 January 1606), also known as Guido Fawkes while fighting for the Spanish, was a member of a group of provincial English Catholics who planned the failed Gunpowder Plot of 1605. He was born and educated in York, England; his father died when Fawkes was eight years old, after which his mother married a recusant Catholic.
+
+Fawkes converted to Catholicism and left for mainland Europe, where he fought for Catholic Spain in the Eighty Years' War against Protestant Dutch reformers in the Low Countries. He travelled to Spain to seek support for a Catholic rebellion in England without success. He later met Thomas Wintour, with whom he returned to England, and Wintour introduced him to Robert Catesby, who planned to assassinate and restore a Catholic monarch to the throne. The plotters leased an undercroft beneath the House of Lords, and Fawkes was placed in charge of the gunpowder which they stockpiled there. The authorities were prompted by an anonymous letter to search Westminster Palace during the early hours of 5 November, and they found Fawkes guarding the explosives. He was questioned and tortured over the next few days, and he finally confessed.
+
+Immediately before his execution on 31 January, Fawkes fell from the scaffold where he was to be hanged and broke his neck, thus avoiding the agony of being hanged, drawn and quartered. He became synonymous with the Gunpowder Plot, the failure of which has been commemorated in Britain as Guy Fawkes Night since 5 November 1605, when his effigy is traditionally burned on a bonfire, commonly accompanied by fireworks.
+
+Guy Fawkes was born in 1570 in Stonegate, York. He was the second of four children born to Edward Fawkes, a proctor and an advocate of the consistory court at York, and his wife, Edith. Guy's parents were regular communicants of the Church of England, as were his paternal grandparents; his grandmother, born Ellen Harrington, was the daughter of a prominent merchant, who served as Lord Mayor of York in 1536. Guy's mother's family were recusant Catholics, and his cousin, Richard Cowling, became a Jesuit priest. "Guy" was an uncommon name in England, but may have been popular in York on account of a local notable, Sir Guy Fairfax of Steeton.
+
+The date of Fawkes's birth is unknown, but he was baptised in the church of St Michael le Belfrey on 16 April. As the customary gap between birth and baptism was three days, he was probably born about 13 April. In 1568, Edith had given birth to a daughter named Anne, but the child died aged about seven weeks, in November that year. She bore two more children after Guy: Anne (b. 1572), and Elizabeth (b. 1575). Both were married, in 1599 and 1594 respectively.
+
+In 1579, when Guy was eight years old, his father died. His mother remarried several years later, to the Catholic Dionis Baynbrigge (or Denis Bainbridge) of Scotton, Harrogate. Fawkes may have become a Catholic through the Baynbrigge family's recusant tendencies, and also the Catholic branches of the Pulleyn and Percy families of Scotton, but also from his time at St. Peter's School in York. A governor of the school had spent about 20�懢ears in prison for recusancy, and its headmaster, John Pulleyn, came from a family of noted Yorkshire recusants, the Pulleyns of Blubberhouses. In her 1915 work "The Pulleynes of Yorkshire", author Catharine Pullein suggested that Fawkes's Catholic education came from his Harrington relatives, who were known for harbouring priests, one of whom later accompanied Fawkes to Flanders in 1592��1593. Fawkes's fellow students included John Wright and his brother Christopher (both later involved with Fawkes in the Gunpowder Plot) and Oswald Tesimond, Edward Oldcorne and Robert Middleton, who became priests (the latter executed in 1601).
+"""
+```
+
+**Output**
+
+
+    "Guy Fawkes (; 13 April 1570�罱�� 31 January 1606), also known as Guido Fawkes while fighting for the Spanish, was a member of a group of provincial English Catholics who planned the failed Gunpowder Plot of 1605. He was born and educated in York, England; his father died when Fawkes was eight years old, after which his mother married a recusant Catholic.\n\nFawkes converted to Catholicism and left for mainland Europe, where he fought for Catholic Spain in the Eighty Years' War against Protestant Dutch reformers in the Low Countries. He travelled to Spain to seek support for a Catholic rebellion in England without success. He later met Thomas Wintour, with whom he returned to England, and Wintour introduced him to Robert Catesby, who planned to assassinate and restore a Catholic monarch to the throne. The plotters leased an undercroft beneath the House of Lords, and Fawkes was placed in charge of the gunpowder which they stockpiled there. The authorities were prompted by an anonymous letter to search Westminster Palace during the early hours of 5 November, and they found Fawkes guarding the explosives. He was questioned and tortured over the next few days, and he finally confessed.\n\nImmediately before his execution on 31 January, Fawkes fell from the scaffold where he was to be hanged and broke his neck, thus avoiding the agony of being hanged, drawn and quartered. He became synonymous with the Gunpowder Plot, the failure of which has been commemorated in Britain as Guy Fawkes Night since 5 November 1605, when his effigy is traditionally burned on a bonfire, commonly accompanied by fireworks.\n\nGuy Fawkes was born in 1570 in Stonegate, York. He was the second of four children born to Edward Fawkes, a proctor and an advocate of the consistory court at York, and his wife, Edith. Guy's parents were regular communicants of the Church of England, as were his paternal grandparents; his grandmother, born Ellen Harrington, was the daughter of a prominent merchant, who served as Lord Mayor of York in 1536. Guy's mother's family were recusant Catholics, and his cousin, Richard Cowling, became a Jesuit priest. \"Guy\" was an uncommon name in England, but may have been popular in York on account of a local notable, Sir Guy Fairfax of Steeton.\n\nThe date of Fawkes's birth is unknown, but he was baptised in the church of St Michael le Belfrey on 16 April. As the customary gap between birth and baptism was three days, he was probably born about 13 April. In 1568, Edith had given birth to a daughter named Anne, but the child died aged about seven weeks, in November that year. She bore two more children after Guy: Anne (b. 1572), and Elizabeth (b. 1575). Both were married, in 1599 and 1594 respectively.\n\nIn 1579, when Guy was eight years old, his father died. His mother remarried several years later, to the Catholic Dionis Baynbrigge (or Denis Bainbridge) of Scotton, Harrogate. Fawkes may have become a Catholic through the Baynbrigge family's recusant tendencies, and also the Catholic branches of the Pulleyn and Percy families of Scotton, but also from his time at St. Peter's School in York. A governor of the school had spent about 20�懢ears in prison for recusancy, and its headmaster, John Pulleyn, came from a family of noted Yorkshire recusants, the Pulleyns of Blubberhouses. In her 1915 work \"The Pulleynes of Yorkshire\", author Catharine Pullein suggested that Fawkes's Catholic education came from his Harrington relatives, who were known for harbouring priests, one of whom later accompanied Fawkes to Flanders in 1592��1593. Fawkes's fellow students included John Wright and his brother Christopher (both later involved with Fawkes in the Gunpowder Plot) and Oswald Tesimond, Edward Oldcorne and Robert Middleton, who became priests (the latter executed in 1601).\n"
+
+
+**Lets Pretrain the model**
+
+```julia
+using WordTokenizers
+
+chn = Channel(3)
+
+sentences = split_sentences(docs)
+task = @async foreach(sentences) do sentence
+  if !isempty(sentence)
+    put!(chn, sentence)
+  end
+end
+bind(chn, task)
+```
+
+**Output** 
+
+
+    Channel{Any}(sz_max:3,sz_curr:3)
+
+
+Lets check our `albert_pretrain_task`
+
+
+```julia
+using Transformers.Basic
+using Transformers
+```
+
+
+```julia
+datas = albert_pretrain_task(chn, spm)
+batch = get_batch(datas ,1)
+```
+
+**Output**
+
+
+    4-element Array{Array{T,1} where T,1}:
+     [["[CLS]", "▁", "H", "[MASK]", "▁was", "[MASK]", "▁and", "▁tortured", "▁over", "▁the"  …  "▁found", "▁", "F", "aw", "kes", "▁guarding", "▁the", "[MASK]", ".", "[SEP]"]]
+     [[4, 6, 14, 23, 24, 30, 41, 58, 61]]
+     [["e", "▁questioned", ",", "he", "▁authorities", "▁letter", "▁during", "kes", "▁explosives"]]
+     Bool[0]
+
+
+
+Seems like it is working fine 
+
+
+```julia
+masked_sentence, mask_idx, masked_token, isnext = get_batch(datas, 1)
+```
+
+**Output** 
+
+
+    4-element Array{Array{T,1} where T,1}:
+     [["[CLS]", "▁", "H", "e", "▁was", "▁questioned", "▁and", "▁tortured", "▁over", "▁the"  …  "▁being", "▁hanged", ",", "▁drawn", "▁and", "▁qu", "arte", "red", ".", "[SEP]"]]
+     [[9, 19, 25, 31, 38, 46, 55, 58]]
+     [["▁over", ".", "ate", "▁31", "F", "▁he", ",", "▁the"]]
+     Bool[1]
+
+
+
+We will be using following libary as shown below
+
+
+```julia
+using TextAnalysis.ALBERT
+using Transformers.Basic
+vocab = keys(spm.vocab_map)
+```
+
+**Output** 
+
+
+    Base.KeySet for a Dict{String,Tuple{Float64,Int64}} with 30000 entries. Keys:
+      "▁shots"
+      "▁ordered"
+      "▁doubtful"
+      "▁glancing"
+      "▁disrespect"
+      "▁without"
+      "▁pol"
+      "chem"
+      "▁1947,"
+      "▁kw"
+      "▁calcutta"
+      "mh"
+      "▁rumors"
+      "▁maharaja"
+      "▁125"
+      "▁xanth"
+      "rha"
+      "▁pound"
+      "lunk"
+      "▁spaniards"
+      "▁ulcer"
+      "henry"
+      "228"
+      "izes"
+      "▁assist"
+      ⋮
+
+
+
+### lets define embedding layers
+The Embed is similar to nn.model in pytorch and is already implemented in Transformers
+
+
+```julia
+emb = CompositeEmbedding(
+  tok = Embed(300, length(vocab)),
+  pe = PositionEmbedding(300, 512; trainable=false),
+  seg = Embed(300, 2)
+)
+
+```
+
+**Output**
+
+
+```julia
+CompositeEmbedding(tok = Embed(300), pe = PositionEmbedding(300), seg = Embed(300))
+```
+
+
+```julia
+using Flux:onehotbatch
+```
+
+`TransformerModel` is structure to holding embedding, transformers and classifier 
+
+
+```julia
+albert = ALBERT.albert_transformer(300,300,12,512,3,1,1) # defining albert_trainformer 
+masklm = Flux.Dense(300,300) # masklm classifier
+nextsentence = Flux.Chain(Flux.Dense(300, 2), Flux.logsoftmax) # nextsentence classifiers
+
+albert_model = TransformerModel(emb, albert, (mlm=masklm, ns = nextsentence)) #struture to hold everything
+```
+
+**Output**
+
+
+    TransformerModel{TextAnalysis.ALBERT.albert_transformer}(
+      embed = CompositeEmbedding(tok = Embed(300), pe = PositionEmbedding(300), seg = Embed(300)),
+      transformers = albert(layers=3, head=12, head_size=25, pwffn_size=512, size=300),
+      classifier = 
+        (
+          mlm => Dense(300, 300)
+          ns => Chain(Dense(300, 2), logsoftmax)
+        )
+    )
+
+or we can you use TextAnalysis.ALBERT.create_albert 
+
+### Preprocess
+`preprocess`- It will take care of proprocessing of text before moving it to model
+
+
+```julia
+function preprocess(training_batch)
+    mask = getmask(training_batch[1])
+    tok = [(ids_from_tokens(spm,i)) for i in training_batch[1]]
+    tok = Flux.batchseq(tok,1)
+    tok = Flux.stack(tok,1)
+    segment = fill!(similar(tok), 1.0)
+    length(tok) #output embedding matrix
+     for (i, sentence) ∈ enumerate(training_batch[1])
+    j = findfirst(isequal("[SEP]"), sentence)
+    if j !== nothing
+      @view(segment[j+1:end, i]) .= 2.0
+    end
+  end
+    
+    ind = vcat(
+    map(enumerate(batch[2])) do (i, x)
+     map(j->(j,i), x)
+    end...)
+
+  masklabel = onehotbatch(ids_from_tokens(spm , vcat(batch[3]...)), 1:length(spm.vocab_map))
+  nextlabel = onehotbatch(batch[4], (true, false))
+return (tok=tok, seg=segment), ind, masklabel, nextlabel, mask
+end
+
+function loss(data, ind, masklabel, nextlabel, mask = nothing)
+  e = albert_model.embed(data)
+  t = albert_model.transformers(e, mask)
+  nextloss = Basic.logcrossentropy(
+    nextlabel,
+    albert_model.classifier.ns(
+      t[:,1,:]
+    )
+  )
+  mkloss = masklmloss(albert_model.embed.embeddings.tok, # embedding table for compute similarity
+                      albert_model.classifier.mlm, # transform function on output embedding
+                      t, # output embeddings
+                      ind, # mask index
+                      masklabel #masked token
+                      )
+  return nextloss + mkloss
+end
+    
+ps = Flux.params(albert)
+opt = Flux.ADAM(1e-4)
+```
+
+**Output**
+
+
+    Flux.Optimise.ADAM(0.0001, (0.9, 0.999), IdDict{Any,Any}())
+
+
+### Lets get the datas
+
+```julia
+datas = albert_pretrain_task(chn, spm)
+```
+
+**Output**
+
+
+    Channel{Any}(sz_max:100,sz_curr:0)
+
+
+
+lets analysis the loss by running 10 epochs
+
+
+```julia
+for i ∈ 1:10 # run 10 step for illustration
+  batch = get_batch(datas, 2)
+  batch === nothing && break # out of data
+  data, ind, masklabel, nextlabel, mask = todevice(preprocess(batch))
+  l = loss(data, ind, masklabel, nextlabel, mask)
+  @show l
+  grad = Flux.gradient(()->loss(data, ind, masklabel, nextlabel, mask), ps)
+  Flux.update!(opt, ps, grad)
+end
+```
+
+    l = 72.28404f0
+    l = 73.165596f0
+    l = 56.124104f0
+    l = 50.31461f0
+    l = 51.023262f0
+    l = 49.547054f0
+    l = 43.89146f0
+    l = 38.276382f0
+    l = 48.87205f0
+    l = 33.408596f0
+
+
+### Conclusion
+As expected loss is converging for our model 
diff --git a/docs/src/Training_tutorial.md b/docs/src/Training_tutorial.md
new file mode 100644
index 0000000..4c6f800
--- /dev/null
+++ b/docs/src/Training_tutorial.md
@@ -0,0 +1,362 @@
+
+## ALBERT Fine tuning Tutorial
+In this tutorial, we will be going through usage of SOTA transformers. We will be using ALBERT transformer model for this tutorial. You can check this link to understand more about [ALBERT](https://arxiv.org/abs/1909.11942)
+
+Get IPYNB [here](https://github.com/tejasvaidhyadev/ALBERT.jl/blob/master/docs/Training_fine-tunning_%20tutorial.ipynb)
+
+We are going to use the following library for our tutorial
+- TextAnlaysis.ALBERT
+- WordTokenizer 
+- Transformers and Flux 
+
+
+
+```julia
+using TextAnalysis
+using TextAnalysis.ALBERT # it is where our model reside
+```
+
+lets checkout the model version avaliable in PretrainedTransformer
+
+
+```julia
+subtypes(ALBERT.PretrainedTransformer)
+```
+
+
+
+
+    2-element Array{Any,1}:
+     TextAnalysis.ALBERT.ALBERT_V1
+     TextAnalysis.ALBERT.ALBERT_V2
+
+
+
+To check different size model 
+
+
+```julia
+model_version( TextAnalysis.ALBERT.ALBERT_V1)
+```
+
+
+
+
+    4-element Array{String,1}:
+     "albert_base_v1"
+     "albert_large_v1"
+     "albert_xlarge_v1"
+     "albert_xxlarge_v1"
+
+
+
+Before moving forward let us look at the following basic steps involved in using any transformer,
+
+ ### For preprocessing
+- Tokenize the input data and other input details such as Attention Mask for BERT to not ignore the attention on padded sequences.
+- Convert tokens to input ID sequences.
+- Pad the IDs to a fixed length.
+
+### For modelling
+- Load the model and feed in the input ID sequence (Do it batch wise suitably based on the memory available).
+- Get the output of the last hidden layer
+- Last hidden layer has the sequence representation embedding at 1th index
+- These embeddings can be used as the inputs for different machine learning or deep learning models.
+
+
+`WordTokenizer` will handle the Preprocessing part
+and `TextAnlaysis` will handle Modelling
+
+
+```julia
+transformer = ALBERT.from_pretrained( "albert_base_v2") #here we are using version 1 i.e base
+```
+
+    This program has requested access to the data dependency albert_base_v2.
+    which is not currently installed. It can be installed automatically, and you will not see this message again.
+    
+    albert-weights BSON file converted from official weigths-file by google research .
+    Website: https://github.com/google-research/albert
+    Author: Google Research
+    Licence: Apache License 2.0
+    albert base version2 of size ~46 MB download.
+
+
+​    
+    Do you want to download the dataset from https://drive.google.com/uc?export=download&id=19llahJFvgjQNQ9pzES2XF0R9JdYwuuTk to "/home/iamtejas/.julia/datadeps/albert_base_v2"?
+    [y/n]
+    stdin> 
+    Do you want to download the dataset from https://drive.google.com/uc?export=download&id=19llahJFvgjQNQ9pzES2XF0R9JdYwuuTk to "/home/iamtejas/.julia/datadeps/albert_base_v2"?
+    [y/n]
+    stdin> y
+
+
+    ┌ Info: Downloading
+    │   source = https://doc-0k-3g-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/mclfg9m1jrs6lb467a4gk0jrph10oocv/1597362075000/15884229709856900679/*/19llahJFvgjQNQ9pzES2XF0R9JdYwuuTk?e=download
+    │   dest = /home/iamtejas/.julia/datadeps/albert_base_v2/albert_base_v2.bson
+    │   progress = NaN
+    │   time_taken = 5.0 s
+    │   time_remaining = NaN s
+    │   average_speed = 6.711 MiB/s
+    │   downloaded = 33.562 MiB
+    │   remaining = ∞ B
+    │   total = ∞ B
+    └ @ HTTP /home/iamtejas/.julia/packages/HTTP/BOJmV/src/download.jl:119
+    ┌ Info: Downloading
+    │   source = https://doc-0k-3g-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/mclfg9m1jrs6lb467a4gk0jrph10oocv/1597362075000/15884229709856900679/*/19llahJFvgjQNQ9pzES2XF0R9JdYwuuTk?e=download
+    │   dest = /home/iamtejas/.julia/datadeps/albert_base_v2/albert_base_v2.bson
+    │   progress = NaN
+    │   time_taken = 6.6 s
+    │   time_remaining = NaN s
+    │   average_speed = 6.959 MiB/s
+    │   downloaded = 45.903 MiB
+    │   remaining = ∞ B
+    │   total = ∞ B
+    └ @ HTTP /home/iamtejas/.julia/packages/HTTP/BOJmV/src/download.jl:119
+
+
+
+
+
+    TransformerModel{TextAnalysis.ALBERT.albert_transformer}(
+      embed = CompositeEmbedding(tok = Embed(128), segment = Embed(128), pe = PositionEmbedding(128, max_len=512), postprocessor = Positionwise(LayerNorm(128), Dropout(0))),
+      transformers = albert(layers=12, head=12, head_size=64, pwffn_size=3072, size=768),
+      classifier = 
+        (
+          pooler => Dense(768, 768, tanh)
+          masklm => (
+            transform => Chain(Dense(768, 128, gelu), LayerNorm(128))
+            output_bias => Array{Float32,1}
+          )
+          nextsentence => Chain(Dense(768, 2), logsoftmax)
+        )
+    )
+
+
+
+Tokenizer
+
+
+```julia
+using WordTokenizers
+```
+
+To get more detail on tokenizer refer the following [blog](https://tejasvaidhyadev.github.io/blog/Hey-Albert) 
+
+
+```julia
+spm = load(ALBERT_V1,1) #because we are using base-version1 
+```
+
+
+
+
+    WordTokenizers.SentencePieceModel(Dict("▁shots" => (-11.2373, 7281),"▁ordered" => (-9.84973, 1906),"▁doubtful" => (-12.7799, 22569),"▁glancing" => (-11.6676, 10426),"▁disrespect" => (-13.13, 26682),"▁without" => (-8.34227, 367),"▁pol" => (-10.7694, 4828),"chem" => (-12.3713, 17661),"▁1947," => (-11.7544, 11199),"▁kw" => (-10.4402, 3511)…), 2)
+
+
+
+we will use DataLoader avaliable in [`Transformers`](https://github.com/chengchingwen/Transformers.jl)
+
+using QNLI Dataseet
+
+
+```julia
+using Transformers.Datasets
+using Transformers.Datasets.GLUE
+using Transformers.Basic
+task = GLUE.QNLI()
+datas = dataset(Train, task)
+```
+
+    This program has requested access to the data dependency GLUE-QNLI.
+    which is not currently installed. It can be installed automatically, and you will not see this message again.
+    
+    Question NLI (SQuAD2.0 / QNLI) task (GLUE version)
+
+
+​    
+    Do you want to download the dataset from https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601 to "/home/iamtejas/.julia/datadeps/GLUE-QNLI"?
+    [y/n]
+    stdin> 
+    Do you want to download the dataset from https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601 to "/home/iamtejas/.julia/datadeps/GLUE-QNLI"?
+    [y/n]
+    stdin> y
+
+
+    ┌ Info: Downloading
+    │   source = https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601
+    │   dest = /home/iamtejas/.julia/datadeps/GLUE-QNLI/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601
+    │   progress = 1.0
+    │   time_taken = 1.86 s
+    │   time_remaining = 0.0 s
+    │   average_speed = 5.458 MiB/s
+    │   downloaded = 10.135 MiB
+    │   remaining = 0 bytes
+    │   total = 10.135 MiB
+    └ @ HTTP /home/iamtejas/.julia/packages/HTTP/BOJmV/src/download.jl:119
+
+
+    Archive:  QNLIv2.zip
+       creating: /home/iamtejas/.julia/datadeps/GLUE-QNLI/QNLI/
+      inflating: /home/iamtejas/.julia/datadeps/GLUE-QNLI/QNLI/dev.tsv  
+      inflating: /home/iamtejas/.julia/datadeps/GLUE-QNLI/QNLI/test.tsv  
+      inflating: /home/iamtejas/.julia/datadeps/GLUE-QNLI/QNLI/train.tsv  
+
+
+
+**Output** 
+
+    (Channel{String}(sz_max:0,sz_curr:1), Channel{String}(sz_max:0,sz_curr:0), Channel{String}(sz_max:0,sz_curr:0))
+
+
+```julia
+using Flux: onehotbatch
+labels = get_labels(task)
+```
+
+**Output** 
+
+
+    ("entailment", "not_entailment")
+
+
+
+Basic Preprocessing function 
+
+
+```julia
+makesentence(s1, s2) = ["[CLS]"; s1; "[SEP]"; s2; "[SEP]"]
+function preprocess(training_batch)
+ids =[]
+sent = []
+for i in 1:length(training_batch[1])
+    sent1 = tokenizer(spm,training_batch[1][i])
+    sent2 = tokenizer(spm,training_batch[2][i])
+    id = makesentence(sent1,sent2)
+    push!(sent, id)
+    push!(ids,ids_from_tokens(spm,id))
+end
+    #print(sent)
+    mask = getmask(convert(Array{Array{String,1},1}, sent)) #better API underprogress
+
+E = Flux.batchseq(ids,1)
+E = Flux.stack(E,1)
+length(E) #output embedding matrix
+segment = fill!(similar(E), 1)
+    for (i, sent) ∈ enumerate(sent)
+      j = findfirst(isequal("[SEP]"), sent)
+      if j !== nothing
+        @view(segment[j+1:end, i]) .= 2
+      end
+end
+data = (tok = E,segment = segment)
+labels = get_labels(task)
+label = onehotbatch(training_batch[3], labels)
+return(data,label,mask)
+end
+```
+
+**Output** 
+
+
+    preprocess (generic function with 1 method)
+
+
+
+lets Define loss function
+
+
+```julia
+using Flux
+using Flux: gradient
+import Flux.Optimise: update!
+
+clf = Flux.Chain(
+    Flux.Dropout(0.1),
+    Flux.Dense(768, length(labels)), Flux.logsoftmax
+)
+transformer = gpu(
+  Basic.set_classifier(transformer, 
+    (
+      pooler = transformer.classifier.pooler,
+      clf = clf
+    )
+  )
+)
+@show transformer
+
+#define the loss
+function loss(data, label, mask=nothing)
+    e = (transformer.embed(data))
+    t = (transformer.transformers(e))
+    l = logcrossentropy(label,
+         clf(
+            transformer.classifier.pooler(
+                t[:,1,:]
+            )
+        )
+    )
+    return l
+end
+```
+
+    transformer = TransformerModel{TextAnalysis.ALBERT.albert_transformer}(
+      embed = CompositeEmbedding(tok = Embed(128), segment = Embed(128), pe = PositionEmbedding(128, max_len=512), postprocessor = Positionwise(LayerNorm(128), Dropout(0))),
+      transformers = albert(layers=12, head=12, head_size=64, pwffn_size=3072, size=768),
+      classifier = 
+        (
+          pooler => Dense(768, 768, tanh)
+          clf => Chain(Dropout(0.1), Dense(768, 2), logsoftmax)
+        )
+    )
+
+
+
+**Output** 
+
+    loss (generic function with 2 methods)
+
+
+```julia
+using Flux
+using Flux: gradient
+import Flux.Optimise: update!
+
+using CuArrays
+
+data_batch = get_batch(datas, 2)
+data_batch, label_batch, mask =(preprocess(data_batch))
+for i ∈ 1:20 # iteration of 20 cycles over same data to see convergence 
+#data_batch = get_batch(datas, 2)
+#data_batch, label_batch, mask = preprocess(data_batch)
+l= loss(data_batch, label_batch, mask)
+ps = params(transformer)
+opt = ADAM(1e-4)
+@show l
+  grad = gradient(()-> loss(data_batch, label_batch, mask), ps)
+  update!(opt, ps, grad)
+end
+```
+**Output** 
+
+    l = 0.28236875f0
+    l = 0.01652541f0
+    l = 0.0030576359f0
+    l = 0.0005550342f0
+    l = 0.00016245738f0
+    l = 1.984803f-5
+    l = 0.0002791701f0
+    l = 1.1324875f-6
+    l = 1.3232057f-5
+    l = 0.2661536f0
+    l = 1.1324871f-6
+    l = -0.0f0
+    l = -0.0f0
+    l = -0.0f0
+    l = -0.0f0
+    l = -0.0f0
+    l = -0.0f0
+    l = -0.0f0
+    l = -0.0f0
+    l = -0.0f0
+
diff --git a/src/TextModels.jl b/src/TextModels.jl
index 5c88496..5e04923 100644
--- a/src/TextModels.jl
+++ b/src/TextModels.jl
@@ -36,8 +36,13 @@ module TextModels
     include("sequence/pos_datadeps.jl")
     include("sequence/pos.jl")
     include("sequence/sequence_models.jl")
-     
-   
+    
+    # ALBERT
+    function __init__()
+        include(joinpath(@__DIR__, "./albert/datadeps.jl"))
+    end 
+    include("./albert/ALBERT.jl")
+
     # ULMFiT
     module ULMFiT
         using TextAnalysis
diff --git a/src/albert/ALBERT.jl b/src/albert/ALBERT.jl
new file mode 100644
index 0000000..c2e5ecc
--- /dev/null
+++ b/src/albert/ALBERT.jl
@@ -0,0 +1,32 @@
+module ALBERT
+using Flux
+using Requires
+using Requires: @init
+using BSON
+using Transformers
+using Transformers.Basic
+using Transformers.Pretrain: isbson, iszip, istfbson, zipname, zipfile, findfile
+
+export ALBERT
+export masklmloss, preprocess_albert, from_pretrained
+export model_version, preprocess_albert, create_albert
+
+abstract type PretrainedTransformer end
+abstract type ALBERT_V1 <: PretrainedTransformer end
+abstract type ALBERT_V2 <: PretrainedTransformer end
+
+const pretrained = Dict{DataType, Vector{String}}()
+
+function model_version(::Type{T}) where T<:PretrainedTransformer
+    get!(pretrained,T) do
+        String[]
+    end
+end
+
+include("utils.jl")
+include("model.jl")
+include("albert.jl")
+include("pretrain.jl")
+include("datadeps.jl")
+end
+ # module
diff --git a/src/albert/albert.jl b/src/albert/albert.jl
new file mode 100644
index 0000000..be998e6
--- /dev/null
+++ b/src/albert/albert.jl
@@ -0,0 +1,158 @@
+using Flux
+using Flux: @functor
+using Transformers.Stacks
+using MacroTools: @forward
+
+using Transformers.Basic
+using Transformers.Basic: AbstractTransformer
+using Transformers.Stacks
+
+
+struct ALGroup
+  ts::Stack
+  drop::Dropout
+end
+
+@functor ALGroup
+
+@forward ALGroup.ts Base.getindex, Base.length
+
+"""
+   ALGroup(size::Int, head::Int, ps::Int, layer::Int,inner_group::Int;
+              act = gelu, pdrop = 0.1, attn_pdrop = 0.1)
+
+layer containing non-shared Transformer layers(multi-headed attention layer + feed-forward NN + Dropout )
+
+    (bert::albert)(x, mask=nothing; all::Bool=false)
+
+eval the forward pass on input `x`. If length `mask` is given (in shape (1, seq_len, batch_size)), mask the attention with `getmask(mask, mask)`. Moreover, set `all` to `true` to get all
+outputs of each transformer layer.
+"""
+function ALGroup(size::Int, head::Int, ps::Int, layer::Int,inner_group::Int;
+              act = gelu, pdrop = 0.1, attn_pdrop = 0.1)
+  rem(size,  head) != 0 && error("size not divisible by head")
+  ALGroup(size, head, div(size, head), ps, layer, inner_group; act=act, pdrop=pdrop, attn_pdrop=attn_pdrop)
+end
+
+function ALGroup(size::Int, head::Int, hs::Int, ps::Int, layer::Int,inner_group::Int; act = gelu, pdrop = 0.1, attn_pdrop = 0.1)
+  ALGroup(
+    Stack(
+      @nntopo_str("((x, m) => x':(x, m)) => $inner_group"),
+      [
+        Transformer(size, head, hs, ps; future=true, act=act, pdrop=attn_pdrop) # Transformer Encoder from "Attention is all you need" 
+        for i = 1:inner_group
+      ]...
+    ),
+    Dropout(pdrop))
+end
+
+function (al::ALGroup)(x::T, mask=nothing; all::Bool=false) where T
+  e = x
+  if mask === nothing
+    t, ts = al.ts(e, nothing)
+  else
+    t, ts = al.ts(e, getmask(mask, mask))
+  end
+
+  if all
+    if mask !== nothing
+      ts = map(ts) do ti
+        ti .* mask
+      end
+    end
+    ts[end], ts
+  else
+    t = mask === nothing ? t : t .* mask
+    t
+  end
+end
+
+struct albert_transformer <: Transformers.Basic.AbstractTransformer
+    linear::Dense
+    al::Array{ALGroup,1}
+    no_hid::Int
+    no_inner::Int
+    no_group::Int
+end
+@functor albert_transformer
+
+"""
+    albert_transformer(emb::Int,size::Int, head::Int, ps::Int, layer::Int, inner_group::Int, no_hidden_group::Int; 
+act = gelu, pdrop = 0.1, attn_pdrop = 0.1)
+
+the A lite Bidirectional Encoder Representations from Transformer(ALBERT) model.
+    
+    (altrans::albert_transformer)(x::T, mask=nothing; all::Bool=false) where T
+
+eval the albert layer on input `x`. If length `mask` is given (in shape (1, seq_len, batch_size)), mask the attention with `getmask(mask, mask)`. Moreover, set `all` to `true` to get all
+outputs of each transformer layer.
+
+Arguments:
+
+emb  : Dimensionality of vocabulary embeddings
+size  : Dimensionality of the encoder layers and the pooler layer
+head  : Number of attention heads for each attention layer in the Transformer encoder
+ps  : The dimensionality of the “intermediate” (i.e., feed-forward) layer in the 
+Transformer encoder.   
+layer  : Number of hidden layers in the Transformer encoder
+inner_group  : The number of inner repetition of attention and ffn.
+no_hidden_group : Number of groups for the hidden layers, parameters in the same group are shared
+act  : The non-linear activation function (function or string) in the encoder and pooler. If string, “gelu”, “relu”, “swish” and “gelu_new” are supported
+pdrop  :  The dropout probability for all fully connected layers in the embeddings, encoder, and pooler
+attn_pdrop  :  The dropout ratio for the attention probabilities.
+"""
+function albert_transformer(emb::Int,size::Int, head::Int, ps::Int, layer::Int,inner_group::Int,no_hidden_group::Int; act = gelu, pdrop = 0.1, attn_pdrop = 0.1)
+    albert_transformer(
+    Dense(emb,size),
+    [ALGroup(size, head, ps,layer,inner_group,act = act ,pdrop= pdrop ,attn_pdrop = attn_pdrop) for i in 1:no_hidden_group],
+    layer,
+    inner_group,
+    no_hidden_group
+    )
+end
+function (altrans::albert_transformer)(x::T, mask=nothing; all::Bool=false) where T
+   hidden_states = @toNd altrans.linear(x)
+   for i in 1:altrans.no_hid
+        layer_per_group = floor(altrans.no_hid/altrans.no_group)
+        group_idx = Int(floor(i/( (altrans.no_hid + 1) / altrans.no_group))) + 1
+        hidden_states = altrans.al[group_idx](hidden_states,mask,all = all)
+        if all
+            hidden_states = altrans.al(hidden_states,mask,all = all)[1]
+        end
+    end
+    return(hidden_states)
+end
+
+"""
+    masklmloss(embed::Embed{T}, transform,
+               t::AbstractArray{T, N}, posis::AbstractArray{Tuple{Int,Int}}, labels) where {T,N}
+    masklmloss(embed::Embed{T}, transform, output_bias,
+               t::AbstractArray{T, N}, posis::AbstractArray{Tuple{Int,Int}}, labels) where {T,N}
+
+helper function for computing the maks language modeling loss.
+Performance `transform(x) .+ output_bias` where `x` is the mask specified by
+`posis`, then compute the similarity with `embed.embedding` and crossentropy between true `labels`.
+"""
+function masklmloss(embed::Embed{T}, transform, t::AbstractArray{T, N}, posis::AbstractArray{Tuple{Int,Int}}, labels) where {T,N}
+  masktok = gather(t, posis)
+  sim = logsoftmax(transpose(embed.embedding) * transform(masktok))
+  return logcrossentropy(labels, sim)
+end
+
+function masklmloss(embed::Embed{T}, transform, output_bias, t::AbstractArray{T, N}, posis::AbstractArray{Tuple{Int,Int}}, labels) where {T,N}
+  masktok = gather(t, posis)
+  sim = logsoftmax(transpose(embed.embedding) * transform(masktok) .+ output_bias)
+  return logcrossentropy(labels, sim)
+end
+# output basic structure of albert transfomer
+function Base.show(io::IO, altrans::albert_transformer)
+  hs = div(size(altrans.al[1].ts[1].mh.iqproj.W)[1], altrans.al[1].ts[1].mh.head)
+  h, ps = size(altrans.al[1].ts[1].pw.dout.W)
+
+  print(io, "albert(")
+  print(io, "layers=$(altrans.no_hid), ")
+  print(io, "head=$(altrans.al[1].ts[1].mh.head), ")
+  print(io, "head_size=$(hs), ")
+  print(io, "pwffn_size=$(ps), ")
+  print(io, "size=$(h))")
+end
diff --git a/src/albert/datadeps.jl b/src/albert/datadeps.jl
new file mode 100644
index 0000000..e4123ef
--- /dev/null
+++ b/src/albert/datadeps.jl
@@ -0,0 +1,75 @@
+using GoogleDrive
+using DataDeps
+
+#BSON files is kept in googledrive 
+vectors_albertversion1 = [
+    ("albert_base_v1",
+    "albert base version1 of size ~46 MB download.",
+    "786b61a6c1597cf67e43a732cd9edb7e9075e81b5dbb73159acc75238ebc2ea7",
+    "https://drive.google.com/uc?export=download&id=1RKggDgmlJrSRsx7Ro2eR2hTNuMmzyUJ7"),
+    ("albert_large_v1",
+    " albert large version1 of size ~69 MB download.",
+    "9dac07e26bc6035974afecc89ff18df51ac6d552714799d4d4d4b083342eb2c9",
+    "https://drive.google.com/uc?export=download&id=1rpfjhpNL0luadP2b2wuuNkU4dNrEcGU0"),
+    ("albert_xlarge_v1",
+    "albert xlarge version1 of size ~226 MB download",
+    "1de4ad94a1b98f5f5f2c75af0f52bc85714d67b8578aa8f7650521bb123335c0",
+    "https://docs.google.com/uc?export=download&id=1fkYq49OvAHW_BsApTO-mXEWf2Hg8D8Xw"),
+    ("albert_xxlarge_v1",
+    "albert xxlarge version1 of size ~825 MB download",
+    "1de4ad94a1b98f5f5f2c75af0f52bc85714d67b8578aa8f7650521bb123335c0",
+    "https://docs.google.com/uc?export=download&id=1WBbW57UwBU0zZHnIO_pkrbtpmX85ydDD")
+]
+
+for (depname, description, sha, link) in vectors_albertversion1
+    register(DataDep(depname,
+        """
+        albert-weights BSON file converted from official Pretrained weigths by google research .
+        Website: https://github.com/google-research/albert
+        Author: Google Research
+        Licence: Apache License 2.0
+        $description
+        """,
+        link,
+        sha,
+        fetch_method = google_download
+            ))
+       
+    append!(model_version(ALBERT_V1), ["$depname"])                    
+end
+
+vectors_albertversion2 = [
+    ("albert_base_v2",
+    "albert base version2 of size ~46 MB download.",
+    "6590ed0aa133b05126c55a5b27362a41baba778f27fff2520df320f3965dd795",
+    "https://drive.google.com/uc?export=download&id=19llahJFvgjQNQ9pzES2XF0R9JdYwuuTk"),
+    ("albert_large_v2",
+    " albert large version2 of size ~69 MB download.",
+    "18928434ba1c7b9dfc6876b413aa94f0f23bbb79aabb765d0d439a2961238473",
+    "https://drive.google.com/uc?export=download&id=1bLiJVnJd-V_S51bLsmXx6COYsMJXcusn"),
+    ("albert_xlarge_v2",
+    "albert xlarge version2 of size ~226 MB download.",
+    "0c41c706549fb2f8d8b75372cc0f5aafb055cfa626392432355e20e55d40a71b",
+    "https://docs.google.com/uc?export=download&id=1Akmp2LdjFUvsZYaBdrAa2PTAK35pzoSm"),
+    ("albert_xxlarge_v2",
+    "albert xxlarge version2 of size ~825 MB download.",
+    "3d7d22cd929b675a26c49342ed77962b54dd55bcfb94c2fef6501cacf9f383d3",
+    "https://docs.google.com/uc?export=download&id=1f_RjeyvqBJzfurcgZ7i_ItFjWK4eRJLr")
+]
+
+for (depname, description, sha, link) in vectors_albertversion2
+    register(DataDep(depname,
+        """
+        albert-weights BSON file converted from official weigths-file by google research .
+        Website: https://github.com/google-research/albert
+        Author: Google Research
+        Licence: Apache License 2.0
+        $description
+        """,
+        link,
+        sha,
+        fetch_method = google_download
+             ))
+       
+    append!(model_version(ALBERT_V2), ["$depname"])                    
+end
diff --git a/src/albert/model.jl b/src/albert/model.jl
new file mode 100644
index 0000000..0e43acc
--- /dev/null
+++ b/src/albert/model.jl
@@ -0,0 +1,98 @@
+## loading model for pre-training i.e. we will not be loading pretrained weights from bson
+using Flux
+const config = Dict(
+  "hidden_act"                   => gelu,
+  "embedding"                    => 128,
+  "num_hidden_layers"            => 12,
+  "inner_group_num"              => 1,
+  "num_hidden_groups"            => 1,
+  "attention_probs_dropout_prob" => 0,
+  "hidden_size"                  => 768,
+  "max_position_embeddings"      => 512,
+  "hidden_dropout_prob"          => 0,
+  "type_vocab_size"              => 2,
+  "vocab_size"                   => 30000, #albert use same size of vocab file
+  "num_attention_heads"          => 12,
+  "intermediate_size"            => 3072,
+)
+
+#creating albert model like pretrain struct
+#you can define the albert model in the way you like and wrap it with TransformerModel
+
+function create_albert(emb=config["embedding"], size=config["hidden_size"], head=config["num_attention_heads"], ps=config["intermediate_size"], layer= config["num_hidden_layers"], inner_group=config["inner_group_num"], no_hidden_group=config["num_hidden_groups"]; act=Flux.gelu, pdrop =config["hidden_dropout_prob"], attn_pdrop = config["attention_probs_dropout_prob"],vocab_size=config["vocab_size"], type_vocab_size=config["type_vocab_size"], max_position_embeddings= config["max_position_embeddings"]
+                       )
+    albert = albert_transformer(
+        emb,
+        size,
+        head,
+        ps,
+        layer,
+        inner_group,
+        no_hidden_group
+    )
+#Dict to hold Token type Embedding 
+#for Embed refer transformers
+
+  tok_emb = Embed(
+    emb,
+    vocab_size
+  )
+
+  seg_emb = Embed(
+    emb,
+    type_vocab_size
+  )
+
+  posi_emb = PositionEmbedding(
+    emb,
+    max_position_embeddings;
+    trainable = true
+  )
+
+  emb_post = Positionwise(
+    LayerNorm(
+      emb
+    ),
+        Dropout(
+            pdrop
+        )
+  )
+
+  pooler = Dense(
+    size,
+    size,
+    tanh
+  )
+
+  masklm = (
+    transform = Chain(
+      Dense(
+        emb,
+        size,
+        act
+      ),
+      LayerNorm(
+        emb
+      )
+    ),
+    output_bias = param(randn(
+      Float32,
+      vocab_size
+    ))
+  )
+
+  nextsentence = Chain(
+    Dense(
+      size,
+      2
+    ),
+    logsoftmax
+  )
+
+  emb = CompositeEmbedding(tok = tok_emb, pe = posi_emb, segment = seg_emb, postprocessor = emb_post)
+
+
+  clf = (pooler = pooler, masklm = masklm, nextsentence = nextsentence)
+
+  TransformerModel(emb, albert, clf)
+end
diff --git a/src/albert/pretrain.jl b/src/albert/pretrain.jl
new file mode 100644
index 0000000..505030f
--- /dev/null
+++ b/src/albert/pretrain.jl
@@ -0,0 +1,317 @@
+# loading pretraining weigths from bson file
+using Transformers.Basic
+using Flux
+using Flux: loadparams!
+using DataDeps
+using TextAnalysis.ALBERT
+using BSON: @save, @load
+
+"""
+    from_pretrained(model::AbstractString = albert_base_v1) where T<:PretrainedTransformer
+Intialised and load pretrained weights on top of deps from all the avaliable model in ALBERT
+
+Example:
+julia> transformer = from_pretrained(albert_base_v1)
+  
+TransformerModel{TextAnalysis.ALBERT.albert_transformer}(
+  embed = CompositeEmbedding(tok = Embed(128), segment = Embed(128), pe = PositionEmbedding(128, max_len=512), postprocessor = Positionwise(LayerNorm(128), Dropout(0.1))),
+  transformers = albert(layers=12, head=12, head_size=64, pwffn_size=3072, size=768),
+  classifier = 
+      (
+       pooler => Dense(768, 768, tanh)
+       masklm => (
+           transform => Chain(Dense(768, 128, gelu), LayerNorm(128))
+           output_bias => Array{Float32,1}
+      )
+       nextsentence => Chain(Dense(768, 2), logsoftmax)
+  )
+)
+"""
+function from_pretrained(model::AbstractString = albert_base_v1) where T<:PretrainedTransformer
+    if model == "albert_base_v1"
+        filepath = @datadep_str model_version(ALBERT_V1)[1]
+        name = model_version(ALBERT_V1)[1]
+    elseif model == "albert_large_v1"
+        filepath = @datadep_str model_version(ALBERT_V1)[2]
+        name = model_version(ALBERT_V1)[2]
+    elseif model == "albert_xlarge_v1"
+        filepath = @datadep_str model_version(ALBERT_V1)[3]
+        name = model_version(ALBERT_V1)[3]
+    elseif model == "albert_xxlarge_v1"
+        filepath = @datadep_str model_version(ALBERT_V1)[4]
+        name = model_version(ALBERT_V1)[4]
+    elseif model == "albert_base_v2"
+        filepath = @datadep_str model_version(ALBERT_V2)[1]
+        name = model_version(ALBERT_V2)[1]
+    elseif model == "albert_large_v2"
+        filepath = @datadep_str model_version(ALBERT_V2)[2]
+        name = model_version(ALBERT_V2)[2]
+    elseif model == "albert_xlarge_v2"
+        filepath = @datadep_str model_version(ALBERT_V2)[3]
+        name = model_version(ALBERT_V2)[3]
+    elseif model == "albert_xxlarge_v2"
+        filepath = @datadep_str model_version(ALBERT_V2)[4]
+        name = model_version(ALBERT_V2)[4]
+    end
+    filepath = "$filepath/$name"*".bson"
+    @load filepath config weights vocab
+    transformer = load_pretrainedalbert(config, weights)
+    return transformer
+end
+
+#To load activation function from Flux
+function get_activation(act_string)
+    if act_string == "gelu"
+        gelu
+    elseif act_string == "relu"
+        relu
+    elseif act_string == "tanh"
+        tanh
+    elseif act_string == "linear"
+        identity
+    else
+        throw(DomainError(act_string, "activation support: linear, gelu, relu, tanh"))
+    end
+end
+_create_classifier(;args...) = args.data
+
+function load_pretrainedalbert(config, weights)
+    albert = albert_transformer(
+        config["embedding_size"],
+        config["hidden_size"],
+        config["num_attention_heads"],
+        config["intermediate_size"],
+        config["num_hidden_layers"],
+        config["inner_group_num"],
+        config["num_hidden_groups"];
+        act = get_activation(config["hidden_act"]),
+        pdrop = config["hidden_dropout_prob"],
+        attn_pdrop = config["attention_probs_dropout_prob"]
+    )
+    # Structure to hold embedding in ALBERT
+    # tok_embed is used to hold token type embedding
+    tok_emb = Embed(
+        config["embedding_size"],
+        config["vocab_size"]
+    )
+
+    # segment is used to hold sentence-segment type embedding
+    seg_emb = Embed(
+        config["embedding_size"],
+        config["type_vocab_size"]
+    )
+
+    # Posi_emb is used to hold position embedding
+    posi_emb = PositionEmbedding(
+        config["embedding_size"],
+        config["max_position_embeddings"];
+        trainable = true
+    )
+    # post embedding operations
+    # layerNormalization and Dropout
+    emb_post = Positionwise(
+        LayerNorm(
+        config["embedding_size"]
+        ),
+        Dropout(
+            config["hidden_dropout_prob"]
+        ) 
+    )
+    
+    #Dict to hold embedding operations and classifiers
+    embedding = Dict{Symbol, Any}()
+    classifier = Dict{Symbol, Any}()
+    
+    #pooler layer for classification in pretraining
+    pooler = Dense(
+        config["hidden_size"],
+        config["hidden_size"],
+        tanh
+    )
+ 
+    #masklm or masked language model t
+    masklm = (
+        transform = Chain(
+            Dense(
+                config["hidden_size"],
+                config["embedding_size"],
+                get_activation(config["hidden_act"])
+            ),
+            LayerNorm(
+                config["embedding_size"]
+            )
+        ),
+        output_bias = randn(
+            Float32,
+            config["vocab_size"]
+        )
+    )
+
+    #nextsentecne or Sentence order prediciton layer
+    nextsentence = Chain(
+        Dense(
+            config["hidden_size"],
+            2
+        ),
+        logsoftmax
+    )
+
+    vnames = keys(weights)
+
+    embeddings_weights = filter(name->occursin("embeddings", name), vnames) 
+
+# loading embedding weights
+    for k ∈ embeddings_weights
+        if occursin("LayerNorm/gamma", k)
+            loadparams!(emb_post[1].diag.α', [weights[k]]) 
+            embedding[:postprocessor] = emb_post
+        elseif occursin("LayerNorm/beta", k)
+            loadparams!(emb_post[1].diag.β', [weights[k]])
+        elseif occursin("word_embeddings", k)
+            loadparams!(tok_emb.embedding, [weights[k]])
+            embedding[:tok] = tok_emb
+        elseif occursin("position_embeddings", k)
+            loadparams!(posi_emb.embedding, [weights[k]])
+            embedding[:pe] = posi_emb
+        elseif occursin("token_type_embeddings", k)
+            loadparams!(seg_emb.embedding, [weights[k]])
+            embedding[:segment] = seg_emb
+        else
+            @warn "unknown variable: $k"
+        end
+    end
+
+    #albert transformer weights
+    albert_weights = filter(name->occursin("transformer", name), vnames)
+
+    #loading transformer weights
+    for j = 1:config["num_hidden_groups"] 
+        group_weights = filter(name->occursin("group_$(j-1)/", name), albert_weights)
+        for i = 1:config["inner_group_num"]
+            inner_weigths = filter(name->occursin("inner_group_$(i-1)/", name), group_weights)
+            for k ∈ inner_weigths
+                if occursin("inner_group_$(i-1)/attention_1", k)
+                    if occursin("self/key/kernel", k)
+                        loadparams!(albert.al[j][i].mh.ikproj.W, [weights[k]])
+                    elseif occursin("self/key/bias", k)
+                        loadparams!(albert.al[j][i].mh.ikproj.b', [weights[k]])
+                    elseif occursin("self/query/kernel", k)
+                        loadparams!(albert.al[j][i].mh.iqproj.W, [weights[k]])
+                    elseif occursin("self/query/bias", k)
+                        loadparams!(albert.al[j][i].mh.iqproj.b', [weights[k]])
+                    elseif occursin("self/value/kernel", k)
+                        loadparams!(albert.al[j][i].mh.ivproj.W, [weights[k]])
+                    elseif occursin("self/value/bias", k)
+                        loadparams!(albert.al[j][i].mh.ivproj.b', [weights[k]])
+                    elseif occursin("output/dense/kernel", k)
+                        loadparams!(albert.al[j][i].mh.oproj.W, [weights[k]])
+                    elseif occursin("output/dense/bias", k)
+                        loadparams!(albert.al[j][i].mh.oproj.b', [weights[k]])
+                    else
+                       # @warn "unknown variable: $k"
+                    end
+                elseif occursin("inner_group_$(1-1)/ffn_1/intermediate/dense", k)
+                    if occursin("kernel", k)
+                        loadparams!(albert.al[j][i].pw.din.W, [weights[k]])
+                    elseif occursin("bias", k)
+                        loadparams!(albert.al[j][i].pw.din.b', [weights[k]])
+                    else
+                     #  @warn "unknown variable: $k"
+                    end
+                elseif occursin("inner_group_$(1-1)/ffn_1/intermediate/output", k)
+                    if occursin("output/dense/kernel", k)
+                        loadparams!(albert.al[j][i].pw.dout.W, [weights[k]])
+                    elseif occursin("output/dense/bias", k)
+                        loadparams!(albert.al[j][i].pw.dout.b', [weights[k]])
+                    else
+                 #   @warn "unknown variable: $k"
+                    end
+                else
+                #@warn "unknown variable: $k"
+                end
+            end
+        
+            layer_weigths = filter(name->occursin("group_$(j-1)/LayerNorm", name), albert_weights)
+
+            for t ∈ layer_weigths
+                if occursin("group_$(j-1)/inner_group_0/LayerNorm_1",t)
+                    if occursin("LayerNorm_1/gamma", t)
+                        loadparams!(albert.al[j][i].pwn.diag.α', [weights[t]])
+                    else occursin("LayerNorm_1/beta", t)
+                        loadparams!(albert.al[j][i].pwn.diag.β', [weights[t]])
+                    end
+                elseif occursin("group_$(j-1)/inner_group_0/LayerNorm",t)
+                    if occursin("LayerNorm/gamma", t)
+                        loadparams!(albert.al[j][i].mhn.diag.α', [weights[t]])
+                    else occursin("LayerNorm/beta",t)
+                        loadparams!(albert.al[j][i].mhn.diag.β', [weights[t]])
+                    end
+                end    
+            end
+        end
+    end
+    mapping_weight = filter(name->occursin("embedding_hidden_mapping_in",name),vnames)
+
+    for mw ∈ mapping_weight
+        if occursin("embedding_hidden_mapping_in/kernel", mw)
+            loadparams!(albert.linear.W, [weights[mw]])
+        else occursin("embedding_hidden_mapping_in/bias", mw)
+            loadparams!(albert.linear.b', [weights[mw]])
+        end
+    end
+    pooler_weights = filter(name->occursin("pooler", name), vnames)
+    masklm_weights = filter(name->occursin("cls/predictions", name), vnames)
+    nextsent_weights = filter(name->occursin("cls/seq_relationship", name), vnames)
+
+    for k ∈ nextsent_weights
+        if occursin("seq_relationship/output_weights", k)
+            loadparams!(nextsentence[1].W', [weights[k]])
+        elseif occursin("seq_relationship/output_bias", k)
+            loadparams!(nextsentence[1].b', [weights[k]])
+        else
+            @warn "unknown variable: $k"
+        end
+    end
+
+    if !isempty(nextsent_weights)
+        classifier[:nextsentence] = nextsentence
+    end
+    for k ∈ pooler_weights
+        if occursin("dense/kernel", k)
+            loadparams!(pooler.W, [weights[k]])
+        elseif occursin("dense/bias", k)
+            loadparams!(pooler.b', [weights[k]])
+        else
+            @warn "unknown variable: $k"
+        end
+    end
+
+    if !isempty(pooler_weights)
+        classifier[:pooler] = pooler
+    end
+
+
+    for k ∈ masklm_weights
+        if occursin("predictions/output_bias", k)
+            loadparams!(masklm.output_bias', [weights[k]])
+        elseif occursin("predictions/transform/dense/kernel", k)
+            loadparams!(masklm.transform[1].W, [weights[k]])
+        elseif occursin("predictions/transform/dense/bias", k)
+            loadparams!(masklm.transform[1].b', [weights[k]])
+        elseif occursin("predictions/transform/LayerNorm/gamma", k)
+            loadparams!(masklm.transform[2].diag.α', [weights[k]])
+        elseif occursin("predictions/transform/LayerNorm/beta", k)
+            loadparams!(masklm.transform[2].diag.β', [weights[k]])
+        else
+            @warn "unknown variable: $k"
+        end
+    end
+
+    if !isempty(masklm_weights)
+        classifier[:masklm] = masklm
+    end
+    embed = CompositeEmbedding(;embedding...) #implemented in Transformer.jl
+    cls = _create_classifier(; classifier...) #implemented in Transformer.jl
+    TransformerModel(embed, albert, cls) #structure to hold transformer model already implemented in Transformers.jl
+
+end
diff --git a/src/albert/utils.jl b/src/albert/utils.jl
new file mode 100644
index 0000000..6a75716
--- /dev/null
+++ b/src/albert/utils.jl
@@ -0,0 +1,50 @@
+using WordTokenizers
+
+"""
+    preprocess_albert(training_batch::Array{Array{String,1},1}, spm ,task=nothing; pad_id::Int=1)
+preprocess text for finetuning.
+
+# Example:
+julia> sentences = [["i love julia language"],["It is fast as C"]]
+julia> using WordTokenizers #using tokenizer 
+julia> spm = load(ALBERT_V1)
+WordTokenizers.SentencePieceModel(Dict("▁shots" => (-11.2373, 
+7281),"▁ordered" => (-9.84973, 1906),"▁doubtful" => (-12.7799, 
+22569),"▁glancing" => (-11.6676, 10426),"▁disrespect" => (-13.13, 
+26682),"▁without" => (-8.34227, 367),"▁pol" => (-10.7694, 4828),"chem" 
+=> (-12.3713, 17661),"▁1947," => (-11.7544, 11199),"▁kw" => (-10.4402, 
+3511)…), 2)
+julia> TextAnalysis.ALBERT.preprocess_albert(sentences, spm)
+((tok = [3; 32; … ; 2; 4], segment = [1; 1; … ; 2; 2]), Float32[1.0 1.0 
+… 1.0 1.0])
+
+"""
+function preprocess_albert(training_batch,spm ,task=nothing; pad_id::Int=1)
+    ids =[]
+    sent = []
+    for i in 1:length(training_batch[1])
+        sent1 = spm(training_batch[1][i])
+        sent2 = spm(training_batch[2][i])
+        comb_sent = makesentence(sent1,sent2)
+        push!(sent, comb_sent)
+        push!(ids,ids_from_tokens(spm,comb_sent))
+    end
+    mask = getmask(convert(Array{Array{String,1},1}, sent)) 
+    E = Flux.batchseq(ids,1)
+    E = Flux.stack(E,1) #pad token always
+    segment = fill!(similar(E), pad_id)
+    for (i, sent) ∈ enumerate(sent)
+        j = findfirst(isequal("[SEP]"), sent)
+        if j !== nothing
+            @view(segment[j+1:end, i]) .= 2
+        end
+    end
+    data = (tok = E,segment = segment)
+    if task != nothing
+        labels = get_labels(task)
+        label = Flux.onehotbatch(training_batch[3], labels)
+        return(data,label,mask)
+    end
+    return(data,mask)
+end
+makesentence(s1, s2) = ["[CLS]"; s1; "[SEP]"; s2; "[SEP]"]
diff --git a/test/albert.jl b/test/albert.jl
new file mode 100644
index 0000000..74d184d
--- /dev/null
+++ b/test/albert.jl
@@ -0,0 +1,41 @@
+using BSON
+using WordTokenizers
+
+spm = WordTokenizers.load(ALBERT_V1)
+tok = ids_from_tokens(spm, spm("i love the julia language"))
+segment = [1,1,1,1,1]
+
+@testset "Transformers Model" begin
+    @testset "albert_transformer" begin
+        albert_transformer = TextAnalysis.ALBERT.albert_transformer(100,512,8,512,8,1,1)
+        @test typeof(albert_transformer.linear) == Flux.Dense{typeof(identity),Array{Float32,2},Array{Float32,1}}
+        @test albert_transformer.no_group == 1 
+        @test albert_transformer.no_hid == 8
+        @test albert_transformer.no_inner == 1
+        x = randn(Float32, 100, 5, 2)
+        x1 = x[:,:,1]
+        @test size(albert_transformer(x)) == (512, 5, 2)
+        @test size(albert_transformer(x1)) == (512, 5)
+    end
+    @testset "AL Group" begin
+        ALBERT_Layer = TextAnalysis.ALBERT.ALGroup(300, 10, 400, 2, 1)
+        @test length(ALBERT_Layer.ts) == 1 #unique layer
+        @test size(ALBERT_Layer.ts[1](randn(Float32,300,5,2))) == (300, 5, 2)    
+    end
+end
+@testset "pretraining" begin
+    pretraining_transformer = TextAnalysis.ALBERT.create_albert()
+    input_embedding = pretraining_transformer.embed(tok=tok, segment = segment)#getting embedding
+    @test size(input_embedding) == (128,5)
+    output = pretraining_transformer.transformers(input_embedding) #forward pass
+    @test size(output) == (768,5)
+    @test typeof(pretraining_transformer.classifier.pooler) == Flux.Dense{typeof(tanh),Array{Float32,2},Array{Float32,1}}
+end
+@testset "preprocess" begin
+    sentences = [["i love julia language"],["It is as fast as C"]]
+    preprocessed = TextAnalysis.ALBERT.preprocess_albert(sentences, spm)
+    @test typeof(preprocessed[1].segment) == Array{Int64,2}
+    @test typeof(preprocessed[1].tok) == Array{Int64,2}
+    @test size(preprocessed[1].segment) == (16,1)
+    @test size(preprocessed[2]) == (1, 16, 1) #masks for attention
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 2738bfa..3096aa2 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,3 +10,4 @@ include("pos.jl")
 include("sentiment.jl")
 include("averagePerceptronTagger.jl")
 include("ulmfit.jl")
+include("albert.jl")
\ No newline at end of file