FluxML · Chandu-4444 · Jul 19, 2022 · Jul 19, 2022 · Jul 21, 2022 · Jul 21, 2022
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 test/Manifest.toml
 docs/Manifest.toml
 development/**
+**/.vscode/**
 Manifest.toml
 **/*.jld2
 *.so
diff --git a/FastText/Project.toml b/FastText/Project.toml
@@ -4,16 +4,27 @@ authors = ["Lorenz Ohly", "FluxML Community"]
 version = "0.1.0"
 
 [deps]
+BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 FastAI = "5d0beca9-ade8-49ae-ad0b-a3cf890e669f"
+FastVision = "7bf02486-ff4c-4e73-b158-40c00866b54f"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+FluxTraining = "7bf95e4d-ca32-48da-9824-f0dc5310474f"
+IJulia = "7073ff75-c697-5162-941a-fcdaad2a7d2a"
 InlineTest = "bd334432-b1e7-49c7-a2dc-dd9149e4ebd6"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
 WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 FastAI = "0.5"
 InlineTest = "0.2"
 MLUtils = "0.2"
-julia = "1"
+julia = "1"
diff --git a/FastText/src/FastText.jl b/FastText/src/FastText.jl
@@ -2,17 +2,17 @@ module FastText
 
 using FastAI
 using FastAI:
-              Datasets,
-# blocks
-              Block, WrapperBlock, AbstractBlock, OneHotTensor, OneHotTensorMulti, Label,
-              LabelMulti, wrapped, Continuous, getencodings, getblocks, encodetarget,
-              encodeinput,
-# encodings
-              Encoding, StatefulEncoding, OneHot,
-# visualization
-              ShowText,
-# other
-              Context, Training, Validation
+    Datasets,
+    # blocks
+    Block, WrapperBlock, AbstractBlock, OneHotTensor, OneHotTensorMulti, Label,
+    LabelMulti, wrapped, Continuous, getencodings, getblocks, encodetarget,
+    encodeinput,
+    # encodings
+    Encoding, StatefulEncoding, OneHot,
+    # visualization
+    ShowText,
+    # other
+    Context, Training, Validation
 
 using FastAI.Datasets
 
@@ -33,14 +33,34 @@ using DataStructures: OrderedDict
 
 using WordTokenizers: TokenBuffer, isdone, character, spaces, nltk_url1, nltk_url2, nltk_phonenumbers
 
+# deoendencies
+using Flux
+using NNlib
+using DataDeps
+using BSON
+using TextAnalysis
+using MLUtils
+using Zygote
+
 
 include("recipes.jl")
 include("blocks/text.jl")
 include("transform.jl")
 include("encodings/textpreprocessing.jl")
 
+
+include("models/pretrain_lm.jl")
+include("models/custom_layers.jl")
+include("models/utils.jl")
+include("models/train_text_classifier.jl")
+include("models/dataloader.jl")
+include("models/datadeps.jl")
+include("textlearner.jl")
+include("models.jl")
+
 const _tasks = Dict{String,Any}()
 include("tasks/classification.jl")
+include("tasks/generation.jl")
 
 const DEFAULT_SANITIZERS = [
     replace_all_caps,
@@ -54,6 +74,7 @@ const DEFAULT_SANITIZERS = [
 const DEFAULT_TOKENIZERS = [tokenize]
 
 function __init__()
+    FastText.ulmfit_datadep_register()
     FastAI.Registries.registerrecipes(@__MODULE__, RECIPES)
     foreach(values(_tasks)) do t
         if !haskey(FastAI.learningtasks(), t.id)
@@ -62,6 +83,6 @@ function __init__()
     end
 end
 
-export Paragraph, TextClassificationSingle, Sanitize, Tokenize
+export Paragraph, TextClassificationSingle, LanguageModel, TextGeneration
 
 end
diff --git a/FastText/src/blocks/text.jl b/FastText/src/blocks/text.jl
@@ -32,6 +32,9 @@ struct Tokens <: Block end
 
 FastAI.checkblock(::Tokens, ::Vector{String}) = true
 
-struct NumberVector <: Block end
+struct NumberVector <: Block
+    classes::AbstractVector{Int64}
+end
 
 FastAI.checkblock(::NumberVector, ::Vector{Int64}) = true
+setup(::Type{NumberVector}, data::OrderedDict{String,Int64}) = Label(unique(eachobs(data)))
diff --git a/FastText/src/encodings/textpreprocessing.jl b/FastText/src/encodings/textpreprocessing.jl
@@ -9,7 +9,7 @@ Encodes
 
 """
 struct Sanitize <: Encoding
-    tfms
+    tfms::Any
 end
 
 Sanitize() = Sanitize(DEFAULT_SANITIZERS)
@@ -25,7 +25,7 @@ function encode(p::Sanitize, context, block::Paragraph, obs)
 end
 
 struct Tokenize <: Encoding
-    tfms
+    tfms::Any
 end
 
 Tokenize() = Tokenize(DEFAULT_TOKENIZERS)
@@ -41,45 +41,69 @@ function encode(p::Tokenize, context, block::Paragraph, obs)
     obs
 end
 
-function computevocabulary(data)
-    lookup_table = Dict{String, Int}()
+
+# Building a vocabulary
+function computevocabulary(data; vocab_size = 40000)
+    lookup_table = Dict{String,Int}()
 
     enc1 = Sanitize()
     sanitized_Data = map(i -> encode(enc1, Training(), Paragraph(), getobs(data, i)[1]), 1:numobs(data))
 
     enc2 = Tokenize()
     tokenized_data = map(i -> encode(enc2, Training(), Paragraph(), getobs(sanitized_Data, i)), 1:numobs(data))
 
-    vocab = []
     for sample in tokenized_data
         for token in sample
             lookup_table[token] = get(lookup_table, token, 0) + 1
         end
     end
-    return OrderedDict(lookup_table)
+
+    ordered_dict = sort(OrderedDict(lookup_table), byvalue = true)
+
+    for (k, v) in ordered_dict
+        if length(ordered_dict) > vocab_size
+            delete!(ordered_dict, k)
+        else
+            break
+        end
+    end
+
+    ordered_dict = sort(OrderedDict(ordered_dict), byvalue = true, rev = true)
+    counter = 3
+
+    for (k, v) in ordered_dict
+        ordered_dict[k] = counter
+        counter = counter + 1
+    end
+
+    ordered_dict["<unk>"] = 1
+    ordered_dict["<pad>"] = 2
+
+
+    return sort(ordered_dict, byvalue = true)
+
 end
 
 struct EmbedVocabulary <: Encoding
-    vocab
+    vocab::Any
 end
 
 function EmbedVocabulary(; vocab)
     return EmbedVocabulary(vocab)
 end
 
-function setup(::Type{EmbedVocabulary}, data)
-    vocab = computevocabulary(data)
+function setup(::Type{EmbedVocabulary}, data; vocab_size = 40000)
+    vocab = computevocabulary(data, vocab_size = vocab_size)
     return EmbedVocabulary(vocab = vocab)
 end
 
 function encodedblock(p::EmbedVocabulary, block::Tokens)
-    return NumberVector()
+    return NumberVector(p.vocab.vals)
 end
 
 function encode(p::EmbedVocabulary, context, block::Tokens, obs)
     vocabulary = p.vocab
-
-    return [vocabulary[token] for token in obs]
+    return [token in vocabulary.keys ? vocabulary[token] : vocabulary["<unk>"] for token in obs]
 end
 
 

diff --git a/FastText/src/models.jl b/FastText/src/models.jl
@@ -0,0 +1,25 @@
+function blockmodel(inblock::NumberVector, outblock::OneHotTensor, backbone; k = 10)
+
+    classifier = TextClassifier(backbone)
+    return classifier
+end
+
+function blockmodel(inblock::NumberVector, outblock::NumberVector, backbone; k = 10)
+    return backbone
+end
+
+function (b::TextClassifier)(input)
+    k = 10
+    Zygote.ignore() do
+        Flux.reset!(b.rnn_layers)
+        [b.rnn_layers(x) for x in input[1:(end - k)]]
+    end
+
+    # bptt
+    model = b.linear_layers([b.rnn_layers(x) for x in input[(end - k + 1):end]])
+end
+
+function (b::LanguageModel)(input)
+    # bptt
+    model = [b.layers(x) for x in input[1:end]]
+end