Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text generation recipe #258

Open
wants to merge 53 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
8194f3f
Integrate ULMFiT (initial)
Chandu-4444 Jul 19, 2022
7662230
Add Paragraph to train_classifier
Chandu-4444 Jul 19, 2022
1418e50
Add batchseq to pad batch (naive version)
Chandu-4444 Jul 21, 2022
b682105
Remove Project.toml changes.
Chandu-4444 Jul 21, 2022
c1064bc
Add vocab_size to TextClassificationTask
Chandu-4444 Jul 24, 2022
345ced1
Add `vocab_size` to encodings
Chandu-4444 Jul 25, 2022
46b6826
Test `batches` integration with model.
Chandu-4444 Jul 25, 2022
388e8ac
Update load_batchseq function.
Chandu-4444 Jul 25, 2022
8bfe705
Clean up useless code from TextModels.jl.
Chandu-4444 Jul 29, 2022
2482aaa
Update FastText/src/models/pretrain_lm.jl
Chandu-4444 Aug 2, 2022
8bc930d
Update FastText/src/models/dataloader.jl
Chandu-4444 Aug 2, 2022
3882b1d
Update FastText/src/models/custom_layers.jl
Chandu-4444 Aug 2, 2022
3057989
Update FastText/src/models/custom_layers.jl
Chandu-4444 Aug 2, 2022
307fde1
Add `reset!` for AWD_LSTM.
Chandu-4444 Aug 2, 2022
075a21e
Add `textlearner`.
Chandu-4444 Aug 8, 2022
f16ec2c
Complere text classification pipeline.
Chandu-4444 Aug 8, 2022
3469630
Upadate `LanguageModel` to use `Flux.reset!`.
Chandu-4444 Aug 18, 2022
974a622
Include models.jl file.
Chandu-4444 Aug 23, 2022
8e9f7aa
Start text generation recipe for `imdb`
Chandu-4444 Aug 23, 2022
0d44bbd
Update FastText/src/models/custom_layers.jl
Chandu-4444 Aug 23, 2022
4ad9a12
Update FastText/src/models/custom_layers.jl
Chandu-4444 Aug 23, 2022
080a018
Update FastText/src/models/custom_layers.jl
Chandu-4444 Aug 23, 2022
922334d
Update FastText/src/models/custom_layers.jl
Chandu-4444 Aug 23, 2022
ce34be1
Update FastText/src/models/custom_layers.jl
Chandu-4444 Aug 23, 2022
6cea902
Update FastText/src/models/custom_layers.jl
Chandu-4444 Aug 23, 2022
5159fb8
Update FastText/src/models/custom_layers.jl
Chandu-4444 Aug 23, 2022
c6b69f7
Update FastText/src/models/custom_layers.jl
Chandu-4444 Aug 23, 2022
164fe21
Update FastText/src/models/custom_layers.jl
Chandu-4444 Aug 24, 2022
1ce6df2
Add suggestions and improvements from the call.
Chandu-4444 Aug 26, 2022
712d9c8
Use previous `VarDrop` code for using in colab.
Chandu-4444 Aug 28, 2022
6d6504b
Use NNlib for scalar indexing
Chandu-4444 Aug 29, 2022
5af6edb
Updates to Project.toml
Chandu-4444 Aug 31, 2022
6636393
Merge branch 'textmodel-integration' into text-generation-recipe
Chandu-4444 Aug 31, 2022
6cc9053
Update code to solve `getfield non-differentiable` error.
Chandu-4444 Sep 1, 2022
d0ee3a4
Add `TextGeneration` task
Chandu-4444 Sep 1, 2022
659e6d1
Modify type params for `LanguageModel` and
Chandu-4444 Sep 1, 2022
0a151b2
Update FastText/src/models/train_text_classifier.jl
Chandu-4444 Sep 1, 2022
12bc9ab
Update FastText/src/models/train_text_classifier.jl
Chandu-4444 Sep 1, 2022
35c345f
Update dtypes to avoid CuArray errors.
Chandu-4444 Sep 6, 2022
cceee46
Add callable TextClassifier
Chandu-4444 Sep 6, 2022
f7d51f6
Update FastText/src/models/custom_layers.jl
Chandu-4444 Sep 8, 2022
90c9a79
Update FastText/src/models/custom_layers.jl
Chandu-4444 Sep 8, 2022
7e7de6d
Update `Flux.reset!()`
Chandu-4444 Sep 12, 2022
aae4442
Merge branch 'textmodel-integration' into text-generation-recipe
Chandu-4444 Sep 13, 2022
c1418b3
Update few Flux.dropout functions.
Chandu-4444 Sep 13, 2022
2c04d19
Update code to avoid non-differentiable error
Chandu-4444 Sep 13, 2022
5b96c78
Merge branch 'textmodel-integration' into text-generation-recipe
Chandu-4444 Sep 16, 2022
9c60de6
Add batch generation for generation task.
Chandu-4444 Sep 19, 2022
3903bb2
Push to test on colab
Chandu-4444 Sep 21, 2022
d4aa13c
Add blockmodel for LanguageModel
Chandu-4444 Sep 21, 2022
fb69dc5
Fix `TextClassificationTask`
Chandu-4444 Sep 21, 2022
416a800
Replace `map` with `mapobs`
Chandu-4444 Sep 24, 2022
232f3bf
Update `onehot` encode for NumberVector
Chandu-4444 Sep 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
test/Manifest.toml
docs/Manifest.toml
development/**
**/.vscode/**
Manifest.toml
**/*.jld2
*.so
13 changes: 12 additions & 1 deletion FastText/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,27 @@ authors = ["Lorenz Ohly", "FluxML Community"]
version = "0.1.0"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
FastAI = "5d0beca9-ade8-49ae-ad0b-a3cf890e669f"
FastVision = "7bf02486-ff4c-4e73-b158-40c00866b54f"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
FluxTraining = "7bf95e4d-ca32-48da-9824-f0dc5310474f"
IJulia = "7073ff75-c697-5162-941a-fcdaad2a7d2a"
InlineTest = "bd334432-b1e7-49c7-a2dc-dd9149e4ebd6"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[compat]
FastAI = "0.5"
InlineTest = "0.2"
MLUtils = "0.2"
julia = "1"
julia = "1"
45 changes: 33 additions & 12 deletions FastText/src/FastText.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@ module FastText

using FastAI
using FastAI:
Datasets,
# blocks
Block, WrapperBlock, AbstractBlock, OneHotTensor, OneHotTensorMulti, Label,
LabelMulti, wrapped, Continuous, getencodings, getblocks, encodetarget,
encodeinput,
# encodings
Encoding, StatefulEncoding, OneHot,
# visualization
ShowText,
# other
Context, Training, Validation
Datasets,
# blocks
Block, WrapperBlock, AbstractBlock, OneHotTensor, OneHotTensorMulti, Label,
LabelMulti, wrapped, Continuous, getencodings, getblocks, encodetarget,
encodeinput,
# encodings
Encoding, StatefulEncoding, OneHot,
# visualization
ShowText,
# other
Context, Training, Validation

using FastAI.Datasets

Expand All @@ -33,14 +33,34 @@ using DataStructures: OrderedDict

using WordTokenizers: TokenBuffer, isdone, character, spaces, nltk_url1, nltk_url2, nltk_phonenumbers

# deoendencies
using Flux
using NNlib
using DataDeps
using BSON
using TextAnalysis
using MLUtils
using Zygote


include("recipes.jl")
include("blocks/text.jl")
include("transform.jl")
include("encodings/textpreprocessing.jl")


include("models/pretrain_lm.jl")
include("models/custom_layers.jl")
include("models/utils.jl")
include("models/train_text_classifier.jl")
include("models/dataloader.jl")
include("models/datadeps.jl")
include("textlearner.jl")
include("models.jl")

const _tasks = Dict{String,Any}()
include("tasks/classification.jl")
include("tasks/generation.jl")

const DEFAULT_SANITIZERS = [
replace_all_caps,
Expand All @@ -54,6 +74,7 @@ const DEFAULT_SANITIZERS = [
const DEFAULT_TOKENIZERS = [tokenize]

function __init__()
FastText.ulmfit_datadep_register()
FastAI.Registries.registerrecipes(@__MODULE__, RECIPES)
foreach(values(_tasks)) do t
if !haskey(FastAI.learningtasks(), t.id)
Expand All @@ -62,6 +83,6 @@ function __init__()
end
end

export Paragraph, TextClassificationSingle, Sanitize, Tokenize
export Paragraph, TextClassificationSingle, LanguageModel, TextGeneration

end
5 changes: 4 additions & 1 deletion FastText/src/blocks/text.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ struct Tokens <: Block end

FastAI.checkblock(::Tokens, ::Vector{String}) = true

struct NumberVector <: Block end
struct NumberVector <: Block
classes::AbstractVector{Int64}
end

FastAI.checkblock(::NumberVector, ::Vector{Int64}) = true
setup(::Type{NumberVector}, data::OrderedDict{String,Int64}) = Label(unique(eachobs(data)))
48 changes: 36 additions & 12 deletions FastText/src/encodings/textpreprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Encodes

"""
struct Sanitize <: Encoding
tfms
tfms::Any
end

Sanitize() = Sanitize(DEFAULT_SANITIZERS)
Expand All @@ -25,7 +25,7 @@ function encode(p::Sanitize, context, block::Paragraph, obs)
end

struct Tokenize <: Encoding
tfms
tfms::Any
end

Tokenize() = Tokenize(DEFAULT_TOKENIZERS)
Expand All @@ -41,45 +41,69 @@ function encode(p::Tokenize, context, block::Paragraph, obs)
obs
end

function computevocabulary(data)
lookup_table = Dict{String, Int}()

# Building a vocabulary
function computevocabulary(data; vocab_size = 40000)
lookup_table = Dict{String,Int}()

enc1 = Sanitize()
sanitized_Data = map(i -> encode(enc1, Training(), Paragraph(), getobs(data, i)[1]), 1:numobs(data))

enc2 = Tokenize()
tokenized_data = map(i -> encode(enc2, Training(), Paragraph(), getobs(sanitized_Data, i)), 1:numobs(data))

vocab = []
for sample in tokenized_data
for token in sample
lookup_table[token] = get(lookup_table, token, 0) + 1
end
end
return OrderedDict(lookup_table)

ordered_dict = sort(OrderedDict(lookup_table), byvalue = true)

for (k, v) in ordered_dict
if length(ordered_dict) > vocab_size
delete!(ordered_dict, k)
else
break
end
end

ordered_dict = sort(OrderedDict(ordered_dict), byvalue = true, rev = true)
counter = 3

for (k, v) in ordered_dict
ordered_dict[k] = counter
counter = counter + 1
end

ordered_dict["<unk>"] = 1
ordered_dict["<pad>"] = 2


return sort(ordered_dict, byvalue = true)

end

struct EmbedVocabulary <: Encoding
vocab
vocab::Any
end

function EmbedVocabulary(; vocab)
return EmbedVocabulary(vocab)
end

function setup(::Type{EmbedVocabulary}, data)
vocab = computevocabulary(data)
function setup(::Type{EmbedVocabulary}, data; vocab_size = 40000)
vocab = computevocabulary(data, vocab_size = vocab_size)
return EmbedVocabulary(vocab = vocab)
end

function encodedblock(p::EmbedVocabulary, block::Tokens)
return NumberVector()
return NumberVector(p.vocab.vals)
end

function encode(p::EmbedVocabulary, context, block::Tokens, obs)
vocabulary = p.vocab

return [vocabulary[token] for token in obs]
return [token in vocabulary.keys ? vocabulary[token] : vocabulary["<unk>"] for token in obs]
end


Expand Down
25 changes: 25 additions & 0 deletions FastText/src/models.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
function blockmodel(inblock::NumberVector, outblock::OneHotTensor, backbone; k = 10)

classifier = TextClassifier(backbone)
return classifier
end

function blockmodel(inblock::NumberVector, outblock::NumberVector, backbone; k = 10)
return backbone
end

function (b::TextClassifier)(input)
k = 10
Zygote.ignore() do
Flux.reset!(b.rnn_layers)
[b.rnn_layers(x) for x in input[1:(end - k)]]
end

# bptt
model = b.linear_layers([b.rnn_layers(x) for x in input[(end - k + 1):end]])
end

function (b::LanguageModel)(input)
# bptt
model = [b.layers(x) for x in input[1:end]]
end
Loading