diff --git a/.github/workflows/Downstream.yml b/.github/workflows/Downstream.yml
index 6a9f4b777e..8f62bdb58a 100644
--- a/.github/workflows/Downstream.yml
+++ b/.github/workflows/Downstream.yml
@@ -26,7 +26,7 @@ jobs:
           - {user: Chemellia, repo: AtomicGraphNets.jl, group: All}
           - {user: SciML, repo: DiffEqFlux.jl, group: Layers}
           - {user: SciML, repo: NeuralPDE.jl, group: NNPDE}
-
+          - {user: SciML, repo: OperatorLearning.jl, group: All}
     if: contains(github.event.pull_request.labels.*.name, 'run downstream test')
     steps:
       - uses: actions/checkout@v2
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8cc6bcf4a0..423682e0bf 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,10 +22,15 @@ jobs:
           - 'nightly'
         os:
           - ubuntu-latest
-          - macOS-latest
-          - windows-latest
         arch:
           - x64
+        include:
+          - os: windows-latest
+            version: '1'
+            arch: x64
+          - os: macOS-latest
+            version: '1'
+            arch: x64
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
diff --git a/NEWS.md b/NEWS.md
index 07852c2dde..a9db7cfa58 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,13 @@
 # Flux Release Notes
 
+## v0.13
+* After a deprecations cycle, the datasets in `Flux.Data` have
+been removed in favour of MLDatasets.jl.
+* `params` is not exported anymore since it is a common name and is also exported by Distributions.jl
+* `flatten` is not exported anymore due to clash with Iterators.flatten.
+* Remove Juno.jl progress bar support as it is now obsolete.
+* `Dropout` gained improved compatibility with Int and Complex arrays and is now twice-differentiable.
+
 ## v0.12.10
 * `Dropout`/`AlphaDropout` now supports [user-specified RNGs](https://github.com/FluxML/Flux.jl/pull/1838)
 
diff --git a/Project.toml b/Project.toml
index fdbbc21b7c..4165efce8e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,50 +1,38 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.12.9"
+version = "0.13.0-DEV"
 
 [deps]
-AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
-Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
-DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
-Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressLogging = "33c8b6b6-d38a-422a-b730-caa89a2f386c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
-SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-AbstractTrees = "0.3"
 Adapt = "3.0"
 ArrayInterface = "3.1, 4"
 CUDA = "3"
 ChainRulesCore = "1.12"
-CodecZlib = "0.7"
-Colors = "0.12"
 Functors = "0.2.1"
 MacroTools = "0.5"
-NNlib = "0.8"
+NNlib = "0.8.2"
 NNlibCUDA = "0.2"
 ProgressLogging = "0.1"
 Reexport = "0.2, 1.0"
 StatsBase = "0.33"
-ZipFile = "0.9"
 Zygote = "0.6.34"
 julia = "1.6"
 
diff --git a/docs/src/models/advanced.md b/docs/src/models/advanced.md
index 6769706b82..d2e738362c 100644
--- a/docs/src/models/advanced.md
+++ b/docs/src/models/advanced.md
@@ -97,8 +97,8 @@ We can freeze a specific parameter of a specific layer which already entered a `
 by simply deleting it from `ps`:
 
 ```julia
-ps = params(m)
-delete!(ps, m[2].bias)
+ps = Flux.params(m)
+delete!(ps, m[2].bias) 
 ```
 
 ## Custom multiple input or output layer
diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index c62d7d004e..3f8e57b166 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -39,7 +39,7 @@ julia> x = [2, 1];
 
 julia> y = [2, 0];
 
-julia> gs = gradient(params(x, y)) do
+julia> gs = gradient(Flux.params(x, y)) do
          f(x, y)
        end
 Grads(...)
@@ -83,7 +83,7 @@ To improve the prediction we can take the gradients of the loss with respect to
 ```julia
 using Flux
 
-gs = gradient(() -> loss(x, y), params(W, b))
+gs = gradient(() -> loss(x, y), Flux.params(W, b))
 ```
 
 Now that we have gradients, we can pull them out and update `W` to train the model.
diff --git a/docs/src/models/recurrence.md b/docs/src/models/recurrence.md
index 65a3cc7430..ba5f5ade0a 100644
--- a/docs/src/models/recurrence.md
+++ b/docs/src/models/recurrence.md
@@ -160,7 +160,7 @@ data = zip(X,Y)
 Flux.reset!(m)
 [m(x) for x in seq_init]
 
-ps = params(m)
+ps = Flux.params(m)
 opt= ADAM(1e-3)
 Flux.train!(loss, ps, data, opt)
 ```
diff --git a/docs/src/saving.md b/docs/src/saving.md
index b1771cd5a0..9b1db909ce 100644
--- a/docs/src/saving.md
+++ b/docs/src/saving.md
@@ -62,7 +62,7 @@ julia> using Flux
 julia> model = Chain(Dense(10,5,relu),Dense(5,2),softmax)
 Chain(Dense(10, 5, NNlib.relu), Dense(5, 2), NNlib.softmax)
 
-julia> weights = params(model);
+julia> weights = Flux.params(model);
 
 julia> using BSON: @save
 
diff --git a/docs/src/training/optimisers.md b/docs/src/training/optimisers.md
index 7f3ad6bf37..948c7a52ba 100644
--- a/docs/src/training/optimisers.md
+++ b/docs/src/training/optimisers.md
@@ -14,7 +14,7 @@ loss(x, y) = sum((predict(x) .- y).^2)
 x, y = rand(5), rand(2) # Dummy data
 l = loss(x, y) # ~ 3
 
-θ = params(W, b)
+θ = Flux.params(W, b)
 grads = gradient(() -> loss(x, y), θ)
 ```
 
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 845a22d8a6..9db2330b65 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -64,7 +64,7 @@ At first glance it may seem strange that the model that we want to train is not
 
 ## Model parameters
 
-The model to be trained must have a set of tracked parameters that are used to calculate the gradients of the objective function. In the [basics](../models/basics.md) section it is explained how to create models with such parameters. The second argument of the function `Flux.train!` must be an object containing those parameters, which can be obtained from a model `m` as `params(m)`.
+The model to be trained must have a set of tracked parameters that are used to calculate the gradients of the objective function. In the [basics](../models/basics.md) section it is explained how to create models with such parameters. The second argument of the function `Flux.train!` must be an object containing those parameters, which can be obtained from a model `m` as `Flux.params(m)`.
 
 Such an object contains a reference to the model's parameters, not a copy, such that after their training, the model behaves according to their updated values.
 
diff --git a/src/Flux.jl b/src/Flux.jl
index 4909969cd1..3fd6fef5d0 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -11,13 +11,13 @@ using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient
 using ChainRulesCore
 
-export Chain, Dense, Maxout, SkipConnection, Parallel, flatten,
+export Chain, Dense, Maxout, SkipConnection, Parallel,
        RNN, LSTM, GRU, GRUv3,
        SamePad, Conv, CrossCor, ConvTranspose, DepthwiseConv,
        AdaptiveMaxPool, AdaptiveMeanPool, GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool,
        Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
        Upsample, PixelShuffle,
-       params, fmap, cpu, gpu, f32, f64,
+       fmap, cpu, gpu, f32, f64,
        testmode!, trainmode!
 
 include("optimise/Optimise.jl")
diff --git a/src/data/Data.jl b/src/data/Data.jl
index d00aeb709c..cb3a073969 100644
--- a/src/data/Data.jl
+++ b/src/data/Data.jl
@@ -6,62 +6,4 @@ using Base: @propagate_inbounds
 include("dataloader.jl")
 export DataLoader
 
-## TODO for v0.13: remove everything below ##############
-## Also remove the following deps:
-## AbstractTrees, ZipFiles, CodecZLib
-
-import ..Flux
-import SHA
-
-deprecation_message() = @warn("Flux's datasets are deprecated, please use the package MLDatasets.jl")
-
-function deps(path...)
-  if isnothing(@__DIR__) # sysimages
-    joinpath("deps", path...)
-  else
-    joinpath(@__DIR__, "..", "..", "deps", path...)
-  end
-end
-
-function download_and_verify(url, path, hash)
-    tmppath = tempname()
-    download(url, tmppath)
-    hash_download = open(tmppath) do f
-        bytes2hex(SHA.sha256(f))
-    end
-    if hash_download !== hash
-        msg  = "Hash Mismatch!\n"
-        msg *= "  Expected sha256:   $hash\n"
-        msg *= "  Calculated sha256: $hash_download"
-        error(msg)
-    end
-    mv(tmppath, path; force=true)
-end
-
-function __init__()
-  mkpath(deps())
-end
-
-include("mnist.jl")
-export MNIST
-
-include("fashion-mnist.jl")
-export FashionMNIST
-
-include("cmudict.jl")
-export CMUDict
-using .CMUDict; export cmudict
-
-include("tree.jl")
-include("sentiment.jl")
-export Sentiment
-
-include("iris.jl")
-export Iris
-
-include("housing.jl")
-export Housing
-
-#########################################
-
 end#module
diff --git a/src/data/cmudict.jl b/src/data/cmudict.jl
deleted file mode 100644
index d096727c2a..0000000000
--- a/src/data/cmudict.jl
+++ /dev/null
@@ -1,77 +0,0 @@
-module CMUDict
-
-export cmudict
-
-using ..Data: deps, download_and_verify, deprecation_message
-
-const version = "0.7b"
-const cache_prefix = "https://cache.julialang.org"
-
-function load()
-  suffixes_and_hashes = [(""       , "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4"),
-                        (".phones" , "ffb588a5e55684723582c7256e1d2f9fadb130011392d9e59237c76e34c2cfd6"),
-                        (".symbols", "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027")]
-  if isdir(deps("cmudict"))
-    if all(isfile(deps("cmudict", "cmudict$x")) for (x, _) in suffixes_and_hashes)
-      return
-    end
-  end
-  @info "Downloading CMUDict dataset"
-  mkpath(deps("cmudict"))
-  for (x, hash) in suffixes_and_hashes
-    download_and_verify("$cache_prefix/https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-$version$x",
-             deps("cmudict", "cmudict$x"), hash)
-  end
-end
-
-"""
-    phones()
-Return a `Vector` containing the phones used in the CMU Pronouncing Dictionary.
-"""
-function phones()
-  deprecation_message()
-  load()
-  Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
-                        "\n", keepempty = false), "\t")))
-end
-
-"""
-    symbols()
-Return a `Vector` containing the symbols used in the CMU Pronouncing Dictionary.
-A symbol is a phone with optional auxiliary symbols, indicating for example the
-amount of stress on the phone.
-"""
-function symbols()
-  deprecation_message()
-  load()
-  Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
-                "\n", keepempty = false))
-end
-
-"""
-    rawdict()
-Return the unfiltered CMU Pronouncing Dictionary.
-"""
-function rawdict()
-  deprecation_message()
-  load()
-  Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
-       filter(!isempty, split.(split(read(deps("cmudict", "cmudict"),String), "\n"))))
-end
-
-validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)
-
-"""
-    cmudict()
-Return a filtered CMU Pronouncing Dictionary.
-It is filtered so each word contains only ASCII characters and a combination of
-word characters (as determined by the regex engine using `\\w`), '-' and '.'.
-"""
-function cmudict()
-  deprecation_message()
-  filter(p -> validword(p.first), rawdict())
-end
-
-alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
-
-end
\ No newline at end of file
diff --git a/src/data/fashion-mnist.jl b/src/data/fashion-mnist.jl
deleted file mode 100644
index 18999e9d5e..0000000000
--- a/src/data/fashion-mnist.jl
+++ /dev/null
@@ -1,67 +0,0 @@
-module FashionMNIST
-
-using ..MNIST: gzopen, imageheader, rawimage, labelheader, rawlabel
-using ..Data: download_and_verify, deprecation_message
-
-const dir = if isnothing(@__DIR__)
-    joinpath("deps", "fashion-mnist")
-  else
-    joinpath(@__DIR__, "../../deps/fashion-mnist")
-end
-
-function load()
-  mkpath(dir)
-  cd(dir) do
-    for (file, hash) in [("train-images-idx3-ubyte", "3aede38d61863908ad78613f6a32ed271626dd12800ba2636569512369268a84"),
-                         ("train-labels-idx1-ubyte", "a04f17134ac03560a47e3764e11b92fc97de4d1bfaf8ba1a3aa29af54cc90845"),
-                         ("t10k-images-idx3-ubyte" , "346e55b948d973a97e58d2351dde16a484bd415d4595297633bb08f03db6a073"),
-                         ("t10k-labels-idx1-ubyte" , "67da17c76eaffca5446c3361aaab5c3cd6d1c2608764d35dfb1850b086bf8dd5")]
-      isfile(file) && continue
-      @info "Downloading Fashion-MNIST dataset"
-      download_and_verify("http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/$file.gz", "$file.gz", hash)
-      open(file, "w") do io
-        write(io, gzopen(read, "$file.gz"))
-      end
-    end
-  end
-end
-
-const TRAINIMAGES = joinpath(dir, "train-images-idx3-ubyte")
-const TRAINLABELS = joinpath(dir, "train-labels-idx1-ubyte")
-const TESTIMAGES = joinpath(dir, "t10k-images-idx3-ubyte")
-const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte")
-
-"""
-    images()
-    images(:test)
-Load the Fashion-MNIST images.
-Each image is a 28×28 array of `Gray` colour values
-(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).
-Return the 60,000 training images by default; pass `:test` to retrieve the
-10,000 test images.
-"""
-function images(set = :train)
-  deprecation_message()
-  load()
-  io = IOBuffer(read(set == :train ? TRAINIMAGES : TESTIMAGES))
-  _, N, nrows, ncols = imageheader(io)
-  [rawimage(io) for _ in 1:N]
-end
-
-"""
-    labels()
-    labels(:test)
-Load the labels corresponding to each of the images returned from [`images()`](@ref).
-Each label is a number from 0-9.
-Return the 60,000 training labels by default; pass `:test` to retrieve the
-10,000 test labels.
-"""
-function labels(set = :train)
-  deprecation_message()
-  load()
-  io = IOBuffer(read(set == :train ? TRAINLABELS : TESTLABELS))
-  _, N = labelheader(io)
-  [rawlabel(io) for _ = 1:N]
-end
-
-end
diff --git a/src/data/housing.jl b/src/data/housing.jl
deleted file mode 100644
index 4202f4d822..0000000000
--- a/src/data/housing.jl
+++ /dev/null
@@ -1,120 +0,0 @@
-"""
-1. Title: Boston Housing Data
-2. Sources:
-   (a) Origin:  This dataset was taken from the StatLib library which is
-                maintained at Carnegie Mellon University.
-   (b) Creator:  Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the
-                 demand for clean air', J. Environ. Economics & Management,
-                 vol.5, 81-102, 1978.
-   (c) Date: July 7, 1993
-3. Number of Instances: 506
-4. Number of Attributes: 13 continuous attributes (including "class"
-                            attribute "MEDV"), 1 binary-valued attribute.
-5. Attribute Information:
-       1. CRIM      per capita crime rate by town
-       2. ZN        proportion of residential land zoned for lots over
-                    25,000 sq.ft.
-       3. INDUS     proportion of non-retail business acres per town
-       4. CHAS      Charles River dummy variable (= 1 if tract bounds
-                    river; 0 otherwise)
-       5. NOX       nitric oxides concentration (parts per 10 million)
-       6. RM        average number of rooms per dwelling
-       7. AGE       proportion of owner-occupied units built prior to 1940
-       8. DIS       weighted distances to five Boston employment centres
-       9. RAD       index of accessibility to radial highways
-       10. TAX      full-value property-tax rate per 10,000 dollars
-       11. PTRATIO  pupil-teacher ratio by town
-       12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks
-                    by town
-       13. LSTAT    % lower status of the population
-       14. MEDV     Median value of owner-occupied homes in 1000's of dollars
-       Downloaded From: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data
-"""
-module Housing
-
-using DelimitedFiles
-using ..Data: deps, download_and_verify, deprecation_message
-
-#Uncomment if package exists
-#const cache_prefix = "https://cache.julialang.org/"
-const cache_prefix = ""
-
-function load()
-    isfile(deps("housing.data")) && return
-
-    @info "Downloading the Boston housing Dataset"
-    download_and_verify("$(cache_prefix)http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
-                        deps("housing.data"),
-                        "baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a")
-
-    #@info "Download complete. Working on the files"
-    path = deps()
-    isfile(deps("housing.data")) && touch(joinpath(path, "tempfile.data"))
-    open(joinpath(path, "tempfile.data"), "a") do fout
-        open(deps("housing.data"), "r") do fin
-            for line in eachline(fin)
-                line = replace(lstrip(line), r" +" => s",")
-                println(fout, line)
-            end
-        end
-    end
-    mv(joinpath(path, "tempfile.data"), deps("housing.data"), force=true)
-end
-
-"""
-Gets the targets for the Boston housing dataset, a 506 element array listing the targets for each example
-```julia
-julia> using Flux
-julia> target = Flux.Data.Housing.targets()
-julia> summary(target)
-506×1 Array{Float64,2}
-julia> target[1]
-24.0
-"""
-function targets()
-    deprecation_message()
-    load()
-    housing = readdlm(deps("housing.data"), ',')
-    reshape(Vector{Float64}(housing[1:end,end]), (506, 1))
-end
-
-
-"""
-Gets the names of the features provided in the dataset
-"""
-function feature_names()
-    ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"]
-end
-
-
-"""
-Gets the features of the Boston Housing Dataset. This is a 506x13 Matrix of Float64 datatypes.
-The values are in the order ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"].
-It has 506 examples.
-```julia
-julia> using Flux
-julia> features = Flux.Data.Housing.features()
-julia> summary(features)
-506×13 Array{Float64,2}
-julia> features[1, :]
-13-element Array{Float64,1}:
-0.00632
-18.0
-2.31
-0.0
-0.538
-   ⋮
-296.0
-15.3
-396.9
-4.98
-"""
-function features()
-    deprecation_message()
-    load()
-    housing = readdlm(deps("housing.data"), ',')
-    Matrix{Float64}(housing[1:end, 1:13])
-end
-
-
-end
\ No newline at end of file
diff --git a/src/data/iris.jl b/src/data/iris.jl
deleted file mode 100644
index 4529aa8a40..0000000000
--- a/src/data/iris.jl
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-Fisher's classic iris dataset.
-Measurements from 3 different species of iris: setosa, versicolor and
-virginica. There are 50 examples of each species.
-There are 4 measurements for each example: sepal length, sepal width,
-petal length and petal width. The measurements are in centimeters.
-The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris).
-"""
-module Iris
-
-using DelimitedFiles
-using ..Data: deps, download_and_verify, deprecation_message
-
-# Uncomment if the iris.data file is cached to cache.julialang.org.
-const cache_prefix = "https://cache.julialang.org/"
-
-function load()
-    isfile(deps("iris.data")) && return
-
-    @info "Downloading iris dataset."
-    download_and_verify("$(cache_prefix)https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
-                        deps("iris.data"),
-                        "6f608b71a7317216319b4d27b4d9bc84e6abd734eda7872b71a458569e2656c0")
-end
-
-"""
-    labels()
-Get the labels of the iris dataset, a 150 element array of strings listing the
-species of each example.
-```julia
-julia> labels = Flux.Data.Iris.labels();
-julia> summary(labels)
-"150-element Array{String,1}"
-julia> labels[1]
-"Iris-setosa"
-```
-"""
-function labels()
-    deprecation_message()
-    load()
-    iris = readdlm(deps("iris.data"), ',')
-    Vector{String}(iris[1:end, end])
-end
-
-"""
-    features()
-Get the features of the iris dataset. This is a 4x150 matrix of Float64
-elements. It has a row for each feature (sepal length, sepal width,
-petal length, petal width) and a column for each example.
-```julia
-julia> features = Flux.Data.Iris.features();
-julia> summary(features)
-"4×150 Array{Float64,2}"
-julia> features[:, 1]
-4-element Array{Float64,1}:
- 5.1
- 3.5
- 1.4
- 0.2
-```
-"""
-function features()
-    deprecation_message()
-    load()
-    iris = readdlm(deps("iris.data"), ',')
-    Matrix{Float64}(iris[1:end, 1:4]')
-end
-
-end
diff --git a/src/data/mnist.jl b/src/data/mnist.jl
deleted file mode 100644
index 45e51178ae..0000000000
--- a/src/data/mnist.jl
+++ /dev/null
@@ -1,117 +0,0 @@
-module MNIST
-
-using CodecZlib, Colors
-using ..Data: download_and_verify, deprecation_message
-
-const Gray = Colors.Gray{Colors.N0f8}
-
-const dir = if isnothing(@__DIR__)
-    joinpath("deps", "mnist")
-  else
-    joinpath(@__DIR__, "../../deps/mnist")
-end
-
-function gzopen(f, file)
-  open(file) do io
-    f(GzipDecompressorStream(io))
-  end
-end
-
-function load()
-  mkpath(dir)
-  cd(dir) do
-    for (file, hash) in [("train-images-idx3-ubyte", "440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609"),
-                         ("train-labels-idx1-ubyte", "3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c"),
-                         ("t10k-images-idx3-ubyte" , "8d422c7b0a1c1c79245a5bcf07fe86e33eeafee792b84584aec276f5a2dbc4e6"),
-                         ("t10k-labels-idx1-ubyte" , "f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6")]
-      isfile(file) && continue
-      @info "Downloading MNIST dataset"
-      download_and_verify("https://cache.julialang.org/http://yann.lecun.com/exdb/mnist/$file.gz", "$file.gz", hash)
-      open(file, "w") do io
-        write(io, gzopen(read, "$file.gz"))
-      end
-    end
-  end
-end
-
-const IMAGEOFFSET = 16
-const LABELOFFSET = 8
-
-const NROWS = 28
-const NCOLS = 28
-
-const TRAINIMAGES = joinpath(dir, "train-images-idx3-ubyte")
-const TRAINLABELS = joinpath(dir, "train-labels-idx1-ubyte")
-const TESTIMAGES = joinpath(dir, "t10k-images-idx3-ubyte")
-const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte")
-
-function imageheader(io::IO)
-  magic_number = bswap(read(io, UInt32))
-  total_items = bswap(read(io, UInt32))
-  nrows = bswap(read(io, UInt32))
-  ncols = bswap(read(io, UInt32))
-  return magic_number, Int(total_items), Int(nrows), Int(ncols)
-end
-
-function labelheader(io::IO)
-  magic_number = bswap(read(io, UInt32))
-  total_items = bswap(read(io, UInt32))
-  return magic_number, Int(total_items)
-end
-
-function rawimage(io::IO)
-  img = Array{Gray}(undef, NCOLS, NROWS)
-  for i in 1:NCOLS, j in 1:NROWS
-    img[i, j] = reinterpret(Colors.N0f8, read(io, UInt8))
-  end
-  return img
-end
-
-function rawimage(io::IO, index::Integer)
-  seek(io, IMAGEOFFSET + NROWS * NCOLS * (index - 1))
-  return rawimage(io)
-end
-
-rawlabel(io::IO) = Int(read(io, UInt8))
-
-function rawlabel(io::IO, index::Integer)
-  seek(io, LABELOFFSET + (index - 1))
-  return rawlabel(io)
-end
-
-getfeatures(io::IO, index::Integer) = vec(getimage(io, index))
-
-"""
-    images()
-    images(:test)
-Load the MNIST images.
-Each image is a 28×28 array of `Gray` colour values
-(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).
-Return the 60,000 training images by default; pass `:test` to retrieve the
-10,000 test images.
-"""
-function images(set = :train)
-  deprecation_message()
-  load()
-  io = IOBuffer(read(set == :train ? TRAINIMAGES : TESTIMAGES))
-  _, N, nrows, ncols = imageheader(io)
-  [rawimage(io) for _ in 1:N]
-end
-
-"""
-    labels()
-    labels(:test)
-Load the labels corresponding to each of the images returned from [`images()`](@ref).
-Each label is a number from 0-9.
-Return the 60,000 training labels by default; pass `:test` to retrieve the
-10,000 test labels.
-"""
-function labels(set = :train)
-  deprecation_message()
-  load()
-  io = IOBuffer(read(set == :train ? TRAINLABELS : TESTLABELS))
-  _, N = labelheader(io)
-  [rawlabel(io) for _ = 1:N]
-end
-
-end # module
diff --git a/src/data/sentiment.jl b/src/data/sentiment.jl
deleted file mode 100644
index aae8f70930..0000000000
--- a/src/data/sentiment.jl
+++ /dev/null
@@ -1,73 +0,0 @@
-"Stanford Sentiment Treebank dataset."
-module Sentiment
-
-using ZipFile
-using ..Data: deps, download_and_verify, deprecation_message
-
-function load()
-  isfile(deps("sentiment.zip")) && return
-  @info "Downloading sentiment treebank dataset"
-  download_and_verify("https://cache.julialang.org/https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip",
-           deps("sentiment.zip"), "5c613a4f673fc74097d523a2c83f38e0cc462984d847b82c7aaf36b01cbbbfcc")
-end
-
-getfile(r, name) = r.files[findfirst(x -> x.name == name, r.files)]
-
-function getfile(name)
-  r = ZipFile.Reader(deps("sentiment.zip"))
-  text = read(getfile(r, "trees/$name"), String)
-  close(r)
-  return text
-end
-
-using ..Data: Tree
-
-totree_(n, w) = Tree{Any}((parse(Int, n), w))
-totree_(n, a, b) = Tree{Any}((parse(Int, n), nothing), totree(a), totree(b))
-totree(t::Expr) = totree_(t.args...)
-
-function parsetree(s)
-  s = replace(s, "\\" => "")
-  s = replace(s, "\$" => "\\\$")
-  s = replace(s, r"[^ \n\(\)]+" => s -> "\"$s\"")
-  s = replace(s, " " => ", ")
-  return totree(Meta.parse(s))
-end
-
-function gettrees(name)
-  load()
-  ss = split(getfile("$name.txt"), '\n', keepempty = false)
-  return parsetree.(ss)
-end
-
-"""
-    train()
-Return the train split of the Stanford Sentiment Treebank.
-The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
-"""
-function train()
-  deprecation_message()
-  gettrees("train")
-end
-
-"""
-    test()
-Return the test split of the Stanford Sentiment Treebank.
-The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
-"""
-function test()
-  deprecation_message()
-  gettrees("test")
-end
-
-"""
-    dev()
-Return the dev split of the Stanford Sentiment Treebank.
-The data is in [treebank](https://en.wikipedia.org/wiki/Treebank) format.
-"""
-function dev()
-  deprecation_message()
-  gettrees("dev")
-end
-
-end
\ No newline at end of file
diff --git a/src/deprecations.jl b/src/deprecations.jl
index e6e7360a22..e1921f4ca9 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -1,23 +1,4 @@
 # v0.12 deprecations
-@deprecate Dropout(p, dims) Dropout(p; dims=dims)
-@deprecate InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, active=nothing) InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, true, true, active, length(β))
-@deprecate BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, active=nothing) BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, true, true, active, length(β))
-@deprecate GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum, active=nothing) GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum, true, true, active, length(β))
-@deprecate outdims(f, inputsize) outputsize(f, inputsize)
-@deprecate Conv(; weight,  bias, activation=identity, kws...) Conv(weight, bias, activation; kws...) 
-@deprecate ConvTranspose(; weight, bias, activation=identity, kws...) ConvTranspose(weight, bias, activation; kws...) 
-@deprecate DepthwiseConv(; weight, bias, activation=identity, kws...) DepthwiseConv(weight, bias, activation; kws...) 
-
-function Base.getproperty(a::Dense, s::Symbol)
-  if s === :W
-    Base.depwarn("field name dense.W is deprecated in favour of dense.weight", :Dense)
-    return getfield(a, :weight)
-  elseif s === :b
-    Base.depwarn("field name dense.b is deprecated in favour of dense.bias", :Dense)
-    return getfield(a, :bias)
-  end
-  return getfield(a, s)
-end
 
 function ones(dims...)
   Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)", :ones)
@@ -34,7 +15,6 @@ zeros(T::Type, dims...) = Base.zeros(T, dims...)
 ones32(::Type, dims...) = throw(ArgumentError("Flux.ones32 is always Float32, use Base.ones to specify the element type"))
 zeros32(::Type, dims...) = throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type"))
 
-
 # v0.13 deprecations
 function Broadcast.broadcasted(f::Recur, args...)
   # This had an explicit @adjoint rule, calling Zygote.∇map(__context__, f, args...), until v0.12
diff --git a/src/functor.jl b/src/functor.jl
index 4e76c924bd..b056ff9574 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -60,6 +60,8 @@ The behaviour of `params` on custom types can be customized using [`Functor.@fun
 
 # Examples
 ```jldoctest
+julia> using Flux: params
+
 julia> params(Chain(Dense(ones(2,3)), softmax))  # unpacks Flux models
 Params([[1.0 1.0 1.0; 1.0 1.0 1.0], [0.0, 0.0]])
 
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 42310d0b7c..3e22895e82 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -27,8 +27,12 @@ julia> m2 = Chain(enc = Chain(Flux.flatten, Dense(10, 5, tanh)),
 julia> m2(x) == (m2[:dec] ∘ m2[:enc])(x)
 true
 ```
+
+For large models, there is a special type-unstable path which can reduce compilation
+times. This can be used by supplying a vector of layers `Chain([layer1, layer2, ...])`.
+This feature is somewhat experimental, beware!
 """
-struct Chain{T<:Union{Tuple, NamedTuple}}
+struct Chain{T<:Union{Tuple, NamedTuple, AbstractVector}}
   layers::T
 end
 
@@ -44,10 +48,22 @@ end
 
 @functor Chain
 
-applychain(::Tuple{}, x) = x
-applychain(fs::Tuple, x) = applychain(tail(fs), first(fs)(x))
+(c::Chain)(x) = applychain(c.layers, x)
+
+@generated function applychain(layers::Tuple{Vararg{<:Any,N}}, x) where {N}
+  symbols = vcat(:x, [gensym() for _ in 1:N])
+  calls = [:($(symbols[i+1]) = layers[$i]($(symbols[i]))) for i in 1:N]
+  Expr(:block, calls...)
+end
 
-(c::Chain)(x) = applychain(Tuple(c.layers), x)
+applychain(layers::NamedTuple, x) = applychain(Tuple(layers), x)
+
+function applychain(layers::AbstractVector, x)  # type-unstable path, helps compile times
+  for f in layers
+    x = f(x)
+  end
+  x
+end
 
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i])
 Base.getindex(c::Chain{<:NamedTuple}, i::AbstractArray) =
@@ -60,6 +76,7 @@ function Base.show(io::IO, c::Chain)
 end
 _show_layers(io, layers::Tuple) = join(io, layers, ", ")
 _show_layers(io, layers::NamedTuple) = join(io, ["$k = $v" for (k, v) in pairs(layers)], ", ")
+_show_layers(io, layers::AbstractVector) = (print(io, "["); join(io, layers, ", "); print(io, "]"))
 
 # This is a temporary and naive implementation
 # it might be replaced in the future for better performance
@@ -132,24 +149,9 @@ struct Dense{F, M<:AbstractMatrix, B}
 end
 
 function Dense(in::Integer, out::Integer, σ = identity;
-               initW = nothing, initb = nothing,
                init = glorot_uniform, bias=true)
 
-  W = if initW !== nothing
-    Base.depwarn("keyword initW is deprecated, please use init (which similarly accepts a funtion like randn)", :Dense)
-    initW(out, in)
-  else
-    init(out, in)
-  end
-
-  b = if bias === true && initb !== nothing
-    Base.depwarn("keyword initb is deprecated, please simply supply the bias vector, bias=initb(out)", :Dense)
-    initb(out)
-  else
-    bias
-  end
-
-  return Dense(W, b, σ)
+  Dense(init(out, in), bias, σ)
 end
 
 @functor Dense
@@ -188,21 +190,7 @@ struct Diagonal{T}
   β::T
 end
 
-function Diagonal(sz::Integer...; initα = nothing, initβ = nothing)
-  α = if initα !== nothing
-    Base.depwarn("keyword initα is deprecated, please simply supply the desired vectors", :Diagonal)
-    initα(sz...)
-  else
-    ones32(sz...)
-  end
-  β = if initβ !== nothing
-    Base.depwarn("keyword initβ is deprecated, please simply supply the desired vectors", :Diagonal)
-    initβ(sz...)
-  else
-    zeros32(sz...)
-  end
-  Diagonal(α, β)
-end
+Diagonal(sz::Integer...) = Diagonal(ones32(sz...), zeros32(sz...))
 
 @functor Diagonal
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 1cda764d0d..eb0ea8604e 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -31,7 +31,7 @@ end
 
 """
     Conv(filter, in => out, σ = identity;
-         stride = 1, pad = 0, dilation = 1, groups = 1, [bias, weight, init])
+         stride = 1, pad = 0, dilation = 1, groups = 1, [bias, init])
 
 Standard convolutional layer. `filter` is a tuple of integers
 specifying the size of the convolutional kernel;
@@ -61,11 +61,8 @@ Then:
 
 Keywords to control initialization of the layer:
 * `init` - Function used to generate initial weights. Defaults to `glorot_uniform`.
-* `weight` - Initial weights of the layer. Typically an array, and can be used to override
-  other configurations. By default, these are generated using [`convfilter`](@ref).
 * `bias` - Initial bias is zero by default, this can be disabled entirely by setting it to
-  [`Flux.Zeros()`](@ref) or equivalently `false`, or another vector provided as
-  `bias = randn(Float32, out)`.
+  `false`, or another vector explicitly as `bias = randn(Float32, out)`.
 
 See also [`ConvTranspose`](@ref), [`DepthwiseConv`](@ref), [`CrossCor`](@ref).
 
@@ -121,7 +118,7 @@ Conv((3,), 4 => 5, σ)  # 65 parameters
 julia> c1(randn(100, 4, 64)) |> size
 (98, 5, 64)
 
-julia> params(c1) |> length
+julia> Flux.params(c1) |> length
 2
 ```
 """
@@ -136,8 +133,9 @@ end
 
 function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
             init = glorot_uniform, stride = 1, pad = 0, dilation = 1, groups = 1,
-            weight = convfilter(k, ch; init, groups), bias = true) where N
-
+            bias = true) where N
+    
+  weight = convfilter(k, ch; init, groups)
   Conv(weight, bias, σ; stride, pad, dilation, groups)
 end
 
@@ -250,10 +248,10 @@ end
 function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                       init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
                       groups = 1,
-                      weight = convfilter(k, reverse(ch); init, groups),
                       bias = true,
                       ) where N
 
+  weight = convfilter(k, reverse(ch); init, groups)                    
   ConvTranspose(weight, bias, σ; stride, pad, dilation, groups)
 end
 
@@ -334,7 +332,7 @@ struct DepthwiseConv{N,M,F,A,V}
 end
 
 """
-    DepthwiseConv(weight::AbstractArray, bias, [activation; stride, pad, dilation])
+    DepthwiseConv(weight::AbstractArray, [bias, activation; stride, pad, dilation])
 
 Constructs a layer with the given weight and bias arrays.
 Accepts the same keywords as the `DepthwiseConv((4,4), 3 => 6, relu)` method.
@@ -350,8 +348,9 @@ end
 
 function DepthwiseConv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                 init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                weight = depthwiseconvfilter(k, ch, init = init), bias = true) where N
+                bias = true) where N
   @assert ch[2] % ch[1] == 0 "Output channels must be integer multiple of input channels"
+  weight = depthwiseconvfilter(k, ch, init = init)
   return DepthwiseConv(weight, bias, σ; stride, pad, dilation)
 end
 
@@ -439,8 +438,9 @@ end
 
 function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
                   init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                  weight = convfilter(k, ch, init = init), bias = true) where N
+                  bias = true) where N
 
+  weight = convfilter(k, ch, init = init)
   return CrossCor(weight, bias, σ; stride, pad, dilation)
 end
 
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 53cb391716..686140f5e1 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -43,12 +43,14 @@ dropout_mask(rng, x::CuArray, p; kwargs...) =
   throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays."))
 dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
 function _dropout_mask(rng, x, p; dims=:)
-  y = rand!(rng, similar(x, _dropout_shape(x, dims)))
+  realfptype = float(real(eltype(x)))
+  y = rand!(rng, similar(x, realfptype, _dropout_shape(x, dims)))
   y .= _dropout_kernel.(y, p, 1 - p)
   return y
 end
 
-ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any)
+# TODO move this to NNlib
+Zygote.ChainRulesCore.@non_differentiable dropout_mask(rng, x, p)
 
 """
     Dropout(p; dims=:, rng = rng_from_array())
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 14e3b8801e..9734990c1e 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -83,23 +83,10 @@ rnn.state = hidden(rnn.cell)
 reset!(m::Recur) = (m.state = m.cell.state0)
 reset!(m) = foreach(reset!, functor(m)[1])
 
-
-# TODO remove in v0.13
-function Base.getproperty(m::Recur, sym::Symbol)
-  if sym === :init
-    Zygote.ignore() do
-      @warn "Recur field :init has been deprecated. To access initial state weights, use m::Recur.cell.state0 instead."
-    end
-    return getfield(m.cell, :state0)
-  else
-    return getfield(m, sym)
-  end
-end
-
 flip(f, xs) = reverse(f.(reverse(xs)))
 
 function (m::Recur)(x::AbstractArray{T, 3}) where T
-  h = [m(view(x, :, :, i)) for i in 1:size(x, 3)]
+  h = [m(x_t) for x_t in eachslice(x, dims=3)]
   sze = size(h[1])
   reshape(reduce(hcat, h), sze[1], sze[2], length(h))
 end
@@ -192,18 +179,6 @@ julia> r(rand(Float32, 3, 10)) |> size # batch size of 10
 RNN(a...; ka...) = Recur(RNNCell(a...; ka...))
 Recur(m::RNNCell) = Recur(m, m.state0)
 
-# TODO remove in v0.13
-function Base.getproperty(m::RNNCell, sym::Symbol)
-  if sym === :h
-    Zygote.ignore() do
-      @warn "RNNCell field :h has been deprecated. Use m::RNNCell.state0 instead."
-    end
-    return getfield(m, :state0)
-  else
-    return getfield(m, sym)
-  end
-end
-
 # LSTM
 
 struct LSTMCell{A,V,S}
@@ -272,23 +247,6 @@ julia> l(rand(Float32, 3, 10)) |> size # batch size of 10
 LSTM(a...; ka...) = Recur(LSTMCell(a...; ka...))
 Recur(m::LSTMCell) = Recur(m, m.state0)
 
-# TODO remove in v0.13
-function Base.getproperty(m::LSTMCell, sym::Symbol)
-  if sym === :h
-    Zygote.ignore() do
-      @warn "LSTMCell field :h has been deprecated. Use m::LSTMCell.state0[1] instead."
-    end
-    return getfield(m, :state0)[1]
-  elseif sym === :c
-    Zygote.ignore() do
-      @warn "LSTMCell field :c has been deprecated. Use m::LSTMCell.state0[2] instead."
-    end
-    return getfield(m, :state0)[2]
-  else
-    return getfield(m, sym)
-  end
-end
-
 # GRU
 
 function _gru_output(gxs, ghs, bs)
@@ -358,19 +316,6 @@ julia> g(rand(Float32, 3, 10)) |> size # batch size of 10
 GRU(a...; ka...) = Recur(GRUCell(a...; ka...))
 Recur(m::GRUCell) = Recur(m, m.state0)
 
-# TODO remove in v0.13
-function Base.getproperty(m::GRUCell, sym::Symbol)
-  if sym === :h
-    Zygote.ignore() do
-      @warn "GRUCell field :h has been deprecated. Use m::GRUCell.state0 instead."
-    end
-    return getfield(m, :state0)
-  else
-    return getfield(m, sym)
-  end
-end
-
-
 # GRU v3
 
 struct GRUv3Cell{A,V,S}
diff --git a/src/layers/show.jl b/src/layers/show.jl
index 85faec3c59..a37af36065 100644
--- a/src/layers/show.jl
+++ b/src/layers/show.jl
@@ -14,11 +14,12 @@ for T in [
 end
 
 function _big_show(io::IO, obj, indent::Int=0, name=nothing)
+  pre, post = obj isa Chain{<:AbstractVector} ? ("([", "])") : ("(", ")")
   children = _show_children(obj)
   if all(_show_leaflike, children)
     _layer_show(io, obj, indent, name)
   else
-    println(io, " "^indent, isnothing(name) ? "" : "$name = ", nameof(typeof(obj)), "(")
+    println(io, " "^indent, isnothing(name) ? "" : "$name = ", nameof(typeof(obj)), pre)
     if obj isa Chain{<:NamedTuple} && children == getfield(obj, :layers)
       # then we insert names -- can this be done more generically? 
       for k in Base.keys(obj)
@@ -35,10 +36,10 @@ function _big_show(io::IO, obj, indent::Int=0, name=nothing)
       end
     end
     if indent == 0  # i.e. this is the outermost container
-      print(io, ")")
+      print(io, rpad(post, 2))
       _big_finale(io, obj)
     else
-      println(io, " "^indent, "),")
+      println(io, " "^indent, post, ",")
     end
   end
 end
@@ -90,18 +91,18 @@ function _big_finale(io::IO, m)
     noncnt = _childarray_sum(_->1, m) - length(ps)
     if noncnt > 0
       nonparam = underscorise(_childarray_sum(length, m) - sum(length, ps))
-      printstyled(io, " "^09, "# Total: ", length(ps), " trainable arrays, "; color=:light_black)
+      printstyled(io, " "^08, "# Total: ", length(ps), " trainable arrays, "; color=:light_black)
       println(io, pars, " parameters,")
       printstyled(io, " "^10, "# plus ", noncnt, " non-trainable, ", nonparam, " parameters, summarysize "; color=:light_black)
       print(io, bytes, ".")
     else
-      printstyled(io, " "^19, "# Total: ", length(ps), " arrays, "; color=:light_black)
+      printstyled(io, " "^18, "# Total: ", length(ps), " arrays, "; color=:light_black)
       print(io, pars, " parameters, ", bytes, ".")
     end
   end
 end
 
-_childarray_sum(f, x::AbstractArray) = f(x)
+_childarray_sum(f, x::AbstractArray{<:Number}) = f(x)
 _childarray_sum(f, x) = isleaf(x) ? 0 : sum(y -> _childarray_sum(f, y), Functors.children(x))
 
 # utility functions
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 4f47c8d058..ec7844e256 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -23,7 +23,7 @@ opt = Descent()
 
 opt = Descent(0.3)
 
-ps = params(model)
+ps = Flux.params(model)
 
 gs = gradient(ps) do
     loss(x, y)
@@ -500,7 +500,7 @@ opt = ADAMW(0.001, (0.89, 0.995), 0.1)
 ```
 """
 ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
-  Optimiser(ADAM(1, β), WeightDecay(decay), Descent(η))
+  Optimiser(ADAM(η, β), WeightDecay(decay))
 
 """
     AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 968ddd506f..ca8e15a643 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -29,16 +29,18 @@ import Flux: activations
     @test m == fmap(identity, m)  # does not forget names
 
     @test_throws ArgumentError Chain(layers = Dense(10, 10), two = identity) # reserved name
+
+    @test_nowarn Chain([Dense(10, 5, σ), Dense(5, 2)])(randn(Float32, 10))  # vector of layers
   end
 
   @testset "Activations" begin
     c = Chain(Dense(3,5,relu), Dense(5,1,relu))
     X = Float32.([1.0; 1.0; 1.0])
-    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], params(c))
+    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], Flux.params(c))
 
     c2 = Chain(enc = c[1], dec = c[2])
     @test Flux.activations(c, X) == Flux.activations(c2, X)
-    @test_nowarn gradient(()->Flux.activations(c2, X)[2][1], params(c2))
+    @test_nowarn gradient(()->Flux.activations(c2, X)[2][1], Flux.params(c2))
   end
 
   @testset "Dense" begin
@@ -126,7 +128,7 @@ import Flux: activations
 
     @testset "params" begin
       mo = Maxout(()->Dense(32, 64), 4)
-      ps = params(mo)
+      ps = Flux.params(mo)
       @test length(ps) == 8  #4 alts, each with weight and bias
     end
   end
@@ -239,7 +241,7 @@ import Flux: activations
       Parallel(f_cnt, sin)(1)
       @test CNT[] == 3
     end
-    
+
     # Ref https://github.com/FluxML/Flux.jl/issues/1673
     @testset "Input domain" begin
       struct Input
@@ -276,7 +278,7 @@ import Flux: activations
     vocab_size, embed_size = 10, 4
     m = Flux.Embedding(vocab_size, embed_size)
     @test size(m.weight) == (embed_size, vocab_size)
-    
+
     x = rand(1:vocab_size, 3)
     y = m(x)
     @test y isa Matrix{Float32}
@@ -297,3 +299,41 @@ import Flux: activations
     @test_throws DimensionMismatch m(OneHotVector(3, 1000))
   end
 end
+
+@testset "second derivatives" begin
+  m1 = Chain(Dense(3,4,tanh; bias=false), Dense(4,2))
+  @test Zygote.hessian_dual(sum∘m1, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m1, [1,2,3])
+
+  m1v = Chain([m1[1], m1[2]])  # vector of layers
+  @test Zygote.hessian_dual(sum∘m1v, [1,2,3]) ≈ Zygote.hessian_dual(sum∘m1, [1,2,3])
+  @test_broken Zygote.hessian_dual(sum∘m1v, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m1v, [1,2,3])
+
+  # NNlib's softmax gradient writes in-place
+  m2 = Chain(Dense(3,4,tanh), Dense(4,2), softmax)
+  @test_broken Zygote.hessian_dual(sum∘m2, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m2, [1,2,3])
+
+  # https://github.com/FluxML/NNlib.jl/issues/362
+  m3 = Chain(Conv((3,), 2 => 3, relu), Dense(2,2))
+  x3 = cat(Float32[1 2; 3 4; 5 6; 7 8]; dims=3)
+  @test Zygote.hessian_dual(sum∘m3, x3) ≈ Zygote.hessian_reverse(sum∘m3, x3)
+end
+
+@testset "gradients of Chain{Vector}" begin
+  m1 = Chain(Dense(3,4,tanh; bias=false), Dense(4,2))
+  m1v = Chain([m1[1], m1[2]])
+  @test sum(length, params(m1)) == sum(length, params(m1v))
+
+  x1 = randn(Float32,3,5)
+  @test m1(x1) ≈ m1v(x1)
+
+  y1 = rand(Bool,2,5)
+  g1 = gradient(() -> Flux.Losses.logitcrossentropy(m1(x1), y1), params(m1))
+  g1v = gradient(() -> Flux.Losses.logitcrossentropy(m1v(x1), y1), params(m1v))
+  @test g1[m1[1].weight] ≈ g1v[m1v[1].weight]
+  @test g1[m1[2].bias] ≈ g1v[m1v[2].bias]
+
+  @test Flux.destructure(m1)[1] ≈ Flux.destructure(m1v)[1]
+  z1 = rand(22);
+  @test Flux.destructure(m1)[2](z1)[1].weight ≈ Flux.destructure(m1v)[2](z1)[1].weight
+  # Note that Flux.destructure(m1v)[2](z) has a Chain{Tuple}, as does m1v[1:2]
+end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 7730b2af20..9ce1a27aa0 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -58,7 +58,7 @@ end
   opt = Descent()
 
   for _ = 1:10^3
-    gs = gradient(params(bias)) do
+    gs = gradient(Flux.params(bias)) do
       Flux.Losses.mse(bias(ip), op)
     end
     Flux.Optimise.update!(opt, params(bias), gs)
@@ -160,7 +160,7 @@ end
 
   m = ConvTranspose((3,3), 1=>1)
   # Test that the gradient call does not throw: #900
-  @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
+  @test gradient(()->sum(m(x)), Flux.params(m)) isa Flux.Zygote.Grads
 
   x = zeros(Float32, 5, 5, 2, 4)
   m = ConvTranspose((3,3), 2=>3)
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 9ab74e4a1d..7ae15aeff9 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -5,6 +5,11 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
 
 @testset "Dropout" begin
   @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
+    x = [1.0+0im,2.0+1im,3.0+3im]
+    @test x == Dropout(0.1; rng_kwargs...)(x)
+    @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
+    @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
+
     x = [1.,2.,3.]
     @test x == Dropout(0.1; rng_kwargs...)(x)
     @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
@@ -121,7 +126,7 @@ end
                              2.0 4.0 6.0]
 
     @test Flux.hasaffine(m) == true
-    @test length(params(m)) == 2
+    @test length(Flux.params(m)) == 2
 
     @test m.β == [0, 0]  # initβ(2)
     @test m.γ == [1, 1]  # initγ(2)
@@ -205,7 +210,7 @@ end
   let m = InstanceNorm(2; affine=true, track_stats=true), sizes = (3, 2, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
-      @test length(params(m)) == 2
+      @test length(Flux.params(m)) == 2
       x = Float32.(x)
       @test m.β == [0, 0]  # initβ(2)
       @test m.γ == [1, 1]  # initγ(2)
@@ -268,7 +273,7 @@ end
       x = reshape(collect(1:prod(sizes)), sizes)
 
     @test Flux.hasaffine(m) == true
-    @test length(params(m)) == 2
+    @test length(Flux.params(m)) == 2
     x = Float64.(x)
     y = m(x)
     μ = mean(x, dims=1)
@@ -281,7 +286,7 @@ end
   let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2),
       x = reshape(collect(1:prod(sizes)), sizes)
     @test Flux.hasaffine(m) == false
-    @test length(params(m)) == 0
+    @test length(Flux.params(m)) == 0
 
     x = Float64.(x)
     y = m(x)
@@ -348,10 +353,10 @@ end
 
   m = LayerNorm((2,3,4))
   @test Flux.hasaffine(m) == true
-  @test length(params(m)) == 2
+  @test length(Flux.params(m)) == 2
   m = LayerNorm((2,3,4), affine=false)
   @test Flux.hasaffine(m) == false
-  @test length(params(m)) == 0
+  @test length(Flux.params(m)) == 0
 end
 
 @testset "GroupNorm" begin
@@ -361,7 +366,7 @@ end
   let m = GroupNorm(4,2, track_stats=true), sizes = (3,4,2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
-      @test length(params(m)) == 2
+      @test length(Flux.params(m)) == 2
       x = Float32.(x)
       @test m.β == [0, 0, 0, 0]  # initβ(32)
       @test m.γ == [1, 1, 1, 1]  # initγ(32)
@@ -453,3 +458,8 @@ end
     @test BN(x) ≈ GN(x)
   end
 end
+
+@testset "second derivatives" begin
+  m1 = Dropout(0.5)
+  @test Zygote.hessian_reverse(sum∘m1, [1.0,2.0,3.0]) == zeros(3, 3)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index a6abd609d2..706f126451 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,9 +1,11 @@
 using Flux
 using Flux.Data
 using Flux: OneHotArray, OneHotMatrix, OneHotVector
+using Flux: params
 using Test
 using Random, Statistics, LinearAlgebra
 using IterTools: ncycle
+using Zygote
 using CUDA
 
 Random.seed!(0)