diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 187695a..0c785da 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -3,8 +3,13 @@ on:
   push:
     branches:
       - master
+    tags: '*'
   pull_request:
-
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: only if it is a pull request build.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 jobs:
   test:
     name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ matrix.julia-threads }} thread(s) - ${{ github.event_name }}
diff --git a/Project.toml b/Project.toml
index 93c5d95..c2e2667 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,18 +11,16 @@ GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 
 [compat]
-Adapt = "3.3"
-CUDA = "3, 4"
+Adapt = "3, 4"
+CUDA = "3, 4, 5"
 ChainRulesCore = "1.3"
-GPUArrays = "8"
+GPUArrays = "8, 9, 10"
 GPUArraysCore = "0.1"
-NNlib = "0.7, 0.8"
-NNlibCUDA = "0.2"
+NNlib = "0.7, 0.8, 0.9"
 Requires = "1.1"
 Static = "0.7, 0.8"
 julia = "1.6"
diff --git a/src/NeuralAttentionlib.jl b/src/NeuralAttentionlib.jl
index 8d40af9..8827d94 100644
--- a/src/NeuralAttentionlib.jl
+++ b/src/NeuralAttentionlib.jl
@@ -9,7 +9,6 @@ import GPUArraysCore
 using ChainRulesCore
 
 using NNlib
-using NNlibCUDA
 
 using Requires
 
diff --git a/src/mask/mask.jl b/src/mask/mask.jl
index afc2371..ab16786 100644
--- a/src/mask/mask.jl
+++ b/src/mask/mask.jl
@@ -138,7 +138,7 @@ Base.@propagate_inbounds Base.getindex(m::M, I::Integer...) where {M <: Union{<:
 Base.@propagate_inbounds Base.getindex(m::MaskIndexer, i::CartesianIndex) = m[Tuple(i)]
 Base.@propagate_inbounds Base.getindex(m::MaskIndexer, I::Tuple) = m[I...]
 
-Adapt.adapt(to::CUDA.Adaptor, m::AbstractArrayMask) = Indexer{typeof(m)}(map(Base.Fix1(Adapt.adapt, to), GetIndexer(m).__fields))
+Adapt.adapt(to, m::AbstractArrayMask) = Indexer{typeof(m)}(map(Base.Fix1(Adapt.adapt, to), GetIndexer(m).__fields))
 
 randomness(::AbstractMask) = static(false)
 require_dest(::AbstractMask) = static(false)
diff --git a/src/mask/wrapper.jl b/src/mask/wrapper.jl
index 0b0e3aa..46fc3c9 100644
--- a/src/mask/wrapper.jl
+++ b/src/mask/wrapper.jl
@@ -9,7 +9,7 @@ AttenMask(m::FlipMask) = FlipMask(AttenMask(m.mask))
 Base.:!(m::AbstractMask) = FlipMask(m)
 Base.:!(m::FlipMask) = m.mask
 
-Adapt.adapt(to::CUDA.Adaptor, m::FlipMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask),))
+Adapt.adapt(to, m::FlipMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask),))
 adapt_structure(to, x::FlipMask) = FlipMask(adapt(to, x.mask))
 GetIndexer(m::FlipMask, dest_size = nothing) = Indexer{typeof(m)}((mask = GetIndexer(m.mask, dest_size),), dest_size)
 
@@ -43,7 +43,7 @@ Base.:|(::Nothing, m::AbstractMask) = nothing
 Base.:&(m::AbstractMask, ::Nothing) = m
 Base.:&(::Nothing, m::AbstractMask) = m
 
-Adapt.adapt(to::CUDA.Adaptor, m::CombinedMask) = Indexer{typeof(m)}((f = adapt(to, m.f),
+Adapt.adapt(to, m::CombinedMask) = Indexer{typeof(m)}((f = adapt(to, m.f),
                                                                      masks = map(Base.Fix1(adapt, to), m.masks)))
 adapt_structure(to, x::CombinedMask) = CombinedMask(x.f, adapt(to, x.masks))
 GetIndexer(m::CombinedMask, dest_size = nothing) = Indexer{typeof(m)}((m.f, masks = map(Base.Fix2(GetIndexer, dest_size), m.masks)))
@@ -101,7 +101,7 @@ function BatchedMask(mask)
     return BatchedMask(mask, batch_dim)
 end
 
-Adapt.adapt(to::CUDA.Adaptor, m::BatchedMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), batch_dim = static(m.batch_dim)))
+Adapt.adapt(to, m::BatchedMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), batch_dim = static(m.batch_dim)))
 adapt_structure(to, x::BatchedMask) = BatchedMask(adapt(to, x.mask), x.batch_dim)
 GetIndexer(m::BatchedMask, dest_size = nothing) = Indexer{typeof(m)}((mask = GetIndexer(m.mask, dest_size), batch_dim = static(m.batch_dim)))
 
@@ -138,7 +138,7 @@ end
 
 AttenMask(r::RepeatMask) = RepeatMask(AttenMask(r.mask), r.num)
 
-Adapt.adapt(to::CUDA.Adaptor, m::RepeatMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), num = m.num))
+Adapt.adapt(to, m::RepeatMask) = Indexer{typeof(m)}((mask = adapt(to, m.mask), num = m.num))
 adapt_structure(to, x::RepeatMask) = RepeatMask(adapt(to, x.mask), x.num)
 GetIndexer(m::RepeatMask, dest_size = nothing) = Indexer{typeof(m)}((mask = GetIndexer(m.mask, dest_size), num = m.num))
 
@@ -176,7 +176,7 @@ struct BiSequenceMask{QM<:AbstractMask, KM<:AbstractMask} <: AbstractWrapperMask
     k_mask::KM
 end
 
-Adapt.adapt(to::CUDA.Adaptor, m::BiSequenceMask) = Indexer{typeof(m)}((q_mask = adapt(to, m.q_mask), k_mask = adapt(to, m.k_mask)))
+Adapt.adapt(to, m::BiSequenceMask) = Indexer{typeof(m)}((q_mask = adapt(to, m.q_mask), k_mask = adapt(to, m.k_mask)))
 adapt_structure(to, x::BiSequenceMask) = BiSequenceMask(adapt(to, x.q_mask), adapt(to, x.k_mask))
 
 bi_dest_size(::Nothing, is_q) = nothing