Skip to content

Commit

Permalink
add bytefallback bpe
Browse files Browse the repository at this point in the history
  • Loading branch information
chengchingwen committed Jul 29, 2023
1 parent 43914a1 commit c6cb514
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 19 deletions.
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
name = "BytePairEncoding"
uuid = "a4280ba5-8788-555a-8ca8-4a8c3d966a71"
authors = ["chengchingwen <[email protected]>"]
version = "0.3.1"
version = "0.3.2"

[deps]
DoubleArrayTries = "abbaa0e5-f788-499c-92af-c35ff4258c82"
StructWalk = "31cdf514-beb7-4750-89db-dda9d2eb8d3d"
TextEncodeBase = "f92c20c0-9f2a-4705-8116-881385faba05"
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[compat]
DoubleArrayTries = "0.1"
StructWalk = "0.2"
TextEncodeBase = "0.5.4, 0.6"
julia = "1.6"
Expand Down
1 change: 1 addition & 0 deletions src/BytePairEncoding.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export BPE, BPETokenization

include("mstring.jl")
include("bpe.jl")
include("bytefallback.jl")
include("tokenization.jl")
include("learn.jl")
include("gpt2_utils.jl")
Expand Down
25 changes: 13 additions & 12 deletions src/bpe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ end
BPE(merging_rank::Dict; sepsym = nothing, endsym = nothing) = BPE(merging_rank, sepsym, endsym)
BPE(bpefile; sepsym = nothing, endsym = nothing, kws...) = BPE(read_merges(bpefile, endsym; kws...); sepsym, endsym)

(bpe::BPE)(x) = bytepairencode(x, bpe.merging_rank, bpe.sepsym, bpe.endsym)
(bpe::BPE)(x) = bytepairencode(bpe, x)

function Base.show(io::IO, bpe::BPE)
print(io, "BPE(")
Expand Down Expand Up @@ -101,20 +101,21 @@ function merge!(ms, i)
return @inbounds @view(ms[1:desidx])
end

function merges(x::AbstractString, endsym = nothing)
buf = map(Merge, graphemes(x))
if endsym !== nothing
@inbounds buf[end] = Merge(buf[end], true)
end
return buf
function merges(x::AbstractString, endsym::Union{Nothing, AbstractString} = nothing)
buf = map(Merge, graphemes(x))
if endsym !== nothing
@inbounds buf[end] = Merge(buf[end], true)
end
return buf
end
merges(bpe::AbstractBPE, x::AbstractString) = merges(x, bpe.endsym)

function bytepairencode(x, merging_rank, sepsym = nothing, endsym = nothing)
ms = merges(x, endsym)
function bytepairencode(bpe::AbstractBPE, x::AbstractString)
ms = merges(bpe, x)
if length(ms) < 2
y = [Merge(x, !isnothing(endsym))]
y = [Merge(x, !isnothing(bpe.endsym))]
else
y = merge_loop!(merging_rank, ms, x)
y = merge_loop!(bpe.merging_rank, ms, x)
end
return as_string.(y, sepsym, endsym)
return as_string.(y, bpe.sepsym, bpe.endsym)
end
47 changes: 47 additions & 0 deletions src/bytefallback.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import DoubleArrayTries
using DoubleArrayTries: DoubleArrayTrie, StringView

struct ByteFallbackBPE <: AbstractBPE
vocab::DoubleArrayTrie
merging_rank::Dict{NTuple{2, Merge}, Int}
sepsym::Union{String, Nothing}
endsym::Union{String, Nothing}
end

ByteFallbackBPE(vocab_list::AbstractVector{String}, merging_rank, sepsym, endsym) =
ByteFallbackBPE(DoubleArrayTrie(collect(vocab_list)), merging_rank, sepsym, endsym)

(bpe::ByteFallbackBPE)(x) = bytepairencode(bpe, x)

function Base.show(io::IO, bpe::ByteFallbackBPE)
print(io, "ByteFallbackBPE(")
print(io, length(bpe.merging_rank))
print(io, " merges")
!isnothing(bpe.sepsym) && print(io, ", sepsym = ", bpe.sepsym)
!isnothing(bpe.endsym) && print(io, ", endsym = ", bpe.endsym)
print(io, ')')
end

function merges(bpe::ByteFallbackBPE, x::AbstractString)
vocab = bpe.vocab
y = Vector{Merge}()
offset = 0
for c in split(x, "")
i = DoubleArrayTries.lookup(vocab, c)
nu = ncodeunits(c)
if iszero(i)
cu = codeunits(c)
for i = 1:nu
push!(y, Merge(x, offset, 1, false, true))
offset += 1
end
else
push!(y, Merge(x, offset, nu, false))
offset += nu
end
end
if bpe.endsym !== nothing
@inbounds y[end] = Merge(y[end], true)
end
return y
end
22 changes: 16 additions & 6 deletions src/mstring.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
using DoubleArrayTries: StringView

struct Merge
string::String
offset::UInt16
ncodeunits::UInt16
extra::Bool
byte::Bool
end

Merge(str, offset::Int, ncodeunits::Int, extra) = Merge(str, UInt16(offset), UInt16(ncodeunits), extra)
Merge(a::Merge, e::Bool) = Merge(a.string, a.offset, a.ncodeunits, e)
Merge(s::SubString, e::Bool = false) = Merge(s.string, s.offset, s.ncodeunits, e)
Merge(str, offset::Int, ncodeunits::Int, extra, byte = false) = Merge(str, UInt16(offset), UInt16(ncodeunits), extra, byte)
Merge(a::Merge, e::Bool) = Merge(a.string, a.offset, a.ncodeunits, e, a.byte)
Merge(s::SubString, e::Bool = false) = Merge(s.string, s.offset, s.ncodeunits, e, false)
Merge(s::String, e::Bool = false) = Merge(SubString(s), e)

function Merge(a::Merge, b::Merge)
Expand All @@ -22,7 +25,7 @@ function Merge(a::Merge, b::Merge)
error("merge two Merge at same offset: partial string?")
end
nunits = a.ncodeunits + b.ncodeunits
return Merge(a.string, offset, nunits, b.extra)
return Merge(a.string, offset, nunits, b.extra, a.byte & b.byte)
else
error("merge different Merge")
end
Expand Down Expand Up @@ -92,7 +95,7 @@ function write_merges(io::IO, rank, endsym = nothing; limit = typemax(Int), comm
end

function Base.hash(m::Merge, h::UInt)
h = hash(m.extra, h) + Base.memhash_seed
h = hash(m.byte, hash(m.extra, h)) + Base.memhash_seed
str_size = m.ncodeunits * sizeof(UInt8)
str = m.string
ptr = convert(Ptr{UInt8}, pointer(str)) + m.offset
Expand All @@ -103,6 +106,7 @@ function Base.:(==)(m1::Merge, m2::Merge)
m1.extra == m2.extra || return false
s = m1.ncodeunits
s == m2.ncodeunits || return false
m1.byte == m2.byte || return false
str1 = m1.string
str2 = m2.string
p1 = convert(Ptr{UInt8}, pointer(str1)) + m1.offset
Expand All @@ -113,7 +117,13 @@ end
function as_string(m::Merge, sepsym, endsym)
str = m.string
offset = m.offset
s = SubString(str, offset+1, prevind(str, offset + m.ncodeunits + 1))
cu = codeunits(str)
range = offset+1:offset+m.ncodeunits
if m.byte
s = join(("<0x$(uppercase(string(cu[i]; base=16, pad=2)))>" for i in range))
else
s = StringView(@view(cu[range]))
end
sym = m.extra ? endsym : sepsym
return isnothing(sym) ? String(s) : string(s, sym)
end

2 comments on commit c6cb514

@chengchingwen
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/88648

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.2 -m "<description of version>" c6cb514f027c9cbf74f731a8ea88422b443ef7c5
git push origin v0.3.2

Please sign in to comment.