From 539497d345c87ccf0030141d55c2051e4fcc14a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Wed, 26 Jun 2024 07:26:53 +0200
Subject: [PATCH 1/4] Add a thorough test of constructing float segments

Since the next commits touch code that create float segments,
make sure that we have thorough tests for constructing
float segments.
---
 erts/emulator/test/bs_construct_SUITE.erl | 252 +++++++++++++++++++++-
 1 file changed, 250 insertions(+), 2 deletions(-)

diff --git a/erts/emulator/test/bs_construct_SUITE.erl b/erts/emulator/test/bs_construct_SUITE.erl
index 42de143791a5..8fbada37e6ac 100644
--- a/erts/emulator/test/bs_construct_SUITE.erl
+++ b/erts/emulator/test/bs_construct_SUITE.erl
@@ -30,7 +30,7 @@
 	 otp_7422/1, zero_width/1, bad_append/1, bs_append_overflow/1,
          bs_append_offheap/1,
          reductions/1, fp16/1, zero_init/1, error_info/1, little/1,
-         heap_binary_unit/1
+         heap_binary_unit/1, floats/1
         ]).
 
 -include_lib("common_test/include/ct.hrl").
@@ -46,7 +46,7 @@ all() ->
      copy_writable_binary, kostis, dynamic, otp_7422, zero_width,
      bad_append, bs_append_overflow, bs_append_offheap,
      reductions, fp16, zero_init,
-     error_info, little, heap_binary_unit].
+     error_info, little, heap_binary_unit, floats].
 
 init_per_suite(Config) ->
     Config.
@@ -1634,6 +1634,254 @@ heap_binary_unit_2(Variant, Rest) ->
             {error2, Bin2}
     end.
 
+floats(_Config) ->
+    _ = rand:uniform(),				%Seed generator
+    io:format("Seed: ~p", [rand:export_seed()]),
+
+    %% Random floats.
+    _ = [do_float(rand:uniform() * math:pow(10.0, rand:uniform(20))) ||
+            _ <- lists:seq(1, 20)],
+
+    %% Random floats with powers of 10 near the upper limit representable
+    %% as a 64-bit float.
+    _ = [do_float(rand:uniform() * math:pow(10.0, 300 + rand:uniform(7))) ||
+            _ <- lists:seq(1, 10)],
+
+    %% Random small integers.
+    _ = [do_float(rand:uniform(1_000_000)) || _ <- lists:seq(1, 10)],
+
+    %% Random big integers.
+    _ = [do_float(rand:uniform(1_000_000) bsl 64) || _ <- lists:seq(1, 10)],
+
+    do_float(-0.0),
+    do_float(+0.0),
+
+    ok.
+
+do_float(F) ->
+    do_float(F, 0).
+
+do_float(_F, 32) ->
+    ok;
+do_float(F, N) ->
+    Pad = rand:uniform(1 bsl N) - 1,
+    true = is_integer(Pad),
+
+    do_float_be_16(F, N, Pad),
+    do_float_be_32(F, N, Pad),
+    do_float_be_64(F, N, Pad),
+
+    do_float_le_16(F, N, Pad),
+    do_float_le_32(F, N, Pad),
+    do_float_le_64(F, N, Pad),
+
+    do_float(F, N + 1).
+
+do_float_be_16(F, N, Pad) ->
+    FloatBin = id(<<F:16/big-float>>),
+    Bin = id(<<Pad:N, F:16/big-float>>),
+    Bin = id(<<Pad:N, (id(F)):16/big-float>>),
+    Bin = <<Pad:N, F:(id(16))/big-float>>,
+    Bin = <<Pad:N, (id(F)):(id(16))/big-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:16/big-float>>),
+            Bin = <<Pad:N, F:(id(16))/big-float>>,
+            Bin = case N of
+                      15 -> <<Pad:N, F:16/big-float>>;
+                      21 -> <<Pad:N, F:16/big-float>>;
+                      _ -> <<Pad:N, F:16/big-float>>
+                  end;
+        is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:16/big-float>>),
+            Bin = <<Pad:N, F:(id(16))/big-float>>,
+            Bin = case N of
+                      1 -> <<Pad:N, F:16/big-float>>;
+                      19 -> <<Pad:N, F:16/big-float>>;
+                      _ -> <<Pad:N, F:16/big-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+do_float_be_32(F, N, Pad) ->
+    FloatBin = id(<<F:32/big-float>>),
+    Bin = id(<<Pad:N, F:32/big-float>>),
+    Bin = id(<<Pad:N, (id(F)):32/big-float>>),
+    Bin = <<Pad:N, F:(id(32))/big-float>>,
+    Bin = <<Pad:N, (id(F)):(id(32))/big-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:32/big-float>>),
+            Bin = <<Pad:N, F:(id(32))/big-float>>,
+            Bin = case N of
+                      1 -> <<Pad:N, F:32/big-float>>;
+                      6 -> <<Pad:N, F:32/big-float>>;
+                      _ -> <<Pad:N, F:32/big-float>>
+                  end;
+        is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:32/big-float>>),
+            Bin = <<Pad:N, F:(id(32))/big-float>>,
+            Bin = case N of
+                      8 -> <<Pad:N, F:32/big-float>>;
+                      12 -> <<Pad:N, F:32/big-float>>;
+                      _ -> <<Pad:N, F:32/big-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+do_float_be_64(F, N, Pad) ->
+    FloatBin = id(<<F:64/big-float>>),
+    Bin = id(<<Pad:N, F:64/big-float>>),
+    Bin = id(<<Pad:N, (id(F)):64/big-float>>),
+    Bin = <<Pad:N, F:(id(64))/big-float>>,
+    Bin = <<Pad:N, (id(F)):(id(64))/big-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:64/big-float>>),
+            Bin = <<Pad:N, F:(id(64))/big-float>>,
+            Bin = case N of
+                      7 -> <<Pad:N, F:64/big-float>>;
+                      13 -> <<Pad:N, F:64/big-float>>;
+                      _ -> <<Pad:N, F:64/big-float>>
+                  end,
+
+            %% Match out the original float.
+            <<Pad:N, F:64/big-float>> = Bin;
+        is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:64/big-float>>),
+            Bin = <<Pad:N, F:(id(64))/big-float>>,
+            Bin = case N of
+                      7 -> <<Pad:N, F:64/big-float>>;
+                      13 -> <<Pad:N, F:64/big-float>>;
+                      _ -> <<Pad:N, F:64/big-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+do_float_le_16(F, N, Pad) ->
+    FloatBin = id(<<F:16/little-float>>),
+    Bin = id(<<Pad:N, F:16/little-float>>),
+    Bin = id(<<Pad:N, (id(F)):16/little-float>>),
+    Bin = <<Pad:N, F:(id(16))/little-float>>,
+    Bin = <<Pad:N, (id(F)):(id(16))/little-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:16/little-float>>),
+            Bin = <<Pad:N, F:(id(16))/little-float>>,
+            Bin = case N of
+                      11 -> <<Pad:N, F:16/little-float>>;
+                      27 -> <<Pad:N, F:16/little-float>>;
+                      _ -> <<Pad:N, F:16/little-float>>
+                  end;
+        is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:16/little-float>>),
+            Bin = <<Pad:N, F:(id(16))/little-float>>,
+            Bin = case N of
+                      7 -> <<Pad:N, F:16/little-float>>;
+                      13 -> <<Pad:N, F:16/little-float>>;
+                      _ -> <<Pad:N, F:16/little-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+do_float_le_32(F, N, Pad) ->
+    FloatBin = id(<<F:32/little-float>>),
+    Bin = id(<<Pad:N, F:32/little-float>>),
+    Bin = id(<<Pad:N, (id(F)):32/little-float>>),
+    Bin = <<Pad:N, F:(id(32))/little-float>>,
+    Bin = <<Pad:N, (id(F)):(id(32))/little-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:32/little-float>>),
+            Bin = <<Pad:N, F:(id(32))/little-float>>,
+            Bin = case N of
+                      9 -> <<Pad:N, F:32/little-float>>;
+                      29 -> <<Pad:N, F:32/little-float>>;
+                      _ -> <<Pad:N, F:32/little-float>>
+                  end;
+       is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:32/little-float>>),
+            Bin = <<Pad:N, F:(id(32))/little-float>>,
+            Bin = case N of
+                      7 -> <<Pad:N, F:32/little-float>>;
+                      13 -> <<Pad:N, F:32/little-float>>;
+                      _ -> <<Pad:N, F:32/little-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+do_float_le_64(F, N, Pad) ->
+    FloatBin = id(<<F:64/little-float>>),
+    Bin = id(<<Pad:N, F:64/little-float>>),
+    Bin = id(<<Pad:N, (id(F)):64/little-float>>),
+    Bin = <<Pad:N, F:(id(64))/little-float>>,
+    Bin = <<Pad:N, (id(F)):(id(64))/little-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:64/little-float>>),
+            Bin = <<Pad:N, F:(id(64))/little-float>>,
+            Bin = case N of
+                      9 -> <<Pad:N, F:64/little-float>>;
+                      29 -> <<Pad:N, F:64/little-float>>;
+                      _ -> <<Pad:N, F:64/little-float>>
+                  end,
+
+            %% Match out the original float.
+            <<Pad:N, F:64/little-float>> = Bin;
+        is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:64/little-float>>),
+            Bin = <<Pad:N, F:(id(64))/little-float>>,
+            Bin = case N of
+                      7 -> <<Pad:N, F:64/little-float>>;
+                      13 -> <<Pad:N, F:64/little-float>>;
+                      _ -> <<Pad:N, F:64/little-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+
 %%%
 %%% Common utilities.
 %%%

From 03c61a43739effe7a8d364bffcf81b7d2ca171db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Wed, 8 May 2024 12:13:49 +0200
Subject: [PATCH 2/4] Get rid of macros for bit syntax construction

The first implementation of the bit syntax was done in Erlang/OTP R7B
before the SMP emulator (the multi-threaded runtime system).
Therefore, global variables were used to keep track of the binary
being constructed.

When the SMP emulator was introduced, the state of the
binary being constructed needed to be local for each
scheduler thread. To reduce the need for extensive rewrites,
macros were used to automagically pass references
to the binary construction state to the helper functions
in erl_bits.c.

Now bite the bullet and remove these macros. Also take the opportunity
to remove the "new" part from function names. They are not exactly
new.
---
 erts/emulator/beam/emu/bs_instrs.tab       |  35 +++--
 erts/emulator/beam/erl_bits.c              | 149 +++++++++++----------
 erts/emulator/beam/erl_bits.h              |  60 ++++-----
 erts/emulator/beam/jit/arm/instr_bs.cpp    |  84 ++++++------
 erts/emulator/beam/jit/beam_jit_common.cpp |  10 +-
 erts/emulator/beam/jit/beam_jit_common.hpp |   2 +-
 erts/emulator/beam/jit/x86/instr_bs.cpp    |  76 ++++++-----
 7 files changed, 209 insertions(+), 207 deletions(-)

diff --git a/erts/emulator/beam/emu/bs_instrs.tab b/erts/emulator/beam/emu/bs_instrs.tab
index 1150b1172781..0024bae6e34f 100644
--- a/erts/emulator/beam/emu/bs_instrs.tab
+++ b/erts/emulator/beam/emu/bs_instrs.tab
@@ -373,7 +373,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
     const BeamInstr* p;
     Uint alloc = $Alloc;
     Eterm new_binary;
-    ERL_BITS_DECLARE_STATEP; /* Has to be last declaration */
+    ErlBitsState* EBS = ERL_BITS_EBS_FROM_REG(reg);
 
     /* We count the total number of bits in an unsigned integer. To avoid
      * having to check for overflow when adding to `num_bits`, we ensure that
@@ -546,7 +546,6 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
     }
 
     /* Allocate binary. */
-    ERL_BITS_RELOAD_STATEP(c_p);
     p = p_start;
     if (p[0] == BSC_APPEND) {
         Uint live = $Live;
@@ -565,15 +564,13 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         }
         p_start += BSC_NUM_ARGS;
     } else if (p[0] == BSC_PRIVATE_APPEND) {
-        Uint unit;
         Eterm Src;
 
         $test_heap(alloc, $Live);
 
-        $BS_LOAD_UNIT(p, unit);
         $BS_LOAD_SRC(p, Src);
 
-        new_binary = erts_bs_private_append_checked(c_p, Src, num_bits, unit);
+        new_binary = erts_bs_private_append_checked(EBS, c_p, Src, num_bits);
 
         if (is_non_value(new_binary)) {
             $BS_FAIL_INFO($Fail, c_p->freason, c_p->fvalue, Src);
@@ -589,7 +586,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         /* num_bits = Number of bits to build
          * alloc = Total number of words to allocate on heap
          */
-        erts_bin_offset = 0;
+        EBS->erts_bin_offset = 0;
         if (num_bits <= ERL_ONHEAP_BITS_LIMIT) {
             ErlHeapBits *hb;
 
@@ -598,7 +595,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
             HTOP += heap_bits_size(num_bits);
             hb->thing_word = header_heap_bits(num_bits);
             hb->size = num_bits;
-            erts_current_bin = (byte *) hb->data;
+            EBS->erts_current_bin = (byte *) hb->data;
             new_binary = make_bitstring(hb);
         } else {
             Binary* bptr;
@@ -608,7 +605,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
                             $Live);
 
             bptr = erts_bin_nrml_alloc(NBYTES(num_bits));
-            erts_current_bin = (byte *)bptr->orig_bytes;
+            EBS->erts_current_bin = (byte *)bptr->orig_bytes;
 
             LIGHT_SWAPOUT;
 
@@ -616,7 +613,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
                                                   &MSO(c_p).overhead,
                                                   &HEAP_TOP(c_p),
                                                   bptr,
-                                                  erts_current_bin,
+                                                  EBS->erts_current_bin,
                                                   0,
                                                   num_bits);
 
@@ -640,7 +637,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
             byte* string;
             $BS_LOAD_STRING_SRC(p, string);
             $BS_LOAD_FIXED_SIZE(p, Size);
-            erts_new_bs_put_string(ERL_BITS_ARGS_2(string, Size));
+            erts_bs_put_string(EBS, string, Size);
             continue;
         }
 
@@ -649,7 +646,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         switch (p[0]) {
         case BSC_BINARY_ALL:
             $BS_LOAD_UNIT(p, unit);
-            if (!erts_new_bs_put_binary_all(c_p, Src, unit)) {
+            if (!erts_bs_put_binary_all(EBS, c_p, Src, unit)) {
                 $BS_FAIL_INFO($Fail, BADARG, am_unit, Src);
             }
             break;
@@ -658,14 +655,14 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
             $BS_LOAD_FLAGS(p, flags);
             $BS_LOAD_SIZE(p, Size);
             $BS_GET_UNCHECKED_FIELD_SIZE(Size, unit, $BADARG($Fail), _size);
-            if (!erts_new_bs_put_binary(c_p, Src, _size)) {
+            if (!erts_bs_put_binary(EBS, c_p, Src, _size)) {
                 Eterm reason = is_bitstring(Src) ? am_short : am_type;
                 $BS_FAIL_INFO($Fail, BADARG, reason, Src);
             }
             break;
         case BSC_BINARY_FIXED_SIZE:
             $BS_LOAD_FIXED_SIZE(p, Size);
-            if (!erts_new_bs_put_binary(c_p, Src, Size)) {
+            if (!erts_bs_put_binary(EBS, c_p, Src, Size)) {
                 Eterm reason = is_bitstring(Src) ? am_short : am_type;
                 $BS_FAIL_INFO($Fail, BADARG, reason, Src);
             }
@@ -675,7 +672,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
             $BS_LOAD_FLAGS(p, flags);
             $BS_LOAD_SIZE(p, Size);
             $BS_GET_UNCHECKED_FIELD_SIZE(Size, unit, $BADARG($Fail), _size);
-            Src = erts_new_bs_put_float(c_p, Src, _size, flags);
+            Src = erts_bs_put_float(EBS, c_p, Src, _size, flags);
             if (is_value(Src)) {
                 $BS_FAIL_INFO($Fail, BADARG, c_p->fvalue, Src);
             }
@@ -683,7 +680,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         case BSC_FLOAT_FIXED_SIZE:
             $BS_LOAD_FLAGS(p, flags);
             $BS_LOAD_FIXED_SIZE(p, Size);
-            Src = erts_new_bs_put_float(c_p, Src, Size, flags);
+            Src = erts_bs_put_float(EBS, c_p, Src, Size, flags);
             if (is_value(Src)) {
                 $BS_FAIL_INFO($Fail, BADARG, c_p->fvalue, Src);
             }
@@ -696,7 +693,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
                 $BS_LOAD_FLAGS(p, flags);
                 $BS_LOAD_SIZE(p, Size);
                 $BS_GET_UNCHECKED_FIELD_SIZE(Size, unit, $BADARG($Fail), _size);
-                if (!erts_new_bs_put_integer(ERL_BITS_ARGS_3(Src, _size, flags))) {
+                if (!erts_bs_put_integer(EBS, Src, _size, flags)) {
                     $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
                 }
             }
@@ -705,19 +702,19 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         case BSC_UTF32:
             $BS_LOAD_FLAGS(p, flags);
             $BS_LOAD_FIXED_SIZE(p, Size);
-            if (!erts_new_bs_put_integer(ERL_BITS_ARGS_3(Src, Size, flags))) {
+            if (!erts_bs_put_integer(EBS, Src, Size, flags)) {
                 $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
             }
             break;
         case BSC_UTF8:
-            if (!erts_bs_put_utf8(ERL_BITS_ARGS_1(Src))) {
+            if (!erts_bs_put_utf8(EBS, Src)) {
                 $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
             }
             break;
         case BSC_UTF16:
             $BS_LOAD_FLAGS(p, flags);
             $BS_LOAD_SRC(p, Src);
-            if (!erts_bs_put_utf16(ERL_BITS_ARGS_2(Src, flags))) {
+            if (!erts_bs_put_utf16(EBS, Src, flags)) {
                 $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
             }
             break;
diff --git a/erts/emulator/beam/erl_bits.c b/erts/emulator/beam/erl_bits.c
index 75413225b9cf..0d7e6b1f5cf3 100644
--- a/erts/emulator/beam/erl_bits.c
+++ b/erts/emulator/beam/erl_bits.c
@@ -732,9 +732,10 @@ fmt_big(byte *buf, Uint num_bytes, Eterm val, Uint num_bits, Uint flags)
 }
 
 int
-erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flags))
+erts_bs_put_integer(ErlBitsState *EBS, Eterm arg, Uint num_bits, unsigned flags)
 {
-    Uint bin_offset = erts_bin_offset;
+    byte* dst_bin = EBS->erts_current_bin;
+    Uint bin_offset = EBS->erts_bin_offset;
     Uint bit_offset;
     Uint b;
     byte *iptr;
@@ -749,7 +750,7 @@ erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flag
 	    /*
 	     * All bits are in the same byte.
 	     */
-	    iptr = erts_current_bin+BYTE_OFFSET(bin_offset);
+	    iptr = dst_bin + BYTE_OFFSET(bin_offset);
 	    b = *iptr & (0xff << rbits);
 	    b |= (signed_val(arg) & ((1 << num_bits)-1)) << (rbits-num_bits);
 	    *iptr = b;
@@ -757,7 +758,7 @@ erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flag
 	    /*
 	     * More than one bit, starting at a byte boundary.
 	     */
-            iptr = erts_current_bin + BYTE_OFFSET(bin_offset);
+            iptr = dst_bin + BYTE_OFFSET(bin_offset);
             fmt_small(iptr, NBYTES(num_bits), arg, num_bits, flags);
 	} else if (flags & BSF_LITTLE) {
             /*
@@ -771,7 +772,7 @@ erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flag
             Uint count = (num_bits - rbits) / 8;
             Uint bits, bits1;
 
-            iptr = erts_current_bin+BYTE_OFFSET(bin_offset);
+            iptr = dst_bin + BYTE_OFFSET(bin_offset);
 
             if (BIT_OFFSET(num_bits) == 0) {
                 bits = val;
@@ -830,7 +831,7 @@ erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flag
 	     */
 	    Uint shift_count = num_bits - rbits;
 	    Sint val = signed_val(arg);
-	    iptr = erts_current_bin+BYTE_OFFSET(bin_offset);
+	    iptr = dst_bin + BYTE_OFFSET(bin_offset);
 	    b = *iptr & (0xff << rbits);
 
 	    /*
@@ -855,7 +856,7 @@ erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flag
 	 * Big number, aligned on a byte boundary. We can format the
 	 * integer directly into the binary.
 	 */
-	fmt_big(erts_current_bin+BYTE_OFFSET(bin_offset),
+	fmt_big(dst_bin + BYTE_OFFSET(bin_offset),
                 NBYTES(num_bits), arg, num_bits, flags);
     } else if (is_big(arg) && bit_offset + num_bits <= 8) {
         /*
@@ -866,7 +867,7 @@ erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flag
         ErtsDigit* dp = big_v(arg);
         Uint val = sign ? -*dp : *dp;
 
-        iptr = erts_current_bin+BYTE_OFFSET(bin_offset);
+        iptr = dst_bin + BYTE_OFFSET(bin_offset);
         b = *iptr & (0xff << rbits);
         b |= (val & ((1 << num_bits)-1)) << (rbits-num_bits);
         *iptr = b;
@@ -888,7 +889,7 @@ erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flag
          * Format the integer byte-aligned using the binary itself as
          * a temporary buffer.
          */
-        iptr = erts_current_bin + BYTE_OFFSET(bin_offset);
+        iptr = dst_bin + BYTE_OFFSET(bin_offset);
         b = *iptr;
         fmt_big(iptr, NBYTES(num_bits), arg, num_bits, flags);
 
@@ -919,15 +920,15 @@ erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flag
         /* Not an integer. */
         return 0;
     }
-    erts_bin_offset = bin_offset + num_bits;
+    EBS->erts_bin_offset = bin_offset + num_bits;
     return 1;
 }
 
 #if !defined(BEAMASM)
 int
-erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm arg))
+erts_bs_put_utf8(ErlBitsState *EBS, Eterm arg)
 {
-    Uint bin_offset = erts_bin_offset;
+    Uint bin_offset = EBS->erts_bin_offset;
     Uint bit_offset;
     Uint num_bits;
     byte tmp_buf[4];
@@ -944,7 +945,7 @@ erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm arg))
 
     if ((bit_offset = BIT_OFFSET(bin_offset)) == 0) {
 	/* We can write directly into the destination binary. */
-	dst = erts_current_bin+BYTE_OFFSET(bin_offset);
+        dst = EBS->erts_current_bin + BYTE_OFFSET(bin_offset);
     } else {
 	/* Unaligned destination binary. Must use a temporary buffer. */
 	dst = tmp_buf;
@@ -975,19 +976,19 @@ erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm arg))
     }
 
     if (bin_offset != 0) {
-	erts_copy_bits(dst, 0, 1, erts_current_bin, bin_offset, 1, num_bits);
+        erts_copy_bits(dst, 0, 1, EBS->erts_current_bin, bin_offset, 1, num_bits);
     }
 
-    erts_bin_offset += num_bits;
+    EBS->erts_bin_offset += num_bits;
 
     return 1;
 }
 #endif
 
 int
-erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm arg, Uint flags))
+erts_bs_put_utf16(ErlBitsState *EBS, Eterm arg, Uint flags)
 {
-    Uint bin_offset = erts_bin_offset;
+    Uint bin_offset = EBS->erts_bin_offset;
     Uint bit_offset;
     Uint num_bits;
     byte tmp_buf[4];
@@ -1004,7 +1005,7 @@ erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm arg, Uint flags))
 
     if ((bit_offset = BIT_OFFSET(bin_offset)) == 0) {
 	/* We can write directly into the destination binary. */
-	dst = erts_current_bin+BYTE_OFFSET(bin_offset);
+	dst = EBS->erts_current_bin + BYTE_OFFSET(bin_offset);
     } else {
 	/* Unaligned destination binary. Must use a temporary buffer. */
 	dst = tmp_buf;
@@ -1040,17 +1041,16 @@ erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm arg, Uint flags))
     }
 
     if (bin_offset != 0) {
-	erts_copy_bits(dst, 0, 1, erts_current_bin, bin_offset, 1, num_bits);
+	erts_copy_bits(dst, 0, 1, EBS->erts_current_bin, bin_offset, 1, num_bits);
     }
 
-    erts_bin_offset += num_bits;
+    EBS->erts_bin_offset += num_bits;
     return 1;
 }
 
 int
-erts_new_bs_put_binary(Process *c_p, Eterm arg, Uint num_bits)
+erts_bs_put_binary(ErlBitsState *EBS, Process *c_p, Eterm arg, Uint num_bits)
 {
-    ERL_BITS_DEFINE_STATEP(c_p);
     Uint offset, size;
     byte *base;
 
@@ -1066,18 +1066,17 @@ erts_new_bs_put_binary(Process *c_p, Eterm arg, Uint num_bits)
         return 0;
     }
 
-    copy_binary_to_buffer(erts_current_bin, erts_bin_offset,
+    copy_binary_to_buffer(EBS->erts_current_bin, EBS->erts_bin_offset,
                           base, offset, num_bits);
-    erts_bin_offset += num_bits;
+    EBS->erts_bin_offset += num_bits;
 
     BUMP_REDS(c_p, num_bits / BITS_PER_REDUCTION);
     return 1;
 }
 
 int
-erts_new_bs_put_binary_all(Process *c_p, Eterm arg, Uint unit)
+erts_bs_put_binary_all(ErlBitsState *EBS, Process *c_p, Eterm arg, Uint unit)
 {
-    ERL_BITS_DEFINE_STATEP(c_p);
     Uint offset, size;
     byte *base;
 
@@ -1091,9 +1090,9 @@ erts_new_bs_put_binary_all(Process *c_p, Eterm arg, Uint unit)
         return 0;
     }
 
-    copy_binary_to_buffer(erts_current_bin, erts_bin_offset,
+    copy_binary_to_buffer(EBS->erts_current_bin, EBS->erts_bin_offset,
                           base, offset, size);
-    erts_bin_offset += size;
+    EBS->erts_bin_offset += size;
 
     BUMP_REDS(c_p, size / BITS_PER_REDUCTION);
     return 1;
@@ -1106,11 +1105,9 @@ erts_new_bs_put_binary_all(Process *c_p, Eterm arg, Uint unit)
  * and sets c_p-fvalue to 'type', 'no_float', or 'invalid'.
  */
 Eterm
-erts_new_bs_put_float(Process *c_p, Eterm arg, Uint num_bits, int flags)
+erts_bs_put_float(ErlBitsState *EBS, Process *c_p, Eterm arg, Uint num_bits, int flags)
 {
-    ERL_BITS_DEFINE_STATEP(c_p);
-
-    if (BIT_OFFSET(erts_bin_offset) == 0) {
+    if (BIT_OFFSET(EBS->erts_bin_offset) == 0) {
 	Uint32 a;
 	Uint32 b;
 	
@@ -1222,7 +1219,7 @@ erts_new_bs_put_float(Process *c_p, Eterm arg, Uint num_bits, int flags)
 	}
 
 	if (BIT_IS_MACHINE_ENDIAN(flags)) {
-	    byte* t = erts_current_bin+BYTE_OFFSET(erts_bin_offset);
+	    byte* t = EBS->erts_current_bin + BYTE_OFFSET(EBS->erts_bin_offset);
 #ifdef WORDS_BIGENDIAN
 	    if (num_bits == 16) {
 		t[0] = a >> 8;
@@ -1255,7 +1252,9 @@ erts_new_bs_put_float(Process *c_p, Eterm arg, Uint num_bits, int flags)
 	    }
 #endif
 	} else {
-	    byte* t = erts_current_bin+BYTE_OFFSET(erts_bin_offset) + NBYTES(num_bits);
+	    byte* t = EBS->erts_current_bin +
+                BYTE_OFFSET(EBS->erts_bin_offset) +
+                NBYTES(num_bits);
 #ifdef WORDS_BIGENDIAN
 	    if (num_bits == 16) {
 		t[-1] = a >> 8;
@@ -1385,27 +1384,32 @@ erts_new_bs_put_float(Process *c_p, Eterm arg, Uint num_bits, int flags)
 	}
 	if (BIT_IS_MACHINE_ENDIAN(flags)) {
 	    erts_copy_bits(bptr, 0, 1,
-		      erts_current_bin,
-		      erts_bin_offset, 1, num_bits);
+                           EBS->erts_current_bin,
+                           EBS->erts_bin_offset, 1, num_bits);
 	} else {
 	    erts_copy_bits(bptr+NBYTES(num_bits)-1, 0, -1,
-			   erts_current_bin, erts_bin_offset, 1,
+			   EBS->erts_current_bin, EBS->erts_bin_offset, 1,
 			   num_bits);
 	}
     }
-    erts_bin_offset += num_bits;
+    EBS->erts_bin_offset += num_bits;
     return THE_NON_VALUE;
 }
 
-void 
-erts_new_bs_put_string(ERL_BITS_PROTO_2(byte* iptr, Uint num_bytes))
+void
+erts_bs_put_string(ErlBitsState* EBS, byte* iptr, Uint num_bytes)
 {
-    if (BIT_OFFSET(erts_bin_offset) != 0) {
-	erts_copy_bits(iptr, 0, 1, erts_current_bin, erts_bin_offset, 1, num_bytes*8);
+    byte* dst_bin = EBS->erts_current_bin;
+    Uint dst_offset = EBS->erts_bin_offset;
+
+    EBS->erts_bin_offset = dst_offset + num_bytes * 8;
+    if (BIT_OFFSET(dst_offset) != 0) {
+        erts_copy_bits(iptr, 0, 1,
+                       dst_bin, dst_offset, 1,
+                       num_bytes*8);
     } else {
-	sys_memcpy(erts_current_bin+BYTE_OFFSET(erts_bin_offset), iptr, num_bytes);
+        sys_memcpy(dst_bin + BYTE_OFFSET(dst_offset), iptr, num_bytes);
     }
-    erts_bin_offset += num_bytes*8;
 }
 
 static ERTS_INLINE
@@ -1488,8 +1492,9 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
     BinRef* br;
     Binary* binp;
     Uint heap_need;
+    Uint position;
     Uint used_size_in_bits;
-    ERL_BITS_DEFINE_STATEP(c_p);
+    ErlBitsState* EBS = ERL_BITS_EBS_FROM_REG(reg);
 
     /*
      * Check the binary argument.
@@ -1528,10 +1533,10 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
      * OK, the binary is writable.
      */
     ASSERT(sb->start == 0);
-    erts_bin_offset = sb->end;
+    EBS->erts_bin_offset = position = sb->end;
     if (unit > 1) {
-	if ((unit == 8 && (erts_bin_offset & 7) != 0) ||
-	    (unit != 8 && (erts_bin_offset % unit) != 0)) {
+	if ((unit == 8 && (position & 7) != 0) ||
+	    (unit != 8 && (position % unit) != 0)) {
             c_p->fvalue = am_unit;
 	    goto badarg;
 	}
@@ -1548,13 +1553,13 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
 	return bin;
     }
 
-    if((ERTS_UINT_MAX - build_size_in_bits) < erts_bin_offset) {
+    if ((ERTS_UINT_MAX - build_size_in_bits) < position) {
         c_p->fvalue = am_size;
         c_p->freason = SYSTEM_LIMIT;
         return THE_NON_VALUE;
     }
 
-    used_size_in_bits = erts_bin_offset + build_size_in_bits;
+    used_size_in_bits = position + build_size_in_bits;
 
     /* Make sure that no one else can append to the incoming bitstring. */
     erl_sub_bits_clear_writable(sb);
@@ -1569,11 +1574,11 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
         binp = erts_bin_realloc(binp, new_size);
         br->val = binp;
 
-        BUMP_REDS(c_p, erts_bin_offset / BITS_PER_REDUCTION);
+        BUMP_REDS(c_p, position / BITS_PER_REDUCTION);
     }
 
     binp->intern.apparent_size = NBYTES(used_size_in_bits);
-    erts_current_bin = (byte*)binp->orig_bytes;
+    EBS->erts_current_bin = (byte*)binp->orig_bytes;
 
     /* Allocate heap space and build a new sub binary. */
     reg[live] = sb->orig;
@@ -1592,7 +1597,7 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
     erl_sub_bits_init(sb,
                       ERL_SUB_BITS_FLAGS_WRITABLE,
                       reg[live],
-                      erts_current_bin,
+                      EBS->erts_current_bin,
                       0,
                       used_size_in_bits);
 
@@ -1656,10 +1661,10 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
                                  &br,
                                  &sb);
 
-        erts_current_bin = (byte*)(br->val)->orig_bytes;
-        erts_bin_offset = src_size;
+        EBS->erts_current_bin = (byte*)(br->val)->orig_bytes;
+        EBS->erts_bin_offset = src_size;
 
-        copy_binary_to_buffer(erts_current_bin,
+        copy_binary_to_buffer(EBS->erts_current_bin,
                               0,
                               src_bytes,
                               src_offset,
@@ -1671,15 +1676,14 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
 }
 
 Eterm
-erts_bs_private_append_checked(Process* p, Eterm bin, Uint build_size_in_bits, Uint unit)
+erts_bs_private_append_checked(ErlBitsState* EBS, Process* p,
+                               Eterm bin, Uint build_size_in_bits)
 {
-    Uint new_position, new_size, used_size;
+    Uint old_position, new_position, used_size;
     Binary *refc_binary;
     ErlSubBits *sb;
     BinRef *br;
 
-    ERL_BITS_DEFINE_STATEP(p);
-
     sb = (ErlSubBits*)bitstring_val(bin);
     ASSERT(sb->thing_word == HEADER_SUB_BITS);
 
@@ -1688,21 +1692,24 @@ erts_bs_private_append_checked(Process* p, Eterm bin, Uint build_size_in_bits, U
 
     /* Calculate new size in bits. */
     ASSERT(sb->start == 0);
-    erts_bin_offset = sb->end;
+    EBS->erts_bin_offset = old_position = sb->end;
 
-    if((ERTS_UINT_MAX - build_size_in_bits) < erts_bin_offset) {
+#ifdef BEAMASM
+    ASSERT(ERTS_UINT_MAX - build_size_in_bits >= old_position);
+#else
+    if (ERTS_UINT_MAX - build_size_in_bits < old_position) {
         p->fvalue = am_size;
         p->freason = SYSTEM_LIMIT;
         return THE_NON_VALUE;
     }
+#endif
 
     refc_binary = br->val;
 
-    new_position = erts_bin_offset + build_size_in_bits;
-    update_wb_overhead(p, br, sb->end, new_position);
-
+    new_position = old_position + build_size_in_bits;
     used_size = NBYTES(new_position);
-    new_size = GROW_PROC_BIN_SIZE(used_size);
+
+    update_wb_overhead(p, br, old_position, new_position);
 
     if (refc_binary->intern.flags & BIN_FLAG_WRITABLE) {
         /* This is the normal case - the binary is writable. There are no other
@@ -1711,10 +1718,11 @@ erts_bs_private_append_checked(Process* p, Eterm bin, Uint build_size_in_bits, U
         ASSERT(erl_sub_bits_is_writable(sb));
         ASSERT(erts_refc_read(&refc_binary->intern.refc, 1) == 1);
         if (refc_binary->orig_size < used_size) {
+            Uint new_size = GROW_PROC_BIN_SIZE(used_size);
             refc_binary = erts_bin_realloc(refc_binary, new_size);
             br->val = refc_binary;
 
-            BUMP_REDS(p, erts_bin_offset / BITS_PER_REDUCTION);
+            BUMP_REDS(p, EBS->erts_bin_offset / BITS_PER_REDUCTION);
         }
 
         ASSERT(sb->start == 0);
@@ -1730,10 +1738,11 @@ erts_bs_private_append_checked(Process* p, Eterm bin, Uint build_size_in_bits, U
          * binary and make a copy of the data.
          *
          * We'll also make a new BinRef as the old one may have been moved from
-         * the `wrt_bins` list to the regular `off_heap` list by the GC. To
+         * the `wrt_bins` list to the regular `off_heap` list by the GC.
          * To move it back would mean traversing the `off_heap` list from the
          * start, so we'll create a new BinRef instead for this (hopefully)
          * rare case. */
+        Uint new_size = GROW_PROC_BIN_SIZE(used_size);
         Binary *new_binary = erts_bin_nrml_alloc(new_size);
         Eterm *hp = HeapFragOnlyAlloc(p, ERL_REFC_BITS_SIZE);
 
@@ -1749,12 +1758,12 @@ erts_bs_private_append_checked(Process* p, Eterm bin, Uint build_size_in_bits, U
                    refc_binary->orig_bytes,
                    MIN(refc_binary->orig_size, new_size));
 
-        BUMP_REDS(p, erts_bin_offset / BITS_PER_REDUCTION);
+        BUMP_REDS(p, EBS->erts_bin_offset / BITS_PER_REDUCTION);
         refc_binary = new_binary;
     }
 
     ASSERT(refc_binary->intern.flags & BIN_FLAG_WRITABLE);
-    erts_current_bin = (byte*)&refc_binary->orig_bytes[0];
+    EBS->erts_current_bin = (byte*)&refc_binary->orig_bytes[0];
 
     return make_bitstring(sb);
 }
diff --git a/erts/emulator/beam/erl_bits.h b/erts/emulator/beam/erl_bits.h
index 619f6157e4ec..0700fb01f053 100644
--- a/erts/emulator/beam/erl_bits.h
+++ b/erts/emulator/beam/erl_bits.h
@@ -168,43 +168,28 @@ struct erl_bits_state {
     /*
      * Pointer to the beginning of the current binary.
      */
-    byte* erts_current_bin_;
+    byte* erts_current_bin;
 
     /*
      * Offset in bits into the current binary.
      */
-    Uint erts_bin_offset_;
+    Uint erts_bin_offset;
 };
 
+typedef struct erl_bits_state ErlBitsState;
+
 /*
- * Reentrant API with the state passed as a parameter.
- * (Except when the current Process* already is a parameter.)
+ * The bit syntax construction state resides in the current process's
+ * schduler data. The following macro retrieves the pointer to that
+ * state given a pointer to the X register array.
  */
-/* the state resides in the current process' scheduler data */
-#define ERL_BITS_DECLARE_STATEP struct erl_bits_state *EBS
-
-#define ERL_BITS_RELOAD_STATEP(P)                                              \
-    do {                                                                       \
-        EBS = &erts_proc_sched_data((P))->registers->aux_regs.d.erl_bits_state;  \
-    } while(0)
-
-#define ERL_BITS_DEFINE_STATEP(P) \
-    struct erl_bits_state *EBS = \
-        &erts_proc_sched_data((P))->registers->aux_regs.d.erl_bits_state
-
-#define ErlBitsState				(*EBS)
-
-#define ERL_BITS_PROTO_0			struct erl_bits_state *EBS
-#define ERL_BITS_PROTO_1(PARM1)			struct erl_bits_state *EBS, PARM1
-#define ERL_BITS_PROTO_2(PARM1,PARM2)		struct erl_bits_state *EBS, PARM1, PARM2
-#define ERL_BITS_PROTO_3(PARM1,PARM2,PARM3)	struct erl_bits_state *EBS, PARM1, PARM2, PARM3
-#define ERL_BITS_ARGS_0				EBS
-#define ERL_BITS_ARGS_1(ARG1)			EBS, ARG1
-#define ERL_BITS_ARGS_2(ARG1,ARG2)		EBS, ARG1, ARG2
-#define ERL_BITS_ARGS_3(ARG1,ARG2,ARG3)		EBS, ARG1, ARG2, ARG3
 
-#define erts_bin_offset		(ErlBitsState.erts_bin_offset_)
-#define erts_current_bin	(ErlBitsState.erts_current_bin_)
+#define ERL_BITS_EBS_FROM_REG(Reg)                              \
+    ((ErlBitsState *) ((char *)(Reg) +                          \
+                       (offsetof(ErtsSchedulerRegisters,        \
+                                 aux_regs.d.erl_bits_state) -   \
+                        offsetof(ErtsSchedulerRegisters,        \
+                                 x_reg_array.d))))
 
 /*
  * Return number of Eterm words needed for allocation with HAlloc(),
@@ -231,22 +216,25 @@ Eterm erts_bs_get_binary_2(Process *p, Uint num_bits, unsigned flags, ErlSubBits
 Eterm erts_bs_get_binary_all_2(Process *p, ErlSubBits* sb);
 
 /* Binary construction, new instruction set. */
-int erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm Integer, Uint num_bits, unsigned flags));
+int erts_bs_put_integer(ErlBitsState *EBS, Eterm Integer, Uint num_bits,
+                        unsigned flags);
 #if !defined(BEAMASM)
-int erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm Integer));
+int erts_bs_put_utf8(ErlBitsState *EBS, Eterm Integer);
 #endif
-int erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm Integer, Uint flags));
-int erts_new_bs_put_binary(Process *c_p, Eterm Bin, Uint num_bits);
-int erts_new_bs_put_binary_all(Process *c_p, Eterm Bin, Uint unit);
-Eterm erts_new_bs_put_float(Process *c_p, Eterm Float, Uint num_bits, int flags);
-void erts_new_bs_put_string(ERL_BITS_PROTO_2(byte* iptr, Uint num_bytes));
+int erts_bs_put_utf16(ErlBitsState *EBS, Eterm Integer, Uint flags);
+int erts_bs_put_binary(ErlBitsState *EBS, Process *c_p, Eterm Bin, Uint num_bits);
+int erts_bs_put_binary_all(ErlBitsState* EBS, Process *c_p, Eterm Bin, Uint unit);
+Eterm erts_bs_put_float(ErlBitsState *EBS, Process *c_p, Eterm Float,
+                        Uint num_bits, int flags);
+void erts_bs_put_string(ErlBitsState *EBS, byte* iptr, Uint num_bytes);
 
 Uint32 erts_bs_get_unaligned_uint32(ErlSubBits* sb);
 Eterm erts_bs_get_utf8(ErlSubBits* sb);
 Eterm erts_bs_get_utf16(ErlSubBits* sb, Uint flags);
 Eterm erts_bs_append_checked(Process* p, Eterm* reg, Uint live, Uint size,
                              Uint extra_words, Uint unit);
-Eterm erts_bs_private_append_checked(Process* p, Eterm bin, Uint size, Uint unit);
+Eterm erts_bs_private_append_checked(ErlBitsState* EBS, Process* p,
+                                     Eterm bin, Uint size);
 Eterm erts_bs_init_writable(Process* p, Eterm sz);
 
 /* ************************************************************************* */
diff --git a/erts/emulator/beam/jit/arm/instr_bs.cpp b/erts/emulator/beam/jit/arm/instr_bs.cpp
index f6a84591c998..e0080466c47f 100644
--- a/erts/emulator/beam/jit/arm/instr_bs.cpp
+++ b/erts/emulator/beam/jit/arm/instr_bs.cpp
@@ -1199,7 +1199,7 @@ void BeamModuleAssembler::update_bin_state(a64::Gp bin_offset,
                                            Sint size,
                                            a64::Gp size_reg) {
     int cur_bin_offset = offsetof(ErtsSchedulerRegisters,
-                                  aux_regs.d.erl_bits_state.erts_current_bin_);
+                                  aux_regs.d.erl_bits_state.erts_current_bin);
     arm::Mem mem_bin_base = arm::Mem(scheduler_registers, cur_bin_offset);
     arm::Mem mem_bin_offset =
             arm::Mem(scheduler_registers, cur_bin_offset + sizeof(Eterm));
@@ -1207,8 +1207,8 @@ void BeamModuleAssembler::update_bin_state(a64::Gp bin_offset,
     if (bit_offset % 8 != 0) {
         /* The bit offset is unknown or not byte-aligned. */
         ERTS_CT_ASSERT_FIELD_PAIR(struct erl_bits_state,
-                                  erts_current_bin_,
-                                  erts_bin_offset_);
+                                  erts_current_bin,
+                                  erts_bin_offset);
         a.ldp(TMP2, bin_offset, mem_bin_base);
 
         if (size_reg.isValid()) {
@@ -2021,14 +2021,14 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         BscSegment seg = segments[0];
         comment("private append to binary");
         ASSERT(Alloc.get() == 0);
-        mov_arg(ARG2, seg.src);
+        load_erl_bits_state(ARG1);
+        a.mov(ARG2, c_p);
+        mov_arg(ARG3, seg.src);
         if (sizeReg.isValid()) {
-            a.mov(ARG3, sizeReg);
+            a.mov(ARG4, sizeReg);
         } else {
-            mov_imm(ARG3, num_bits);
+            mov_imm(ARG4, num_bits);
         }
-        a.mov(ARG4, seg.unit);
-        a.mov(ARG1, c_p);
         emit_enter_runtime(Live.get());
         runtime_call<4>(erts_bs_private_append_checked);
         emit_leave_runtime(Live.get());
@@ -2036,7 +2036,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
     } else if (estimated_num_bits <= ERL_ONHEAP_BITS_LIMIT) {
         static constexpr auto cur_bin_offset =
                 offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
-                offsetof(struct erl_bits_state, erts_current_bin_);
+                offsetof(struct erl_bits_state, erts_current_bin);
         Uint need;
 
         arm::Mem mem_bin_base = arm::Mem(scheduler_registers, cur_bin_offset);
@@ -2104,8 +2104,8 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
 
             /* Initialize the erl_bin_state struct. */
             ERTS_CT_ASSERT_FIELD_PAIR(struct erl_bits_state,
-                                      erts_current_bin_,
-                                      erts_bin_offset_);
+                                      erts_current_bin,
+                                      erts_bin_offset);
             a.stp(HTOP, ZERO, mem_bin_base);
 
             /* Update HTOP. */
@@ -2158,11 +2158,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             comment("construct a binary segment");
             if (seg.effectiveSize >= 0) {
                 /* The segment has a literal size. */
-                mov_imm(ARG3, seg.effectiveSize);
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
+                load_erl_bits_state(ARG1);
+                a.mov(ARG2, c_p);
+                mov_arg(ARG3, seg.src);
+                mov_imm(ARG4, seg.effectiveSize);
                 emit_enter_runtime<Update::eReductions>(Live.get());
-                runtime_call<3>(erts_new_bs_put_binary);
+                runtime_call<4>(erts_bs_put_binary);
                 emit_leave_runtime<Update::eReductions>(Live.get());
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
                                                              BSC_REASON_BADARG,
@@ -2172,12 +2173,13 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                        seg.size.as<ArgAtom>().get() == am_all) {
                 /* Include the entire binary/bitstring in the
                  * resulting binary. */
-                a.mov(ARG3, seg.unit);
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
+                load_erl_bits_state(ARG1);
+                a.mov(ARG2, c_p);
+                mov_arg(ARG3, seg.src);
+                mov_imm(ARG4, seg.unit);
 
                 emit_enter_runtime<Update::eReductions>(Live.get());
-                runtime_call<3>(erts_new_bs_put_binary_all);
+                runtime_call<4>(erts_bs_put_binary_all);
                 emit_leave_runtime<Update::eReductions>(Live.get());
 
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
@@ -2195,17 +2197,18 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                  * the value is a non-negative small in the
                  * appropriate range. Multiply the size with the
                  * unit. */
-                auto r = load_source(seg.size, ARG3);
-                a.asr(ARG3, r.reg, imm(_TAG_IMMED1_SIZE));
+                auto r = load_source(seg.size, ARG4);
+                a.asr(ARG4, r.reg, imm(_TAG_IMMED1_SIZE));
                 if (seg.unit != 1) {
                     mov_imm(TMP1, seg.unit);
-                    a.mul(ARG3, ARG3, TMP1);
+                    a.mul(ARG4, ARG4, TMP1);
                 }
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
+                load_erl_bits_state(ARG1);
+                a.mov(ARG2, c_p);
+                mov_arg(ARG3, seg.src);
 
                 emit_enter_runtime<Update::eReductions>(Live.get());
-                runtime_call<3>(erts_new_bs_put_binary);
+                runtime_call<4>(erts_bs_put_binary);
                 emit_leave_runtime<Update::eReductions>(Live.get());
 
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
@@ -2225,21 +2228,22 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         case am_float:
             comment("construct float segment");
             if (seg.effectiveSize >= 0) {
-                mov_imm(ARG3, seg.effectiveSize);
+                mov_imm(ARG4, seg.effectiveSize);
             } else {
-                auto r = load_source(seg.size, ARG3);
-                a.asr(ARG3, r.reg, imm(_TAG_IMMED1_SIZE));
+                auto r = load_source(seg.size, ARG4);
+                a.asr(ARG4, r.reg, imm(_TAG_IMMED1_SIZE));
                 if (seg.unit != 1) {
                     mov_imm(TMP1, seg.unit);
-                    a.mul(ARG3, ARG3, TMP1);
+                    a.mul(ARG4, ARG4, TMP1);
                 }
             }
-            mov_arg(ARG2, seg.src);
-            mov_imm(ARG4, seg.flags);
-            a.mov(ARG1, c_p);
+            load_erl_bits_state(ARG1);
+            a.mov(ARG2, c_p);
+            mov_arg(ARG3, seg.src);
+            mov_imm(ARG5, seg.flags);
 
             emit_enter_runtime(Live.get());
-            runtime_call<4>(erts_new_bs_put_float);
+            runtime_call<5>(erts_bs_put_float);
             emit_leave_runtime(Live.get());
 
             if (Fail.get() == 0) {
@@ -2490,12 +2494,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                 } else {
                     /* Call the helper function to fetch and store the
                      * integer into the binary. */
+                    load_erl_bits_state(ARG1);
                     mov_arg(ARG2, seg.src);
                     mov_imm(ARG4, seg.flags);
-                    load_erl_bits_state(ARG1);
 
                     emit_enter_runtime(Live.get());
-                    runtime_call<4>(erts_new_bs_put_integer);
+                    runtime_call<4>(erts_bs_put_integer);
                     emit_leave_runtime(Live.get());
 
                     if (exact_type<BeamTypeId::Integer>(seg.src)) {
@@ -2522,12 +2526,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
 
             comment("insert string");
             ASSERT(seg.effectiveSize >= 0);
-            mov_imm(ARG3, seg.effectiveSize / 8);
-            mov_arg(ARG2, string_ptr);
             load_erl_bits_state(ARG1);
+            mov_arg(ARG2, string_ptr);
+            mov_imm(ARG3, seg.effectiveSize / 8);
 
             emit_enter_runtime(Live.get());
-            runtime_call<3>(erts_new_bs_put_string);
+            runtime_call<3>(erts_bs_put_string);
             emit_leave_runtime(Live.get());
             break;
         }
@@ -2556,13 +2560,13 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             a.cbz(ARG1, resolve_label(error, disp1MB));
             break;
         case am_utf32:
+            load_erl_bits_state(ARG1);
             mov_arg(ARG2, seg.src);
             mov_imm(ARG3, 4 * 8);
             a.mov(ARG4, seg.flags);
-            load_erl_bits_state(ARG1);
 
             emit_enter_runtime(Live.get());
-            runtime_call<4>(erts_new_bs_put_integer);
+            runtime_call<4>(erts_bs_put_integer);
             emit_leave_runtime(Live.get());
 
             if (Fail.get() == 0) {
diff --git a/erts/emulator/beam/jit/beam_jit_common.cpp b/erts/emulator/beam/jit/beam_jit_common.cpp
index fde678606ffa..3b348db4b9ad 100644
--- a/erts/emulator/beam/jit/beam_jit_common.cpp
+++ b/erts/emulator/beam/jit/beam_jit_common.cpp
@@ -808,7 +808,7 @@ void beam_jit_bs_add_argument_error(Process *c_p, Eterm A, Eterm B) {
 
 Eterm beam_jit_bs_init_bits(Process *c_p,
                             Eterm *reg,
-                            ERL_BITS_DECLARE_STATEP,
+                            ErlBitsState *EBS,
                             Uint num_bits,
                             Uint alloc,
                             unsigned Live) {
@@ -818,7 +818,7 @@ Eterm beam_jit_bs_init_bits(Process *c_p,
         alloc += ERL_REFC_BITS_SIZE;
     }
 
-    erts_bin_offset = 0;
+    EBS->erts_bin_offset = 0;
 
     if (num_bits <= ERL_ONHEAP_BITS_LIMIT) {
         ErlHeapBits *hb;
@@ -830,7 +830,7 @@ Eterm beam_jit_bs_init_bits(Process *c_p,
         hb->thing_word = header_heap_bits(num_bits);
         hb->size = num_bits;
 
-        erts_current_bin = (byte *)hb->data;
+        EBS->erts_current_bin = (byte *)hb->data;
         return make_bitstring(hb);
     } else {
         const Uint num_bytes = NBYTES(num_bits);
@@ -839,13 +839,13 @@ Eterm beam_jit_bs_init_bits(Process *c_p,
         test_bin_vheap(c_p, reg, num_bytes / sizeof(Eterm), alloc, Live);
 
         new_binary = erts_bin_nrml_alloc(num_bytes);
-        erts_current_bin = (byte *)new_binary->orig_bytes;
+        EBS->erts_current_bin = (byte *)new_binary->orig_bytes;
 
         return erts_wrap_refc_bitstring(&MSO(c_p).first,
                                         &MSO(c_p).overhead,
                                         &HEAP_TOP(c_p),
                                         new_binary,
-                                        erts_current_bin,
+                                        EBS->erts_current_bin,
                                         0,
                                         num_bits);
     }
diff --git a/erts/emulator/beam/jit/beam_jit_common.hpp b/erts/emulator/beam/jit/beam_jit_common.hpp
index 934821d8f532..3d34674153e0 100644
--- a/erts/emulator/beam/jit/beam_jit_common.hpp
+++ b/erts/emulator/beam/jit/beam_jit_common.hpp
@@ -600,7 +600,7 @@ void beam_jit_bs_field_size_argument_error(Process *c_p, Eterm size);
 void beam_jit_bs_add_argument_error(Process *c_p, Eterm A, Eterm B);
 Eterm beam_jit_bs_init_bits(Process *c_p,
                             Eterm *reg,
-                            ERL_BITS_DECLARE_STATEP,
+                            ErlBitsState *EBS,
                             Uint num_bits,
                             Uint alloc,
                             unsigned Live);
diff --git a/erts/emulator/beam/jit/x86/instr_bs.cpp b/erts/emulator/beam/jit/x86/instr_bs.cpp
index d3c9e1c050d6..b43351b871ba 100644
--- a/erts/emulator/beam/jit/x86/instr_bs.cpp
+++ b/erts/emulator/beam/jit/x86/instr_bs.cpp
@@ -1262,10 +1262,10 @@ void BeamModuleAssembler::update_bin_state(x86::Gp bin_offset,
     const int x_reg_offset = offsetof(ErtsSchedulerRegisters, x_reg_array.d);
     const int cur_bin_base =
             offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
-            offsetof(struct erl_bits_state, erts_current_bin_);
+            offsetof(struct erl_bits_state, erts_current_bin);
     const int cur_bin_offset =
             offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
-            offsetof(struct erl_bits_state, erts_bin_offset_);
+            offsetof(struct erl_bits_state, erts_bin_offset);
 
     x86::Mem mem_bin_base =
             x86::Mem(registers, cur_bin_base - x_reg_offset, sizeof(UWord));
@@ -1822,10 +1822,10 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                 offsetof(ErtsSchedulerRegisters, x_reg_array.d);
         const int cur_bin_base =
                 offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
-                offsetof(struct erl_bits_state, erts_current_bin_);
+                offsetof(struct erl_bits_state, erts_current_bin);
         const int cur_bin_offset =
                 offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
-                offsetof(struct erl_bits_state, erts_bin_offset_);
+                offsetof(struct erl_bits_state, erts_bin_offset);
         x86::Mem mem_bin_base =
                 x86::qword_ptr(registers, cur_bin_base - x_reg_offset);
         x86::Mem mem_bin_offset =
@@ -2195,14 +2195,14 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         runtime_entered = bs_maybe_enter_runtime(runtime_entered);
         comment("private append to binary");
         ASSERT(Alloc.get() == 0);
-        mov_arg(ARG2, seg.src);
+        mov_arg(ARG3, seg.src);
         if (sizeReg.isValid()) {
-            a.mov(ARG3, sizeReg);
+            a.mov(ARG4, sizeReg);
         } else {
-            mov_imm(ARG3, num_bits);
+            mov_imm(ARG4, num_bits);
         }
-        a.mov(ARG4, seg.unit);
-        a.mov(ARG1, c_p);
+        a.mov(ARG2, c_p);
+        load_erl_bits_state(ARG1);
         runtime_call<4>(erts_bs_private_append_checked);
         /* There is no way the call can fail on a 64-bit architecture. */
         a.mov(TMP_MEM1q, RET);
@@ -2251,10 +2251,11 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             comment("construct a binary segment");
             if (seg.effectiveSize >= 0) {
                 /* The segment has a literal size. */
-                mov_imm(ARG3, seg.effectiveSize);
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
-                runtime_call<3>(erts_new_bs_put_binary);
+                mov_imm(ARG4, seg.effectiveSize);
+                mov_arg(ARG3, seg.src);
+                a.mov(ARG2, c_p);
+                load_erl_bits_state(ARG1);
+                runtime_call<4>(erts_bs_put_binary);
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
                                                              BSC_REASON_BADARG,
                                                              BSC_INFO_DEPENDS,
@@ -2263,10 +2264,11 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                        seg.size.as<ArgAtom>().get() == am_all) {
                 /* Include the entire binary/bitstring in the
                  * resulting binary. */
-                a.mov(ARG3, seg.unit);
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
-                runtime_call<3>(erts_new_bs_put_binary_all);
+                mov_imm(ARG4, seg.unit);
+                mov_arg(ARG3, seg.src);
+                a.mov(ARG2, c_p);
+                load_erl_bits_state(ARG1);
+                runtime_call<4>(erts_bs_put_binary_all);
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
                                                              BSC_REASON_BADARG,
                                                              BSC_INFO_UNIT,
@@ -2282,16 +2284,17 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                  * the value is a non-negative small in the
                  * appropriate range. Multiply the size with the
                  * unit. */
-                mov_arg(ARG3, seg.size);
-                a.sar(ARG3, imm(_TAG_IMMED1_SIZE));
+                mov_arg(ARG4, seg.size);
+                a.sar(ARG4, imm(_TAG_IMMED1_SIZE));
                 if (seg.unit != 1) {
                     mov_imm(RET, seg.unit);
-                    a.mul(ARG3); /* CLOBBERS RDX = ARG3! */
-                    a.mov(ARG3, RET);
+                    a.mul(ARG4); /* CLOBBERS RDX = ARG3! */
+                    a.mov(ARG4, RET);
                 }
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
-                runtime_call<3>(erts_new_bs_put_binary);
+                mov_arg(ARG3, seg.src);
+                a.mov(ARG2, c_p);
+                load_erl_bits_state(ARG1);
+                runtime_call<4>(erts_bs_put_binary);
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
                                                              BSC_REASON_BADARG,
                                                              BSC_INFO_DEPENDS,
@@ -2311,20 +2314,21 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             runtime_entered = bs_maybe_enter_runtime(runtime_entered);
             comment("construct float segment");
             if (seg.effectiveSize >= 0) {
-                mov_imm(ARG3, seg.effectiveSize);
+                mov_imm(ARG4, seg.effectiveSize);
             } else {
-                mov_arg(ARG3, seg.size);
-                a.sar(ARG3, imm(_TAG_IMMED1_SIZE));
+                mov_arg(ARG4, seg.size);
+                a.sar(ARG4, imm(_TAG_IMMED1_SIZE));
                 if (seg.unit != 1) {
                     mov_imm(RET, seg.unit);
-                    a.mul(ARG3); /* CLOBBERS RDX = ARG3! */
-                    a.mov(ARG3, RET);
+                    a.mul(ARG4); /* CLOBBERS RDX = ARG3! */
+                    a.mov(ARG4, RET);
                 }
             }
-            mov_arg(ARG2, seg.src);
-            mov_imm(ARG4, seg.flags);
-            a.mov(ARG1, c_p);
-            runtime_call<4>(erts_new_bs_put_float);
+            mov_arg(ARG3, seg.src);
+            mov_imm(ARG5, seg.flags);
+            a.mov(ARG2, c_p);
+            load_erl_bits_state(ARG1);
+            runtime_call<5>(erts_bs_put_float);
             if (Fail.get() == 0) {
                 mov_imm(ARG4,
                         beam_jit_update_bsc_reason_info(seg.error_info,
@@ -2597,7 +2601,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                     mov_arg(ARG2, seg.src);
                     mov_imm(ARG4, seg.flags);
                     load_erl_bits_state(ARG1);
-                    runtime_call<4>(erts_new_bs_put_integer);
+                    runtime_call<4>(erts_bs_put_integer);
                     if (exact_type<BeamTypeId::Integer>(seg.src)) {
                         comment("skipped test for success because construction "
                                 "can't fail");
@@ -2628,7 +2632,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             mov_imm(ARG3, seg.effectiveSize / 8);
             mov_arg(ARG2, string_ptr);
             load_erl_bits_state(ARG1);
-            runtime_call<3>(erts_new_bs_put_string);
+            runtime_call<3>(erts_bs_put_string);
         } break;
         case am_utf8: {
             runtime_entered = bs_maybe_enter_runtime(runtime_entered);
@@ -2658,7 +2662,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             mov_imm(ARG3, 4 * 8);
             a.mov(ARG4, seg.flags);
             load_erl_bits_state(ARG1);
-            runtime_call<4>(erts_new_bs_put_integer);
+            runtime_call<4>(erts_bs_put_integer);
             if (Fail.get() == 0) {
                 mov_arg(ARG1, seg.src);
                 mov_imm(ARG4,

From ff2ecfd258fac9d3e04377a6c4703806deb0a0db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sat, 11 May 2024 08:26:18 +0200
Subject: [PATCH 3/4] Optimize construction of dynamically sized integer
 segments

The main optimization is breaking apart the `erts_bs_put_integer()`
function into the two functions `erts_bs_put_integer_be()` and
`erts_bs_put_integer_le()` to avoid runtime checks of the endianness.
---
 erts/emulator/beam/emu/bs_instrs.tab    |  25 +-
 erts/emulator/beam/erl_bits.c           | 694 ++++++++++++++----------
 erts/emulator/beam/erl_bits.h           |   4 +-
 erts/emulator/beam/jit/arm/instr_bs.cpp |  14 +-
 erts/emulator/beam/jit/x86/instr_bs.cpp |  14 +-
 5 files changed, 453 insertions(+), 298 deletions(-)

diff --git a/erts/emulator/beam/emu/bs_instrs.tab b/erts/emulator/beam/emu/bs_instrs.tab
index 0024bae6e34f..1c41b5b3ccb7 100644
--- a/erts/emulator/beam/emu/bs_instrs.tab
+++ b/erts/emulator/beam/emu/bs_instrs.tab
@@ -688,22 +688,37 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         case BSC_INTEGER:
             {
                 Sint _size;
+                int result;
 
                 $BS_LOAD_UNIT(p, unit);
                 $BS_LOAD_FLAGS(p, flags);
                 $BS_LOAD_SIZE(p, Size);
                 $BS_GET_UNCHECKED_FIELD_SIZE(Size, unit, $BADARG($Fail), _size);
-                if (!erts_bs_put_integer(EBS, Src, _size, flags)) {
+                if (flags & BSF_LITTLE) {
+                    result = erts_bs_put_integer_le(EBS, Src, _size);
+                } else {
+                    result = erts_bs_put_integer_be(EBS, Src, _size);
+                }
+                if (!result) {
                     $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
                 }
             }
             break;
         case BSC_INTEGER_FIXED_SIZE:
         case BSC_UTF32:
-            $BS_LOAD_FLAGS(p, flags);
-            $BS_LOAD_FIXED_SIZE(p, Size);
-            if (!erts_bs_put_integer(EBS, Src, Size, flags)) {
-                $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
+            {
+                int result;
+
+                $BS_LOAD_FLAGS(p, flags);
+                $BS_LOAD_FIXED_SIZE(p, Size);
+                if (flags & BSF_LITTLE) {
+                    result = erts_bs_put_integer_le(EBS, Src, Size);
+                } else {
+                    result = erts_bs_put_integer_be(EBS, Src, Size);
+                }
+                if (!result) {
+                    $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
+                }
             }
             break;
         case BSC_UTF8:
diff --git a/erts/emulator/beam/erl_bits.c b/erts/emulator/beam/erl_bits.c
index 0d7e6b1f5cf3..b489c74c6f05 100644
--- a/erts/emulator/beam/erl_bits.c
+++ b/erts/emulator/beam/erl_bits.c
@@ -532,238 +532,438 @@ erts_bs_get_binary_all_2(Process *p, ErlSubBits *sb)
  ****************************************************************/
 
 
-/* COPY_VAL:
- * copy sz byte from val to dst buffer, 
- * dst, val are updated!!!
+/* FMT_COPY_VAL:
+ * Copy sz bytes from val to dst buffer;
+ * dst and val are updated.
  */
 
-#define COPY_VAL(dst,ddir,val,sz) do { \
-   Uint __sz = (sz); \
-   while(__sz) { \
-     switch(__sz) { \
-     default: \
-     case 4: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     case 3: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     case 2: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     case 1: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     } \
-   } \
+#define FMT_COPY_VAL(dst,ddir,val,sz) do {                      \
+   Uint __sz = (sz);                                            \
+   while (__sz) {                                               \
+     switch(__sz) {                                             \
+     default:                                                   \
+     case 8: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 7: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 6: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 5: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 4: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 3: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 2: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 1: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     }                                                          \
+   }                                                            \
  } while(0)
 
 static void
-fmt_small(byte *buf, Uint num_bytes, Eterm arg, Uint num_bits, Uint flags)
+fmt_small_be(byte *buf, Eterm arg, Uint num_bits)
 {
     Uint bit_offset;
     Sint val;
+    Uint num_bytes;
 
     ASSERT(is_small(arg));
     ASSERT(num_bits != 0);      /* Tested by caller */
 
+    num_bytes = NBYTES(num_bits);
     bit_offset = BIT_OFFSET(num_bits);
     val = signed_val(arg);
 
-    if (flags & BSF_LITTLE) { /* Little endian */
+    buf += num_bytes - 1;
+    if (bit_offset) {
+        *buf-- = val << (8-bit_offset);
         num_bytes--;
-        COPY_VAL(buf, 1, val, num_bytes);
-        *buf = bit_offset ? (val << (8-bit_offset)) : val;
-    } else {		/* Big endian */
-        buf += num_bytes - 1;
-        if (bit_offset) {
-            *buf-- = val << (8-bit_offset);
-            num_bytes--;
-            val >>= bit_offset;
-        }
-        COPY_VAL(buf, -1, val, num_bytes);
+        val >>= bit_offset;
     }
+    FMT_COPY_VAL(buf, -1, val, num_bytes);
 }
 
-/* calculate a - *cp (carry)  (store result in b), *cp is updated! */
-#define SUBc(a, cp, b) do { \
-   byte __x = (a); \
-   byte __y = (__x - (*(cp))); \
-   (*cp) = (__y > __x); \
-   *(b) = ~__y; \
- } while(0)
+static void
+fmt_small_le(byte *buf, Eterm arg, Uint num_bits)
+{
+    Uint bit_offset;
+    Sint val;
+    Uint num_bytes;
+
+    ASSERT(is_small(arg));
+    ASSERT(num_bits != 0);      /* Tested by caller */
+
+    num_bytes = NBYTES(num_bits);
+    bit_offset = BIT_OFFSET(num_bits);
+    val = signed_val(arg);
+    num_bytes--;
+    FMT_COPY_VAL(buf, 1, val, num_bytes);
+    *buf = bit_offset ? (val << (8-bit_offset)) : val;
+}
+
+#undef FMT_COPY_VAL
+
+/*
+ * Calculate a - c (carry), storing the result in b; set c
+ * to the new carry.
+ */
+#if __has_builtin(__builtin_subc) && !defined(DEBUG)
+#define SUBc(a, c, b)                           \
+   do {                                         \
+       *(b) = __builtin_subc(0, (a), c, &c);    \
+   } while(0)
+#else
+#define SUBc(a, c, b)                           \
+   do {                                         \
+       byte __x = (a);                          \
+       byte __y = (__x - !(c));                 \
+       c = !(__y > __x);                        \
+       *(b) = ~__y;                             \
+   } while(0)
+#endif
 
 static void
-fmt_big(byte *buf, Uint num_bytes, Eterm val, Uint num_bits, Uint flags)
+fmt_big_be(byte *buf, Eterm val, Uint num_bits)
 {
     unsigned long offs;
     int sign;
     Uint ds;
     ErtsDigit* dp;
     int n;
+    ErtsDigit acc = 0;
+    ErtsDigit d;
+    Uint num_bytes;
 
     ASSERT(is_big(val));
+    ASSERT(num_bits != 0);
 
-    if (num_bits == 0) {
-        return;
-    }
-
+    num_bytes = NBYTES(num_bits);
     sign = big_sign(val);
     ds = big_size(val)*sizeof(ErtsDigit); /* number of digits bytes */
     dp = big_v(val);
     n = MIN(num_bytes, ds);
 
     offs = BIT_OFFSET(num_bits);
-    if (flags & BSF_LITTLE) {
-        num_bytes -= n;         /* pad with this amount */
-        if (sign) {             /* negative */
-            int c = 1;
-            while (n >= sizeof(ErtsDigit)) {
-                ErtsDigit d = *dp++;
-                int i;
-                for (i = 0; i < sizeof(ErtsDigit); i++) {
-                    SUBc(d & 0xff, &c, buf);
-                    buf++;
-                    d >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
+
+    buf += num_bytes - 1;       /* end of buffer */
+    num_bytes -= n;             /* pad with this amount */
+    offs = offs ? (8-offs) : 0; /* shift offset */
+
+    if (sign) {             /* negative bignum */
+        unsigned int c = 0;
+
+        while (n >= sizeof(ErtsDigit)) {
+            int i;
+
+            d = *dp++;
+            acc |= d << offs;
+            SUBc(acc & 0xff, c, buf);
+            buf--;
+            acc = d >> (8-offs);
+            for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
+                SUBc(acc & 0xff, c, buf);
+                buf--;
+                acc >>= 8;
             }
-            if (n) {
-                ErtsDigit d = *dp;
-                do {
-                    SUBc(d & 0xff, &c, buf);
-                    buf++;
-                    d >>= 8;
-                } while (--n > 0);
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            acc |= ((ErtsDigit)*dp << offs);
+            do {
+                SUBc(acc & 0xff, c, buf);
+                buf--;
+                acc >>= 8;
+            } while (--n > 0);
+        }
+        /* pad */
+        while (num_bytes--) {
+            SUBc(acc & 0xff, c, buf);
+            buf--;
+            acc >>= 8;
+        }
+    } else {                /* positive bignum */
+        while (n >= sizeof(ErtsDigit)) {
+            int i;
+
+            d = *dp++;
+            acc |= d << offs;
+            *buf-- = acc;
+            acc = d >> (8-offs);
+            for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
+                *buf-- = acc;
+                acc >>= 8;
             }
-            /* pad */
-            while (num_bytes--) {
-                SUBc(0, &c, buf);
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            acc |= (*dp << offs);
+            do {
+                *buf-- = acc;
+                acc >>= 8;
+            } while (--n > 0);
+        }
+        while (num_bytes--) {
+            *buf-- = acc;
+            acc >>= 8;
+        }
+    }
+}
+
+static void
+fmt_big_le(byte *buf, Eterm val, Uint num_bits)
+{
+    unsigned long offs;
+    int sign;
+    Uint ds;
+    ErtsDigit* dp;
+    int n;
+    Uint num_bytes;
+
+    ASSERT(is_big(val));
+    ASSERT(num_bits != 0);
+
+    num_bytes = NBYTES(num_bits);
+    sign = big_sign(val);
+    ds = big_size(val)*sizeof(ErtsDigit); /* number of digits bytes */
+    dp = big_v(val);
+    n = MIN(num_bytes, ds);
+
+    offs = BIT_OFFSET(num_bits);
+    num_bytes -= n;         /* pad with this amount */
+    if (sign) {             /* negative */
+        unsigned int c = 0;
+        while (n >= sizeof(ErtsDigit)) {
+            ErtsDigit d = *dp++;
+            int i;
+            for (i = 0; i < sizeof(ErtsDigit); i++) {
+                SUBc(d & 0xff, c, buf);
                 buf++;
+                d >>= 8;
             }
-        } else {                /* positive */
-            while (n >= sizeof(ErtsDigit)) {
-                ErtsDigit d = *dp++;
-                int i;
-                for(i = 0; i < sizeof(ErtsDigit); i++) {
-                    *buf++ = d;
-                    d >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
-            }
-            if (n) {
-                ErtsDigit d = *dp;
-                do {
-                    *buf++ = d;
-                    d >>= 8;
-                } while (--n > 0);
-            }
-            /* pad */
-            while (num_bytes) {
-                *buf++ = 0;
-                num_bytes--;
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            ErtsDigit d = *dp;
+            do {
+                SUBc(d & 0xff, c, buf);
+                buf++;
+                d >>= 8;
+            } while (--n > 0);
+        }
+        /* pad */
+        while (num_bytes--) {
+            SUBc(0, c, buf);
+            buf++;
+        }
+    } else {                /* positive */
+        while (n >= sizeof(ErtsDigit)) {
+            ErtsDigit d = *dp++;
+            int i;
+            for(i = 0; i < sizeof(ErtsDigit); i++) {
+                *buf++ = d;
+                d >>= 8;
             }
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            ErtsDigit d = *dp;
+            do {
+                *buf++ = d;
+                d >>= 8;
+            } while (--n > 0);
         }
+        /* pad */
+        while (num_bytes) {
+            *buf++ = 0;
+            num_bytes--;
+        }
+    }
 
-        /* adjust MSB */
-        if (offs) {
-            buf--;
-            *buf <<= (8 - offs);
+    /* adjust MSB */
+    if (offs) {
+        buf--;
+        *buf <<= (8 - offs);
+    }
+}
+
+#undef SUBc
+
+static void
+restore_and_shift(byte *buf, Uint orig_byte, Uint bit_offset, Uint num_bits)
+{
+    Uint rshift = bit_offset;
+    Uint lshift = 8 - bit_offset;
+    Uint deoffs = BIT_OFFSET(bit_offset + num_bits);
+    Uint lmask = MAKE_MASK(8 - bit_offset);
+    Uint count = (num_bits - lshift) / 8;
+    Uint bits, bits1;
+
+    ASSERT(num_bits - lshift >= 0);
+
+    bits = *buf;
+    bits1 = bits >> rshift;
+    *buf = MASK_BITS(bits1, orig_byte, lmask);
+    buf++;
+
+    while (count--) {
+        bits1 = bits << lshift;
+        bits = *buf;
+        *buf++ = bits1 | (bits >> rshift);
+    }
+
+    if (deoffs) {
+        Uint rmask = MAKE_MASK(deoffs) << (8 - deoffs);
+
+        bits1 = bits << lshift;
+        if ((rmask << rshift) & 0xff) {
+            bits = *buf;
+            bits1 |= (bits >> rshift);
         }
-    } else {   /* BIG ENDIAN */
-        ErtsDigit acc = 0;
-        ErtsDigit d;
+        *buf = MASK_BITS(bits1, *buf, rmask);
+    }
+}
 
-        buf += num_bytes - 1;       /* end of buffer */
-        num_bytes -= n;             /* pad with this amount */
-        offs = offs ? (8-offs) : 0; /* shift offset */
+int
+erts_bs_put_integer_be(ErlBitsState *EBS, Eterm arg, Uint num_bits)
+{
+    byte* dst_bin = EBS->erts_current_bin;
+    Uint bin_offset = EBS->erts_bin_offset;
+    Uint bit_offset;
+    byte b;
+    byte *iptr;
 
-        if (sign) {             /* negative bignum */
-            int c = 1;
+    if (ERTS_UNLIKELY(num_bits == 0)) {
+        return is_small(arg) || is_big(arg);
+    }
 
-            while (n >= sizeof(ErtsDigit)) {
-                int i;
+    iptr = dst_bin + BYTE_OFFSET(bin_offset);
+    bit_offset = BIT_OFFSET(bin_offset);
+    if (is_small(arg)) {
+        Uint rbits = 8 - bit_offset;
 
-                d = *dp++;
-                acc |= d << offs;
-                SUBc(acc & 0xff, &c, buf);
-                buf--;
-                acc = d >> (8-offs);
-                for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
-                    SUBc(acc & 0xff, &c, buf);
-                    buf--;
-                    acc >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
-            }
-            if (n) {
-                acc |= ((ErtsDigit)*dp << offs);
-                do {
-                    SUBc(acc & 0xff, &c, buf);
-                    buf--;
-                    acc >>= 8;
-                } while (--n > 0);
-            }
-            /* pad */
-            while (num_bytes--) {
-                SUBc(acc & 0xff, &c, buf);
-                buf--;
-                acc >>= 8;
+        if (bit_offset == 0) {
+            /* Aligned on a byte boundary. */
+            if (num_bits <= 8) {
+                /* All bits are in the same byte. */
+                b = (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+                *iptr = b;
+            } else {
+                /* More than one byte. */
+                fmt_small_be(iptr, arg, num_bits);
             }
-        } else {                /* positive bignum */
-            while (n >= sizeof(ErtsDigit)) {
-                int i;
+        } else if (bit_offset + num_bits <= 8) {
+            /*
+             * All bits are in the same byte.
+             */
+            b = *iptr & (0xff << rbits);
+            b |= (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+            *iptr = b;
+        } else {		/* Big endian */
+            /*
+             * Big-endian, more than one byte, but not aligned on a byte boundary.
+             * Handle the bits up to the next byte boundary specially,
+             * then let fmt_small_be() handle the rest.
+             */
+            Uint shift_count = num_bits - rbits;
+            Sint val = signed_val(arg);
 
-                d = *dp++;
-                acc |= d << offs;
-                *buf-- = acc;
-                acc = d >> (8-offs);
-                for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
-                    *buf-- = acc;
-                    acc >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
-            }
-            if (n) {
-                acc |= (*dp << offs);
-                do {
-                    *buf-- = acc;
-                    acc >>= 8;
-                } while (--n > 0);
-            }
-            while (num_bytes--) {
-                *buf-- = acc;
-                acc >>= 8;
+            ASSERT(num_bits > rbits);
+            b = *iptr & (0xff << rbits);
+
+            /*
+             * Shifting with a shift count greater than or equal to the word
+             * size may be a no-op (instead of 0 the result may be the unshifted
+             * value). Therefore, only do the shift and the OR if the shift count
+             * is less than the word size if the number is positive; if negative,
+             * we must simulate the sign extension.
+             */
+            if (shift_count < sizeof(Uint)*8) {
+                b |= (val >> shift_count) & MAKE_MASK(rbits);
+            } else if (val < 0) {
+                /* Simulate sign extension. */
+                b |= (-1) & MAKE_MASK(rbits);
             }
+            *iptr++ = b;
+
+            fmt_small_be(iptr, arg, shift_count);
         }
+    } else if (is_big(arg) && bit_offset == 0) {
+        /*
+         * Big number, aligned on a byte boundary. We can format the
+         * integer directly into the binary.
+         */
+        fmt_big_be(iptr, arg, num_bits);
+    } else if (is_big(arg) && bit_offset + num_bits <= 8) {
+        /*
+         * All bits are in the same byte.
+         */
+        Uint rbits = 8 - bit_offset;
+        Sint sign = big_sign(arg);
+        ErtsDigit* dp = big_v(arg);
+        Uint val = sign ? -*dp : *dp;
+
+        b = *iptr & (0xff << rbits);
+        b |= (val & MAKE_MASK(num_bits)) << (rbits-num_bits);
+        *iptr = b;
+    } else if (is_big(arg)) {
+        /*
+         * Big number, not aligned on a byte boundary.
+         *
+         * Format the integer byte-aligned using the binary itself as
+         * a temporary buffer.
+         */
+        b = *iptr;
+        fmt_big_be(iptr, arg, num_bits);
+
+        /*
+         * Now restore the overwritten bits of the first byte and
+         * shift everything to the right.
+         */
+        restore_and_shift(iptr, b, bit_offset, num_bits);
+    } else {
+        /* Not an integer. */
+        return 0;
     }
+    EBS->erts_bin_offset = bin_offset + num_bits;
+    return 1;
 }
 
 int
-erts_bs_put_integer(ErlBitsState *EBS, Eterm arg, Uint num_bits, unsigned flags)
+erts_bs_put_integer_le(ErlBitsState *EBS, Eterm arg, Uint num_bits)
 {
     byte* dst_bin = EBS->erts_current_bin;
     Uint bin_offset = EBS->erts_bin_offset;
     Uint bit_offset;
-    Uint b;
+    byte b;
     byte *iptr;
 
+    if (ERTS_UNLIKELY(num_bits == 0)) {
+        return is_small(arg) || is_big(arg);
+    }
+
+    iptr = dst_bin + BYTE_OFFSET(bin_offset);
     bit_offset = BIT_OFFSET(bin_offset);
     if (is_small(arg)) {
-	Uint rbits = 8 - bit_offset;
-
-	if (num_bits == 0) {
-	    return 1;
-	} else if (bit_offset + num_bits <= 8) {
-	    /*
-	     * All bits are in the same byte.
-	     */
-	    iptr = dst_bin + BYTE_OFFSET(bin_offset);
-	    b = *iptr & (0xff << rbits);
-	    b |= (signed_val(arg) & ((1 << num_bits)-1)) << (rbits-num_bits);
-	    *iptr = b;
-	} else if (bit_offset == 0) {
-	    /*
-	     * More than one bit, starting at a byte boundary.
-	     */
-            iptr = dst_bin + BYTE_OFFSET(bin_offset);
-            fmt_small(iptr, NBYTES(num_bits), arg, num_bits, flags);
-	} else if (flags & BSF_LITTLE) {
+        Uint rbits = 8 - bit_offset;
+
+        if (bit_offset == 0) {
+            /* Aligned on a byte boundary. */
+            if (num_bits <= 8) {
+                /* All bits are in the same byte. */
+                b = (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+                *iptr = b;
+            } else {
+                /* More than one byte. */
+                fmt_small_le(iptr, arg, num_bits);
+            }
+        } else if (bit_offset + num_bits <= 8) {
+            /*
+             * All bits are in the same byte.
+             */
+            b = *iptr & (0xff << rbits);
+            b |= (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+            *iptr = b;
+        } else if (BIT_OFFSET(num_bits) == 0) {
             /*
              * Little endian small in more than one byte, not
-             * aligned on a byte boundary.
+             * aligned on a byte boundary. The size is evenly
+             * divisible by 8, which means that there will be
+             * one partial byte, followed by zero or more
+             * complete bytes, followed by a final partial byte.
              */
             Sint val = signed_val(arg);
             Uint rshift = bit_offset;
@@ -772,92 +972,51 @@ erts_bs_put_integer(ErlBitsState *EBS, Eterm arg, Uint num_bits, unsigned flags)
             Uint count = (num_bits - rbits) / 8;
             Uint bits, bits1;
 
-            iptr = dst_bin + BYTE_OFFSET(bin_offset);
-
-            if (BIT_OFFSET(num_bits) == 0) {
-                bits = val;
-                bits1 = bits >> rshift;
-                *iptr = MASK_BITS(bits1, *iptr, lmask);
-                iptr++;
-                val >>= 8;
-
-                while (count--) {
-                    bits1 = bits << lshift;
-                    bits = val & 0xff;
-                    *iptr++ = bits1 | (bits >> rshift);
-                    val >>= 8;
-                }
-
-                *iptr = bits << lshift;
-            } else {
-                Sint num_bytes = NBYTES(num_bits) - 1;
-                Uint deoffs = BIT_OFFSET(bit_offset + num_bits);
-
-                if (num_bytes-- > 0) {
-                    bits = val;
-                } else {
-                    bits = (val << (8 - BIT_OFFSET(num_bits)));
-                }
-                bits1 = bits >> rshift;
-                *iptr = MASK_BITS(bits1, *iptr, lmask);
-                iptr++;
+            /* Handle the first partial byte. */
+            bits = val;
+            bits1 = bits >> rshift;
+            *iptr = MASK_BITS(bits1, *iptr, lmask);
+            iptr++;
+            val >>= 8;
+
+            /* Handle all complete bytes. */
+            while (count--) {
+                bits1 = bits << lshift;
+                bits = val & 0xff;
+                *iptr++ = bits1 | (bits >> rshift);
                 val >>= 8;
-
-                while (count--) {
-                    bits1 = bits << lshift;
-                    if (num_bytes-- > 0) {
-                        bits = val & 0xff;
-                    } else {
-                        bits = (val << (8 - BIT_OFFSET(num_bits))) & 0xff;
-                    }
-                    *iptr++ = bits1 | (bits >> rshift);
-                    val >>= 8;
-                }
-
-                if (deoffs) {
-                    bits1 = bits << lshift;
-                    if (rshift < deoffs) {
-                        bits = (val << (8 - BIT_OFFSET(num_bits))) & 0xff;
-                        bits1 |= bits >> rshift;
-                    }
-                    *iptr = bits1;
-                }
             }
-	} else {		/* Big endian */
-	    /*
-	     * Big-endian, more than one byte, but not aligned on a byte boundary.
-	     * Handle the bits up to the next byte boundary specially,
-	     * then let fmt_int() handle the rest.
-	     */
-	    Uint shift_count = num_bits - rbits;
-	    Sint val = signed_val(arg);
-	    iptr = dst_bin + BYTE_OFFSET(bin_offset);
-	    b = *iptr & (0xff << rbits);
-
-	    /*
-	     * Shifting with a shift count greater than or equal to the word
-	     * size may be a no-op (instead of 0 the result may be the unshifted
-	     * value). Therefore, only do the shift and the OR if the shift count
-	     * is less than the word size if the number is positive; if negative,
-	     * we must simulate the sign extension.
-	     */
-	    if (shift_count < sizeof(Uint)*8) {
-		b |= (val >> shift_count) & ((1 << rbits) - 1);
-	    } else if (val < 0) {
-		/* Simulate sign extension. */
-		b |= (-1) & ((1 << rbits) - 1);
-	    }
-	    *iptr++ = b;
 
-            fmt_small(iptr, NBYTES(num_bits-rbits), arg, num_bits-rbits, flags);
-	}
+            /* Handle the final partial byte. */
+            *iptr = bits << lshift;
+        } else {
+            /*
+             * Little endian small in more than one byte, not aligned
+             * on a byte boundary, and the size is not evenly
+             * divisible by 8.
+             *
+             * Now this gets complicated. We used to handle this
+             * directly, but since this case is presumably uncommon,
+             * we do this in a simpler way in two steps.
+             *
+             * First format the integer byte-aligned using the binary
+             * itself as a temporary buffer.
+             */
+            b = *iptr;
+            fmt_small_le(iptr, arg, num_bits);
+
+            /*
+             * Now restore the overwritten bits of the first byte and
+             * shift everything to the right.
+             */
+            restore_and_shift(iptr, b, bit_offset, num_bits);
+        }
     } else if (is_big(arg) && bit_offset == 0) {
-	/*
-	 * Big number, aligned on a byte boundary. We can format the
-	 * integer directly into the binary.
-	 */
-	fmt_big(dst_bin + BYTE_OFFSET(bin_offset),
-                NBYTES(num_bits), arg, num_bits, flags);
+        /*
+         * Big number, aligned on a byte boundary. We can format the
+         * integer directly into the binary.
+         */
+        fmt_big_le(iptr, arg, num_bits);
     } else if (is_big(arg) && bit_offset + num_bits <= 8) {
         /*
          * All bits are in the same byte.
@@ -867,55 +1026,24 @@ erts_bs_put_integer(ErlBitsState *EBS, Eterm arg, Uint num_bits, unsigned flags)
         ErtsDigit* dp = big_v(arg);
         Uint val = sign ? -*dp : *dp;
 
-        iptr = dst_bin + BYTE_OFFSET(bin_offset);
         b = *iptr & (0xff << rbits);
-        b |= (val & ((1 << num_bits)-1)) << (rbits-num_bits);
+        b |= (val & MAKE_MASK(num_bits)) << (rbits-num_bits);
         *iptr = b;
     } else if (is_big(arg)) {
         /*
          * Big number, not aligned on a byte boundary.
-         */
-        Uint rshift = bit_offset;
-        Uint lshift = 8 - bit_offset;
-        Uint deoffs = BIT_OFFSET(bit_offset + num_bits);
-        Uint lmask = MAKE_MASK(8 - bit_offset);
-        Uint rmask = (deoffs) ? (MAKE_MASK(deoffs)<<(8-deoffs)) : 0;
-        Uint count = (num_bits - lshift) / 8;
-        Uint bits, bits1;
-
-        ASSERT(num_bits - lshift >= 0);
-
-        /*
+         *
          * Format the integer byte-aligned using the binary itself as
          * a temporary buffer.
          */
-        iptr = dst_bin + BYTE_OFFSET(bin_offset);
         b = *iptr;
-        fmt_big(iptr, NBYTES(num_bits), arg, num_bits, flags);
+        fmt_big_le(iptr, arg, num_bits);
 
         /*
          * Now restore the overwritten bits of the first byte and
          * shift everything to the right.
          */
-        bits = *iptr;
-        bits1 = bits >> rshift;
-        *iptr = MASK_BITS(bits1, b, lmask);
-        iptr++;
-
-        while (count--) {
-            bits1 = bits << lshift;
-            bits = *iptr;
-            *iptr++ = bits1 | (bits >> rshift);
-        }
-
-        if (rmask) {
-            bits1 = bits << lshift;
-            if ((rmask << rshift) & 0xff) {
-                bits = *iptr;
-                bits1 |= (bits >> rshift);
-            }
-            *iptr = MASK_BITS(bits1, *iptr, rmask);
-        }
+        restore_and_shift(iptr, b, bit_offset, num_bits);
     } else {
         /* Not an integer. */
         return 0;
diff --git a/erts/emulator/beam/erl_bits.h b/erts/emulator/beam/erl_bits.h
index 0700fb01f053..72e45143a0bf 100644
--- a/erts/emulator/beam/erl_bits.h
+++ b/erts/emulator/beam/erl_bits.h
@@ -216,8 +216,8 @@ Eterm erts_bs_get_binary_2(Process *p, Uint num_bits, unsigned flags, ErlSubBits
 Eterm erts_bs_get_binary_all_2(Process *p, ErlSubBits* sb);
 
 /* Binary construction, new instruction set. */
-int erts_bs_put_integer(ErlBitsState *EBS, Eterm Integer, Uint num_bits,
-                        unsigned flags);
+int erts_bs_put_integer_be(ErlBitsState *EBS, Eterm Integer, Uint num_bits);
+int erts_bs_put_integer_le(ErlBitsState *EBS, Eterm Integer, Uint num_bits);
 #if !defined(BEAMASM)
 int erts_bs_put_utf8(ErlBitsState *EBS, Eterm Integer);
 #endif
diff --git a/erts/emulator/beam/jit/arm/instr_bs.cpp b/erts/emulator/beam/jit/arm/instr_bs.cpp
index e0080466c47f..d28c40321cce 100644
--- a/erts/emulator/beam/jit/arm/instr_bs.cpp
+++ b/erts/emulator/beam/jit/arm/instr_bs.cpp
@@ -2496,10 +2496,13 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                      * integer into the binary. */
                     load_erl_bits_state(ARG1);
                     mov_arg(ARG2, seg.src);
-                    mov_imm(ARG4, seg.flags);
 
                     emit_enter_runtime(Live.get());
-                    runtime_call<4>(erts_bs_put_integer);
+                    if (seg.flags & BSF_LITTLE) {
+                        runtime_call<3>(erts_bs_put_integer_le);
+                    } else {
+                        runtime_call<3>(erts_bs_put_integer_be);
+                    }
                     emit_leave_runtime(Live.get());
 
                     if (exact_type<BeamTypeId::Integer>(seg.src)) {
@@ -2563,10 +2566,13 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             load_erl_bits_state(ARG1);
             mov_arg(ARG2, seg.src);
             mov_imm(ARG3, 4 * 8);
-            a.mov(ARG4, seg.flags);
 
             emit_enter_runtime(Live.get());
-            runtime_call<4>(erts_bs_put_integer);
+            if (seg.flags & BSF_LITTLE) {
+                runtime_call<3>(erts_bs_put_integer_le);
+            } else {
+                runtime_call<3>(erts_bs_put_integer_be);
+            }
             emit_leave_runtime(Live.get());
 
             if (Fail.get() == 0) {
diff --git a/erts/emulator/beam/jit/x86/instr_bs.cpp b/erts/emulator/beam/jit/x86/instr_bs.cpp
index b43351b871ba..4a7f54771750 100644
--- a/erts/emulator/beam/jit/x86/instr_bs.cpp
+++ b/erts/emulator/beam/jit/x86/instr_bs.cpp
@@ -2599,9 +2599,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                      * integer into the binary. */
                     runtime_entered = bs_maybe_enter_runtime(runtime_entered);
                     mov_arg(ARG2, seg.src);
-                    mov_imm(ARG4, seg.flags);
                     load_erl_bits_state(ARG1);
-                    runtime_call<4>(erts_bs_put_integer);
+                    if (seg.flags & BSF_LITTLE) {
+                        runtime_call<3>(erts_bs_put_integer_le);
+                    } else {
+                        runtime_call<3>(erts_bs_put_integer_be);
+                    }
                     if (exact_type<BeamTypeId::Integer>(seg.src)) {
                         comment("skipped test for success because construction "
                                 "can't fail");
@@ -2660,9 +2663,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             runtime_entered = bs_maybe_enter_runtime(runtime_entered);
             mov_arg(ARG2, seg.src);
             mov_imm(ARG3, 4 * 8);
-            a.mov(ARG4, seg.flags);
             load_erl_bits_state(ARG1);
-            runtime_call<4>(erts_bs_put_integer);
+            if (seg.flags & BSF_LITTLE) {
+                runtime_call<3>(erts_bs_put_integer_le);
+            } else {
+                runtime_call<3>(erts_bs_put_integer_be);
+            }
             if (Fail.get() == 0) {
                 mov_arg(ARG1, seg.src);
                 mov_imm(ARG4,

From 10b6aee161ea5eb00d8d5928bfd05090d94c3e8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sun, 12 May 2024 16:05:16 +0200
Subject: [PATCH 4/4] Split an overly general function into two functions

The `erts_copy_bits()` function is overly general, preventing the C
compiler from doing any serious loop optimization. Replace it with the
new `erts_copy_bits_fwd()` and `erts_copy_bits_rev()` functions.
---
 erts/emulator/beam/erl_bits.c         | 354 +++++++++++++++++---------
 erts/emulator/beam/erl_bits.h         |  14 +-
 erts/emulator/beam/erl_nif.c          |   2 +-
 erts/emulator/beam/erl_term_hashing.c |  12 +-
 4 files changed, 255 insertions(+), 127 deletions(-)

diff --git a/erts/emulator/beam/erl_bits.c b/erts/emulator/beam/erl_bits.c
index b489c74c6f05..a3858841283a 100644
--- a/erts/emulator/beam/erl_bits.c
+++ b/erts/emulator/beam/erl_bits.c
@@ -302,15 +302,15 @@ Process *p, Uint num_bits, unsigned flags, ErlSubBits *sb)
      * Move bits to temporary buffer. We want the buffer to be stored in
      * little-endian order, since bignums are little-endian.
      */
-    
+
     if (flags & BSF_LITTLE) {
-	erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1,
-                       LSB, 0, 1, num_bits);
+        erts_copy_bits_fwd(erl_sub_bits_get_base(sb), sb->start,
+                           LSB, 0, num_bits);
 	*MSB >>= offs;		/* adjust msb */
     } else {
 	*MSB = 0;
-	erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1,
-                       MSB, offs, -1, num_bits);
+        erts_copy_bits_rev(erl_sub_bits_get_base(sb), sb->start,
+                           MSB, offs, num_bits);
     }
     sb->start += num_bits;
 
@@ -472,13 +472,13 @@ erts_bs_get_float_2(Process *p, Uint num_bits, unsigned flags, ErlSubBits *sb)
     }
 
     if (BIT_IS_MACHINE_ENDIAN(flags)) {
-	erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1,
-		  fptr, 0, 1,
-		  num_bits);
+        erts_copy_bits_fwd(erl_sub_bits_get_base(sb), sb->start,
+                           fptr, 0,
+                           num_bits);
     } else {
-	erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1,
-		  fptr + NBYTES(num_bits) - 1, 0, -1,
-		  num_bits);
+        erts_copy_bits_rev(erl_sub_bits_get_base(sb), sb->start,
+                           fptr + NBYTES(num_bits) - 1, 0,
+                           num_bits);
     }
     ERTS_FP_CHECK_INIT(p);
     if (num_bits == 16) {
@@ -1104,7 +1104,7 @@ erts_bs_put_utf8(ErlBitsState *EBS, Eterm arg)
     }
 
     if (bin_offset != 0) {
-        erts_copy_bits(dst, 0, 1, EBS->erts_current_bin, bin_offset, 1, num_bits);
+        erts_copy_bits_fwd(dst, 0, EBS->erts_current_bin, bin_offset, num_bits);
     }
 
     EBS->erts_bin_offset += num_bits;
@@ -1169,7 +1169,7 @@ erts_bs_put_utf16(ErlBitsState *EBS, Eterm arg, Uint flags)
     }
 
     if (bin_offset != 0) {
-	erts_copy_bits(dst, 0, 1, EBS->erts_current_bin, bin_offset, 1, num_bits);
+	erts_copy_bits_fwd(dst, 0, EBS->erts_current_bin, bin_offset, num_bits);
     }
 
     EBS->erts_bin_offset += num_bits;
@@ -1511,13 +1511,21 @@ erts_bs_put_float(ErlBitsState *EBS, Process *c_p, Eterm arg, Uint num_bits, int
 	    return make_small(num_bits);
 	}
 	if (BIT_IS_MACHINE_ENDIAN(flags)) {
-	    erts_copy_bits(bptr, 0, 1,
-                           EBS->erts_current_bin,
-                           EBS->erts_bin_offset, 1, num_bits);
+            erts_copy_bits_fwd(bptr, 0,
+                               EBS->erts_current_bin,
+                               EBS->erts_bin_offset, num_bits);
 	} else {
-	    erts_copy_bits(bptr+NBYTES(num_bits)-1, 0, -1,
-			   EBS->erts_current_bin, EBS->erts_bin_offset, 1,
-			   num_bits);
+            byte tmp_buf[8];
+            Uint n = BYTE_OFFSET(num_bits);
+            byte *dst = tmp_buf + n;
+
+            do {
+                *--dst = *bptr++;
+            } while (--n != 0);
+
+            erts_copy_bits_fwd(tmp_buf, 0,
+                               EBS->erts_current_bin,
+                               EBS->erts_bin_offset, num_bits);
 	}
     }
     EBS->erts_bin_offset += num_bits;
@@ -1532,9 +1540,9 @@ erts_bs_put_string(ErlBitsState* EBS, byte* iptr, Uint num_bytes)
 
     EBS->erts_bin_offset = dst_offset + num_bytes * 8;
     if (BIT_OFFSET(dst_offset) != 0) {
-        erts_copy_bits(iptr, 0, 1,
-                       dst_bin, dst_offset, 1,
-                       num_bytes*8);
+        erts_copy_bits_fwd(iptr, 0,
+                           dst_bin, dst_offset,
+                           num_bytes * 8);
     } else {
         sys_memcpy(dst_bin + BYTE_OFFSET(dst_offset), iptr, num_bytes);
     }
@@ -1966,7 +1974,7 @@ erts_bs_get_unaligned_uint32(ErlSubBits* sb)
     byte bigbuf[4];
     byte* LSB;
     byte* MSB;
-	
+
     CHECK_MATCH_BUFFER(sb);
     ASSERT((sb->start & 7) != 0);
     ASSERT(sb->end - sb->start >= 32);
@@ -1978,7 +1986,7 @@ erts_bs_get_unaligned_uint32(ErlSubBits* sb)
     MSB = LSB + bytes - 1;
 
     *MSB = 0;
-    erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1, MSB, offs, -1, 32);
+    erts_copy_bits_rev(erl_sub_bits_get_base(sb), sb->start, MSB, offs, 32);
     return LSB[0] | (LSB[1]<<8) | (LSB[2]<<16) | (LSB[3]<<24);
 }
 
@@ -2001,7 +2009,7 @@ erts_align_utf8_bytes(ErlSubBits *sb, byte* buf)
     } else {
 	bits = 16;
     }
-    erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1, buf, 0, 1, bits);
+    erts_copy_bits_fwd(erl_sub_bits_get_base(sb), sb->start, buf, 0, bits);
 }
 
 Eterm
@@ -2126,8 +2134,8 @@ erts_bs_get_utf16(ErlSubBits *sb, Uint flags)
 	 * get 4 bytes, otherwise two bytes.
 	 */
 	Uint n = num_bits < 32 ? 16 : 32;
-	erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1,
-                       tmp_buf, 0, 1, n);
+        erts_copy_bits_fwd(erl_sub_bits_get_base(sb), sb->start,
+                           tmp_buf, 0, n);
 	src = tmp_buf;
     }
     
@@ -2261,18 +2269,141 @@ int erts_cmp_bits__(const byte *a_ptr,
 
 /*
  * The basic bit copy operation. Copies n bits from the source buffer to
- * the destination buffer. Depending on the directions, it can reverse the
- * copied bits.
+ * the destination buffer.
  */
 
+void
+erts_copy_bits_fwd(const byte* src, /* Base pointer to source. */
+                   size_t soffs,    /* Bit offset for source relative to src. */
+                   byte* dst,       /* Base pointer to destination. */
+                   size_t doffs,    /* Bit offset for destination relative to dst. */
+                   size_t n)        /* Number of bits to copy. */
+{
+    Uint lmask;
+    Uint rmask;
+    Uint count;
+    Uint deoffs;
+
+    if (n == 0) {
+        return;
+    }
+
+    src += BYTE_OFFSET(soffs);
+    dst += BYTE_OFFSET(doffs);
+    soffs = BIT_OFFSET(soffs);
+    doffs = BIT_OFFSET(doffs);
+    deoffs = BIT_OFFSET(doffs+n);
+    lmask = (doffs) ? MAKE_MASK(8-doffs) : 0;
+    rmask = (deoffs) ? (MAKE_MASK(deoffs)<<(8-deoffs)) : 0;
+
+    /*
+     * Take care of the case that all bits are in the same byte.
+     */
+
+    if (doffs+n < 8) {		/* All bits are in the same byte */
+        lmask = (lmask & rmask) ? (lmask & rmask) : (lmask | rmask);
+
+        if (soffs == doffs) {
+            *dst = MASK_BITS(*src, *dst, lmask);
+        } else if (soffs > doffs) {
+            Uint bits = (*src << (soffs-doffs));
+            if (soffs+n > 8) {
+                src++;
+                bits |= (*src >> (8-(soffs-doffs)));
+            }
+            *dst = MASK_BITS(bits, *dst, lmask);
+        } else {
+            *dst = MASK_BITS((*src >> (doffs-soffs)), *dst, lmask);
+        }
+        return;			/* We are done! */
+    }
+
+    /*
+     * At this point, we know that the bits are in 2 or more bytes.
+     */
+
+    count = (lmask ? (n - (8 - doffs)) : n) >> 3;
+
+    if (soffs == doffs) {
+        /*
+         * The bits are aligned in the same way. We can just copy the bytes
+         * (except for the first and last bytes).
+         */
+
+        if (lmask) {
+            *dst = MASK_BITS(*src, *dst, lmask);
+            dst++, src++;
+        }
+
+        sys_memcpy(dst, src, count);
+
+        if (rmask) {
+            dst += count;
+            src += count;
+            *dst = MASK_BITS(*src, *dst, rmask);
+        }
+    } else {
+        Uint bits;
+        Uint bits1;
+        Uint rshift;
+        Uint lshift;
+
+        /*
+         * The tricky case. The bits must be shifted into position.
+         */
+
+        if (soffs > doffs) {
+            lshift = soffs - doffs;
+            rshift = 8 - lshift;
+            bits = *src;
+            if (soffs + n > 8) {
+                src++;
+            }
+        } else {
+            rshift = doffs - soffs;
+            lshift = 8 - rshift;
+            bits = 0;
+        }
+
+        if (lmask) {
+            bits1 = bits << lshift;
+            bits = *src++;
+            bits1 |= (bits >> rshift);
+            *dst = MASK_BITS(bits1, *dst, lmask);
+            dst++;
+        }
+
+        while (count--) {
+            bits1 = bits << lshift;
+            bits = *src++;
+            *dst = bits1 | (bits >> rshift);
+            dst++;
+        }
+
+        if (rmask) {
+            bits1 = bits << lshift;
+            if ((rmask << rshift) & 0xff) {
+                bits = *src;
+                bits1 |= (bits >> rshift);
+            }
+            *dst = MASK_BITS(bits1, *dst, rmask);
+        }
+    }
+}
+
+/*
+ * The reverse bit copy operation. Copies n bits from the source
+ * buffer to the destination buffer. The bits are read 8 bits at the
+ * time from the source buffer, while incrementing the source buffer
+ * pointer. The 8 bit groups are stored into the destination buffer,
+ * while decrementing the destination buffer pointer.
+ */
 
-void 
-erts_copy_bits(const byte* src, /* Base pointer to source. */
+void
+erts_copy_bits_rev(const byte* src, /* Base pointer to source. */
                size_t soffs,    /* Bit offset for source relative to src. */
-               int sdir,        /* Direction: 1 (forward) or -1 (backward). */
                byte* dst,       /* Base pointer to destination. */
                size_t doffs,    /* Bit offset for destination relative to dst. */
-               int ddir,        /* Direction: 1 (forward) or -1 (backward). */
                size_t n)        /* Number of bits to copy. */
 {
     Uint lmask;
@@ -2281,11 +2412,11 @@ erts_copy_bits(const byte* src, /* Base pointer to source. */
     Uint deoffs;
 
     if (n == 0) {
-	return;
+        return;
     }
 
-    src += sdir*BYTE_OFFSET(soffs);
-    dst += ddir*BYTE_OFFSET(doffs);
+    src += BYTE_OFFSET(soffs);
+    dst -= BYTE_OFFSET(doffs);
     soffs = BIT_OFFSET(soffs);
     doffs = BIT_OFFSET(doffs);
     deoffs = BIT_OFFSET(doffs+n);
@@ -2297,21 +2428,21 @@ erts_copy_bits(const byte* src, /* Base pointer to source. */
      */
 
     if (doffs+n < 8) {		/* All bits are in the same byte */
-	lmask = (lmask & rmask) ? (lmask & rmask) : (lmask | rmask);
-
-	if (soffs == doffs) {
-	    *dst = MASK_BITS(*src,*dst,lmask);
-	} else if (soffs > doffs) {
-	    Uint bits = (*src << (soffs-doffs));
-	    if (soffs+n > 8) {
-		src += sdir;
-		bits |= (*src >> (8-(soffs-doffs)));
-	    }
-	    *dst = MASK_BITS(bits,*dst,lmask);
-	} else {
-	    *dst = MASK_BITS((*src >> (doffs-soffs)),*dst,lmask);
-	}
-	return;			/* We are done! */
+        lmask = (lmask & rmask) ? (lmask & rmask) : (lmask | rmask);
+
+        if (soffs == doffs) {
+            *dst = MASK_BITS(*src,*dst,lmask);
+        } else if (soffs > doffs) {
+            Uint bits = (*src << (soffs-doffs));
+            if (soffs+n > 8) {
+                src++;
+                bits |= (*src >> (8-(soffs-doffs)));
+            }
+            *dst = MASK_BITS(bits,*dst,lmask);
+        } else {
+            *dst = MASK_BITS((*src >> (doffs-soffs)),*dst,lmask);
+        }
+        return;			/* We are done! */
     }
 
     /*
@@ -2321,75 +2452,70 @@ erts_copy_bits(const byte* src, /* Base pointer to source. */
     count = ((lmask) ? (n - (8 - doffs)) : n) >> 3;
 
     if (soffs == doffs) {
-	/*
-	 * The bits are aligned in the same way. We can just copy the bytes
-	 * (except for the first and last bytes). Note that the directions
-	 * might be different, so we can't just use memcpy().
-	 */
+        /*
+         * The bits are aligned in the same way. We can just copy the bytes
+         * (except for the first and last bytes).
+         */
 
-	if (lmask) {
-	    *dst = MASK_BITS(*src, *dst, lmask);
-	    dst += ddir;
-	    src += sdir;
-	}
+        if (lmask) {
+            *dst = MASK_BITS(*src, *dst, lmask);
+            dst--, src++;
+        }
 
-	while (count--) {
-	    *dst = *src;
-	    dst += ddir;
-	    src += sdir;
-	}
+        while (count--) {
+            *dst-- = *src++;
+        }
 
-	if (rmask) {
-	    *dst = MASK_BITS(*src,*dst,rmask);
-	}
+        if (rmask) {
+            *dst = MASK_BITS(*src, *dst, rmask);
+        }
     } else {
-	Uint bits;
-	Uint bits1;
-	Uint rshift;
-	Uint lshift;
+        Uint bits;
+        Uint bits1;
+        Uint rshift;
+        Uint lshift;
 
-	/*
-	 * The tricky case. The bits must be shifted into position.
-	 */
-	
-	if (soffs > doffs) {
-	    lshift = (soffs - doffs);
-	    rshift = 8 - lshift;
-	    bits = *src;
-	    if (soffs + n > 8) {
-		src += sdir;
-	    }
-	} else {
-	    rshift = (doffs - soffs);
-	    lshift = 8 - rshift;
-	    bits = 0;
-	}
-	    
-	if (lmask) {
-	    bits1 = bits << lshift;
-	    bits = *src;
-	    src += sdir;
-	    bits1 |= (bits >> rshift);
-	    *dst = MASK_BITS(bits1,*dst,lmask);
-	    dst += ddir;
-	}
+        /*
+         * The tricky case. The bits must be shifted into position.
+         */
 
-	while (count--) {
-	    bits1 = bits << lshift;
-	    bits = *src;
-	    src += sdir;
-	    *dst = bits1 | (bits >> rshift);
-	    dst += ddir;
-	}
-	
-	if (rmask) {
-	    bits1 = bits << lshift;
-	    if ((rmask << rshift) & 0xff) {
-		bits = *src;
-		bits1 |= (bits >> rshift);
-	    }
-	    *dst = MASK_BITS(bits1,*dst,rmask);
-	}
+        if (soffs > doffs) {
+            lshift = (soffs - doffs);
+            rshift = 8 - lshift;
+            bits = *src;
+            if (soffs + n > 8) {
+                src++;
+            }
+        } else {
+            rshift = doffs - soffs;
+            lshift = 8 - rshift;
+            bits = 0;
+        }
+
+        if (lmask) {
+            bits1 = bits << lshift;
+            bits = *src;
+            src++;
+            bits1 |= (bits >> rshift);
+            *dst = MASK_BITS(bits1, *dst, lmask);
+            dst--;
+        }
+
+        while (count--) {
+            bits1 = bits << lshift;
+            bits = *src++;
+            *dst = bits1 | (bits >> rshift);
+            dst--;
+        }
+
+        if (rmask) {
+            bits1 = bits << lshift;
+            if ((rmask << rshift) & 0xff) {
+                bits = *src;
+                bits1 |= (bits >> rshift);
+            }
+            *dst = MASK_BITS(bits1, *dst, rmask);
+        }
     }
 }
 
diff --git a/erts/emulator/beam/erl_bits.h b/erts/emulator/beam/erl_bits.h
index 72e45143a0bf..6d4200e4a2b1 100644
--- a/erts/emulator/beam/erl_bits.h
+++ b/erts/emulator/beam/erl_bits.h
@@ -245,8 +245,10 @@ copy_binary_to_buffer(byte *dst_base, Uint dst_offset,
                       const byte *src_base, Uint src_offset,
                       Uint size);
 
-void erts_copy_bits(const byte* src, size_t soffs, int sdir,
-                    byte* dst, size_t doffs, int ddir, size_t n);
+void erts_copy_bits_fwd(const byte* src, size_t soffs,
+                        byte* dst, size_t doffs, size_t n);
+void erts_copy_bits_rev(const byte* src, size_t soffs,
+                        byte* dst, size_t doffs, size_t n);
 
 ERTS_GLB_INLINE int erts_cmp_bits(const byte* a_ptr,
                                   Uint a_offs,
@@ -532,9 +534,9 @@ copy_binary_to_buffer(byte *dst_base, Uint dst_offset,
         if (((dst_offset | src_offset | size) & 7) == 0) {
             sys_memcpy(dst_base, src_base, BYTE_SIZE(size));
         } else {
-            erts_copy_bits(src_base, BIT_OFFSET(src_offset), 1,
-                           dst_base, BIT_OFFSET(dst_offset), 1,
-                           size);
+            erts_copy_bits_fwd(src_base, BIT_OFFSET(src_offset),
+                               dst_base, BIT_OFFSET(dst_offset),
+                               size);
         }
     }
 }
@@ -606,7 +608,7 @@ erts_get_aligned_binary_bytes_extra(Eterm bin,
                                                 NBYTES(size) + extra);
                 *base_ptr = bytes;
 
-                erts_copy_bits(base, offset, 1, &bytes[extra], 0, 1, size);
+                erts_copy_bits_fwd(base, offset, &bytes[extra], 0, size);
                 return &bytes[extra];
             }
 
diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index 2563d40ad563..0e554a336a99 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -1352,7 +1352,7 @@ int enif_inspect_binary(ErlNifEnv* env, Eterm bin_term, ErlNifBinary* bin)
                 env->tmp_obj_list = tmp_obj;
 
                 bin->data = (byte*)&tmp_obj[1];
-                erts_copy_bits(base, offset, 1, bin->data, 0, 1, size);
+                erts_copy_bits_fwd(base, offset, bin->data, 0, size);
             } else {
                 bin->data = &base[BYTE_OFFSET(offset)];
             }
diff --git a/erts/emulator/beam/erl_term_hashing.c b/erts/emulator/beam/erl_term_hashing.c
index 796dd9574b0c..dae3bba60f64 100644
--- a/erts/emulator/beam/erl_term_hashing.c
+++ b/erts/emulator/beam/erl_term_hashing.c
@@ -1276,8 +1276,8 @@ make_hash2_helper(Eterm term_param, const int can_trap, Eterm* state_mref_write_
                     byte *buf = erts_alloc(ERTS_ALC_T_TMP, nr_of_bytes);
                     Uint nr_of_bits_to_copy = ctx.sz*BYTE_BITS+ctx.bitsize;
                     if (can_trap) iterations_until_trap -= iters_for_bin;
-                    erts_copy_bits(ctx.bptr,
-                                   ctx.bitoffs, 1, buf, 0, 1, nr_of_bits_to_copy);
+                    erts_copy_bits_fwd(ctx.bptr, ctx.bitoffs,
+                                       buf, 0, nr_of_bits_to_copy);
                     hash = block_hash(buf, ctx.sz, con);
                     if (ctx.bitsize > 0) {
                         UINT32_HASH_2(ctx.bitsize,
@@ -1312,9 +1312,9 @@ make_hash2_helper(Eterm term_param, const int can_trap, Eterm* state_mref_write_
                         Uint nr_of_bits_to_copy =
                             MIN(nr_of_bits_left, BINARY_BUF_SIZE_BITS);
                         ctx.done = nr_of_bits_left == nr_of_bits_to_copy;
-                        erts_copy_bits(ctx.bptr + ctx.no_bytes_processed,
-                                       ctx.bitoffs, 1, ctx.buf, 0, 1,
-                                       nr_of_bits_to_copy);
+                        erts_copy_bits_fwd(ctx.bptr + ctx.no_bytes_processed,
+                                           ctx.bitoffs, ctx.buf, 0,
+                                           nr_of_bits_to_copy);
                         block_hash_buffer(ctx.buf,
                                           bytes_to_process,
                                           block_hash_ctx);
@@ -1948,7 +1948,7 @@ make_internal_hash(Eterm term, erts_ihash_t salt)
                     if (BIT_OFFSET(offset) != 0) {
                         byte *tmp = (byte*)erts_alloc(ERTS_ALC_T_TMP,
                                                       NBYTES(size));
-                        erts_copy_bits(data, offset, 1, tmp, 0, 1, size);
+                        erts_copy_bits_fwd(data, offset, tmp, 0, size);
                         bytes = tmp;
                     } else {
                         bytes = &data[BYTE_OFFSET(offset)];