diff --git a/erts/emulator/beam/emu/bs_instrs.tab b/erts/emulator/beam/emu/bs_instrs.tab
index 1150b1172781..1c41b5b3ccb7 100644
--- a/erts/emulator/beam/emu/bs_instrs.tab
+++ b/erts/emulator/beam/emu/bs_instrs.tab
@@ -373,7 +373,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
     const BeamInstr* p;
     Uint alloc = $Alloc;
     Eterm new_binary;
-    ERL_BITS_DECLARE_STATEP; /* Has to be last declaration */
+    ErlBitsState* EBS = ERL_BITS_EBS_FROM_REG(reg);
 
     /* We count the total number of bits in an unsigned integer. To avoid
      * having to check for overflow when adding to `num_bits`, we ensure that
@@ -546,7 +546,6 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
     }
 
     /* Allocate binary. */
-    ERL_BITS_RELOAD_STATEP(c_p);
     p = p_start;
     if (p[0] == BSC_APPEND) {
         Uint live = $Live;
@@ -565,15 +564,13 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         }
         p_start += BSC_NUM_ARGS;
     } else if (p[0] == BSC_PRIVATE_APPEND) {
-        Uint unit;
         Eterm Src;
 
         $test_heap(alloc, $Live);
 
-        $BS_LOAD_UNIT(p, unit);
         $BS_LOAD_SRC(p, Src);
 
-        new_binary = erts_bs_private_append_checked(c_p, Src, num_bits, unit);
+        new_binary = erts_bs_private_append_checked(EBS, c_p, Src, num_bits);
 
         if (is_non_value(new_binary)) {
             $BS_FAIL_INFO($Fail, c_p->freason, c_p->fvalue, Src);
@@ -589,7 +586,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         /* num_bits = Number of bits to build
          * alloc = Total number of words to allocate on heap
          */
-        erts_bin_offset = 0;
+        EBS->erts_bin_offset = 0;
         if (num_bits <= ERL_ONHEAP_BITS_LIMIT) {
             ErlHeapBits *hb;
 
@@ -598,7 +595,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
             HTOP += heap_bits_size(num_bits);
             hb->thing_word = header_heap_bits(num_bits);
             hb->size = num_bits;
-            erts_current_bin = (byte *) hb->data;
+            EBS->erts_current_bin = (byte *) hb->data;
             new_binary = make_bitstring(hb);
         } else {
             Binary* bptr;
@@ -608,7 +605,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
                             $Live);
 
             bptr = erts_bin_nrml_alloc(NBYTES(num_bits));
-            erts_current_bin = (byte *)bptr->orig_bytes;
+            EBS->erts_current_bin = (byte *)bptr->orig_bytes;
 
             LIGHT_SWAPOUT;
 
@@ -616,7 +613,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
                                                   &MSO(c_p).overhead,
                                                   &HEAP_TOP(c_p),
                                                   bptr,
-                                                  erts_current_bin,
+                                                  EBS->erts_current_bin,
                                                   0,
                                                   num_bits);
 
@@ -640,7 +637,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
             byte* string;
             $BS_LOAD_STRING_SRC(p, string);
             $BS_LOAD_FIXED_SIZE(p, Size);
-            erts_new_bs_put_string(ERL_BITS_ARGS_2(string, Size));
+            erts_bs_put_string(EBS, string, Size);
             continue;
         }
 
@@ -649,7 +646,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         switch (p[0]) {
         case BSC_BINARY_ALL:
             $BS_LOAD_UNIT(p, unit);
-            if (!erts_new_bs_put_binary_all(c_p, Src, unit)) {
+            if (!erts_bs_put_binary_all(EBS, c_p, Src, unit)) {
                 $BS_FAIL_INFO($Fail, BADARG, am_unit, Src);
             }
             break;
@@ -658,14 +655,14 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
             $BS_LOAD_FLAGS(p, flags);
             $BS_LOAD_SIZE(p, Size);
             $BS_GET_UNCHECKED_FIELD_SIZE(Size, unit, $BADARG($Fail), _size);
-            if (!erts_new_bs_put_binary(c_p, Src, _size)) {
+            if (!erts_bs_put_binary(EBS, c_p, Src, _size)) {
                 Eterm reason = is_bitstring(Src) ? am_short : am_type;
                 $BS_FAIL_INFO($Fail, BADARG, reason, Src);
             }
             break;
         case BSC_BINARY_FIXED_SIZE:
             $BS_LOAD_FIXED_SIZE(p, Size);
-            if (!erts_new_bs_put_binary(c_p, Src, Size)) {
+            if (!erts_bs_put_binary(EBS, c_p, Src, Size)) {
                 Eterm reason = is_bitstring(Src) ? am_short : am_type;
                 $BS_FAIL_INFO($Fail, BADARG, reason, Src);
             }
@@ -675,7 +672,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
             $BS_LOAD_FLAGS(p, flags);
             $BS_LOAD_SIZE(p, Size);
             $BS_GET_UNCHECKED_FIELD_SIZE(Size, unit, $BADARG($Fail), _size);
-            Src = erts_new_bs_put_float(c_p, Src, _size, flags);
+            Src = erts_bs_put_float(EBS, c_p, Src, _size, flags);
             if (is_value(Src)) {
                 $BS_FAIL_INFO($Fail, BADARG, c_p->fvalue, Src);
             }
@@ -683,7 +680,7 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         case BSC_FLOAT_FIXED_SIZE:
             $BS_LOAD_FLAGS(p, flags);
             $BS_LOAD_FIXED_SIZE(p, Size);
-            Src = erts_new_bs_put_float(c_p, Src, Size, flags);
+            Src = erts_bs_put_float(EBS, c_p, Src, Size, flags);
             if (is_value(Src)) {
                 $BS_FAIL_INFO($Fail, BADARG, c_p->fvalue, Src);
             }
@@ -691,33 +688,48 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         case BSC_INTEGER:
             {
                 Sint _size;
+                int result;
 
                 $BS_LOAD_UNIT(p, unit);
                 $BS_LOAD_FLAGS(p, flags);
                 $BS_LOAD_SIZE(p, Size);
                 $BS_GET_UNCHECKED_FIELD_SIZE(Size, unit, $BADARG($Fail), _size);
-                if (!erts_new_bs_put_integer(ERL_BITS_ARGS_3(Src, _size, flags))) {
+                if (flags & BSF_LITTLE) {
+                    result = erts_bs_put_integer_le(EBS, Src, _size);
+                } else {
+                    result = erts_bs_put_integer_be(EBS, Src, _size);
+                }
+                if (!result) {
                     $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
                 }
             }
             break;
         case BSC_INTEGER_FIXED_SIZE:
         case BSC_UTF32:
-            $BS_LOAD_FLAGS(p, flags);
-            $BS_LOAD_FIXED_SIZE(p, Size);
-            if (!erts_new_bs_put_integer(ERL_BITS_ARGS_3(Src, Size, flags))) {
-                $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
+            {
+                int result;
+
+                $BS_LOAD_FLAGS(p, flags);
+                $BS_LOAD_FIXED_SIZE(p, Size);
+                if (flags & BSF_LITTLE) {
+                    result = erts_bs_put_integer_le(EBS, Src, Size);
+                } else {
+                    result = erts_bs_put_integer_be(EBS, Src, Size);
+                }
+                if (!result) {
+                    $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
+                }
             }
             break;
         case BSC_UTF8:
-            if (!erts_bs_put_utf8(ERL_BITS_ARGS_1(Src))) {
+            if (!erts_bs_put_utf8(EBS, Src)) {
                 $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
             }
             break;
         case BSC_UTF16:
             $BS_LOAD_FLAGS(p, flags);
             $BS_LOAD_SRC(p, Src);
-            if (!erts_bs_put_utf16(ERL_BITS_ARGS_2(Src, flags))) {
+            if (!erts_bs_put_utf16(EBS, Src, flags)) {
                 $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
             }
             break;
diff --git a/erts/emulator/beam/erl_bits.c b/erts/emulator/beam/erl_bits.c
index 75413225b9cf..a3858841283a 100644
--- a/erts/emulator/beam/erl_bits.c
+++ b/erts/emulator/beam/erl_bits.c
@@ -302,15 +302,15 @@ Process *p, Uint num_bits, unsigned flags, ErlSubBits *sb)
      * Move bits to temporary buffer. We want the buffer to be stored in
      * little-endian order, since bignums are little-endian.
      */
-    
+
     if (flags & BSF_LITTLE) {
-	erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1,
-                       LSB, 0, 1, num_bits);
+        erts_copy_bits_fwd(erl_sub_bits_get_base(sb), sb->start,
+                           LSB, 0, num_bits);
 	*MSB >>= offs;		/* adjust msb */
     } else {
 	*MSB = 0;
-	erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1,
-                       MSB, offs, -1, num_bits);
+        erts_copy_bits_rev(erl_sub_bits_get_base(sb), sb->start,
+                           MSB, offs, num_bits);
     }
     sb->start += num_bits;
 
@@ -472,13 +472,13 @@ erts_bs_get_float_2(Process *p, Uint num_bits, unsigned flags, ErlSubBits *sb)
     }
 
     if (BIT_IS_MACHINE_ENDIAN(flags)) {
-	erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1,
-		  fptr, 0, 1,
-		  num_bits);
+        erts_copy_bits_fwd(erl_sub_bits_get_base(sb), sb->start,
+                           fptr, 0,
+                           num_bits);
     } else {
-	erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1,
-		  fptr + NBYTES(num_bits) - 1, 0, -1,
-		  num_bits);
+        erts_copy_bits_rev(erl_sub_bits_get_base(sb), sb->start,
+                           fptr + NBYTES(num_bits) - 1, 0,
+                           num_bits);
     }
     ERTS_FP_CHECK_INIT(p);
     if (num_bits == 16) {
@@ -532,237 +532,438 @@ erts_bs_get_binary_all_2(Process *p, ErlSubBits *sb)
  ****************************************************************/
 
 
-/* COPY_VAL:
- * copy sz byte from val to dst buffer, 
- * dst, val are updated!!!
+/* FMT_COPY_VAL:
+ * Copy sz bytes from val to dst buffer;
+ * dst and val are updated.
  */
 
-#define COPY_VAL(dst,ddir,val,sz) do { \
-   Uint __sz = (sz); \
-   while(__sz) { \
-     switch(__sz) { \
-     default: \
-     case 4: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     case 3: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     case 2: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     case 1: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     } \
-   } \
+#define FMT_COPY_VAL(dst,ddir,val,sz) do {                      \
+   Uint __sz = (sz);                                            \
+   while (__sz) {                                               \
+     switch(__sz) {                                             \
+     default:                                                   \
+     case 8: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 7: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 6: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 5: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 4: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 3: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 2: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 1: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     }                                                          \
+   }                                                            \
  } while(0)
 
 static void
-fmt_small(byte *buf, Uint num_bytes, Eterm arg, Uint num_bits, Uint flags)
+fmt_small_be(byte *buf, Eterm arg, Uint num_bits)
 {
     Uint bit_offset;
     Sint val;
+    Uint num_bytes;
 
     ASSERT(is_small(arg));
     ASSERT(num_bits != 0);      /* Tested by caller */
 
+    num_bytes = NBYTES(num_bits);
     bit_offset = BIT_OFFSET(num_bits);
     val = signed_val(arg);
 
-    if (flags & BSF_LITTLE) { /* Little endian */
+    buf += num_bytes - 1;
+    if (bit_offset) {
+        *buf-- = val << (8-bit_offset);
         num_bytes--;
-        COPY_VAL(buf, 1, val, num_bytes);
-        *buf = bit_offset ? (val << (8-bit_offset)) : val;
-    } else {		/* Big endian */
-        buf += num_bytes - 1;
-        if (bit_offset) {
-            *buf-- = val << (8-bit_offset);
-            num_bytes--;
-            val >>= bit_offset;
-        }
-        COPY_VAL(buf, -1, val, num_bytes);
+        val >>= bit_offset;
     }
+    FMT_COPY_VAL(buf, -1, val, num_bytes);
 }
 
-/* calculate a - *cp (carry)  (store result in b), *cp is updated! */
-#define SUBc(a, cp, b) do { \
-   byte __x = (a); \
-   byte __y = (__x - (*(cp))); \
-   (*cp) = (__y > __x); \
-   *(b) = ~__y; \
- } while(0)
+static void
+fmt_small_le(byte *buf, Eterm arg, Uint num_bits)
+{
+    Uint bit_offset;
+    Sint val;
+    Uint num_bytes;
+
+    ASSERT(is_small(arg));
+    ASSERT(num_bits != 0);      /* Tested by caller */
+
+    num_bytes = NBYTES(num_bits);
+    bit_offset = BIT_OFFSET(num_bits);
+    val = signed_val(arg);
+    num_bytes--;
+    FMT_COPY_VAL(buf, 1, val, num_bytes);
+    *buf = bit_offset ? (val << (8-bit_offset)) : val;
+}
+
+#undef FMT_COPY_VAL
+
+/*
+ * Calculate a - c (carry), storing the result in b; set c
+ * to the new carry.
+ */
+#if __has_builtin(__builtin_subc) && !defined(DEBUG)
+#define SUBc(a, c, b)                           \
+   do {                                         \
+       *(b) = __builtin_subc(0, (a), c, &c);    \
+   } while(0)
+#else
+#define SUBc(a, c, b)                           \
+   do {                                         \
+       byte __x = (a);                          \
+       byte __y = (__x - !(c));                 \
+       c = !(__y > __x);                        \
+       *(b) = ~__y;                             \
+   } while(0)
+#endif
 
 static void
-fmt_big(byte *buf, Uint num_bytes, Eterm val, Uint num_bits, Uint flags)
+fmt_big_be(byte *buf, Eterm val, Uint num_bits)
 {
     unsigned long offs;
     int sign;
     Uint ds;
     ErtsDigit* dp;
     int n;
+    ErtsDigit acc = 0;
+    ErtsDigit d;
+    Uint num_bytes;
 
     ASSERT(is_big(val));
+    ASSERT(num_bits != 0);
 
-    if (num_bits == 0) {
-        return;
-    }
-
+    num_bytes = NBYTES(num_bits);
     sign = big_sign(val);
     ds = big_size(val)*sizeof(ErtsDigit); /* number of digits bytes */
     dp = big_v(val);
     n = MIN(num_bytes, ds);
 
     offs = BIT_OFFSET(num_bits);
-    if (flags & BSF_LITTLE) {
-        num_bytes -= n;         /* pad with this amount */
-        if (sign) {             /* negative */
-            int c = 1;
-            while (n >= sizeof(ErtsDigit)) {
-                ErtsDigit d = *dp++;
-                int i;
-                for (i = 0; i < sizeof(ErtsDigit); i++) {
-                    SUBc(d & 0xff, &c, buf);
-                    buf++;
-                    d >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
+
+    buf += num_bytes - 1;       /* end of buffer */
+    num_bytes -= n;             /* pad with this amount */
+    offs = offs ? (8-offs) : 0; /* shift offset */
+
+    if (sign) {             /* negative bignum */
+        unsigned int c = 0;
+
+        while (n >= sizeof(ErtsDigit)) {
+            int i;
+
+            d = *dp++;
+            acc |= d << offs;
+            SUBc(acc & 0xff, c, buf);
+            buf--;
+            acc = d >> (8-offs);
+            for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
+                SUBc(acc & 0xff, c, buf);
+                buf--;
+                acc >>= 8;
             }
-            if (n) {
-                ErtsDigit d = *dp;
-                do {
-                    SUBc(d & 0xff, &c, buf);
-                    buf++;
-                    d >>= 8;
-                } while (--n > 0);
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            acc |= ((ErtsDigit)*dp << offs);
+            do {
+                SUBc(acc & 0xff, c, buf);
+                buf--;
+                acc >>= 8;
+            } while (--n > 0);
+        }
+        /* pad */
+        while (num_bytes--) {
+            SUBc(acc & 0xff, c, buf);
+            buf--;
+            acc >>= 8;
+        }
+    } else {                /* positive bignum */
+        while (n >= sizeof(ErtsDigit)) {
+            int i;
+
+            d = *dp++;
+            acc |= d << offs;
+            *buf-- = acc;
+            acc = d >> (8-offs);
+            for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
+                *buf-- = acc;
+                acc >>= 8;
             }
-            /* pad */
-            while (num_bytes--) {
-                SUBc(0, &c, buf);
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            acc |= (*dp << offs);
+            do {
+                *buf-- = acc;
+                acc >>= 8;
+            } while (--n > 0);
+        }
+        while (num_bytes--) {
+            *buf-- = acc;
+            acc >>= 8;
+        }
+    }
+}
+
+static void
+fmt_big_le(byte *buf, Eterm val, Uint num_bits)
+{
+    unsigned long offs;
+    int sign;
+    Uint ds;
+    ErtsDigit* dp;
+    int n;
+    Uint num_bytes;
+
+    ASSERT(is_big(val));
+    ASSERT(num_bits != 0);
+
+    num_bytes = NBYTES(num_bits);
+    sign = big_sign(val);
+    ds = big_size(val)*sizeof(ErtsDigit); /* number of digits bytes */
+    dp = big_v(val);
+    n = MIN(num_bytes, ds);
+
+    offs = BIT_OFFSET(num_bits);
+    num_bytes -= n;         /* pad with this amount */
+    if (sign) {             /* negative */
+        unsigned int c = 0;
+        while (n >= sizeof(ErtsDigit)) {
+            ErtsDigit d = *dp++;
+            int i;
+            for (i = 0; i < sizeof(ErtsDigit); i++) {
+                SUBc(d & 0xff, c, buf);
                 buf++;
+                d >>= 8;
             }
-        } else {                /* positive */
-            while (n >= sizeof(ErtsDigit)) {
-                ErtsDigit d = *dp++;
-                int i;
-                for(i = 0; i < sizeof(ErtsDigit); i++) {
-                    *buf++ = d;
-                    d >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
-            }
-            if (n) {
-                ErtsDigit d = *dp;
-                do {
-                    *buf++ = d;
-                    d >>= 8;
-                } while (--n > 0);
-            }
-            /* pad */
-            while (num_bytes) {
-                *buf++ = 0;
-                num_bytes--;
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            ErtsDigit d = *dp;
+            do {
+                SUBc(d & 0xff, c, buf);
+                buf++;
+                d >>= 8;
+            } while (--n > 0);
+        }
+        /* pad */
+        while (num_bytes--) {
+            SUBc(0, c, buf);
+            buf++;
+        }
+    } else {                /* positive */
+        while (n >= sizeof(ErtsDigit)) {
+            ErtsDigit d = *dp++;
+            int i;
+            for(i = 0; i < sizeof(ErtsDigit); i++) {
+                *buf++ = d;
+                d >>= 8;
             }
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            ErtsDigit d = *dp;
+            do {
+                *buf++ = d;
+                d >>= 8;
+            } while (--n > 0);
+        }
+        /* pad */
+        while (num_bytes) {
+            *buf++ = 0;
+            num_bytes--;
         }
+    }
 
-        /* adjust MSB */
-        if (offs) {
-            buf--;
-            *buf <<= (8 - offs);
+    /* adjust MSB */
+    if (offs) {
+        buf--;
+        *buf <<= (8 - offs);
+    }
+}
+
+#undef SUBc
+
+static void
+restore_and_shift(byte *buf, Uint orig_byte, Uint bit_offset, Uint num_bits)
+{
+    Uint rshift = bit_offset;
+    Uint lshift = 8 - bit_offset;
+    Uint deoffs = BIT_OFFSET(bit_offset + num_bits);
+    Uint lmask = MAKE_MASK(8 - bit_offset);
+    Uint count = (num_bits - lshift) / 8;
+    Uint bits, bits1;
+
+    ASSERT(num_bits - lshift >= 0);
+
+    bits = *buf;
+    bits1 = bits >> rshift;
+    *buf = MASK_BITS(bits1, orig_byte, lmask);
+    buf++;
+
+    while (count--) {
+        bits1 = bits << lshift;
+        bits = *buf;
+        *buf++ = bits1 | (bits >> rshift);
+    }
+
+    if (deoffs) {
+        Uint rmask = MAKE_MASK(deoffs) << (8 - deoffs);
+
+        bits1 = bits << lshift;
+        if ((rmask << rshift) & 0xff) {
+            bits = *buf;
+            bits1 |= (bits >> rshift);
         }
-    } else {   /* BIG ENDIAN */
-        ErtsDigit acc = 0;
-        ErtsDigit d;
+        *buf = MASK_BITS(bits1, *buf, rmask);
+    }
+}
 
-        buf += num_bytes - 1;       /* end of buffer */
-        num_bytes -= n;             /* pad with this amount */
-        offs = offs ? (8-offs) : 0; /* shift offset */
+int
+erts_bs_put_integer_be(ErlBitsState *EBS, Eterm arg, Uint num_bits)
+{
+    byte* dst_bin = EBS->erts_current_bin;
+    Uint bin_offset = EBS->erts_bin_offset;
+    Uint bit_offset;
+    byte b;
+    byte *iptr;
 
-        if (sign) {             /* negative bignum */
-            int c = 1;
+    if (ERTS_UNLIKELY(num_bits == 0)) {
+        return is_small(arg) || is_big(arg);
+    }
 
-            while (n >= sizeof(ErtsDigit)) {
-                int i;
+    iptr = dst_bin + BYTE_OFFSET(bin_offset);
+    bit_offset = BIT_OFFSET(bin_offset);
+    if (is_small(arg)) {
+        Uint rbits = 8 - bit_offset;
 
-                d = *dp++;
-                acc |= d << offs;
-                SUBc(acc & 0xff, &c, buf);
-                buf--;
-                acc = d >> (8-offs);
-                for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
-                    SUBc(acc & 0xff, &c, buf);
-                    buf--;
-                    acc >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
-            }
-            if (n) {
-                acc |= ((ErtsDigit)*dp << offs);
-                do {
-                    SUBc(acc & 0xff, &c, buf);
-                    buf--;
-                    acc >>= 8;
-                } while (--n > 0);
-            }
-            /* pad */
-            while (num_bytes--) {
-                SUBc(acc & 0xff, &c, buf);
-                buf--;
-                acc >>= 8;
+        if (bit_offset == 0) {
+            /* Aligned on a byte boundary. */
+            if (num_bits <= 8) {
+                /* All bits are in the same byte. */
+                b = (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+                *iptr = b;
+            } else {
+                /* More than one byte. */
+                fmt_small_be(iptr, arg, num_bits);
             }
-        } else {                /* positive bignum */
-            while (n >= sizeof(ErtsDigit)) {
-                int i;
+        } else if (bit_offset + num_bits <= 8) {
+            /*
+             * All bits are in the same byte.
+             */
+            b = *iptr & (0xff << rbits);
+            b |= (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+            *iptr = b;
+        } else {		/* Big endian */
+            /*
+             * Big-endian, more than one byte, but not aligned on a byte boundary.
+             * Handle the bits up to the next byte boundary specially,
+             * then let fmt_small_be() handle the rest.
+             */
+            Uint shift_count = num_bits - rbits;
+            Sint val = signed_val(arg);
 
-                d = *dp++;
-                acc |= d << offs;
-                *buf-- = acc;
-                acc = d >> (8-offs);
-                for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
-                    *buf-- = acc;
-                    acc >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
-            }
-            if (n) {
-                acc |= (*dp << offs);
-                do {
-                    *buf-- = acc;
-                    acc >>= 8;
-                } while (--n > 0);
-            }
-            while (num_bytes--) {
-                *buf-- = acc;
-                acc >>= 8;
+            ASSERT(num_bits > rbits);
+            b = *iptr & (0xff << rbits);
+
+            /*
+             * Shifting with a shift count greater than or equal to the word
+             * size may be a no-op (instead of 0 the result may be the unshifted
+             * value). Therefore, only do the shift and the OR if the shift count
+             * is less than the word size if the number is positive; if negative,
+             * we must simulate the sign extension.
+             */
+            if (shift_count < sizeof(Uint)*8) {
+                b |= (val >> shift_count) & MAKE_MASK(rbits);
+            } else if (val < 0) {
+                /* Simulate sign extension. */
+                b |= (-1) & MAKE_MASK(rbits);
             }
+            *iptr++ = b;
+
+            fmt_small_be(iptr, arg, shift_count);
         }
+    } else if (is_big(arg) && bit_offset == 0) {
+        /*
+         * Big number, aligned on a byte boundary. We can format the
+         * integer directly into the binary.
+         */
+        fmt_big_be(iptr, arg, num_bits);
+    } else if (is_big(arg) && bit_offset + num_bits <= 8) {
+        /*
+         * All bits are in the same byte.
+         */
+        Uint rbits = 8 - bit_offset;
+        Sint sign = big_sign(arg);
+        ErtsDigit* dp = big_v(arg);
+        Uint val = sign ? -*dp : *dp;
+
+        b = *iptr & (0xff << rbits);
+        b |= (val & MAKE_MASK(num_bits)) << (rbits-num_bits);
+        *iptr = b;
+    } else if (is_big(arg)) {
+        /*
+         * Big number, not aligned on a byte boundary.
+         *
+         * Format the integer byte-aligned using the binary itself as
+         * a temporary buffer.
+         */
+        b = *iptr;
+        fmt_big_be(iptr, arg, num_bits);
+
+        /*
+         * Now restore the overwritten bits of the first byte and
+         * shift everything to the right.
+         */
+        restore_and_shift(iptr, b, bit_offset, num_bits);
+    } else {
+        /* Not an integer. */
+        return 0;
     }
+    EBS->erts_bin_offset = bin_offset + num_bits;
+    return 1;
 }
 
 int
-erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flags))
+erts_bs_put_integer_le(ErlBitsState *EBS, Eterm arg, Uint num_bits)
 {
-    Uint bin_offset = erts_bin_offset;
+    byte* dst_bin = EBS->erts_current_bin;
+    Uint bin_offset = EBS->erts_bin_offset;
     Uint bit_offset;
-    Uint b;
+    byte b;
     byte *iptr;
 
+    if (ERTS_UNLIKELY(num_bits == 0)) {
+        return is_small(arg) || is_big(arg);
+    }
+
+    iptr = dst_bin + BYTE_OFFSET(bin_offset);
     bit_offset = BIT_OFFSET(bin_offset);
     if (is_small(arg)) {
-	Uint rbits = 8 - bit_offset;
-
-	if (num_bits == 0) {
-	    return 1;
-	} else if (bit_offset + num_bits <= 8) {
-	    /*
-	     * All bits are in the same byte.
-	     */
-	    iptr = erts_current_bin+BYTE_OFFSET(bin_offset);
-	    b = *iptr & (0xff << rbits);
-	    b |= (signed_val(arg) & ((1 << num_bits)-1)) << (rbits-num_bits);
-	    *iptr = b;
-	} else if (bit_offset == 0) {
-	    /*
-	     * More than one bit, starting at a byte boundary.
-	     */
-            iptr = erts_current_bin + BYTE_OFFSET(bin_offset);
-            fmt_small(iptr, NBYTES(num_bits), arg, num_bits, flags);
-	} else if (flags & BSF_LITTLE) {
+        Uint rbits = 8 - bit_offset;
+
+        if (bit_offset == 0) {
+            /* Aligned on a byte boundary. */
+            if (num_bits <= 8) {
+                /* All bits are in the same byte. */
+                b = (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+                *iptr = b;
+            } else {
+                /* More than one byte. */
+                fmt_small_le(iptr, arg, num_bits);
+            }
+        } else if (bit_offset + num_bits <= 8) {
+            /*
+             * All bits are in the same byte.
+             */
+            b = *iptr & (0xff << rbits);
+            b |= (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+            *iptr = b;
+        } else if (BIT_OFFSET(num_bits) == 0) {
             /*
              * Little endian small in more than one byte, not
-             * aligned on a byte boundary.
+             * aligned on a byte boundary. The size is evenly
+             * divisible by 8, which means that there will be
+             * one partial byte, followed by zero or more
+             * complete bytes, followed by a final partial byte.
              */
             Sint val = signed_val(arg);
             Uint rshift = bit_offset;
@@ -771,92 +972,51 @@ erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flag
             Uint count = (num_bits - rbits) / 8;
             Uint bits, bits1;
 
-            iptr = erts_current_bin+BYTE_OFFSET(bin_offset);
-
-            if (BIT_OFFSET(num_bits) == 0) {
-                bits = val;
-                bits1 = bits >> rshift;
-                *iptr = MASK_BITS(bits1, *iptr, lmask);
-                iptr++;
+            /* Handle the first partial byte. */
+            bits = val;
+            bits1 = bits >> rshift;
+            *iptr = MASK_BITS(bits1, *iptr, lmask);
+            iptr++;
+            val >>= 8;
+
+            /* Handle all complete bytes. */
+            while (count--) {
+                bits1 = bits << lshift;
+                bits = val & 0xff;
+                *iptr++ = bits1 | (bits >> rshift);
                 val >>= 8;
-
-                while (count--) {
-                    bits1 = bits << lshift;
-                    bits = val & 0xff;
-                    *iptr++ = bits1 | (bits >> rshift);
-                    val >>= 8;
-                }
-
-                *iptr = bits << lshift;
-            } else {
-                Sint num_bytes = NBYTES(num_bits) - 1;
-                Uint deoffs = BIT_OFFSET(bit_offset + num_bits);
-
-                if (num_bytes-- > 0) {
-                    bits = val;
-                } else {
-                    bits = (val << (8 - BIT_OFFSET(num_bits)));
-                }
-                bits1 = bits >> rshift;
-                *iptr = MASK_BITS(bits1, *iptr, lmask);
-                iptr++;
-                val >>= 8;
-
-                while (count--) {
-                    bits1 = bits << lshift;
-                    if (num_bytes-- > 0) {
-                        bits = val & 0xff;
-                    } else {
-                        bits = (val << (8 - BIT_OFFSET(num_bits))) & 0xff;
-                    }
-                    *iptr++ = bits1 | (bits >> rshift);
-                    val >>= 8;
-                }
-
-                if (deoffs) {
-                    bits1 = bits << lshift;
-                    if (rshift < deoffs) {
-                        bits = (val << (8 - BIT_OFFSET(num_bits))) & 0xff;
-                        bits1 |= bits >> rshift;
-                    }
-                    *iptr = bits1;
-                }
             }
-	} else {		/* Big endian */
-	    /*
-	     * Big-endian, more than one byte, but not aligned on a byte boundary.
-	     * Handle the bits up to the next byte boundary specially,
-	     * then let fmt_int() handle the rest.
-	     */
-	    Uint shift_count = num_bits - rbits;
-	    Sint val = signed_val(arg);
-	    iptr = erts_current_bin+BYTE_OFFSET(bin_offset);
-	    b = *iptr & (0xff << rbits);
-
-	    /*
-	     * Shifting with a shift count greater than or equal to the word
-	     * size may be a no-op (instead of 0 the result may be the unshifted
-	     * value). Therefore, only do the shift and the OR if the shift count
-	     * is less than the word size if the number is positive; if negative,
-	     * we must simulate the sign extension.
-	     */
-	    if (shift_count < sizeof(Uint)*8) {
-		b |= (val >> shift_count) & ((1 << rbits) - 1);
-	    } else if (val < 0) {
-		/* Simulate sign extension. */
-		b |= (-1) & ((1 << rbits) - 1);
-	    }
-	    *iptr++ = b;
 
-            fmt_small(iptr, NBYTES(num_bits-rbits), arg, num_bits-rbits, flags);
-	}
+            /* Handle the final partial byte. */
+            *iptr = bits << lshift;
+        } else {
+            /*
+             * Little endian small in more than one byte, not aligned
+             * on a byte boundary, and the size is not evenly
+             * divisible by 8.
+             *
+             * Now this gets complicated. We used to handle this
+             * directly, but since this case is presumably uncommon,
+             * we do this in a simpler way in two steps.
+             *
+             * First format the integer byte-aligned using the binary
+             * itself as a temporary buffer.
+             */
+            b = *iptr;
+            fmt_small_le(iptr, arg, num_bits);
+
+            /*
+             * Now restore the overwritten bits of the first byte and
+             * shift everything to the right.
+             */
+            restore_and_shift(iptr, b, bit_offset, num_bits);
+        }
     } else if (is_big(arg) && bit_offset == 0) {
-	/*
-	 * Big number, aligned on a byte boundary. We can format the
-	 * integer directly into the binary.
-	 */
-	fmt_big(erts_current_bin+BYTE_OFFSET(bin_offset),
-                NBYTES(num_bits), arg, num_bits, flags);
+        /*
+         * Big number, aligned on a byte boundary. We can format the
+         * integer directly into the binary.
+         */
+        fmt_big_le(iptr, arg, num_bits);
     } else if (is_big(arg) && bit_offset + num_bits <= 8) {
         /*
          * All bits are in the same byte.
@@ -866,68 +1026,37 @@ erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm arg, Uint num_bits, unsigned flag
         ErtsDigit* dp = big_v(arg);
         Uint val = sign ? -*dp : *dp;
 
-        iptr = erts_current_bin+BYTE_OFFSET(bin_offset);
         b = *iptr & (0xff << rbits);
-        b |= (val & ((1 << num_bits)-1)) << (rbits-num_bits);
+        b |= (val & MAKE_MASK(num_bits)) << (rbits-num_bits);
         *iptr = b;
     } else if (is_big(arg)) {
         /*
          * Big number, not aligned on a byte boundary.
-         */
-        Uint rshift = bit_offset;
-        Uint lshift = 8 - bit_offset;
-        Uint deoffs = BIT_OFFSET(bit_offset + num_bits);
-        Uint lmask = MAKE_MASK(8 - bit_offset);
-        Uint rmask = (deoffs) ? (MAKE_MASK(deoffs)<<(8-deoffs)) : 0;
-        Uint count = (num_bits - lshift) / 8;
-        Uint bits, bits1;
-
-        ASSERT(num_bits - lshift >= 0);
-
-        /*
+         *
          * Format the integer byte-aligned using the binary itself as
          * a temporary buffer.
          */
-        iptr = erts_current_bin + BYTE_OFFSET(bin_offset);
         b = *iptr;
-        fmt_big(iptr, NBYTES(num_bits), arg, num_bits, flags);
+        fmt_big_le(iptr, arg, num_bits);
 
         /*
          * Now restore the overwritten bits of the first byte and
          * shift everything to the right.
          */
-        bits = *iptr;
-        bits1 = bits >> rshift;
-        *iptr = MASK_BITS(bits1, b, lmask);
-        iptr++;
-
-        while (count--) {
-            bits1 = bits << lshift;
-            bits = *iptr;
-            *iptr++ = bits1 | (bits >> rshift);
-        }
-
-        if (rmask) {
-            bits1 = bits << lshift;
-            if ((rmask << rshift) & 0xff) {
-                bits = *iptr;
-                bits1 |= (bits >> rshift);
-            }
-            *iptr = MASK_BITS(bits1, *iptr, rmask);
-        }
+        restore_and_shift(iptr, b, bit_offset, num_bits);
     } else {
         /* Not an integer. */
         return 0;
     }
-    erts_bin_offset = bin_offset + num_bits;
+    EBS->erts_bin_offset = bin_offset + num_bits;
     return 1;
 }
 
 #if !defined(BEAMASM)
 int
-erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm arg))
+erts_bs_put_utf8(ErlBitsState *EBS, Eterm arg)
 {
-    Uint bin_offset = erts_bin_offset;
+    Uint bin_offset = EBS->erts_bin_offset;
     Uint bit_offset;
     Uint num_bits;
     byte tmp_buf[4];
@@ -944,7 +1073,7 @@ erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm arg))
 
     if ((bit_offset = BIT_OFFSET(bin_offset)) == 0) {
 	/* We can write directly into the destination binary. */
-	dst = erts_current_bin+BYTE_OFFSET(bin_offset);
+        dst = EBS->erts_current_bin + BYTE_OFFSET(bin_offset);
     } else {
 	/* Unaligned destination binary. Must use a temporary buffer. */
 	dst = tmp_buf;
@@ -975,19 +1104,19 @@ erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm arg))
     }
 
     if (bin_offset != 0) {
-	erts_copy_bits(dst, 0, 1, erts_current_bin, bin_offset, 1, num_bits);
+        erts_copy_bits_fwd(dst, 0, EBS->erts_current_bin, bin_offset, num_bits);
     }
 
-    erts_bin_offset += num_bits;
+    EBS->erts_bin_offset += num_bits;
 
     return 1;
 }
 #endif
 
 int
-erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm arg, Uint flags))
+erts_bs_put_utf16(ErlBitsState *EBS, Eterm arg, Uint flags)
 {
-    Uint bin_offset = erts_bin_offset;
+    Uint bin_offset = EBS->erts_bin_offset;
     Uint bit_offset;
     Uint num_bits;
     byte tmp_buf[4];
@@ -1004,7 +1133,7 @@ erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm arg, Uint flags))
 
     if ((bit_offset = BIT_OFFSET(bin_offset)) == 0) {
 	/* We can write directly into the destination binary. */
-	dst = erts_current_bin+BYTE_OFFSET(bin_offset);
+	dst = EBS->erts_current_bin + BYTE_OFFSET(bin_offset);
     } else {
 	/* Unaligned destination binary. Must use a temporary buffer. */
 	dst = tmp_buf;
@@ -1040,17 +1169,16 @@ erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm arg, Uint flags))
     }
 
     if (bin_offset != 0) {
-	erts_copy_bits(dst, 0, 1, erts_current_bin, bin_offset, 1, num_bits);
+	erts_copy_bits_fwd(dst, 0, EBS->erts_current_bin, bin_offset, num_bits);
     }
 
-    erts_bin_offset += num_bits;
+    EBS->erts_bin_offset += num_bits;
     return 1;
 }
 
 int
-erts_new_bs_put_binary(Process *c_p, Eterm arg, Uint num_bits)
+erts_bs_put_binary(ErlBitsState *EBS, Process *c_p, Eterm arg, Uint num_bits)
 {
-    ERL_BITS_DEFINE_STATEP(c_p);
     Uint offset, size;
     byte *base;
 
@@ -1066,18 +1194,17 @@ erts_new_bs_put_binary(Process *c_p, Eterm arg, Uint num_bits)
         return 0;
     }
 
-    copy_binary_to_buffer(erts_current_bin, erts_bin_offset,
+    copy_binary_to_buffer(EBS->erts_current_bin, EBS->erts_bin_offset,
                           base, offset, num_bits);
-    erts_bin_offset += num_bits;
+    EBS->erts_bin_offset += num_bits;
 
     BUMP_REDS(c_p, num_bits / BITS_PER_REDUCTION);
     return 1;
 }
 
 int
-erts_new_bs_put_binary_all(Process *c_p, Eterm arg, Uint unit)
+erts_bs_put_binary_all(ErlBitsState *EBS, Process *c_p, Eterm arg, Uint unit)
 {
-    ERL_BITS_DEFINE_STATEP(c_p);
     Uint offset, size;
     byte *base;
 
@@ -1091,9 +1218,9 @@ erts_new_bs_put_binary_all(Process *c_p, Eterm arg, Uint unit)
         return 0;
     }
 
-    copy_binary_to_buffer(erts_current_bin, erts_bin_offset,
+    copy_binary_to_buffer(EBS->erts_current_bin, EBS->erts_bin_offset,
                           base, offset, size);
-    erts_bin_offset += size;
+    EBS->erts_bin_offset += size;
 
     BUMP_REDS(c_p, size / BITS_PER_REDUCTION);
     return 1;
@@ -1106,11 +1233,9 @@ erts_new_bs_put_binary_all(Process *c_p, Eterm arg, Uint unit)
  * and sets c_p-fvalue to 'type', 'no_float', or 'invalid'.
  */
 Eterm
-erts_new_bs_put_float(Process *c_p, Eterm arg, Uint num_bits, int flags)
+erts_bs_put_float(ErlBitsState *EBS, Process *c_p, Eterm arg, Uint num_bits, int flags)
 {
-    ERL_BITS_DEFINE_STATEP(c_p);
-
-    if (BIT_OFFSET(erts_bin_offset) == 0) {
+    if (BIT_OFFSET(EBS->erts_bin_offset) == 0) {
 	Uint32 a;
 	Uint32 b;
 	
@@ -1222,7 +1347,7 @@ erts_new_bs_put_float(Process *c_p, Eterm arg, Uint num_bits, int flags)
 	}
 
 	if (BIT_IS_MACHINE_ENDIAN(flags)) {
-	    byte* t = erts_current_bin+BYTE_OFFSET(erts_bin_offset);
+	    byte* t = EBS->erts_current_bin + BYTE_OFFSET(EBS->erts_bin_offset);
 #ifdef WORDS_BIGENDIAN
 	    if (num_bits == 16) {
 		t[0] = a >> 8;
@@ -1255,7 +1380,9 @@ erts_new_bs_put_float(Process *c_p, Eterm arg, Uint num_bits, int flags)
 	    }
 #endif
 	} else {
-	    byte* t = erts_current_bin+BYTE_OFFSET(erts_bin_offset) + NBYTES(num_bits);
+	    byte* t = EBS->erts_current_bin +
+                BYTE_OFFSET(EBS->erts_bin_offset) +
+                NBYTES(num_bits);
 #ifdef WORDS_BIGENDIAN
 	    if (num_bits == 16) {
 		t[-1] = a >> 8;
@@ -1384,28 +1511,41 @@ erts_new_bs_put_float(Process *c_p, Eterm arg, Uint num_bits, int flags)
 	    return make_small(num_bits);
 	}
 	if (BIT_IS_MACHINE_ENDIAN(flags)) {
-	    erts_copy_bits(bptr, 0, 1,
-		      erts_current_bin,
-		      erts_bin_offset, 1, num_bits);
+            erts_copy_bits_fwd(bptr, 0,
+                               EBS->erts_current_bin,
+                               EBS->erts_bin_offset, num_bits);
 	} else {
-	    erts_copy_bits(bptr+NBYTES(num_bits)-1, 0, -1,
-			   erts_current_bin, erts_bin_offset, 1,
-			   num_bits);
+            byte tmp_buf[8];
+            Uint n = BYTE_OFFSET(num_bits);
+            byte *dst = tmp_buf + n;
+
+            do {
+                *--dst = *bptr++;
+            } while (--n != 0);
+
+            erts_copy_bits_fwd(tmp_buf, 0,
+                               EBS->erts_current_bin,
+                               EBS->erts_bin_offset, num_bits);
 	}
     }
-    erts_bin_offset += num_bits;
+    EBS->erts_bin_offset += num_bits;
     return THE_NON_VALUE;
 }
 
-void 
-erts_new_bs_put_string(ERL_BITS_PROTO_2(byte* iptr, Uint num_bytes))
+void
+erts_bs_put_string(ErlBitsState* EBS, byte* iptr, Uint num_bytes)
 {
-    if (BIT_OFFSET(erts_bin_offset) != 0) {
-	erts_copy_bits(iptr, 0, 1, erts_current_bin, erts_bin_offset, 1, num_bytes*8);
+    byte* dst_bin = EBS->erts_current_bin;
+    Uint dst_offset = EBS->erts_bin_offset;
+
+    EBS->erts_bin_offset = dst_offset + num_bytes * 8;
+    if (BIT_OFFSET(dst_offset) != 0) {
+        erts_copy_bits_fwd(iptr, 0,
+                           dst_bin, dst_offset,
+                           num_bytes * 8);
     } else {
-	sys_memcpy(erts_current_bin+BYTE_OFFSET(erts_bin_offset), iptr, num_bytes);
+        sys_memcpy(dst_bin + BYTE_OFFSET(dst_offset), iptr, num_bytes);
     }
-    erts_bin_offset += num_bytes*8;
 }
 
 static ERTS_INLINE
@@ -1488,8 +1628,9 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
     BinRef* br;
     Binary* binp;
     Uint heap_need;
+    Uint position;
     Uint used_size_in_bits;
-    ERL_BITS_DEFINE_STATEP(c_p);
+    ErlBitsState* EBS = ERL_BITS_EBS_FROM_REG(reg);
 
     /*
      * Check the binary argument.
@@ -1528,10 +1669,10 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
      * OK, the binary is writable.
      */
     ASSERT(sb->start == 0);
-    erts_bin_offset = sb->end;
+    EBS->erts_bin_offset = position = sb->end;
     if (unit > 1) {
-	if ((unit == 8 && (erts_bin_offset & 7) != 0) ||
-	    (unit != 8 && (erts_bin_offset % unit) != 0)) {
+	if ((unit == 8 && (position & 7) != 0) ||
+	    (unit != 8 && (position % unit) != 0)) {
             c_p->fvalue = am_unit;
 	    goto badarg;
 	}
@@ -1548,13 +1689,13 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
 	return bin;
     }
 
-    if((ERTS_UINT_MAX - build_size_in_bits) < erts_bin_offset) {
+    if ((ERTS_UINT_MAX - build_size_in_bits) < position) {
         c_p->fvalue = am_size;
         c_p->freason = SYSTEM_LIMIT;
         return THE_NON_VALUE;
     }
 
-    used_size_in_bits = erts_bin_offset + build_size_in_bits;
+    used_size_in_bits = position + build_size_in_bits;
 
     /* Make sure that no one else can append to the incoming bitstring. */
     erl_sub_bits_clear_writable(sb);
@@ -1569,11 +1710,11 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
         binp = erts_bin_realloc(binp, new_size);
         br->val = binp;
 
-        BUMP_REDS(c_p, erts_bin_offset / BITS_PER_REDUCTION);
+        BUMP_REDS(c_p, position / BITS_PER_REDUCTION);
     }
 
     binp->intern.apparent_size = NBYTES(used_size_in_bits);
-    erts_current_bin = (byte*)binp->orig_bytes;
+    EBS->erts_current_bin = (byte*)binp->orig_bytes;
 
     /* Allocate heap space and build a new sub binary. */
     reg[live] = sb->orig;
@@ -1592,7 +1733,7 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
     erl_sub_bits_init(sb,
                       ERL_SUB_BITS_FLAGS_WRITABLE,
                       reg[live],
-                      erts_current_bin,
+                      EBS->erts_current_bin,
                       0,
                       used_size_in_bits);
 
@@ -1656,10 +1797,10 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
                                  &br,
                                  &sb);
 
-        erts_current_bin = (byte*)(br->val)->orig_bytes;
-        erts_bin_offset = src_size;
+        EBS->erts_current_bin = (byte*)(br->val)->orig_bytes;
+        EBS->erts_bin_offset = src_size;
 
-        copy_binary_to_buffer(erts_current_bin,
+        copy_binary_to_buffer(EBS->erts_current_bin,
                               0,
                               src_bytes,
                               src_offset,
@@ -1671,15 +1812,14 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
 }
 
 Eterm
-erts_bs_private_append_checked(Process* p, Eterm bin, Uint build_size_in_bits, Uint unit)
+erts_bs_private_append_checked(ErlBitsState* EBS, Process* p,
+                               Eterm bin, Uint build_size_in_bits)
 {
-    Uint new_position, new_size, used_size;
+    Uint old_position, new_position, used_size;
     Binary *refc_binary;
     ErlSubBits *sb;
     BinRef *br;
 
-    ERL_BITS_DEFINE_STATEP(p);
-
     sb = (ErlSubBits*)bitstring_val(bin);
     ASSERT(sb->thing_word == HEADER_SUB_BITS);
 
@@ -1688,21 +1828,24 @@ erts_bs_private_append_checked(Process* p, Eterm bin, Uint build_size_in_bits, U
 
     /* Calculate new size in bits. */
     ASSERT(sb->start == 0);
-    erts_bin_offset = sb->end;
+    EBS->erts_bin_offset = old_position = sb->end;
 
-    if((ERTS_UINT_MAX - build_size_in_bits) < erts_bin_offset) {
+#ifdef BEAMASM
+    ASSERT(ERTS_UINT_MAX - build_size_in_bits >= old_position);
+#else
+    if (ERTS_UINT_MAX - build_size_in_bits < old_position) {
         p->fvalue = am_size;
         p->freason = SYSTEM_LIMIT;
         return THE_NON_VALUE;
     }
+#endif
 
     refc_binary = br->val;
 
-    new_position = erts_bin_offset + build_size_in_bits;
-    update_wb_overhead(p, br, sb->end, new_position);
-
+    new_position = old_position + build_size_in_bits;
     used_size = NBYTES(new_position);
-    new_size = GROW_PROC_BIN_SIZE(used_size);
+
+    update_wb_overhead(p, br, old_position, new_position);
 
     if (refc_binary->intern.flags & BIN_FLAG_WRITABLE) {
         /* This is the normal case - the binary is writable. There are no other
@@ -1711,10 +1854,11 @@ erts_bs_private_append_checked(Process* p, Eterm bin, Uint build_size_in_bits, U
         ASSERT(erl_sub_bits_is_writable(sb));
         ASSERT(erts_refc_read(&refc_binary->intern.refc, 1) == 1);
         if (refc_binary->orig_size < used_size) {
+            Uint new_size = GROW_PROC_BIN_SIZE(used_size);
             refc_binary = erts_bin_realloc(refc_binary, new_size);
             br->val = refc_binary;
 
-            BUMP_REDS(p, erts_bin_offset / BITS_PER_REDUCTION);
+            BUMP_REDS(p, EBS->erts_bin_offset / BITS_PER_REDUCTION);
         }
 
         ASSERT(sb->start == 0);
@@ -1730,10 +1874,11 @@ erts_bs_private_append_checked(Process* p, Eterm bin, Uint build_size_in_bits, U
          * binary and make a copy of the data.
          *
          * We'll also make a new BinRef as the old one may have been moved from
-         * the `wrt_bins` list to the regular `off_heap` list by the GC. To
+         * the `wrt_bins` list to the regular `off_heap` list by the GC.
          * To move it back would mean traversing the `off_heap` list from the
          * start, so we'll create a new BinRef instead for this (hopefully)
          * rare case. */
+        Uint new_size = GROW_PROC_BIN_SIZE(used_size);
         Binary *new_binary = erts_bin_nrml_alloc(new_size);
         Eterm *hp = HeapFragOnlyAlloc(p, ERL_REFC_BITS_SIZE);
 
@@ -1749,12 +1894,12 @@ erts_bs_private_append_checked(Process* p, Eterm bin, Uint build_size_in_bits, U
                    refc_binary->orig_bytes,
                    MIN(refc_binary->orig_size, new_size));
 
-        BUMP_REDS(p, erts_bin_offset / BITS_PER_REDUCTION);
+        BUMP_REDS(p, EBS->erts_bin_offset / BITS_PER_REDUCTION);
         refc_binary = new_binary;
     }
 
     ASSERT(refc_binary->intern.flags & BIN_FLAG_WRITABLE);
-    erts_current_bin = (byte*)&refc_binary->orig_bytes[0];
+    EBS->erts_current_bin = (byte*)&refc_binary->orig_bytes[0];
 
     return make_bitstring(sb);
 }
@@ -1829,7 +1974,7 @@ erts_bs_get_unaligned_uint32(ErlSubBits* sb)
     byte bigbuf[4];
     byte* LSB;
     byte* MSB;
-	
+
     CHECK_MATCH_BUFFER(sb);
     ASSERT((sb->start & 7) != 0);
     ASSERT(sb->end - sb->start >= 32);
@@ -1841,7 +1986,7 @@ erts_bs_get_unaligned_uint32(ErlSubBits* sb)
     MSB = LSB + bytes - 1;
 
     *MSB = 0;
-    erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1, MSB, offs, -1, 32);
+    erts_copy_bits_rev(erl_sub_bits_get_base(sb), sb->start, MSB, offs, 32);
     return LSB[0] | (LSB[1]<<8) | (LSB[2]<<16) | (LSB[3]<<24);
 }
 
@@ -1864,7 +2009,7 @@ erts_align_utf8_bytes(ErlSubBits *sb, byte* buf)
     } else {
 	bits = 16;
     }
-    erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1, buf, 0, 1, bits);
+    erts_copy_bits_fwd(erl_sub_bits_get_base(sb), sb->start, buf, 0, bits);
 }
 
 Eterm
@@ -1989,8 +2134,8 @@ erts_bs_get_utf16(ErlSubBits *sb, Uint flags)
 	 * get 4 bytes, otherwise two bytes.
 	 */
 	Uint n = num_bits < 32 ? 16 : 32;
-	erts_copy_bits(erl_sub_bits_get_base(sb), sb->start, 1,
-                       tmp_buf, 0, 1, n);
+        erts_copy_bits_fwd(erl_sub_bits_get_base(sb), sb->start,
+                           tmp_buf, 0, n);
 	src = tmp_buf;
     }
     
@@ -2124,18 +2269,141 @@ int erts_cmp_bits__(const byte *a_ptr,
 
 /*
  * The basic bit copy operation. Copies n bits from the source buffer to
- * the destination buffer. Depending on the directions, it can reverse the
- * copied bits.
+ * the destination buffer.
  */
 
+void
+erts_copy_bits_fwd(const byte* src, /* Base pointer to source. */
+                   size_t soffs,    /* Bit offset for source relative to src. */
+                   byte* dst,       /* Base pointer to destination. */
+                   size_t doffs,    /* Bit offset for destination relative to dst. */
+                   size_t n)        /* Number of bits to copy. */
+{
+    Uint lmask;
+    Uint rmask;
+    Uint count;
+    Uint deoffs;
+
+    if (n == 0) {
+        return;
+    }
+
+    src += BYTE_OFFSET(soffs);
+    dst += BYTE_OFFSET(doffs);
+    soffs = BIT_OFFSET(soffs);
+    doffs = BIT_OFFSET(doffs);
+    deoffs = BIT_OFFSET(doffs+n);
+    lmask = (doffs) ? MAKE_MASK(8-doffs) : 0;
+    rmask = (deoffs) ? (MAKE_MASK(deoffs)<<(8-deoffs)) : 0;
+
+    /*
+     * Take care of the case that all bits are in the same byte.
+     */
+
+    if (doffs+n < 8) {		/* All bits are in the same byte */
+        lmask = (lmask & rmask) ? (lmask & rmask) : (lmask | rmask);
+
+        if (soffs == doffs) {
+            *dst = MASK_BITS(*src, *dst, lmask);
+        } else if (soffs > doffs) {
+            Uint bits = (*src << (soffs-doffs));
+            if (soffs+n > 8) {
+                src++;
+                bits |= (*src >> (8-(soffs-doffs)));
+            }
+            *dst = MASK_BITS(bits, *dst, lmask);
+        } else {
+            *dst = MASK_BITS((*src >> (doffs-soffs)), *dst, lmask);
+        }
+        return;			/* We are done! */
+    }
+
+    /*
+     * At this point, we know that the bits are in 2 or more bytes.
+     */
+
+    count = (lmask ? (n - (8 - doffs)) : n) >> 3;
+
+    if (soffs == doffs) {
+        /*
+         * The bits are aligned in the same way. We can just copy the bytes
+         * (except for the first and last bytes).
+         */
+
+        if (lmask) {
+            *dst = MASK_BITS(*src, *dst, lmask);
+            dst++, src++;
+        }
+
+        sys_memcpy(dst, src, count);
 
-void 
-erts_copy_bits(const byte* src, /* Base pointer to source. */
+        if (rmask) {
+            dst += count;
+            src += count;
+            *dst = MASK_BITS(*src, *dst, rmask);
+        }
+    } else {
+        Uint bits;
+        Uint bits1;
+        Uint rshift;
+        Uint lshift;
+
+        /*
+         * The tricky case. The bits must be shifted into position.
+         */
+
+        if (soffs > doffs) {
+            lshift = soffs - doffs;
+            rshift = 8 - lshift;
+            bits = *src;
+            if (soffs + n > 8) {
+                src++;
+            }
+        } else {
+            rshift = doffs - soffs;
+            lshift = 8 - rshift;
+            bits = 0;
+        }
+
+        if (lmask) {
+            bits1 = bits << lshift;
+            bits = *src++;
+            bits1 |= (bits >> rshift);
+            *dst = MASK_BITS(bits1, *dst, lmask);
+            dst++;
+        }
+
+        while (count--) {
+            bits1 = bits << lshift;
+            bits = *src++;
+            *dst = bits1 | (bits >> rshift);
+            dst++;
+        }
+
+        if (rmask) {
+            bits1 = bits << lshift;
+            if ((rmask << rshift) & 0xff) {
+                bits = *src;
+                bits1 |= (bits >> rshift);
+            }
+            *dst = MASK_BITS(bits1, *dst, rmask);
+        }
+    }
+}
+
+/*
+ * The reverse bit copy operation. Copies n bits from the source
+ * buffer to the destination buffer. The bits are read 8 bits at the
+ * time from the source buffer, while incrementing the source buffer
+ * pointer. The 8 bit groups are stored into the destination buffer,
+ * while decrementing the destination buffer pointer.
+ */
+
+void
+erts_copy_bits_rev(const byte* src, /* Base pointer to source. */
                size_t soffs,    /* Bit offset for source relative to src. */
-               int sdir,        /* Direction: 1 (forward) or -1 (backward). */
                byte* dst,       /* Base pointer to destination. */
                size_t doffs,    /* Bit offset for destination relative to dst. */
-               int ddir,        /* Direction: 1 (forward) or -1 (backward). */
                size_t n)        /* Number of bits to copy. */
 {
     Uint lmask;
@@ -2144,11 +2412,11 @@ erts_copy_bits(const byte* src, /* Base pointer to source. */
     Uint deoffs;
 
     if (n == 0) {
-	return;
+        return;
     }
 
-    src += sdir*BYTE_OFFSET(soffs);
-    dst += ddir*BYTE_OFFSET(doffs);
+    src += BYTE_OFFSET(soffs);
+    dst -= BYTE_OFFSET(doffs);
     soffs = BIT_OFFSET(soffs);
     doffs = BIT_OFFSET(doffs);
     deoffs = BIT_OFFSET(doffs+n);
@@ -2160,21 +2428,21 @@ erts_copy_bits(const byte* src, /* Base pointer to source. */
      */
 
     if (doffs+n < 8) {		/* All bits are in the same byte */
-	lmask = (lmask & rmask) ? (lmask & rmask) : (lmask | rmask);
-
-	if (soffs == doffs) {
-	    *dst = MASK_BITS(*src,*dst,lmask);
-	} else if (soffs > doffs) {
-	    Uint bits = (*src << (soffs-doffs));
-	    if (soffs+n > 8) {
-		src += sdir;
-		bits |= (*src >> (8-(soffs-doffs)));
-	    }
-	    *dst = MASK_BITS(bits,*dst,lmask);
-	} else {
-	    *dst = MASK_BITS((*src >> (doffs-soffs)),*dst,lmask);
-	}
-	return;			/* We are done! */
+        lmask = (lmask & rmask) ? (lmask & rmask) : (lmask | rmask);
+
+        if (soffs == doffs) {
+            *dst = MASK_BITS(*src,*dst,lmask);
+        } else if (soffs > doffs) {
+            Uint bits = (*src << (soffs-doffs));
+            if (soffs+n > 8) {
+                src++;
+                bits |= (*src >> (8-(soffs-doffs)));
+            }
+            *dst = MASK_BITS(bits,*dst,lmask);
+        } else {
+            *dst = MASK_BITS((*src >> (doffs-soffs)),*dst,lmask);
+        }
+        return;			/* We are done! */
     }
 
     /*
@@ -2184,75 +2452,70 @@ erts_copy_bits(const byte* src, /* Base pointer to source. */
     count = ((lmask) ? (n - (8 - doffs)) : n) >> 3;
 
     if (soffs == doffs) {
-	/*
-	 * The bits are aligned in the same way. We can just copy the bytes
-	 * (except for the first and last bytes). Note that the directions
-	 * might be different, so we can't just use memcpy().
-	 */
+        /*
+         * The bits are aligned in the same way. We can just copy the bytes
+         * (except for the first and last bytes).
+         */
 
-	if (lmask) {
-	    *dst = MASK_BITS(*src, *dst, lmask);
-	    dst += ddir;
-	    src += sdir;
-	}
+        if (lmask) {
+            *dst = MASK_BITS(*src, *dst, lmask);
+            dst--, src++;
+        }
 
-	while (count--) {
-	    *dst = *src;
-	    dst += ddir;
-	    src += sdir;
-	}
+        while (count--) {
+            *dst-- = *src++;
+        }
 
-	if (rmask) {
-	    *dst = MASK_BITS(*src,*dst,rmask);
-	}
+        if (rmask) {
+            *dst = MASK_BITS(*src, *dst, rmask);
+        }
     } else {
-	Uint bits;
-	Uint bits1;
-	Uint rshift;
-	Uint lshift;
+        Uint bits;
+        Uint bits1;
+        Uint rshift;
+        Uint lshift;
 
-	/*
-	 * The tricky case. The bits must be shifted into position.
-	 */
-	
-	if (soffs > doffs) {
-	    lshift = (soffs - doffs);
-	    rshift = 8 - lshift;
-	    bits = *src;
-	    if (soffs + n > 8) {
-		src += sdir;
-	    }
-	} else {
-	    rshift = (doffs - soffs);
-	    lshift = 8 - rshift;
-	    bits = 0;
-	}
-	    
-	if (lmask) {
-	    bits1 = bits << lshift;
-	    bits = *src;
-	    src += sdir;
-	    bits1 |= (bits >> rshift);
-	    *dst = MASK_BITS(bits1,*dst,lmask);
-	    dst += ddir;
-	}
+        /*
+         * The tricky case. The bits must be shifted into position.
+         */
 
-	while (count--) {
-	    bits1 = bits << lshift;
-	    bits = *src;
-	    src += sdir;
-	    *dst = bits1 | (bits >> rshift);
-	    dst += ddir;
-	}
-	
-	if (rmask) {
-	    bits1 = bits << lshift;
-	    if ((rmask << rshift) & 0xff) {
-		bits = *src;
-		bits1 |= (bits >> rshift);
-	    }
-	    *dst = MASK_BITS(bits1,*dst,rmask);
-	}
+        if (soffs > doffs) {
+            lshift = (soffs - doffs);
+            rshift = 8 - lshift;
+            bits = *src;
+            if (soffs + n > 8) {
+                src++;
+            }
+        } else {
+            rshift = doffs - soffs;
+            lshift = 8 - rshift;
+            bits = 0;
+        }
+
+        if (lmask) {
+            bits1 = bits << lshift;
+            bits = *src;
+            src++;
+            bits1 |= (bits >> rshift);
+            *dst = MASK_BITS(bits1, *dst, lmask);
+            dst--;
+        }
+
+        while (count--) {
+            bits1 = bits << lshift;
+            bits = *src++;
+            *dst = bits1 | (bits >> rshift);
+            dst--;
+        }
+
+        if (rmask) {
+            bits1 = bits << lshift;
+            if ((rmask << rshift) & 0xff) {
+                bits = *src;
+                bits1 |= (bits >> rshift);
+            }
+            *dst = MASK_BITS(bits1, *dst, rmask);
+        }
     }
 }
 
diff --git a/erts/emulator/beam/erl_bits.h b/erts/emulator/beam/erl_bits.h
index 619f6157e4ec..6d4200e4a2b1 100644
--- a/erts/emulator/beam/erl_bits.h
+++ b/erts/emulator/beam/erl_bits.h
@@ -168,43 +168,28 @@ struct erl_bits_state {
     /*
      * Pointer to the beginning of the current binary.
      */
-    byte* erts_current_bin_;
+    byte* erts_current_bin;
 
     /*
      * Offset in bits into the current binary.
      */
-    Uint erts_bin_offset_;
+    Uint erts_bin_offset;
 };
 
+typedef struct erl_bits_state ErlBitsState;
+
 /*
- * Reentrant API with the state passed as a parameter.
- * (Except when the current Process* already is a parameter.)
+ * The bit syntax construction state resides in the current process's
+ * schduler data. The following macro retrieves the pointer to that
+ * state given a pointer to the X register array.
  */
-/* the state resides in the current process' scheduler data */
-#define ERL_BITS_DECLARE_STATEP struct erl_bits_state *EBS
-
-#define ERL_BITS_RELOAD_STATEP(P)                                              \
-    do {                                                                       \
-        EBS = &erts_proc_sched_data((P))->registers->aux_regs.d.erl_bits_state;  \
-    } while(0)
-
-#define ERL_BITS_DEFINE_STATEP(P) \
-    struct erl_bits_state *EBS = \
-        &erts_proc_sched_data((P))->registers->aux_regs.d.erl_bits_state
 
-#define ErlBitsState				(*EBS)
-
-#define ERL_BITS_PROTO_0			struct erl_bits_state *EBS
-#define ERL_BITS_PROTO_1(PARM1)			struct erl_bits_state *EBS, PARM1
-#define ERL_BITS_PROTO_2(PARM1,PARM2)		struct erl_bits_state *EBS, PARM1, PARM2
-#define ERL_BITS_PROTO_3(PARM1,PARM2,PARM3)	struct erl_bits_state *EBS, PARM1, PARM2, PARM3
-#define ERL_BITS_ARGS_0				EBS
-#define ERL_BITS_ARGS_1(ARG1)			EBS, ARG1
-#define ERL_BITS_ARGS_2(ARG1,ARG2)		EBS, ARG1, ARG2
-#define ERL_BITS_ARGS_3(ARG1,ARG2,ARG3)		EBS, ARG1, ARG2, ARG3
-
-#define erts_bin_offset		(ErlBitsState.erts_bin_offset_)
-#define erts_current_bin	(ErlBitsState.erts_current_bin_)
+#define ERL_BITS_EBS_FROM_REG(Reg)                              \
+    ((ErlBitsState *) ((char *)(Reg) +                          \
+                       (offsetof(ErtsSchedulerRegisters,        \
+                                 aux_regs.d.erl_bits_state) -   \
+                        offsetof(ErtsSchedulerRegisters,        \
+                                 x_reg_array.d))))
 
 /*
  * Return number of Eterm words needed for allocation with HAlloc(),
@@ -231,22 +216,25 @@ Eterm erts_bs_get_binary_2(Process *p, Uint num_bits, unsigned flags, ErlSubBits
 Eterm erts_bs_get_binary_all_2(Process *p, ErlSubBits* sb);
 
 /* Binary construction, new instruction set. */
-int erts_new_bs_put_integer(ERL_BITS_PROTO_3(Eterm Integer, Uint num_bits, unsigned flags));
+int erts_bs_put_integer_be(ErlBitsState *EBS, Eterm Integer, Uint num_bits);
+int erts_bs_put_integer_le(ErlBitsState *EBS, Eterm Integer, Uint num_bits);
 #if !defined(BEAMASM)
-int erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm Integer));
+int erts_bs_put_utf8(ErlBitsState *EBS, Eterm Integer);
 #endif
-int erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm Integer, Uint flags));
-int erts_new_bs_put_binary(Process *c_p, Eterm Bin, Uint num_bits);
-int erts_new_bs_put_binary_all(Process *c_p, Eterm Bin, Uint unit);
-Eterm erts_new_bs_put_float(Process *c_p, Eterm Float, Uint num_bits, int flags);
-void erts_new_bs_put_string(ERL_BITS_PROTO_2(byte* iptr, Uint num_bytes));
+int erts_bs_put_utf16(ErlBitsState *EBS, Eterm Integer, Uint flags);
+int erts_bs_put_binary(ErlBitsState *EBS, Process *c_p, Eterm Bin, Uint num_bits);
+int erts_bs_put_binary_all(ErlBitsState* EBS, Process *c_p, Eterm Bin, Uint unit);
+Eterm erts_bs_put_float(ErlBitsState *EBS, Process *c_p, Eterm Float,
+                        Uint num_bits, int flags);
+void erts_bs_put_string(ErlBitsState *EBS, byte* iptr, Uint num_bytes);
 
 Uint32 erts_bs_get_unaligned_uint32(ErlSubBits* sb);
 Eterm erts_bs_get_utf8(ErlSubBits* sb);
 Eterm erts_bs_get_utf16(ErlSubBits* sb, Uint flags);
 Eterm erts_bs_append_checked(Process* p, Eterm* reg, Uint live, Uint size,
                              Uint extra_words, Uint unit);
-Eterm erts_bs_private_append_checked(Process* p, Eterm bin, Uint size, Uint unit);
+Eterm erts_bs_private_append_checked(ErlBitsState* EBS, Process* p,
+                                     Eterm bin, Uint size);
 Eterm erts_bs_init_writable(Process* p, Eterm sz);
 
 /* ************************************************************************* */
@@ -257,8 +245,10 @@ copy_binary_to_buffer(byte *dst_base, Uint dst_offset,
                       const byte *src_base, Uint src_offset,
                       Uint size);
 
-void erts_copy_bits(const byte* src, size_t soffs, int sdir,
-                    byte* dst, size_t doffs, int ddir, size_t n);
+void erts_copy_bits_fwd(const byte* src, size_t soffs,
+                        byte* dst, size_t doffs, size_t n);
+void erts_copy_bits_rev(const byte* src, size_t soffs,
+                        byte* dst, size_t doffs, size_t n);
 
 ERTS_GLB_INLINE int erts_cmp_bits(const byte* a_ptr,
                                   Uint a_offs,
@@ -544,9 +534,9 @@ copy_binary_to_buffer(byte *dst_base, Uint dst_offset,
         if (((dst_offset | src_offset | size) & 7) == 0) {
             sys_memcpy(dst_base, src_base, BYTE_SIZE(size));
         } else {
-            erts_copy_bits(src_base, BIT_OFFSET(src_offset), 1,
-                           dst_base, BIT_OFFSET(dst_offset), 1,
-                           size);
+            erts_copy_bits_fwd(src_base, BIT_OFFSET(src_offset),
+                               dst_base, BIT_OFFSET(dst_offset),
+                               size);
         }
     }
 }
@@ -618,7 +608,7 @@ erts_get_aligned_binary_bytes_extra(Eterm bin,
                                                 NBYTES(size) + extra);
                 *base_ptr = bytes;
 
-                erts_copy_bits(base, offset, 1, &bytes[extra], 0, 1, size);
+                erts_copy_bits_fwd(base, offset, &bytes[extra], 0, size);
                 return &bytes[extra];
             }
 
diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index 2563d40ad563..0e554a336a99 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -1352,7 +1352,7 @@ int enif_inspect_binary(ErlNifEnv* env, Eterm bin_term, ErlNifBinary* bin)
                 env->tmp_obj_list = tmp_obj;
 
                 bin->data = (byte*)&tmp_obj[1];
-                erts_copy_bits(base, offset, 1, bin->data, 0, 1, size);
+                erts_copy_bits_fwd(base, offset, bin->data, 0, size);
             } else {
                 bin->data = &base[BYTE_OFFSET(offset)];
             }
diff --git a/erts/emulator/beam/erl_term_hashing.c b/erts/emulator/beam/erl_term_hashing.c
index 796dd9574b0c..dae3bba60f64 100644
--- a/erts/emulator/beam/erl_term_hashing.c
+++ b/erts/emulator/beam/erl_term_hashing.c
@@ -1276,8 +1276,8 @@ make_hash2_helper(Eterm term_param, const int can_trap, Eterm* state_mref_write_
                     byte *buf = erts_alloc(ERTS_ALC_T_TMP, nr_of_bytes);
                     Uint nr_of_bits_to_copy = ctx.sz*BYTE_BITS+ctx.bitsize;
                     if (can_trap) iterations_until_trap -= iters_for_bin;
-                    erts_copy_bits(ctx.bptr,
-                                   ctx.bitoffs, 1, buf, 0, 1, nr_of_bits_to_copy);
+                    erts_copy_bits_fwd(ctx.bptr, ctx.bitoffs,
+                                       buf, 0, nr_of_bits_to_copy);
                     hash = block_hash(buf, ctx.sz, con);
                     if (ctx.bitsize > 0) {
                         UINT32_HASH_2(ctx.bitsize,
@@ -1312,9 +1312,9 @@ make_hash2_helper(Eterm term_param, const int can_trap, Eterm* state_mref_write_
                         Uint nr_of_bits_to_copy =
                             MIN(nr_of_bits_left, BINARY_BUF_SIZE_BITS);
                         ctx.done = nr_of_bits_left == nr_of_bits_to_copy;
-                        erts_copy_bits(ctx.bptr + ctx.no_bytes_processed,
-                                       ctx.bitoffs, 1, ctx.buf, 0, 1,
-                                       nr_of_bits_to_copy);
+                        erts_copy_bits_fwd(ctx.bptr + ctx.no_bytes_processed,
+                                           ctx.bitoffs, ctx.buf, 0,
+                                           nr_of_bits_to_copy);
                         block_hash_buffer(ctx.buf,
                                           bytes_to_process,
                                           block_hash_ctx);
@@ -1948,7 +1948,7 @@ make_internal_hash(Eterm term, erts_ihash_t salt)
                     if (BIT_OFFSET(offset) != 0) {
                         byte *tmp = (byte*)erts_alloc(ERTS_ALC_T_TMP,
                                                       NBYTES(size));
-                        erts_copy_bits(data, offset, 1, tmp, 0, 1, size);
+                        erts_copy_bits_fwd(data, offset, tmp, 0, size);
                         bytes = tmp;
                     } else {
                         bytes = &data[BYTE_OFFSET(offset)];
diff --git a/erts/emulator/beam/jit/arm/instr_bs.cpp b/erts/emulator/beam/jit/arm/instr_bs.cpp
index f6a84591c998..d28c40321cce 100644
--- a/erts/emulator/beam/jit/arm/instr_bs.cpp
+++ b/erts/emulator/beam/jit/arm/instr_bs.cpp
@@ -1199,7 +1199,7 @@ void BeamModuleAssembler::update_bin_state(a64::Gp bin_offset,
                                            Sint size,
                                            a64::Gp size_reg) {
     int cur_bin_offset = offsetof(ErtsSchedulerRegisters,
-                                  aux_regs.d.erl_bits_state.erts_current_bin_);
+                                  aux_regs.d.erl_bits_state.erts_current_bin);
     arm::Mem mem_bin_base = arm::Mem(scheduler_registers, cur_bin_offset);
     arm::Mem mem_bin_offset =
             arm::Mem(scheduler_registers, cur_bin_offset + sizeof(Eterm));
@@ -1207,8 +1207,8 @@ void BeamModuleAssembler::update_bin_state(a64::Gp bin_offset,
     if (bit_offset % 8 != 0) {
         /* The bit offset is unknown or not byte-aligned. */
         ERTS_CT_ASSERT_FIELD_PAIR(struct erl_bits_state,
-                                  erts_current_bin_,
-                                  erts_bin_offset_);
+                                  erts_current_bin,
+                                  erts_bin_offset);
         a.ldp(TMP2, bin_offset, mem_bin_base);
 
         if (size_reg.isValid()) {
@@ -2021,14 +2021,14 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         BscSegment seg = segments[0];
         comment("private append to binary");
         ASSERT(Alloc.get() == 0);
-        mov_arg(ARG2, seg.src);
+        load_erl_bits_state(ARG1);
+        a.mov(ARG2, c_p);
+        mov_arg(ARG3, seg.src);
         if (sizeReg.isValid()) {
-            a.mov(ARG3, sizeReg);
+            a.mov(ARG4, sizeReg);
         } else {
-            mov_imm(ARG3, num_bits);
+            mov_imm(ARG4, num_bits);
         }
-        a.mov(ARG4, seg.unit);
-        a.mov(ARG1, c_p);
         emit_enter_runtime(Live.get());
         runtime_call<4>(erts_bs_private_append_checked);
         emit_leave_runtime(Live.get());
@@ -2036,7 +2036,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
     } else if (estimated_num_bits <= ERL_ONHEAP_BITS_LIMIT) {
         static constexpr auto cur_bin_offset =
                 offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
-                offsetof(struct erl_bits_state, erts_current_bin_);
+                offsetof(struct erl_bits_state, erts_current_bin);
         Uint need;
 
         arm::Mem mem_bin_base = arm::Mem(scheduler_registers, cur_bin_offset);
@@ -2104,8 +2104,8 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
 
             /* Initialize the erl_bin_state struct. */
             ERTS_CT_ASSERT_FIELD_PAIR(struct erl_bits_state,
-                                      erts_current_bin_,
-                                      erts_bin_offset_);
+                                      erts_current_bin,
+                                      erts_bin_offset);
             a.stp(HTOP, ZERO, mem_bin_base);
 
             /* Update HTOP. */
@@ -2158,11 +2158,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             comment("construct a binary segment");
             if (seg.effectiveSize >= 0) {
                 /* The segment has a literal size. */
-                mov_imm(ARG3, seg.effectiveSize);
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
+                load_erl_bits_state(ARG1);
+                a.mov(ARG2, c_p);
+                mov_arg(ARG3, seg.src);
+                mov_imm(ARG4, seg.effectiveSize);
                 emit_enter_runtime<Update::eReductions>(Live.get());
-                runtime_call<3>(erts_new_bs_put_binary);
+                runtime_call<4>(erts_bs_put_binary);
                 emit_leave_runtime<Update::eReductions>(Live.get());
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
                                                              BSC_REASON_BADARG,
@@ -2172,12 +2173,13 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                        seg.size.as<ArgAtom>().get() == am_all) {
                 /* Include the entire binary/bitstring in the
                  * resulting binary. */
-                a.mov(ARG3, seg.unit);
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
+                load_erl_bits_state(ARG1);
+                a.mov(ARG2, c_p);
+                mov_arg(ARG3, seg.src);
+                mov_imm(ARG4, seg.unit);
 
                 emit_enter_runtime<Update::eReductions>(Live.get());
-                runtime_call<3>(erts_new_bs_put_binary_all);
+                runtime_call<4>(erts_bs_put_binary_all);
                 emit_leave_runtime<Update::eReductions>(Live.get());
 
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
@@ -2195,17 +2197,18 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                  * the value is a non-negative small in the
                  * appropriate range. Multiply the size with the
                  * unit. */
-                auto r = load_source(seg.size, ARG3);
-                a.asr(ARG3, r.reg, imm(_TAG_IMMED1_SIZE));
+                auto r = load_source(seg.size, ARG4);
+                a.asr(ARG4, r.reg, imm(_TAG_IMMED1_SIZE));
                 if (seg.unit != 1) {
                     mov_imm(TMP1, seg.unit);
-                    a.mul(ARG3, ARG3, TMP1);
+                    a.mul(ARG4, ARG4, TMP1);
                 }
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
+                load_erl_bits_state(ARG1);
+                a.mov(ARG2, c_p);
+                mov_arg(ARG3, seg.src);
 
                 emit_enter_runtime<Update::eReductions>(Live.get());
-                runtime_call<3>(erts_new_bs_put_binary);
+                runtime_call<4>(erts_bs_put_binary);
                 emit_leave_runtime<Update::eReductions>(Live.get());
 
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
@@ -2225,21 +2228,22 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         case am_float:
             comment("construct float segment");
             if (seg.effectiveSize >= 0) {
-                mov_imm(ARG3, seg.effectiveSize);
+                mov_imm(ARG4, seg.effectiveSize);
             } else {
-                auto r = load_source(seg.size, ARG3);
-                a.asr(ARG3, r.reg, imm(_TAG_IMMED1_SIZE));
+                auto r = load_source(seg.size, ARG4);
+                a.asr(ARG4, r.reg, imm(_TAG_IMMED1_SIZE));
                 if (seg.unit != 1) {
                     mov_imm(TMP1, seg.unit);
-                    a.mul(ARG3, ARG3, TMP1);
+                    a.mul(ARG4, ARG4, TMP1);
                 }
             }
-            mov_arg(ARG2, seg.src);
-            mov_imm(ARG4, seg.flags);
-            a.mov(ARG1, c_p);
+            load_erl_bits_state(ARG1);
+            a.mov(ARG2, c_p);
+            mov_arg(ARG3, seg.src);
+            mov_imm(ARG5, seg.flags);
 
             emit_enter_runtime(Live.get());
-            runtime_call<4>(erts_new_bs_put_float);
+            runtime_call<5>(erts_bs_put_float);
             emit_leave_runtime(Live.get());
 
             if (Fail.get() == 0) {
@@ -2490,12 +2494,15 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                 } else {
                     /* Call the helper function to fetch and store the
                      * integer into the binary. */
-                    mov_arg(ARG2, seg.src);
-                    mov_imm(ARG4, seg.flags);
                     load_erl_bits_state(ARG1);
+                    mov_arg(ARG2, seg.src);
 
                     emit_enter_runtime(Live.get());
-                    runtime_call<4>(erts_new_bs_put_integer);
+                    if (seg.flags & BSF_LITTLE) {
+                        runtime_call<3>(erts_bs_put_integer_le);
+                    } else {
+                        runtime_call<3>(erts_bs_put_integer_be);
+                    }
                     emit_leave_runtime(Live.get());
 
                     if (exact_type<BeamTypeId::Integer>(seg.src)) {
@@ -2522,12 +2529,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
 
             comment("insert string");
             ASSERT(seg.effectiveSize >= 0);
-            mov_imm(ARG3, seg.effectiveSize / 8);
-            mov_arg(ARG2, string_ptr);
             load_erl_bits_state(ARG1);
+            mov_arg(ARG2, string_ptr);
+            mov_imm(ARG3, seg.effectiveSize / 8);
 
             emit_enter_runtime(Live.get());
-            runtime_call<3>(erts_new_bs_put_string);
+            runtime_call<3>(erts_bs_put_string);
             emit_leave_runtime(Live.get());
             break;
         }
@@ -2556,13 +2563,16 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             a.cbz(ARG1, resolve_label(error, disp1MB));
             break;
         case am_utf32:
+            load_erl_bits_state(ARG1);
             mov_arg(ARG2, seg.src);
             mov_imm(ARG3, 4 * 8);
-            a.mov(ARG4, seg.flags);
-            load_erl_bits_state(ARG1);
 
             emit_enter_runtime(Live.get());
-            runtime_call<4>(erts_new_bs_put_integer);
+            if (seg.flags & BSF_LITTLE) {
+                runtime_call<3>(erts_bs_put_integer_le);
+            } else {
+                runtime_call<3>(erts_bs_put_integer_be);
+            }
             emit_leave_runtime(Live.get());
 
             if (Fail.get() == 0) {
diff --git a/erts/emulator/beam/jit/beam_jit_common.cpp b/erts/emulator/beam/jit/beam_jit_common.cpp
index fde678606ffa..3b348db4b9ad 100644
--- a/erts/emulator/beam/jit/beam_jit_common.cpp
+++ b/erts/emulator/beam/jit/beam_jit_common.cpp
@@ -808,7 +808,7 @@ void beam_jit_bs_add_argument_error(Process *c_p, Eterm A, Eterm B) {
 
 Eterm beam_jit_bs_init_bits(Process *c_p,
                             Eterm *reg,
-                            ERL_BITS_DECLARE_STATEP,
+                            ErlBitsState *EBS,
                             Uint num_bits,
                             Uint alloc,
                             unsigned Live) {
@@ -818,7 +818,7 @@ Eterm beam_jit_bs_init_bits(Process *c_p,
         alloc += ERL_REFC_BITS_SIZE;
     }
 
-    erts_bin_offset = 0;
+    EBS->erts_bin_offset = 0;
 
     if (num_bits <= ERL_ONHEAP_BITS_LIMIT) {
         ErlHeapBits *hb;
@@ -830,7 +830,7 @@ Eterm beam_jit_bs_init_bits(Process *c_p,
         hb->thing_word = header_heap_bits(num_bits);
         hb->size = num_bits;
 
-        erts_current_bin = (byte *)hb->data;
+        EBS->erts_current_bin = (byte *)hb->data;
         return make_bitstring(hb);
     } else {
         const Uint num_bytes = NBYTES(num_bits);
@@ -839,13 +839,13 @@ Eterm beam_jit_bs_init_bits(Process *c_p,
         test_bin_vheap(c_p, reg, num_bytes / sizeof(Eterm), alloc, Live);
 
         new_binary = erts_bin_nrml_alloc(num_bytes);
-        erts_current_bin = (byte *)new_binary->orig_bytes;
+        EBS->erts_current_bin = (byte *)new_binary->orig_bytes;
 
         return erts_wrap_refc_bitstring(&MSO(c_p).first,
                                         &MSO(c_p).overhead,
                                         &HEAP_TOP(c_p),
                                         new_binary,
-                                        erts_current_bin,
+                                        EBS->erts_current_bin,
                                         0,
                                         num_bits);
     }
diff --git a/erts/emulator/beam/jit/beam_jit_common.hpp b/erts/emulator/beam/jit/beam_jit_common.hpp
index 934821d8f532..3d34674153e0 100644
--- a/erts/emulator/beam/jit/beam_jit_common.hpp
+++ b/erts/emulator/beam/jit/beam_jit_common.hpp
@@ -600,7 +600,7 @@ void beam_jit_bs_field_size_argument_error(Process *c_p, Eterm size);
 void beam_jit_bs_add_argument_error(Process *c_p, Eterm A, Eterm B);
 Eterm beam_jit_bs_init_bits(Process *c_p,
                             Eterm *reg,
-                            ERL_BITS_DECLARE_STATEP,
+                            ErlBitsState *EBS,
                             Uint num_bits,
                             Uint alloc,
                             unsigned Live);
diff --git a/erts/emulator/beam/jit/x86/instr_bs.cpp b/erts/emulator/beam/jit/x86/instr_bs.cpp
index d3c9e1c050d6..4a7f54771750 100644
--- a/erts/emulator/beam/jit/x86/instr_bs.cpp
+++ b/erts/emulator/beam/jit/x86/instr_bs.cpp
@@ -1262,10 +1262,10 @@ void BeamModuleAssembler::update_bin_state(x86::Gp bin_offset,
     const int x_reg_offset = offsetof(ErtsSchedulerRegisters, x_reg_array.d);
     const int cur_bin_base =
             offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
-            offsetof(struct erl_bits_state, erts_current_bin_);
+            offsetof(struct erl_bits_state, erts_current_bin);
     const int cur_bin_offset =
             offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
-            offsetof(struct erl_bits_state, erts_bin_offset_);
+            offsetof(struct erl_bits_state, erts_bin_offset);
 
     x86::Mem mem_bin_base =
             x86::Mem(registers, cur_bin_base - x_reg_offset, sizeof(UWord));
@@ -1822,10 +1822,10 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                 offsetof(ErtsSchedulerRegisters, x_reg_array.d);
         const int cur_bin_base =
                 offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
-                offsetof(struct erl_bits_state, erts_current_bin_);
+                offsetof(struct erl_bits_state, erts_current_bin);
         const int cur_bin_offset =
                 offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
-                offsetof(struct erl_bits_state, erts_bin_offset_);
+                offsetof(struct erl_bits_state, erts_bin_offset);
         x86::Mem mem_bin_base =
                 x86::qword_ptr(registers, cur_bin_base - x_reg_offset);
         x86::Mem mem_bin_offset =
@@ -2195,14 +2195,14 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         runtime_entered = bs_maybe_enter_runtime(runtime_entered);
         comment("private append to binary");
         ASSERT(Alloc.get() == 0);
-        mov_arg(ARG2, seg.src);
+        mov_arg(ARG3, seg.src);
         if (sizeReg.isValid()) {
-            a.mov(ARG3, sizeReg);
+            a.mov(ARG4, sizeReg);
         } else {
-            mov_imm(ARG3, num_bits);
+            mov_imm(ARG4, num_bits);
         }
-        a.mov(ARG4, seg.unit);
-        a.mov(ARG1, c_p);
+        a.mov(ARG2, c_p);
+        load_erl_bits_state(ARG1);
         runtime_call<4>(erts_bs_private_append_checked);
         /* There is no way the call can fail on a 64-bit architecture. */
         a.mov(TMP_MEM1q, RET);
@@ -2251,10 +2251,11 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             comment("construct a binary segment");
             if (seg.effectiveSize >= 0) {
                 /* The segment has a literal size. */
-                mov_imm(ARG3, seg.effectiveSize);
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
-                runtime_call<3>(erts_new_bs_put_binary);
+                mov_imm(ARG4, seg.effectiveSize);
+                mov_arg(ARG3, seg.src);
+                a.mov(ARG2, c_p);
+                load_erl_bits_state(ARG1);
+                runtime_call<4>(erts_bs_put_binary);
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
                                                              BSC_REASON_BADARG,
                                                              BSC_INFO_DEPENDS,
@@ -2263,10 +2264,11 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                        seg.size.as<ArgAtom>().get() == am_all) {
                 /* Include the entire binary/bitstring in the
                  * resulting binary. */
-                a.mov(ARG3, seg.unit);
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
-                runtime_call<3>(erts_new_bs_put_binary_all);
+                mov_imm(ARG4, seg.unit);
+                mov_arg(ARG3, seg.src);
+                a.mov(ARG2, c_p);
+                load_erl_bits_state(ARG1);
+                runtime_call<4>(erts_bs_put_binary_all);
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
                                                              BSC_REASON_BADARG,
                                                              BSC_INFO_UNIT,
@@ -2282,16 +2284,17 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                  * the value is a non-negative small in the
                  * appropriate range. Multiply the size with the
                  * unit. */
-                mov_arg(ARG3, seg.size);
-                a.sar(ARG3, imm(_TAG_IMMED1_SIZE));
+                mov_arg(ARG4, seg.size);
+                a.sar(ARG4, imm(_TAG_IMMED1_SIZE));
                 if (seg.unit != 1) {
                     mov_imm(RET, seg.unit);
-                    a.mul(ARG3); /* CLOBBERS RDX = ARG3! */
-                    a.mov(ARG3, RET);
+                    a.mul(ARG4); /* CLOBBERS RDX = ARG3! */
+                    a.mov(ARG4, RET);
                 }
-                mov_arg(ARG2, seg.src);
-                a.mov(ARG1, c_p);
-                runtime_call<3>(erts_new_bs_put_binary);
+                mov_arg(ARG3, seg.src);
+                a.mov(ARG2, c_p);
+                load_erl_bits_state(ARG1);
+                runtime_call<4>(erts_bs_put_binary);
                 error_info = beam_jit_update_bsc_reason_info(seg.error_info,
                                                              BSC_REASON_BADARG,
                                                              BSC_INFO_DEPENDS,
@@ -2311,20 +2314,21 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             runtime_entered = bs_maybe_enter_runtime(runtime_entered);
             comment("construct float segment");
             if (seg.effectiveSize >= 0) {
-                mov_imm(ARG3, seg.effectiveSize);
+                mov_imm(ARG4, seg.effectiveSize);
             } else {
-                mov_arg(ARG3, seg.size);
-                a.sar(ARG3, imm(_TAG_IMMED1_SIZE));
+                mov_arg(ARG4, seg.size);
+                a.sar(ARG4, imm(_TAG_IMMED1_SIZE));
                 if (seg.unit != 1) {
                     mov_imm(RET, seg.unit);
-                    a.mul(ARG3); /* CLOBBERS RDX = ARG3! */
-                    a.mov(ARG3, RET);
+                    a.mul(ARG4); /* CLOBBERS RDX = ARG3! */
+                    a.mov(ARG4, RET);
                 }
             }
-            mov_arg(ARG2, seg.src);
-            mov_imm(ARG4, seg.flags);
-            a.mov(ARG1, c_p);
-            runtime_call<4>(erts_new_bs_put_float);
+            mov_arg(ARG3, seg.src);
+            mov_imm(ARG5, seg.flags);
+            a.mov(ARG2, c_p);
+            load_erl_bits_state(ARG1);
+            runtime_call<5>(erts_bs_put_float);
             if (Fail.get() == 0) {
                 mov_imm(ARG4,
                         beam_jit_update_bsc_reason_info(seg.error_info,
@@ -2595,9 +2599,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                      * integer into the binary. */
                     runtime_entered = bs_maybe_enter_runtime(runtime_entered);
                     mov_arg(ARG2, seg.src);
-                    mov_imm(ARG4, seg.flags);
                     load_erl_bits_state(ARG1);
-                    runtime_call<4>(erts_new_bs_put_integer);
+                    if (seg.flags & BSF_LITTLE) {
+                        runtime_call<3>(erts_bs_put_integer_le);
+                    } else {
+                        runtime_call<3>(erts_bs_put_integer_be);
+                    }
                     if (exact_type<BeamTypeId::Integer>(seg.src)) {
                         comment("skipped test for success because construction "
                                 "can't fail");
@@ -2628,7 +2635,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             mov_imm(ARG3, seg.effectiveSize / 8);
             mov_arg(ARG2, string_ptr);
             load_erl_bits_state(ARG1);
-            runtime_call<3>(erts_new_bs_put_string);
+            runtime_call<3>(erts_bs_put_string);
         } break;
         case am_utf8: {
             runtime_entered = bs_maybe_enter_runtime(runtime_entered);
@@ -2656,9 +2663,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             runtime_entered = bs_maybe_enter_runtime(runtime_entered);
             mov_arg(ARG2, seg.src);
             mov_imm(ARG3, 4 * 8);
-            a.mov(ARG4, seg.flags);
             load_erl_bits_state(ARG1);
-            runtime_call<4>(erts_new_bs_put_integer);
+            if (seg.flags & BSF_LITTLE) {
+                runtime_call<3>(erts_bs_put_integer_le);
+            } else {
+                runtime_call<3>(erts_bs_put_integer_be);
+            }
             if (Fail.get() == 0) {
                 mov_arg(ARG1, seg.src);
                 mov_imm(ARG4,
diff --git a/erts/emulator/test/bs_construct_SUITE.erl b/erts/emulator/test/bs_construct_SUITE.erl
index 42de143791a5..8fbada37e6ac 100644
--- a/erts/emulator/test/bs_construct_SUITE.erl
+++ b/erts/emulator/test/bs_construct_SUITE.erl
@@ -30,7 +30,7 @@
 	 otp_7422/1, zero_width/1, bad_append/1, bs_append_overflow/1,
          bs_append_offheap/1,
          reductions/1, fp16/1, zero_init/1, error_info/1, little/1,
-         heap_binary_unit/1
+         heap_binary_unit/1, floats/1
         ]).
 
 -include_lib("common_test/include/ct.hrl").
@@ -46,7 +46,7 @@ all() ->
      copy_writable_binary, kostis, dynamic, otp_7422, zero_width,
      bad_append, bs_append_overflow, bs_append_offheap,
      reductions, fp16, zero_init,
-     error_info, little, heap_binary_unit].
+     error_info, little, heap_binary_unit, floats].
 
 init_per_suite(Config) ->
     Config.
@@ -1634,6 +1634,254 @@ heap_binary_unit_2(Variant, Rest) ->
             {error2, Bin2}
     end.
 
+floats(_Config) ->
+    _ = rand:uniform(),				%Seed generator
+    io:format("Seed: ~p", [rand:export_seed()]),
+
+    %% Random floats.
+    _ = [do_float(rand:uniform() * math:pow(10.0, rand:uniform(20))) ||
+            _ <- lists:seq(1, 20)],
+
+    %% Random floats with powers of 10 near the upper limit representable
+    %% as a 64-bit float.
+    _ = [do_float(rand:uniform() * math:pow(10.0, 300 + rand:uniform(7))) ||
+            _ <- lists:seq(1, 10)],
+
+    %% Random small integers.
+    _ = [do_float(rand:uniform(1_000_000)) || _ <- lists:seq(1, 10)],
+
+    %% Random big integers.
+    _ = [do_float(rand:uniform(1_000_000) bsl 64) || _ <- lists:seq(1, 10)],
+
+    do_float(-0.0),
+    do_float(+0.0),
+
+    ok.
+
+do_float(F) ->
+    do_float(F, 0).
+
+do_float(_F, 32) ->
+    ok;
+do_float(F, N) ->
+    Pad = rand:uniform(1 bsl N) - 1,
+    true = is_integer(Pad),
+
+    do_float_be_16(F, N, Pad),
+    do_float_be_32(F, N, Pad),
+    do_float_be_64(F, N, Pad),
+
+    do_float_le_16(F, N, Pad),
+    do_float_le_32(F, N, Pad),
+    do_float_le_64(F, N, Pad),
+
+    do_float(F, N + 1).
+
+do_float_be_16(F, N, Pad) ->
+    FloatBin = id(<<F:16/big-float>>),
+    Bin = id(<<Pad:N, F:16/big-float>>),
+    Bin = id(<<Pad:N, (id(F)):16/big-float>>),
+    Bin = <<Pad:N, F:(id(16))/big-float>>,
+    Bin = <<Pad:N, (id(F)):(id(16))/big-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:16/big-float>>),
+            Bin = <<Pad:N, F:(id(16))/big-float>>,
+            Bin = case N of
+                      15 -> <<Pad:N, F:16/big-float>>;
+                      21 -> <<Pad:N, F:16/big-float>>;
+                      _ -> <<Pad:N, F:16/big-float>>
+                  end;
+        is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:16/big-float>>),
+            Bin = <<Pad:N, F:(id(16))/big-float>>,
+            Bin = case N of
+                      1 -> <<Pad:N, F:16/big-float>>;
+                      19 -> <<Pad:N, F:16/big-float>>;
+                      _ -> <<Pad:N, F:16/big-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+do_float_be_32(F, N, Pad) ->
+    FloatBin = id(<<F:32/big-float>>),
+    Bin = id(<<Pad:N, F:32/big-float>>),
+    Bin = id(<<Pad:N, (id(F)):32/big-float>>),
+    Bin = <<Pad:N, F:(id(32))/big-float>>,
+    Bin = <<Pad:N, (id(F)):(id(32))/big-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:32/big-float>>),
+            Bin = <<Pad:N, F:(id(32))/big-float>>,
+            Bin = case N of
+                      1 -> <<Pad:N, F:32/big-float>>;
+                      6 -> <<Pad:N, F:32/big-float>>;
+                      _ -> <<Pad:N, F:32/big-float>>
+                  end;
+        is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:32/big-float>>),
+            Bin = <<Pad:N, F:(id(32))/big-float>>,
+            Bin = case N of
+                      8 -> <<Pad:N, F:32/big-float>>;
+                      12 -> <<Pad:N, F:32/big-float>>;
+                      _ -> <<Pad:N, F:32/big-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+do_float_be_64(F, N, Pad) ->
+    FloatBin = id(<<F:64/big-float>>),
+    Bin = id(<<Pad:N, F:64/big-float>>),
+    Bin = id(<<Pad:N, (id(F)):64/big-float>>),
+    Bin = <<Pad:N, F:(id(64))/big-float>>,
+    Bin = <<Pad:N, (id(F)):(id(64))/big-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:64/big-float>>),
+            Bin = <<Pad:N, F:(id(64))/big-float>>,
+            Bin = case N of
+                      7 -> <<Pad:N, F:64/big-float>>;
+                      13 -> <<Pad:N, F:64/big-float>>;
+                      _ -> <<Pad:N, F:64/big-float>>
+                  end,
+
+            %% Match out the original float.
+            <<Pad:N, F:64/big-float>> = Bin;
+        is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:64/big-float>>),
+            Bin = <<Pad:N, F:(id(64))/big-float>>,
+            Bin = case N of
+                      7 -> <<Pad:N, F:64/big-float>>;
+                      13 -> <<Pad:N, F:64/big-float>>;
+                      _ -> <<Pad:N, F:64/big-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+do_float_le_16(F, N, Pad) ->
+    FloatBin = id(<<F:16/little-float>>),
+    Bin = id(<<Pad:N, F:16/little-float>>),
+    Bin = id(<<Pad:N, (id(F)):16/little-float>>),
+    Bin = <<Pad:N, F:(id(16))/little-float>>,
+    Bin = <<Pad:N, (id(F)):(id(16))/little-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:16/little-float>>),
+            Bin = <<Pad:N, F:(id(16))/little-float>>,
+            Bin = case N of
+                      11 -> <<Pad:N, F:16/little-float>>;
+                      27 -> <<Pad:N, F:16/little-float>>;
+                      _ -> <<Pad:N, F:16/little-float>>
+                  end;
+        is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:16/little-float>>),
+            Bin = <<Pad:N, F:(id(16))/little-float>>,
+            Bin = case N of
+                      7 -> <<Pad:N, F:16/little-float>>;
+                      13 -> <<Pad:N, F:16/little-float>>;
+                      _ -> <<Pad:N, F:16/little-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+do_float_le_32(F, N, Pad) ->
+    FloatBin = id(<<F:32/little-float>>),
+    Bin = id(<<Pad:N, F:32/little-float>>),
+    Bin = id(<<Pad:N, (id(F)):32/little-float>>),
+    Bin = <<Pad:N, F:(id(32))/little-float>>,
+    Bin = <<Pad:N, (id(F)):(id(32))/little-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:32/little-float>>),
+            Bin = <<Pad:N, F:(id(32))/little-float>>,
+            Bin = case N of
+                      9 -> <<Pad:N, F:32/little-float>>;
+                      29 -> <<Pad:N, F:32/little-float>>;
+                      _ -> <<Pad:N, F:32/little-float>>
+                  end;
+       is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:32/little-float>>),
+            Bin = <<Pad:N, F:(id(32))/little-float>>,
+            Bin = case N of
+                      7 -> <<Pad:N, F:32/little-float>>;
+                      13 -> <<Pad:N, F:32/little-float>>;
+                      _ -> <<Pad:N, F:32/little-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+do_float_le_64(F, N, Pad) ->
+    FloatBin = id(<<F:64/little-float>>),
+    Bin = id(<<Pad:N, F:64/little-float>>),
+    Bin = id(<<Pad:N, (id(F)):64/little-float>>),
+    Bin = <<Pad:N, F:(id(64))/little-float>>,
+    Bin = <<Pad:N, (id(F)):(id(64))/little-float>>,
+    <<Pad:N, FloatBin/binary>> = Bin,
+
+    if
+        is_float(F) ->
+            %% Construct float segment of a known float.
+            FloatBin = id(<<F:64/little-float>>),
+            Bin = <<Pad:N, F:(id(64))/little-float>>,
+            Bin = case N of
+                      9 -> <<Pad:N, F:64/little-float>>;
+                      29 -> <<Pad:N, F:64/little-float>>;
+                      _ -> <<Pad:N, F:64/little-float>>
+                  end,
+
+            %% Match out the original float.
+            <<Pad:N, F:64/little-float>> = Bin;
+        is_integer(F) ->
+            %% Construct float segment of a known integer.
+            FloatBin = id(<<F:64/little-float>>),
+            Bin = <<Pad:N, F:(id(64))/little-float>>,
+            Bin = case N of
+                      7 -> <<Pad:N, F:64/little-float>>;
+                      13 -> <<Pad:N, F:64/little-float>>;
+                      _ -> <<Pad:N, F:64/little-float>>
+                  end;
+        true ->
+            ok
+    end,
+
+    ok.
+
+
 %%%
 %%% Common utilities.
 %%%