From ff2ecfd258fac9d3e04377a6c4703806deb0a0db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sat, 11 May 2024 08:26:18 +0200
Subject: [PATCH] Optimize construction of dynamically sized integer segments

The main optimization is breaking apart the `erts_bs_put_integer()`
function into the two functions `erts_bs_put_integer_be()` and
`erts_bs_put_integer_le()` to avoid runtime checks of the endianness.
---
 erts/emulator/beam/emu/bs_instrs.tab    |  25 +-
 erts/emulator/beam/erl_bits.c           | 694 ++++++++++++++----------
 erts/emulator/beam/erl_bits.h           |   4 +-
 erts/emulator/beam/jit/arm/instr_bs.cpp |  14 +-
 erts/emulator/beam/jit/x86/instr_bs.cpp |  14 +-
 5 files changed, 453 insertions(+), 298 deletions(-)

diff --git a/erts/emulator/beam/emu/bs_instrs.tab b/erts/emulator/beam/emu/bs_instrs.tab
index 0024bae6e34f..1c41b5b3ccb7 100644
--- a/erts/emulator/beam/emu/bs_instrs.tab
+++ b/erts/emulator/beam/emu/bs_instrs.tab
@@ -688,22 +688,37 @@ i_bs_create_bin(Fail, Alloc, Live, Dst, N) {
         case BSC_INTEGER:
             {
                 Sint _size;
+                int result;
 
                 $BS_LOAD_UNIT(p, unit);
                 $BS_LOAD_FLAGS(p, flags);
                 $BS_LOAD_SIZE(p, Size);
                 $BS_GET_UNCHECKED_FIELD_SIZE(Size, unit, $BADARG($Fail), _size);
-                if (!erts_bs_put_integer(EBS, Src, _size, flags)) {
+                if (flags & BSF_LITTLE) {
+                    result = erts_bs_put_integer_le(EBS, Src, _size);
+                } else {
+                    result = erts_bs_put_integer_be(EBS, Src, _size);
+                }
+                if (!result) {
                     $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
                 }
             }
             break;
         case BSC_INTEGER_FIXED_SIZE:
         case BSC_UTF32:
-            $BS_LOAD_FLAGS(p, flags);
-            $BS_LOAD_FIXED_SIZE(p, Size);
-            if (!erts_bs_put_integer(EBS, Src, Size, flags)) {
-                $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
+            {
+                int result;
+
+                $BS_LOAD_FLAGS(p, flags);
+                $BS_LOAD_FIXED_SIZE(p, Size);
+                if (flags & BSF_LITTLE) {
+                    result = erts_bs_put_integer_le(EBS, Src, Size);
+                } else {
+                    result = erts_bs_put_integer_be(EBS, Src, Size);
+                }
+                if (!result) {
+                    $BS_FAIL_INFO($Fail, BADARG, am_type, Src);
+                }
             }
             break;
         case BSC_UTF8:
diff --git a/erts/emulator/beam/erl_bits.c b/erts/emulator/beam/erl_bits.c
index 0d7e6b1f5cf3..b489c74c6f05 100644
--- a/erts/emulator/beam/erl_bits.c
+++ b/erts/emulator/beam/erl_bits.c
@@ -532,238 +532,438 @@ erts_bs_get_binary_all_2(Process *p, ErlSubBits *sb)
  ****************************************************************/
 
 
-/* COPY_VAL:
- * copy sz byte from val to dst buffer, 
- * dst, val are updated!!!
+/* FMT_COPY_VAL:
+ * Copy sz bytes from val to dst buffer;
+ * dst and val are updated.
  */
 
-#define COPY_VAL(dst,ddir,val,sz) do { \
-   Uint __sz = (sz); \
-   while(__sz) { \
-     switch(__sz) { \
-     default: \
-     case 4: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     case 3: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     case 2: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     case 1: *dst = (val&0xff); dst += ddir; val >>= 8; __sz--; \
-     } \
-   } \
+#define FMT_COPY_VAL(dst,ddir,val,sz) do {                      \
+   Uint __sz = (sz);                                            \
+   while (__sz) {                                               \
+     switch(__sz) {                                             \
+     default:                                                   \
+     case 8: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 7: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 6: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 5: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 4: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 3: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 2: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     case 1: *dst = val; dst += ddir; val >>= 8; __sz--;        \
+     }                                                          \
+   }                                                            \
  } while(0)
 
 static void
-fmt_small(byte *buf, Uint num_bytes, Eterm arg, Uint num_bits, Uint flags)
+fmt_small_be(byte *buf, Eterm arg, Uint num_bits)
 {
     Uint bit_offset;
     Sint val;
+    Uint num_bytes;
 
     ASSERT(is_small(arg));
     ASSERT(num_bits != 0);      /* Tested by caller */
 
+    num_bytes = NBYTES(num_bits);
     bit_offset = BIT_OFFSET(num_bits);
     val = signed_val(arg);
 
-    if (flags & BSF_LITTLE) { /* Little endian */
+    buf += num_bytes - 1;
+    if (bit_offset) {
+        *buf-- = val << (8-bit_offset);
         num_bytes--;
-        COPY_VAL(buf, 1, val, num_bytes);
-        *buf = bit_offset ? (val << (8-bit_offset)) : val;
-    } else {		/* Big endian */
-        buf += num_bytes - 1;
-        if (bit_offset) {
-            *buf-- = val << (8-bit_offset);
-            num_bytes--;
-            val >>= bit_offset;
-        }
-        COPY_VAL(buf, -1, val, num_bytes);
+        val >>= bit_offset;
     }
+    FMT_COPY_VAL(buf, -1, val, num_bytes);
 }
 
-/* calculate a - *cp (carry)  (store result in b), *cp is updated! */
-#define SUBc(a, cp, b) do { \
-   byte __x = (a); \
-   byte __y = (__x - (*(cp))); \
-   (*cp) = (__y > __x); \
-   *(b) = ~__y; \
- } while(0)
+static void
+fmt_small_le(byte *buf, Eterm arg, Uint num_bits)
+{
+    Uint bit_offset;
+    Sint val;
+    Uint num_bytes;
+
+    ASSERT(is_small(arg));
+    ASSERT(num_bits != 0);      /* Tested by caller */
+
+    num_bytes = NBYTES(num_bits);
+    bit_offset = BIT_OFFSET(num_bits);
+    val = signed_val(arg);
+    num_bytes--;
+    FMT_COPY_VAL(buf, 1, val, num_bytes);
+    *buf = bit_offset ? (val << (8-bit_offset)) : val;
+}
+
+#undef FMT_COPY_VAL
+
+/*
+ * Calculate a - c (carry), storing the result in b; set c
+ * to the new carry.
+ */
+#if __has_builtin(__builtin_subc) && !defined(DEBUG)
+#define SUBc(a, c, b)                           \
+   do {                                         \
+       *(b) = __builtin_subc(0, (a), c, &c);    \
+   } while(0)
+#else
+#define SUBc(a, c, b)                           \
+   do {                                         \
+       byte __x = (a);                          \
+       byte __y = (__x - !(c));                 \
+       c = !(__y > __x);                        \
+       *(b) = ~__y;                             \
+   } while(0)
+#endif
 
 static void
-fmt_big(byte *buf, Uint num_bytes, Eterm val, Uint num_bits, Uint flags)
+fmt_big_be(byte *buf, Eterm val, Uint num_bits)
 {
     unsigned long offs;
     int sign;
     Uint ds;
     ErtsDigit* dp;
     int n;
+    ErtsDigit acc = 0;
+    ErtsDigit d;
+    Uint num_bytes;
 
     ASSERT(is_big(val));
+    ASSERT(num_bits != 0);
 
-    if (num_bits == 0) {
-        return;
-    }
-
+    num_bytes = NBYTES(num_bits);
     sign = big_sign(val);
     ds = big_size(val)*sizeof(ErtsDigit); /* number of digits bytes */
     dp = big_v(val);
     n = MIN(num_bytes, ds);
 
     offs = BIT_OFFSET(num_bits);
-    if (flags & BSF_LITTLE) {
-        num_bytes -= n;         /* pad with this amount */
-        if (sign) {             /* negative */
-            int c = 1;
-            while (n >= sizeof(ErtsDigit)) {
-                ErtsDigit d = *dp++;
-                int i;
-                for (i = 0; i < sizeof(ErtsDigit); i++) {
-                    SUBc(d & 0xff, &c, buf);
-                    buf++;
-                    d >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
+
+    buf += num_bytes - 1;       /* end of buffer */
+    num_bytes -= n;             /* pad with this amount */
+    offs = offs ? (8-offs) : 0; /* shift offset */
+
+    if (sign) {             /* negative bignum */
+        unsigned int c = 0;
+
+        while (n >= sizeof(ErtsDigit)) {
+            int i;
+
+            d = *dp++;
+            acc |= d << offs;
+            SUBc(acc & 0xff, c, buf);
+            buf--;
+            acc = d >> (8-offs);
+            for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
+                SUBc(acc & 0xff, c, buf);
+                buf--;
+                acc >>= 8;
             }
-            if (n) {
-                ErtsDigit d = *dp;
-                do {
-                    SUBc(d & 0xff, &c, buf);
-                    buf++;
-                    d >>= 8;
-                } while (--n > 0);
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            acc |= ((ErtsDigit)*dp << offs);
+            do {
+                SUBc(acc & 0xff, c, buf);
+                buf--;
+                acc >>= 8;
+            } while (--n > 0);
+        }
+        /* pad */
+        while (num_bytes--) {
+            SUBc(acc & 0xff, c, buf);
+            buf--;
+            acc >>= 8;
+        }
+    } else {                /* positive bignum */
+        while (n >= sizeof(ErtsDigit)) {
+            int i;
+
+            d = *dp++;
+            acc |= d << offs;
+            *buf-- = acc;
+            acc = d >> (8-offs);
+            for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
+                *buf-- = acc;
+                acc >>= 8;
             }
-            /* pad */
-            while (num_bytes--) {
-                SUBc(0, &c, buf);
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            acc |= (*dp << offs);
+            do {
+                *buf-- = acc;
+                acc >>= 8;
+            } while (--n > 0);
+        }
+        while (num_bytes--) {
+            *buf-- = acc;
+            acc >>= 8;
+        }
+    }
+}
+
+static void
+fmt_big_le(byte *buf, Eterm val, Uint num_bits)
+{
+    unsigned long offs;
+    int sign;
+    Uint ds;
+    ErtsDigit* dp;
+    int n;
+    Uint num_bytes;
+
+    ASSERT(is_big(val));
+    ASSERT(num_bits != 0);
+
+    num_bytes = NBYTES(num_bits);
+    sign = big_sign(val);
+    ds = big_size(val)*sizeof(ErtsDigit); /* number of digits bytes */
+    dp = big_v(val);
+    n = MIN(num_bytes, ds);
+
+    offs = BIT_OFFSET(num_bits);
+    num_bytes -= n;         /* pad with this amount */
+    if (sign) {             /* negative */
+        unsigned int c = 0;
+        while (n >= sizeof(ErtsDigit)) {
+            ErtsDigit d = *dp++;
+            int i;
+            for (i = 0; i < sizeof(ErtsDigit); i++) {
+                SUBc(d & 0xff, c, buf);
                 buf++;
+                d >>= 8;
             }
-        } else {                /* positive */
-            while (n >= sizeof(ErtsDigit)) {
-                ErtsDigit d = *dp++;
-                int i;
-                for(i = 0; i < sizeof(ErtsDigit); i++) {
-                    *buf++ = d;
-                    d >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
-            }
-            if (n) {
-                ErtsDigit d = *dp;
-                do {
-                    *buf++ = d;
-                    d >>= 8;
-                } while (--n > 0);
-            }
-            /* pad */
-            while (num_bytes) {
-                *buf++ = 0;
-                num_bytes--;
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            ErtsDigit d = *dp;
+            do {
+                SUBc(d & 0xff, c, buf);
+                buf++;
+                d >>= 8;
+            } while (--n > 0);
+        }
+        /* pad */
+        while (num_bytes--) {
+            SUBc(0, c, buf);
+            buf++;
+        }
+    } else {                /* positive */
+        while (n >= sizeof(ErtsDigit)) {
+            ErtsDigit d = *dp++;
+            int i;
+            for(i = 0; i < sizeof(ErtsDigit); i++) {
+                *buf++ = d;
+                d >>= 8;
             }
+            n -= sizeof(ErtsDigit);
+        }
+        if (n) {
+            ErtsDigit d = *dp;
+            do {
+                *buf++ = d;
+                d >>= 8;
+            } while (--n > 0);
         }
+        /* pad */
+        while (num_bytes) {
+            *buf++ = 0;
+            num_bytes--;
+        }
+    }
 
-        /* adjust MSB */
-        if (offs) {
-            buf--;
-            *buf <<= (8 - offs);
+    /* adjust MSB */
+    if (offs) {
+        buf--;
+        *buf <<= (8 - offs);
+    }
+}
+
+#undef SUBc
+
+static void
+restore_and_shift(byte *buf, Uint orig_byte, Uint bit_offset, Uint num_bits)
+{
+    Uint rshift = bit_offset;
+    Uint lshift = 8 - bit_offset;
+    Uint deoffs = BIT_OFFSET(bit_offset + num_bits);
+    Uint lmask = MAKE_MASK(8 - bit_offset);
+    Uint count = (num_bits - lshift) / 8;
+    Uint bits, bits1;
+
+    ASSERT(num_bits - lshift >= 0);
+
+    bits = *buf;
+    bits1 = bits >> rshift;
+    *buf = MASK_BITS(bits1, orig_byte, lmask);
+    buf++;
+
+    while (count--) {
+        bits1 = bits << lshift;
+        bits = *buf;
+        *buf++ = bits1 | (bits >> rshift);
+    }
+
+    if (deoffs) {
+        Uint rmask = MAKE_MASK(deoffs) << (8 - deoffs);
+
+        bits1 = bits << lshift;
+        if ((rmask << rshift) & 0xff) {
+            bits = *buf;
+            bits1 |= (bits >> rshift);
         }
-    } else {   /* BIG ENDIAN */
-        ErtsDigit acc = 0;
-        ErtsDigit d;
+        *buf = MASK_BITS(bits1, *buf, rmask);
+    }
+}
 
-        buf += num_bytes - 1;       /* end of buffer */
-        num_bytes -= n;             /* pad with this amount */
-        offs = offs ? (8-offs) : 0; /* shift offset */
+int
+erts_bs_put_integer_be(ErlBitsState *EBS, Eterm arg, Uint num_bits)
+{
+    byte* dst_bin = EBS->erts_current_bin;
+    Uint bin_offset = EBS->erts_bin_offset;
+    Uint bit_offset;
+    byte b;
+    byte *iptr;
 
-        if (sign) {             /* negative bignum */
-            int c = 1;
+    if (ERTS_UNLIKELY(num_bits == 0)) {
+        return is_small(arg) || is_big(arg);
+    }
 
-            while (n >= sizeof(ErtsDigit)) {
-                int i;
+    iptr = dst_bin + BYTE_OFFSET(bin_offset);
+    bit_offset = BIT_OFFSET(bin_offset);
+    if (is_small(arg)) {
+        Uint rbits = 8 - bit_offset;
 
-                d = *dp++;
-                acc |= d << offs;
-                SUBc(acc & 0xff, &c, buf);
-                buf--;
-                acc = d >> (8-offs);
-                for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
-                    SUBc(acc & 0xff, &c, buf);
-                    buf--;
-                    acc >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
-            }
-            if (n) {
-                acc |= ((ErtsDigit)*dp << offs);
-                do {
-                    SUBc(acc & 0xff, &c, buf);
-                    buf--;
-                    acc >>= 8;
-                } while (--n > 0);
-            }
-            /* pad */
-            while (num_bytes--) {
-                SUBc(acc & 0xff, &c, buf);
-                buf--;
-                acc >>= 8;
+        if (bit_offset == 0) {
+            /* Aligned on a byte boundary. */
+            if (num_bits <= 8) {
+                /* All bits are in the same byte. */
+                b = (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+                *iptr = b;
+            } else {
+                /* More than one byte. */
+                fmt_small_be(iptr, arg, num_bits);
             }
-        } else {                /* positive bignum */
-            while (n >= sizeof(ErtsDigit)) {
-                int i;
+        } else if (bit_offset + num_bits <= 8) {
+            /*
+             * All bits are in the same byte.
+             */
+            b = *iptr & (0xff << rbits);
+            b |= (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+            *iptr = b;
+        } else {		/* Big endian */
+            /*
+             * Big-endian, more than one byte, but not aligned on a byte boundary.
+             * Handle the bits up to the next byte boundary specially,
+             * then let fmt_small_be() handle the rest.
+             */
+            Uint shift_count = num_bits - rbits;
+            Sint val = signed_val(arg);
 
-                d = *dp++;
-                acc |= d << offs;
-                *buf-- = acc;
-                acc = d >> (8-offs);
-                for (i = 0; i < sizeof(ErtsDigit)-1; i++) {
-                    *buf-- = acc;
-                    acc >>= 8;
-                }
-                n -= sizeof(ErtsDigit);
-            }
-            if (n) {
-                acc |= (*dp << offs);
-                do {
-                    *buf-- = acc;
-                    acc >>= 8;
-                } while (--n > 0);
-            }
-            while (num_bytes--) {
-                *buf-- = acc;
-                acc >>= 8;
+            ASSERT(num_bits > rbits);
+            b = *iptr & (0xff << rbits);
+
+            /*
+             * Shifting with a shift count greater than or equal to the word
+             * size may be a no-op (instead of 0 the result may be the unshifted
+             * value). Therefore, only do the shift and the OR if the shift count
+             * is less than the word size if the number is positive; if negative,
+             * we must simulate the sign extension.
+             */
+            if (shift_count < sizeof(Uint)*8) {
+                b |= (val >> shift_count) & MAKE_MASK(rbits);
+            } else if (val < 0) {
+                /* Simulate sign extension. */
+                b |= (-1) & MAKE_MASK(rbits);
             }
+            *iptr++ = b;
+
+            fmt_small_be(iptr, arg, shift_count);
         }
+    } else if (is_big(arg) && bit_offset == 0) {
+        /*
+         * Big number, aligned on a byte boundary. We can format the
+         * integer directly into the binary.
+         */
+        fmt_big_be(iptr, arg, num_bits);
+    } else if (is_big(arg) && bit_offset + num_bits <= 8) {
+        /*
+         * All bits are in the same byte.
+         */
+        Uint rbits = 8 - bit_offset;
+        Sint sign = big_sign(arg);
+        ErtsDigit* dp = big_v(arg);
+        Uint val = sign ? -*dp : *dp;
+
+        b = *iptr & (0xff << rbits);
+        b |= (val & MAKE_MASK(num_bits)) << (rbits-num_bits);
+        *iptr = b;
+    } else if (is_big(arg)) {
+        /*
+         * Big number, not aligned on a byte boundary.
+         *
+         * Format the integer byte-aligned using the binary itself as
+         * a temporary buffer.
+         */
+        b = *iptr;
+        fmt_big_be(iptr, arg, num_bits);
+
+        /*
+         * Now restore the overwritten bits of the first byte and
+         * shift everything to the right.
+         */
+        restore_and_shift(iptr, b, bit_offset, num_bits);
+    } else {
+        /* Not an integer. */
+        return 0;
     }
+    EBS->erts_bin_offset = bin_offset + num_bits;
+    return 1;
 }
 
 int
-erts_bs_put_integer(ErlBitsState *EBS, Eterm arg, Uint num_bits, unsigned flags)
+erts_bs_put_integer_le(ErlBitsState *EBS, Eterm arg, Uint num_bits)
 {
     byte* dst_bin = EBS->erts_current_bin;
     Uint bin_offset = EBS->erts_bin_offset;
     Uint bit_offset;
-    Uint b;
+    byte b;
     byte *iptr;
 
+    if (ERTS_UNLIKELY(num_bits == 0)) {
+        return is_small(arg) || is_big(arg);
+    }
+
+    iptr = dst_bin + BYTE_OFFSET(bin_offset);
     bit_offset = BIT_OFFSET(bin_offset);
     if (is_small(arg)) {
-	Uint rbits = 8 - bit_offset;
-
-	if (num_bits == 0) {
-	    return 1;
-	} else if (bit_offset + num_bits <= 8) {
-	    /*
-	     * All bits are in the same byte.
-	     */
-	    iptr = dst_bin + BYTE_OFFSET(bin_offset);
-	    b = *iptr & (0xff << rbits);
-	    b |= (signed_val(arg) & ((1 << num_bits)-1)) << (rbits-num_bits);
-	    *iptr = b;
-	} else if (bit_offset == 0) {
-	    /*
-	     * More than one bit, starting at a byte boundary.
-	     */
-            iptr = dst_bin + BYTE_OFFSET(bin_offset);
-            fmt_small(iptr, NBYTES(num_bits), arg, num_bits, flags);
-	} else if (flags & BSF_LITTLE) {
+        Uint rbits = 8 - bit_offset;
+
+        if (bit_offset == 0) {
+            /* Aligned on a byte boundary. */
+            if (num_bits <= 8) {
+                /* All bits are in the same byte. */
+                b = (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+                *iptr = b;
+            } else {
+                /* More than one byte. */
+                fmt_small_le(iptr, arg, num_bits);
+            }
+        } else if (bit_offset + num_bits <= 8) {
+            /*
+             * All bits are in the same byte.
+             */
+            b = *iptr & (0xff << rbits);
+            b |= (signed_val(arg) & MAKE_MASK(num_bits)) << (rbits-num_bits);
+            *iptr = b;
+        } else if (BIT_OFFSET(num_bits) == 0) {
             /*
              * Little endian small in more than one byte, not
-             * aligned on a byte boundary.
+             * aligned on a byte boundary. The size is evenly
+             * divisible by 8, which means that there will be
+             * one partial byte, followed by zero or more
+             * complete bytes, followed by a final partial byte.
              */
             Sint val = signed_val(arg);
             Uint rshift = bit_offset;
@@ -772,92 +972,51 @@ erts_bs_put_integer(ErlBitsState *EBS, Eterm arg, Uint num_bits, unsigned flags)
             Uint count = (num_bits - rbits) / 8;
             Uint bits, bits1;
 
-            iptr = dst_bin + BYTE_OFFSET(bin_offset);
-
-            if (BIT_OFFSET(num_bits) == 0) {
-                bits = val;
-                bits1 = bits >> rshift;
-                *iptr = MASK_BITS(bits1, *iptr, lmask);
-                iptr++;
-                val >>= 8;
-
-                while (count--) {
-                    bits1 = bits << lshift;
-                    bits = val & 0xff;
-                    *iptr++ = bits1 | (bits >> rshift);
-                    val >>= 8;
-                }
-
-                *iptr = bits << lshift;
-            } else {
-                Sint num_bytes = NBYTES(num_bits) - 1;
-                Uint deoffs = BIT_OFFSET(bit_offset + num_bits);
-
-                if (num_bytes-- > 0) {
-                    bits = val;
-                } else {
-                    bits = (val << (8 - BIT_OFFSET(num_bits)));
-                }
-                bits1 = bits >> rshift;
-                *iptr = MASK_BITS(bits1, *iptr, lmask);
-                iptr++;
+            /* Handle the first partial byte. */
+            bits = val;
+            bits1 = bits >> rshift;
+            *iptr = MASK_BITS(bits1, *iptr, lmask);
+            iptr++;
+            val >>= 8;
+
+            /* Handle all complete bytes. */
+            while (count--) {
+                bits1 = bits << lshift;
+                bits = val & 0xff;
+                *iptr++ = bits1 | (bits >> rshift);
                 val >>= 8;
-
-                while (count--) {
-                    bits1 = bits << lshift;
-                    if (num_bytes-- > 0) {
-                        bits = val & 0xff;
-                    } else {
-                        bits = (val << (8 - BIT_OFFSET(num_bits))) & 0xff;
-                    }
-                    *iptr++ = bits1 | (bits >> rshift);
-                    val >>= 8;
-                }
-
-                if (deoffs) {
-                    bits1 = bits << lshift;
-                    if (rshift < deoffs) {
-                        bits = (val << (8 - BIT_OFFSET(num_bits))) & 0xff;
-                        bits1 |= bits >> rshift;
-                    }
-                    *iptr = bits1;
-                }
             }
-	} else {		/* Big endian */
-	    /*
-	     * Big-endian, more than one byte, but not aligned on a byte boundary.
-	     * Handle the bits up to the next byte boundary specially,
-	     * then let fmt_int() handle the rest.
-	     */
-	    Uint shift_count = num_bits - rbits;
-	    Sint val = signed_val(arg);
-	    iptr = dst_bin + BYTE_OFFSET(bin_offset);
-	    b = *iptr & (0xff << rbits);
-
-	    /*
-	     * Shifting with a shift count greater than or equal to the word
-	     * size may be a no-op (instead of 0 the result may be the unshifted
-	     * value). Therefore, only do the shift and the OR if the shift count
-	     * is less than the word size if the number is positive; if negative,
-	     * we must simulate the sign extension.
-	     */
-	    if (shift_count < sizeof(Uint)*8) {
-		b |= (val >> shift_count) & ((1 << rbits) - 1);
-	    } else if (val < 0) {
-		/* Simulate sign extension. */
-		b |= (-1) & ((1 << rbits) - 1);
-	    }
-	    *iptr++ = b;
 
-            fmt_small(iptr, NBYTES(num_bits-rbits), arg, num_bits-rbits, flags);
-	}
+            /* Handle the final partial byte. */
+            *iptr = bits << lshift;
+        } else {
+            /*
+             * Little endian small in more than one byte, not aligned
+             * on a byte boundary, and the size is not evenly
+             * divisible by 8.
+             *
+             * Now this gets complicated. We used to handle this
+             * directly, but since this case is presumably uncommon,
+             * we do this in a simpler way in two steps.
+             *
+             * First format the integer byte-aligned using the binary
+             * itself as a temporary buffer.
+             */
+            b = *iptr;
+            fmt_small_le(iptr, arg, num_bits);
+
+            /*
+             * Now restore the overwritten bits of the first byte and
+             * shift everything to the right.
+             */
+            restore_and_shift(iptr, b, bit_offset, num_bits);
+        }
     } else if (is_big(arg) && bit_offset == 0) {
-	/*
-	 * Big number, aligned on a byte boundary. We can format the
-	 * integer directly into the binary.
-	 */
-	fmt_big(dst_bin + BYTE_OFFSET(bin_offset),
-                NBYTES(num_bits), arg, num_bits, flags);
+        /*
+         * Big number, aligned on a byte boundary. We can format the
+         * integer directly into the binary.
+         */
+        fmt_big_le(iptr, arg, num_bits);
     } else if (is_big(arg) && bit_offset + num_bits <= 8) {
         /*
          * All bits are in the same byte.
@@ -867,55 +1026,24 @@ erts_bs_put_integer(ErlBitsState *EBS, Eterm arg, Uint num_bits, unsigned flags)
         ErtsDigit* dp = big_v(arg);
         Uint val = sign ? -*dp : *dp;
 
-        iptr = dst_bin + BYTE_OFFSET(bin_offset);
         b = *iptr & (0xff << rbits);
-        b |= (val & ((1 << num_bits)-1)) << (rbits-num_bits);
+        b |= (val & MAKE_MASK(num_bits)) << (rbits-num_bits);
         *iptr = b;
     } else if (is_big(arg)) {
         /*
          * Big number, not aligned on a byte boundary.
-         */
-        Uint rshift = bit_offset;
-        Uint lshift = 8 - bit_offset;
-        Uint deoffs = BIT_OFFSET(bit_offset + num_bits);
-        Uint lmask = MAKE_MASK(8 - bit_offset);
-        Uint rmask = (deoffs) ? (MAKE_MASK(deoffs)<<(8-deoffs)) : 0;
-        Uint count = (num_bits - lshift) / 8;
-        Uint bits, bits1;
-
-        ASSERT(num_bits - lshift >= 0);
-
-        /*
+         *
          * Format the integer byte-aligned using the binary itself as
          * a temporary buffer.
          */
-        iptr = dst_bin + BYTE_OFFSET(bin_offset);
         b = *iptr;
-        fmt_big(iptr, NBYTES(num_bits), arg, num_bits, flags);
+        fmt_big_le(iptr, arg, num_bits);
 
         /*
          * Now restore the overwritten bits of the first byte and
          * shift everything to the right.
          */
-        bits = *iptr;
-        bits1 = bits >> rshift;
-        *iptr = MASK_BITS(bits1, b, lmask);
-        iptr++;
-
-        while (count--) {
-            bits1 = bits << lshift;
-            bits = *iptr;
-            *iptr++ = bits1 | (bits >> rshift);
-        }
-
-        if (rmask) {
-            bits1 = bits << lshift;
-            if ((rmask << rshift) & 0xff) {
-                bits = *iptr;
-                bits1 |= (bits >> rshift);
-            }
-            *iptr = MASK_BITS(bits1, *iptr, rmask);
-        }
+        restore_and_shift(iptr, b, bit_offset, num_bits);
     } else {
         /* Not an integer. */
         return 0;
diff --git a/erts/emulator/beam/erl_bits.h b/erts/emulator/beam/erl_bits.h
index 0700fb01f053..72e45143a0bf 100644
--- a/erts/emulator/beam/erl_bits.h
+++ b/erts/emulator/beam/erl_bits.h
@@ -216,8 +216,8 @@ Eterm erts_bs_get_binary_2(Process *p, Uint num_bits, unsigned flags, ErlSubBits
 Eterm erts_bs_get_binary_all_2(Process *p, ErlSubBits* sb);
 
 /* Binary construction, new instruction set. */
-int erts_bs_put_integer(ErlBitsState *EBS, Eterm Integer, Uint num_bits,
-                        unsigned flags);
+int erts_bs_put_integer_be(ErlBitsState *EBS, Eterm Integer, Uint num_bits);
+int erts_bs_put_integer_le(ErlBitsState *EBS, Eterm Integer, Uint num_bits);
 #if !defined(BEAMASM)
 int erts_bs_put_utf8(ErlBitsState *EBS, Eterm Integer);
 #endif
diff --git a/erts/emulator/beam/jit/arm/instr_bs.cpp b/erts/emulator/beam/jit/arm/instr_bs.cpp
index e0080466c47f..d28c40321cce 100644
--- a/erts/emulator/beam/jit/arm/instr_bs.cpp
+++ b/erts/emulator/beam/jit/arm/instr_bs.cpp
@@ -2496,10 +2496,13 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                      * integer into the binary. */
                     load_erl_bits_state(ARG1);
                     mov_arg(ARG2, seg.src);
-                    mov_imm(ARG4, seg.flags);
 
                     emit_enter_runtime(Live.get());
-                    runtime_call<4>(erts_bs_put_integer);
+                    if (seg.flags & BSF_LITTLE) {
+                        runtime_call<3>(erts_bs_put_integer_le);
+                    } else {
+                        runtime_call<3>(erts_bs_put_integer_be);
+                    }
                     emit_leave_runtime(Live.get());
 
                     if (exact_type<BeamTypeId::Integer>(seg.src)) {
@@ -2563,10 +2566,13 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             load_erl_bits_state(ARG1);
             mov_arg(ARG2, seg.src);
             mov_imm(ARG3, 4 * 8);
-            a.mov(ARG4, seg.flags);
 
             emit_enter_runtime(Live.get());
-            runtime_call<4>(erts_bs_put_integer);
+            if (seg.flags & BSF_LITTLE) {
+                runtime_call<3>(erts_bs_put_integer_le);
+            } else {
+                runtime_call<3>(erts_bs_put_integer_be);
+            }
             emit_leave_runtime(Live.get());
 
             if (Fail.get() == 0) {
diff --git a/erts/emulator/beam/jit/x86/instr_bs.cpp b/erts/emulator/beam/jit/x86/instr_bs.cpp
index b43351b871ba..4a7f54771750 100644
--- a/erts/emulator/beam/jit/x86/instr_bs.cpp
+++ b/erts/emulator/beam/jit/x86/instr_bs.cpp
@@ -2599,9 +2599,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                      * integer into the binary. */
                     runtime_entered = bs_maybe_enter_runtime(runtime_entered);
                     mov_arg(ARG2, seg.src);
-                    mov_imm(ARG4, seg.flags);
                     load_erl_bits_state(ARG1);
-                    runtime_call<4>(erts_bs_put_integer);
+                    if (seg.flags & BSF_LITTLE) {
+                        runtime_call<3>(erts_bs_put_integer_le);
+                    } else {
+                        runtime_call<3>(erts_bs_put_integer_be);
+                    }
                     if (exact_type<BeamTypeId::Integer>(seg.src)) {
                         comment("skipped test for success because construction "
                                 "can't fail");
@@ -2660,9 +2663,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             runtime_entered = bs_maybe_enter_runtime(runtime_entered);
             mov_arg(ARG2, seg.src);
             mov_imm(ARG3, 4 * 8);
-            a.mov(ARG4, seg.flags);
             load_erl_bits_state(ARG1);
-            runtime_call<4>(erts_bs_put_integer);
+            if (seg.flags & BSF_LITTLE) {
+                runtime_call<3>(erts_bs_put_integer_le);
+            } else {
+                runtime_call<3>(erts_bs_put_integer_be);
+            }
             if (Fail.get() == 0) {
                 mov_arg(ARG1, seg.src);
                 mov_imm(ARG4,