commit ca6d2c9ff1f93bcc469176c75a85dc3918bcd3c6
parent 58dfb7922401a60d5de76825fcd5f6ecbcd7afe0
Author: Jared Tobin <jared@jtobin.io>
Date: Sat, 16 May 2026 12:02:24 -0230
Merge branch 'perf-refactor'
Refactor Data.ByteString.Base16 away from bytestring builders to
direct buffer allocation, with both lookup tables in static rodata.
Public API unchanged (encode, decode); existing tasty suite (5000 QC
cases x 3 properties + uppercase HUnit) passes throughout.
Commits:
* 172805a lib: drop bytestring builder, use unsafeCreate + lookup
tables
encode now allocates output via 'BI.unsafeCreate (2*l)' and writes
via direct 'Ptr' pokes, indexed through a 512-byte 'enc_tab' CAF
(all-ASCII, so the bytestring 'IsString' rewrite rule places it in
static rodata). decode uses 'mallocByteString (l/2)' +
'unsafeDupablePerformIO', a 256-byte 'dec_tab' CAF, and a branchless
OR-accumulator for validity (checked once at the end). Resulting
Core is a 'joinrec' with 'Int#'/'Word8#'/'State#' args calling
'readWord8OffAddr#' and 'writeWord8OffAddr#' directly — no
allocation in the hot path.
* a19b50d lib: put dec_tab in static rodata too
'dec_tab' was being built once at first use via 'BI.unsafeCreate 256'
because the original 0x80 invalid-byte sentinel was non-ASCII (so a
literal would have gone through 'unpackCStringUtf8#' and missed the
static-rodata rewrite rule). Choosing a sentinel that stays ASCII
and avoids embedded NUL — valid nibbles 0..15 -> 0x10..0x1f,
invalid -> 0x20 — lets the literal compile via 'unpackCString#'
and land in rodata next to 'enc_tab'. Validity becomes
'acc .&. 0x20 == 0'; the decode-byte math is unchanged because
'(n0 `shiftL` 4)' in 'Word8' naturally drops bit 4.
* 53523b5 lib: rewrite dec_tab comment to stand on its own
Previous wording read as a diff against an earlier version of the
table rather than a description of the code as it stands.
* cf4585f lib: simplify encode inner loop with Word16 view
The two-byte block at 'enc_tab[2*b]' and the two-byte block at
'dst[2*i]' share the same byte layout in memory, so we can read
'enc_tab' as 'Ptr Word16' and write 'dst' as 'Ptr Word16',
shuffling 16 bits between the two locations without ever
inspecting their numerical value — endianness-safe by construction.
Drops the explicit 'j'/'o' offset arithmetic and 'hi'/'lo'
interleaving from the hot path. No measurable wall-time change
on Apple Silicon; kept for readability.
* dd0e35d readme: perf update
Performance section rewritten with current numbers from an M4
MacBook Air (GHC 9.10.3, LLVM 19, -fllvm).
On 1 KiB inputs, M4 MacBook Air, GHC 9.10.3 + LLVM 19, -fllvm:
encode time: 3032 ns -> 296 ns (~10.2x)
decode time: 596 ns -> 271 ns (~ 2.2x)
encode alloc: 49,696 B -> 3,872 B
decode alloc: 17,960 B -> 3,872 B
Allocation now matches the C-backed 'base16-bytestring' (3,824 B
encode / 3,848 B decode). Both functions run at roughly one cycle
per input byte — near the practical floor for a scalar loop with a
dependent table load.
Diffstat:
2 files changed, 123 insertions(+), 276 deletions(-)
diff --git a/README.md b/README.md
@@ -31,26 +31,23 @@ Haddocks (API documentation, etc.) are hosted at
## Performance
-The aim is best-in-class performance for pure, highly-auditable Haskell
-code. We could go slightly faster by using direct allocation and writes,
-but we get pretty close to the best impure versions with only builders.
+The aim is best-in-class performance.
-Current benchmark figures on 1kb inputs on a relatively-beefy NixOS VPS look
-like (use `cabal bench` to run the benchmark suite):
+Current benchmark figures on 1kb inputs on my M4 MacBook Air look like
+(use `cabal bench -fllvm` to run the benchmark suite):
```
- benchmarking ppad-base16
- time 2.997 μs (2.988 μs .. 3.009 μs)
+ benchmarking ppad-base16/encode
+ time 295.9 ns (295.4 ns .. 296.4 ns)
+ 1.000 R² (0.999 R² .. 1.000 R²)
+ mean 296.8 ns (296.4 ns .. 297.2 ns)
+ std dev 1.367 ns (1.181 ns .. 1.619 ns)
+
+ benchmarking ppad-base16/decode
+ time 270.8 ns (270.6 ns .. 271.1 ns)
1.000 R² (1.000 R² .. 1.000 R²)
- mean 3.024 μs (3.012 μs .. 3.035 μs)
- std dev 39.14 ns (36.12 ns .. 42.87 ns)
- variance introduced by outliers: 11% (moderately inflated)
-
- benchmarking ppad-base16
- time 599.0 ns (597.3 ns .. 601.3 ns)
- 1.000 R² (1.000 R² .. 1.000 R²)
- mean 597.3 ns (596.3 ns .. 598.5 ns)
- std dev 3.493 ns (2.623 ns .. 5.111 ns)
+ mean 270.9 ns (270.7 ns .. 271.1 ns)
+ std dev 627.8 ps (515.3 ps .. 766.1 ps)
```
## Security
diff --git a/lib/Data/ByteString/Base16.hs b/lib/Data/ByteString/Base16.hs
@@ -1,7 +1,5 @@
{-# OPTIONS_HADDOCK prune #-}
-{-# LANGUAGE ApplicativeDo #-}
{-# LANGUAGE BangPatterns #-}
-{-# LANGUAGE BinaryLiterals #-}
{-# LANGUAGE OverloadedStrings #-}
-- |
@@ -20,116 +18,103 @@ module Data.ByteString.Base16 (
import qualified Data.Bits as B
import Data.Bits ((.&.), (.|.))
import qualified Data.ByteString as BS
-import qualified Data.ByteString.Builder as BSB
-import qualified Data.ByteString.Builder.Extra as BE
import qualified Data.ByteString.Internal as BI
-import qualified Data.ByteString.Unsafe as BU
import Data.Word (Word8, Word16)
-
-to_strict :: BSB.Builder -> BS.ByteString
-to_strict = BS.toStrict . BSB.toLazyByteString
-{-# INLINE to_strict #-}
-
-to_strict_small :: BSB.Builder -> BS.ByteString
-to_strict_small = BS.toStrict
- . BE.toLazyByteStringWith (BE.safeStrategy 128 BE.smallChunkSize) mempty
-{-# INLINE to_strict_small #-}
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Ptr (Ptr, castPtr, plusPtr)
+import Foreign.Storable (peekElemOff, pokeElemOff)
+import System.IO.Unsafe (unsafeDupablePerformIO)
fi :: (Num a, Integral b) => b -> a
fi = fromIntegral
{-# INLINE fi #-}
-hex_charset :: BS.ByteString
-hex_charset = "0123456789abcdef"
-
-expand_w8 :: Word8 -> Word16
-expand_w8 b =
- let !hi = BU.unsafeIndex hex_charset (fi b `B.shiftR` 4)
- !lo = BU.unsafeIndex hex_charset (fi b .&. 0b00001111)
- in fi hi `B.shiftL` 8
- .|. fi lo
-{-# INLINE expand_w8 #-}
+-- 512-byte table. Bytes [2k] and [2k+1] are the two lowercase ASCII
+-- hex characters representing the value k. All-ASCII content means
+-- the bytestring 'IsString' rule rewrites this to 'unsafePackAddress'
+-- and the bytes live in static rodata.
+enc_tab :: BS.ByteString
+enc_tab =
+ "000102030405060708090a0b0c0d0e0f\
+ \101112131415161718191a1b1c1d1e1f\
+ \202122232425262728292a2b2c2d2e2f\
+ \303132333435363738393a3b3c3d3e3f\
+ \404142434445464748494a4b4c4d4e4f\
+ \505152535455565758595a5b5c5d5e5f\
+ \606162636465666768696a6b6c6d6e6f\
+ \707172737475767778797a7b7c7d7e7f\
+ \808182838485868788898a8b8c8d8e8f\
+ \909192939495969798999a9b9c9d9e9f\
+ \a0a1a2a3a4a5a6a7a8a9aaabacadaeaf\
+ \b0b1b2b3b4b5b6b7b8b9babbbcbdbebf\
+ \c0c1c2c3c4c5c6c7c8c9cacbcccdcecf\
+ \d0d1d2d3d4d5d6d7d8d9dadbdcdddedf\
+ \e0e1e2e3e4e5e6e7e8e9eaebecedeeef\
+ \f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"
+{-# NOINLINE enc_tab #-}
+
+-- 256-byte table. Index by an ASCII byte to obtain its nibble; valid
+-- hex chars ('0'..'9', 'a'..'f', 'A'..'F') map to 0x10..0x1f, every
+-- other byte maps to 0x20.
+--
+-- The encoding is chosen so the literal is strictly ASCII and contains
+-- no embedded NUL, which is what the bytestring 'IsString' rule needs
+-- to rewrite it into 'unsafePackAddress' (cf. 'enc_tab') — the bytes
+-- end up in static rodata, with no CAF allocation.
+--
+-- The 0x20 sentinel is distinguished by bit 5; no value 0x10..0x1f
+-- carries that bit, so 'decode' OR-folds every lookup into an
+-- accumulator and tests 'acc .&. 0x20 == 0' once at the end. The
+-- output byte is '(n0 `shiftL` 4) .|. (n1 .&. 0x0f)': in 'Word8' the
+-- shift naturally drops bit 4, and the mask isolates the low nibble.
+dec_tab :: BS.ByteString
+dec_tab =
+ "\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x20\x20\x20\x20\x20\x20\
+ \\x20\x1a\x1b\x1c\x1d\x1e\x1f\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x1a\x1b\x1c\x1d\x1e\x1f\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+ \\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20"
+{-# NOINLINE dec_tab #-}
-- | Encode a base256 'ByteString' as base16.
--
-- >>> encode "hello world"
-- "68656c6c6f20776f726c64"
encode :: BS.ByteString -> BS.ByteString
-encode bs@(BI.PS _ _ l)
- | l < 64 = to_strict_small loop
- | otherwise = to_strict loop
- where
- -- writing as few words as possible requires performing some length
- -- checks up front
- loop
- | l `rem` 4 == 0 =
- go64 bs
- | (l - 3) `rem` 4 == 0 = case BS.splitAt (l - 3) bs of
- (chunk, etc) ->
- go64 chunk
- <> go32 (BU.unsafeTake 2 etc)
- <> go16 (BU.unsafeDrop 2 etc)
- | (l - 2) `rem` 4 == 0 = case BS.splitAt (l - 2) bs of
- (chunk, etc) ->
- go64 chunk
- <> go32 etc
- | (l - 1) `rem` 4 == 0 = case BS.splitAt (l - 1) bs of
- (chunk, etc) ->
- go64 chunk
- <> go16 etc
-
- | l `rem` 2 == 0 =
- go32 bs
- | (l - 1) `rem` 2 == 0 = case BS.splitAt (l - 1) bs of
- (chunk, etc) ->
- go32 chunk
- <> go16 etc
-
- | otherwise =
- go16 bs
-
- go64 b = case BS.splitAt 4 b of
- (chunk, etc)
- | BS.null chunk -> mempty
- | otherwise ->
- let !w16_0 = expand_w8 (BU.unsafeIndex chunk 0)
- !w16_1 = expand_w8 (BU.unsafeIndex chunk 1)
- !w16_2 = expand_w8 (BU.unsafeIndex chunk 2)
- !w16_3 = expand_w8 (BU.unsafeIndex chunk 3)
-
- !w64 = fi w16_0 `B.shiftL` 48
- .|. fi w16_1 `B.shiftL` 32
- .|. fi w16_2 `B.shiftL` 16
- .|. fi w16_3
-
- in BSB.word64BE w64 <> go64 etc
-
- go32 b = case BS.splitAt 2 b of
- (chunk, etc)
- | BS.null chunk -> mempty
- | otherwise ->
- let !w16_0 = expand_w8 (BU.unsafeIndex chunk 0)
- !w16_1 = expand_w8 (BU.unsafeIndex chunk 1)
-
- !w32 = fi w16_0 `B.shiftL` 16
- .|. fi w16_1
-
- in BSB.word32BE w32 <> go32 etc
-
- go16 b = case BS.uncons b of
- Nothing -> mempty
- Just (h, t) ->
- let !w16 = expand_w8 h
- in BSB.word16BE w16 <> go16 t
-
--- word8 hex character to word4
-word4 :: Word8 -> Maybe Word8
-word4 c
- | c > 47 && c < 58 = pure $! c - 48 -- 0-9
- | c > 64 && c < 71 = pure $! c - 55 -- A-F
- | c > 96 && c < 103 = pure $! c - 87 -- a-f
- | otherwise = Nothing
-{-# INLINE word4 #-}
+encode (BI.PS sfp soff l) =
+ case enc_tab of
+ BI.PS tfp toff _ ->
+ BI.unsafeCreate (l `B.shiftL` 1) $ \dst ->
+ withForeignPtr sfp $ \sp0 ->
+ withForeignPtr tfp $ \tp0 -> do
+ -- read 'enc_tab' and write 'dst' as 'Word16' pairs. The
+ -- two-byte block at 'enc_tab[2*b]' and the two-byte block
+ -- at 'dst[2*i]' share the same byte layout in memory, so
+ -- this is endianness-safe: we never inspect the numerical
+ -- value of the 'Word16', we just shuffle 16 bits between
+ -- two locations.
+ let !sp = sp0 `plusPtr` soff :: Ptr Word8
+ !tp = tp0 `plusPtr` toff :: Ptr Word16
+ !dp = castPtr dst :: Ptr Word16
+ loop !i
+ | i == l = pure ()
+ | otherwise = do
+ b <- peekElemOff sp i
+ w <- peekElemOff tp (fi b)
+ pokeElemOff dp i (w :: Word16)
+ loop (i + 1)
+ loop 0
-- | Decode a base16 'ByteString' to base256.
--
@@ -141,164 +126,29 @@ word4 c
-- >>> decode "068656c6c6f20776f726c64" -- odd-length
-- Nothing
decode :: BS.ByteString -> Maybe BS.ByteString
-decode bs@(BI.PS _ _ l)
- | B.testBit l 0 = Nothing
- | l `quot` 2 < 128 = fmap to_strict_small loop
- | otherwise = fmap to_strict loop
- where
- -- same story, but we need more checks
- loop
- | l `rem` 16 == 0 =
- go64 mempty bs
- | (l - 2) `rem` 16 == 0 = case BS.splitAt (l - 2) bs of
- (chunk, etc) -> do
- b0 <- go64 mempty chunk
- go8 b0 etc
- | (l - 4) `rem` 16 == 0 = case BS.splitAt (l - 4) bs of
- (chunk, etc) -> do
- b0 <- go64 mempty chunk
- go16 b0 etc
- | (l - 6) `rem` 16 == 0 = case BS.splitAt (l - 6) bs of
- (chunk, etc) -> do
- b0 <- go64 mempty chunk
- b1 <- go16 b0 (BU.unsafeTake 4 etc)
- go8 b1 (BU.unsafeDrop 4 etc)
- | (l - 8) `rem` 16 == 0 = case BS.splitAt (l - 8) bs of
- (chunk, etc) -> do
- b0 <- go64 mempty chunk
- go32 b0 etc
- | (l - 10) `rem` 16 == 0 = case BS.splitAt (l - 10) bs of
- (chunk, etc) -> do
- b0 <- go64 mempty chunk
- b1 <- go32 b0 (BU.unsafeTake 8 etc)
- go8 b1 (BU.unsafeDrop 8 etc)
- | (l - 12) `rem` 16 == 0 = case BS.splitAt (l - 12) bs of
- (chunk, etc) -> do
- b0 <- go64 mempty chunk
- b1 <- go32 b0 (BU.unsafeTake 8 etc)
- go16 b1 (BU.unsafeDrop 8 etc)
- | (l - 14) `rem` 16 == 0 = case BS.splitAt (l - 14) bs of
- (chunk, etc) -> do
- b0 <- go64 mempty chunk
- b1 <- go32 b0 (BU.unsafeTake 8 etc)
- b2 <- go16 b1 (BU.unsafeTake 4 (BU.unsafeDrop 8 etc))
- go8 b2 (BU.unsafeDrop 12 etc)
-
- | l `rem` 8 == 0 =
- go32 mempty bs
- | (l - 2) `rem` 8 == 0 = case BS.splitAt (l - 2) bs of
- (chunk, etc) -> do
- b0 <- go32 mempty chunk
- go8 b0 etc
- | (l - 4) `rem` 8 == 0 = case BS.splitAt (l - 4) bs of
- (chunk, etc) -> do
- b0 <- go32 mempty chunk
- go16 b0 etc
- | (l - 6) `rem` 8 == 0 = case BS.splitAt (l - 6) bs of
- (chunk, etc) -> do
- b0 <- go32 mempty chunk
- b1 <- go16 b0 (BU.unsafeTake 4 etc)
- go8 b1 (BU.unsafeDrop 4 etc)
-
- | l `rem` 4 == 0 =
- go16 mempty bs
- | (l - 2) `rem` 4 == 0 = case BS.splitAt (l - 2) bs of
- (chunk, etc) -> do
- b0 <- go16 mempty chunk
- go8 b0 etc
-
- | otherwise =
- go8 mempty bs
-
- go64 acc b = case BS.splitAt 16 b of
- (chunk, etc)
- | BS.null chunk -> pure acc
- | otherwise -> do
- !w4_00 <- word4 (BU.unsafeIndex chunk 00)
- !w4_01 <- word4 (BU.unsafeIndex chunk 01)
- !w4_02 <- word4 (BU.unsafeIndex chunk 02)
- !w4_03 <- word4 (BU.unsafeIndex chunk 03)
- !w4_04 <- word4 (BU.unsafeIndex chunk 04)
- !w4_05 <- word4 (BU.unsafeIndex chunk 05)
- !w4_06 <- word4 (BU.unsafeIndex chunk 06)
- !w4_07 <- word4 (BU.unsafeIndex chunk 07)
- !w4_08 <- word4 (BU.unsafeIndex chunk 08)
- !w4_09 <- word4 (BU.unsafeIndex chunk 09)
- !w4_10 <- word4 (BU.unsafeIndex chunk 10)
- !w4_11 <- word4 (BU.unsafeIndex chunk 11)
- !w4_12 <- word4 (BU.unsafeIndex chunk 12)
- !w4_13 <- word4 (BU.unsafeIndex chunk 13)
- !w4_14 <- word4 (BU.unsafeIndex chunk 14)
- !w4_15 <- word4 (BU.unsafeIndex chunk 15)
-
- let !w64 = fi w4_00 `B.shiftL` 60
- .|. fi w4_01 `B.shiftL` 56
- .|. fi w4_02 `B.shiftL` 52
- .|. fi w4_03 `B.shiftL` 48
- .|. fi w4_04 `B.shiftL` 44
- .|. fi w4_05 `B.shiftL` 40
- .|. fi w4_06 `B.shiftL` 36
- .|. fi w4_07 `B.shiftL` 32
- .|. fi w4_08 `B.shiftL` 28
- .|. fi w4_09 `B.shiftL` 24
- .|. fi w4_10 `B.shiftL` 20
- .|. fi w4_11 `B.shiftL` 16
- .|. fi w4_12 `B.shiftL` 12
- .|. fi w4_13 `B.shiftL` 08
- .|. fi w4_14 `B.shiftL` 04
- .|. fi w4_15
-
- go64 (acc <> BSB.word64BE w64) etc
-
- go32 acc b = case BS.splitAt 8 b of
- (chunk, etc)
- | BS.null chunk -> pure acc
- | otherwise -> do
- !w4_00 <- word4 (BU.unsafeIndex chunk 00)
- !w4_01 <- word4 (BU.unsafeIndex chunk 01)
- !w4_02 <- word4 (BU.unsafeIndex chunk 02)
- !w4_03 <- word4 (BU.unsafeIndex chunk 03)
- !w4_04 <- word4 (BU.unsafeIndex chunk 04)
- !w4_05 <- word4 (BU.unsafeIndex chunk 05)
- !w4_06 <- word4 (BU.unsafeIndex chunk 06)
- !w4_07 <- word4 (BU.unsafeIndex chunk 07)
-
- let !w32 = fi w4_00 `B.shiftL` 28
- .|. fi w4_01 `B.shiftL` 24
- .|. fi w4_02 `B.shiftL` 20
- .|. fi w4_03 `B.shiftL` 16
- .|. fi w4_04 `B.shiftL` 12
- .|. fi w4_05 `B.shiftL` 08
- .|. fi w4_06 `B.shiftL` 04
- .|. fi w4_07
-
- go32 (acc <> BSB.word32BE w32) etc
-
- go16 acc b = case BS.splitAt 4 b of
- (chunk, etc)
- | BS.null chunk -> pure acc
- | otherwise -> do
- !w4_00 <- word4 (BU.unsafeIndex chunk 00)
- !w4_01 <- word4 (BU.unsafeIndex chunk 01)
- !w4_02 <- word4 (BU.unsafeIndex chunk 02)
- !w4_03 <- word4 (BU.unsafeIndex chunk 03)
-
- let !w16 = fi w4_00 `B.shiftL` 12
- .|. fi w4_01 `B.shiftL` 08
- .|. fi w4_02 `B.shiftL` 04
- .|. fi w4_03
-
- go16 (acc <> BSB.word16BE w16) etc
-
- go8 acc b = case BS.splitAt 2 b of
- (chunk, etc)
- | BS.null chunk -> pure acc
- | otherwise -> do
- !w4_00 <- word4 (BU.unsafeIndex chunk 00)
- !w4_01 <- word4 (BU.unsafeIndex chunk 01)
-
- let !w8 = fi w4_00 `B.shiftL` 04
- .|. fi w4_01
-
- go8 (acc <> BSB.word8 w8) etc
-
+decode (BI.PS sfp soff l)
+ | B.testBit l 0 = Nothing
+ | otherwise = case dec_tab of
+ BI.PS tfp toff _ -> unsafeDupablePerformIO $ do
+ let !n = l `B.shiftR` 1
+ fp <- BI.mallocByteString n
+ ok <- withForeignPtr fp $ \dst ->
+ withForeignPtr sfp $ \sp0 ->
+ withForeignPtr tfp $ \tp0 -> do
+ let !sp = sp0 `plusPtr` soff :: Ptr Word8
+ !tp = tp0 `plusPtr` toff :: Ptr Word8
+ loop !i !acc
+ | i == n =
+ pure $! acc .&. 0x20 == 0
+ | otherwise = do
+ let !o = i `B.shiftL` 1
+ c0 <- peekElemOff sp o
+ c1 <- peekElemOff sp (o + 1)
+ n0 <- peekElemOff tp (fi c0)
+ n1 <- peekElemOff tp (fi c1)
+ let !b = (n0 `B.shiftL` 4)
+ .|. (n1 .&. 0x0f)
+ pokeElemOff dst i b
+ loop (i + 1) (acc .|. n0 .|. n1)
+ loop 0 0
+ pure $! if ok then Just (BI.PS fp 0 n) else Nothing