lib: drop bytestring builder, use unsafeCreate + lookup tables - base16 - Pure Haskell base16 encoding/decoding (docs.ppad.tech/base16).

commit 172805a1500ccf81594d82cd189dfc81a562d43a
parent 58dfb7922401a60d5de76825fcd5f6ecbcd7afe0
Author: Jared Tobin <jared@jtobin.io>
Date:   Sat, 16 May 2026 11:27:12 -0230

lib: drop bytestring builder, use unsafeCreate + lookup tables

encode and decode now allocate the output buffer once with the exact
final size and fill it via direct Ptr writes, instead of accumulating
a Builder and collapsing it with toLazyByteString/toStrict.

- 512-byte enc_tab CAF, all-ASCII so the bytestring IsString rule
  rewrites it to unsafePackAddress and the bytes live in rodata
- 256-byte dec_tab CAF mapping byte -> nibble, with 0x80 as the
  invalid sentinel; OR-accumulator gives branchless validity, checked
  once after the loop
- encode body is unsafeCreate (2*l) + a per-byte Ptr loop
- decode body is mallocByteString (l/2) + unsafeDupablePerformIO

Resulting Core is a joinrec with Int#/Word8#/State# args calling
readWord8OffAddr# and writeWord8OffAddr# directly; no allocation in
the hot path.

On 1 KiB inputs (aarch64, GHC 9.10.3, LLVM 19, -f+llvm):
- encode: 3032 ns -> 296 ns   (10.2x)
- decode:  596 ns -> 394 ns   ( 1.5x)
- encode alloc: 49,696 B ->  3,872 B
- decode alloc: 17,960 B ->  3,992 B

Allocation now matches the C-backed base16-bytestring on both paths.
Existing tasty suite (5000 QC cases x 3 properties + uppercase HUnit)
passes unchanged.

Diffstat:
M lib/Data/ByteString/Base16.hs  | 353 +++++++++++++++++++++----------------------------------------------------------

1 file changed, 92 insertions(+), 261 deletions(-)
diff --git a/lib/Data/ByteString/Base16.hs b/lib/Data/ByteString/Base16.hs
@@ -1,7 +1,5 @@
 {-# OPTIONS_HADDOCK prune #-}
-{-# LANGUAGE ApplicativeDo #-}
 {-# LANGUAGE BangPatterns #-}
-{-# LANGUAGE BinaryLiterals #-}
 {-# LANGUAGE OverloadedStrings #-}
 
 -- |
@@ -20,116 +18,84 @@ module Data.ByteString.Base16 (
 import qualified Data.Bits as B
 import Data.Bits ((.&.), (.|.))
 import qualified Data.ByteString as BS
-import qualified Data.ByteString.Builder as BSB
-import qualified Data.ByteString.Builder.Extra as BE
 import qualified Data.ByteString.Internal as BI
-import qualified Data.ByteString.Unsafe as BU
-import Data.Word (Word8, Word16)
-
-to_strict :: BSB.Builder -> BS.ByteString
-to_strict = BS.toStrict . BSB.toLazyByteString
-{-# INLINE to_strict #-}
-
-to_strict_small :: BSB.Builder -> BS.ByteString
-to_strict_small = BS.toStrict
-  . BE.toLazyByteStringWith (BE.safeStrategy 128 BE.smallChunkSize) mempty
-{-# INLINE to_strict_small #-}
+import Data.Word (Word8)
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Ptr (Ptr, plusPtr)
+import Foreign.Storable (peekElemOff, pokeElemOff)
+import System.IO.Unsafe (unsafeDupablePerformIO)
 
 fi :: (Num a, Integral b) => b -> a
 fi = fromIntegral
 {-# INLINE fi #-}
 
-hex_charset :: BS.ByteString
-hex_charset = "0123456789abcdef"
-
-expand_w8 :: Word8 -> Word16
-expand_w8 b =
-  let !hi = BU.unsafeIndex hex_charset (fi b `B.shiftR` 4)
-      !lo = BU.unsafeIndex hex_charset (fi b .&. 0b00001111)
-  in      fi hi `B.shiftL` 8
-      .|. fi lo
-{-# INLINE expand_w8 #-}
+-- 512-byte table.  Bytes [2k] and [2k+1] are the two lowercase ASCII
+-- hex characters representing the value k.  All-ASCII content means
+-- the bytestring 'IsString' rule rewrites this to 'unsafePackAddress'
+-- and the bytes live in static rodata.
+enc_tab :: BS.ByteString
+enc_tab =
+  "000102030405060708090a0b0c0d0e0f\
+  \101112131415161718191a1b1c1d1e1f\
+  \202122232425262728292a2b2c2d2e2f\
+  \303132333435363738393a3b3c3d3e3f\
+  \404142434445464748494a4b4c4d4e4f\
+  \505152535455565758595a5b5c5d5e5f\
+  \606162636465666768696a6b6c6d6e6f\
+  \707172737475767778797a7b7c7d7e7f\
+  \808182838485868788898a8b8c8d8e8f\
+  \909192939495969798999a9b9c9d9e9f\
+  \a0a1a2a3a4a5a6a7a8a9aaabacadaeaf\
+  \b0b1b2b3b4b5b6b7b8b9babbbcbdbebf\
+  \c0c1c2c3c4c5c6c7c8c9cacbcccdcecf\
+  \d0d1d2d3d4d5d6d7d8d9dadbdcdddedf\
+  \e0e1e2e3e4e5e6e7e8e9eaebecedeeef\
+  \f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"
+{-# NOINLINE enc_tab #-}
+
+-- 256-byte table.  Index by an ASCII byte to obtain its nibble value
+-- (0..15); invalid bytes map to 0x80.  Built once at first use, with
+-- no intermediate list (which would balloon CAF allocation).
+dec_tab :: BS.ByteString
+dec_tab = BI.unsafeCreate 256 $ \p ->
+  let go !i
+        | i == 256  = pure ()
+        | otherwise = pokeElemOff p i (nib i) >> go (i + 1)
+  in  go 0
+  where
+    nib :: Int -> Word8
+    nib c
+      | c >= 0x30 && c <= 0x39 = fi (c - 0x30) -- '0'..'9'
+      | c >= 0x41 && c <= 0x46 = fi (c - 0x37) -- 'A'..'F'
+      | c >= 0x61 && c <= 0x66 = fi (c - 0x57) -- 'a'..'f'
+      | otherwise              = 0x80
+{-# NOINLINE dec_tab #-}
 
 -- | Encode a base256 'ByteString' as base16.
 --
 --   >>> encode "hello world"
 --   "68656c6c6f20776f726c64"
 encode :: BS.ByteString -> BS.ByteString
-encode bs@(BI.PS _ _ l)
-    | l < 64    = to_strict_small loop
-    | otherwise = to_strict loop
-  where
-    -- writing as few words as possible requires performing some length
-    -- checks up front
-    loop
-      | l `rem` 4 == 0 =
-               go64 bs
-      | (l - 3) `rem` 4 == 0 = case BS.splitAt (l - 3) bs of
-          (chunk, etc) ->
-               go64 chunk
-            <> go32 (BU.unsafeTake 2 etc)
-            <> go16 (BU.unsafeDrop 2 etc)
-      | (l - 2) `rem` 4 == 0 = case BS.splitAt (l - 2) bs of
-          (chunk, etc) ->
-               go64 chunk
-            <> go32 etc
-      | (l - 1) `rem` 4 == 0 = case BS.splitAt (l - 1) bs of
-          (chunk, etc) ->
-               go64 chunk
-            <> go16 etc
-
-      | l `rem` 2 == 0 =
-               go32 bs
-      | (l - 1) `rem` 2 == 0 = case BS.splitAt (l - 1) bs of
-          (chunk, etc) ->
-               go32 chunk
-            <> go16 etc
-
-      | otherwise =
-               go16 bs
-
-    go64 b = case BS.splitAt 4 b of
-      (chunk, etc)
-        | BS.null chunk -> mempty
-        | otherwise ->
-            let !w16_0 = expand_w8 (BU.unsafeIndex chunk 0)
-                !w16_1 = expand_w8 (BU.unsafeIndex chunk 1)
-                !w16_2 = expand_w8 (BU.unsafeIndex chunk 2)
-                !w16_3 = expand_w8 (BU.unsafeIndex chunk 3)
-
-                !w64 = fi w16_0 `B.shiftL` 48
-                   .|. fi w16_1 `B.shiftL` 32
-                   .|. fi w16_2 `B.shiftL` 16
-                   .|. fi w16_3
-
-            in  BSB.word64BE w64 <> go64 etc
-
-    go32 b = case BS.splitAt 2 b of
-      (chunk, etc)
-        | BS.null chunk -> mempty
-        | otherwise ->
-            let !w16_0 = expand_w8 (BU.unsafeIndex chunk 0)
-                !w16_1 = expand_w8 (BU.unsafeIndex chunk 1)
-
-                !w32 = fi w16_0 `B.shiftL` 16
-                   .|. fi w16_1
-
-            in  BSB.word32BE w32 <> go32 etc
-
-    go16 b = case BS.uncons b of
-      Nothing -> mempty
-      Just (h, t) ->
-        let !w16 = expand_w8 h
-        in  BSB.word16BE w16 <> go16 t
-
--- word8 hex character to word4
-word4 :: Word8 -> Maybe Word8
-word4 c
-  | c > 47 && c < 58  = pure $! c - 48 -- 0-9
-  | c > 64 && c < 71  = pure $! c - 55 -- A-F
-  | c > 96 && c < 103 = pure $! c - 87 -- a-f
-  | otherwise         = Nothing
-{-# INLINE word4 #-}
+encode (BI.PS sfp soff l) =
+  case enc_tab of
+    BI.PS tfp toff _ ->
+      BI.unsafeCreate (l `B.shiftL` 1) $ \dst ->
+        withForeignPtr sfp $ \sp0 ->
+        withForeignPtr tfp $ \tp0 -> do
+          let !sp = sp0 `plusPtr` soff :: Ptr Word8
+              !tp = tp0 `plusPtr` toff :: Ptr Word8
+              loop !i
+                | i == l    = pure ()
+                | otherwise = do
+                    b <- peekElemOff sp i
+                    let !j = fi b `B.shiftL` 1
+                        !o = i `B.shiftL` 1
+                    hi <- peekElemOff tp j
+                    lo <- peekElemOff tp (j + 1)
+                    pokeElemOff dst  o      hi
+                    pokeElemOff dst (o + 1) lo
+                    loop (i + 1)
+          loop 0
 
 -- | Decode a base16 'ByteString' to base256.
 --
@@ -141,164 +107,29 @@ word4 c
 --   >>> decode "068656c6c6f20776f726c64" -- odd-length
 --   Nothing
 decode :: BS.ByteString -> Maybe BS.ByteString
-decode bs@(BI.PS _ _ l)
-    | B.testBit l 0    = Nothing
-    | l `quot` 2 < 128 = fmap to_strict_small loop
-    | otherwise        = fmap to_strict loop
-  where
-    -- same story, but we need more checks
-    loop
-      | l `rem` 16 == 0 =
-            go64 mempty bs
-      | (l - 2) `rem` 16 == 0 = case BS.splitAt (l - 2) bs of
-          (chunk, etc) -> do
-            b0 <- go64 mempty chunk
-            go8 b0 etc
-      | (l - 4) `rem` 16 == 0 = case BS.splitAt (l - 4) bs of
-          (chunk, etc) -> do
-            b0 <- go64 mempty chunk
-            go16 b0 etc
-      | (l - 6) `rem` 16 == 0 = case BS.splitAt (l - 6) bs of
-          (chunk, etc) -> do
-            b0 <- go64 mempty chunk
-            b1 <- go16 b0 (BU.unsafeTake 4 etc)
-            go8 b1 (BU.unsafeDrop 4 etc)
-      | (l - 8) `rem` 16 == 0 = case BS.splitAt (l - 8) bs of
-          (chunk, etc) -> do
-            b0 <- go64 mempty chunk
-            go32 b0 etc
-      | (l - 10) `rem` 16 == 0 = case BS.splitAt (l - 10) bs of
-          (chunk, etc) -> do
-            b0 <- go64 mempty chunk
-            b1 <- go32 b0 (BU.unsafeTake 8 etc)
-            go8 b1 (BU.unsafeDrop 8 etc)
-      | (l - 12) `rem` 16 == 0 = case BS.splitAt (l - 12) bs of
-          (chunk, etc) -> do
-            b0 <- go64 mempty chunk
-            b1 <- go32 b0 (BU.unsafeTake 8 etc)
-            go16 b1 (BU.unsafeDrop 8 etc)
-      | (l - 14) `rem` 16 == 0 = case BS.splitAt (l - 14) bs of
-          (chunk, etc) -> do
-            b0 <- go64 mempty chunk
-            b1 <- go32 b0 (BU.unsafeTake 8 etc)
-            b2 <- go16 b1 (BU.unsafeTake 4 (BU.unsafeDrop 8 etc))
-            go8 b2 (BU.unsafeDrop 12 etc)
-
-      | l `rem` 8 == 0 =
-            go32 mempty bs
-      | (l - 2) `rem` 8 == 0 = case BS.splitAt (l - 2) bs of
-          (chunk, etc) -> do
-            b0 <- go32 mempty chunk
-            go8 b0 etc
-      | (l - 4) `rem` 8 == 0 = case BS.splitAt (l - 4) bs of
-          (chunk, etc) -> do
-            b0 <- go32 mempty chunk
-            go16 b0 etc
-      | (l - 6) `rem` 8 == 0 = case BS.splitAt (l - 6) bs of
-          (chunk, etc) -> do
-            b0 <- go32 mempty chunk
-            b1 <- go16 b0 (BU.unsafeTake 4 etc)
-            go8 b1  (BU.unsafeDrop 4 etc)
-
-      | l `rem` 4 == 0 =
-            go16 mempty bs
-      | (l - 2) `rem` 4 == 0 = case BS.splitAt (l - 2) bs of
-          (chunk, etc) -> do
-            b0 <- go16 mempty chunk
-            go8 b0 etc
-
-      | otherwise =
-            go8 mempty bs
-
-    go64 acc b = case BS.splitAt 16 b of
-      (chunk, etc)
-        | BS.null chunk -> pure acc
-        | otherwise -> do
-            !w4_00 <- word4 (BU.unsafeIndex chunk 00)
-            !w4_01 <- word4 (BU.unsafeIndex chunk 01)
-            !w4_02 <- word4 (BU.unsafeIndex chunk 02)
-            !w4_03 <- word4 (BU.unsafeIndex chunk 03)
-            !w4_04 <- word4 (BU.unsafeIndex chunk 04)
-            !w4_05 <- word4 (BU.unsafeIndex chunk 05)
-            !w4_06 <- word4 (BU.unsafeIndex chunk 06)
-            !w4_07 <- word4 (BU.unsafeIndex chunk 07)
-            !w4_08 <- word4 (BU.unsafeIndex chunk 08)
-            !w4_09 <- word4 (BU.unsafeIndex chunk 09)
-            !w4_10 <- word4 (BU.unsafeIndex chunk 10)
-            !w4_11 <- word4 (BU.unsafeIndex chunk 11)
-            !w4_12 <- word4 (BU.unsafeIndex chunk 12)
-            !w4_13 <- word4 (BU.unsafeIndex chunk 13)
-            !w4_14 <- word4 (BU.unsafeIndex chunk 14)
-            !w4_15 <- word4 (BU.unsafeIndex chunk 15)
-
-            let !w64 = fi w4_00 `B.shiftL` 60
-                   .|. fi w4_01 `B.shiftL` 56
-                   .|. fi w4_02 `B.shiftL` 52
-                   .|. fi w4_03 `B.shiftL` 48
-                   .|. fi w4_04 `B.shiftL` 44
-                   .|. fi w4_05 `B.shiftL` 40
-                   .|. fi w4_06 `B.shiftL` 36
-                   .|. fi w4_07 `B.shiftL` 32
-                   .|. fi w4_08 `B.shiftL` 28
-                   .|. fi w4_09 `B.shiftL` 24
-                   .|. fi w4_10 `B.shiftL` 20
-                   .|. fi w4_11 `B.shiftL` 16
-                   .|. fi w4_12 `B.shiftL` 12
-                   .|. fi w4_13 `B.shiftL` 08
-                   .|. fi w4_14 `B.shiftL` 04
-                   .|. fi w4_15
-
-            go64 (acc <> BSB.word64BE w64) etc
-
-    go32 acc b = case BS.splitAt 8 b of
-      (chunk, etc)
-        | BS.null chunk -> pure acc
-        | otherwise -> do
-            !w4_00 <- word4 (BU.unsafeIndex chunk 00)
-            !w4_01 <- word4 (BU.unsafeIndex chunk 01)
-            !w4_02 <- word4 (BU.unsafeIndex chunk 02)
-            !w4_03 <- word4 (BU.unsafeIndex chunk 03)
-            !w4_04 <- word4 (BU.unsafeIndex chunk 04)
-            !w4_05 <- word4 (BU.unsafeIndex chunk 05)
-            !w4_06 <- word4 (BU.unsafeIndex chunk 06)
-            !w4_07 <- word4 (BU.unsafeIndex chunk 07)
-
-            let !w32 = fi w4_00 `B.shiftL` 28
-                   .|. fi w4_01 `B.shiftL` 24
-                   .|. fi w4_02 `B.shiftL` 20
-                   .|. fi w4_03 `B.shiftL` 16
-                   .|. fi w4_04 `B.shiftL` 12
-                   .|. fi w4_05 `B.shiftL` 08
-                   .|. fi w4_06 `B.shiftL` 04
-                   .|. fi w4_07
-
-            go32 (acc <> BSB.word32BE w32) etc
-
-    go16 acc b = case BS.splitAt 4 b of
-      (chunk, etc)
-        | BS.null chunk -> pure acc
-        | otherwise -> do
-            !w4_00 <- word4 (BU.unsafeIndex chunk 00)
-            !w4_01 <- word4 (BU.unsafeIndex chunk 01)
-            !w4_02 <- word4 (BU.unsafeIndex chunk 02)
-            !w4_03 <- word4 (BU.unsafeIndex chunk 03)
-
-            let !w16 = fi w4_00 `B.shiftL` 12
-                   .|. fi w4_01 `B.shiftL` 08
-                   .|. fi w4_02 `B.shiftL` 04
-                   .|. fi w4_03
-
-            go16 (acc <> BSB.word16BE w16) etc
-
-    go8 acc b  = case BS.splitAt 2 b of
-      (chunk, etc)
-        | BS.null chunk -> pure acc
-        | otherwise -> do
-            !w4_00 <- word4 (BU.unsafeIndex chunk 00)
-            !w4_01 <- word4 (BU.unsafeIndex chunk 01)
-
-            let !w8 = fi w4_00 `B.shiftL` 04
-                  .|. fi w4_01
-
-            go8 (acc <> BSB.word8 w8) etc
-
+decode (BI.PS sfp soff l)
+  | B.testBit l 0 = Nothing
+  | otherwise = case dec_tab of
+      BI.PS tfp toff _ -> unsafeDupablePerformIO $ do
+        let !n = l `B.shiftR` 1
+        fp <- BI.mallocByteString n
+        ok <- withForeignPtr fp  $ \dst ->
+              withForeignPtr sfp $ \sp0 ->
+              withForeignPtr tfp $ \tp0 -> do
+                let !sp = sp0 `plusPtr` soff :: Ptr Word8
+                    !tp = tp0 `plusPtr` toff :: Ptr Word8
+                    loop !i !acc
+                      | i == n    =
+                          pure $! acc .&. 0x80 == 0
+                      | otherwise = do
+                          let !o = i `B.shiftL` 1
+                          c0 <- peekElemOff sp  o
+                          c1 <- peekElemOff sp (o + 1)
+                          n0 <- peekElemOff tp (fi c0)
+                          n1 <- peekElemOff tp (fi c1)
+                          let !b = (n0 `B.shiftL` 4)
+                               .|. (n1 .&. 0x0f)
+                          pokeElemOff dst i b
+                          loop (i + 1) (acc .|. n0 .|. n1)
+                loop 0 0
+        pure $! if ok then Just (BI.PS fp 0 n) else Nothing

	base16 Pure Haskell base16 encoding/decoding (docs.ppad.tech/base16).
	git clone git://git.ppad.tech/base16.git
	Log \| Files \| Refs \| README \| LICENSE