commit 517c87e92b32302e2c705ba1b90141a36953f725
parent 17f5dd241bc4ad200676b2b30b72cd6718cedb3b
Author: Jared Tobin <jared@jtobin.io>
Date: Sat, 16 May 2026 12:19:24 -0230
lib: dispatch encode/decode to ARM NEON when available
Wire 'Data.ByteString.Base16.encode' and 'decode' to the NEON
implementation added in the previous commit, with the pure Haskell
scalar loop kept as a fallback.
Mirrors the dispatch pattern in 'Crypto.Hash.SHA256.hs':
encode bs
| Arm.base16_arm_available = Arm.encode bs
| otherwise = encode_scalar bs
The previous 'encode' / 'decode' bodies are renamed in place to
'encode_scalar' / 'decode_scalar'. No behavioural change beyond
dispatch: on aarch64 the NEON path is taken, on every other arch
the C stubs return availability = 0 and the scalar bodies run.
Existing tasty suite (5000 QuickCheck cases x 3 properties +
uppercase HUnit) passes through the dispatched path, including
under 'cabal test -fllvm -fsanitize' which exercises the C kernel
under AddressSanitizer + UndefinedBehaviorSanitizer.
Performance on 1 KiB inputs, M4 MacBook Air, GHC 9.10.3 + LLVM 19,
-fllvm:
encode time: 296 ns -> 60.45 ns (~4.9x)
decode time: 271 ns -> 76.03 ns (~3.6x)
encode alloc: 3,872 B -> 3,840 B
decode alloc: 3,872 B -> 3,856 B
Roughly 16.9 GB/s encode and 13.5 GB/s decode output throughput;
allocation per call is unchanged from the scalar path.
Diffstat:
1 file changed, 29 insertions(+), 12 deletions(-)
diff --git a/lib/Data/ByteString/Base16.hs b/lib/Data/ByteString/Base16.hs
@@ -18,6 +18,7 @@ module Data.ByteString.Base16 (
import qualified Data.Bits as B
import Data.Bits ((.&.), (.|.))
import qualified Data.ByteString as BS
+import qualified Data.ByteString.Base16.Arm as Arm
import qualified Data.ByteString.Internal as BI
import Data.Word (Word8, Word16)
import Foreign.ForeignPtr (withForeignPtr)
@@ -89,10 +90,35 @@ dec_tab =
-- | Encode a base256 'ByteString' as base16.
--
+-- Uses ARM NEON extensions when available, otherwise a pure
+-- Haskell scalar loop.
+--
-- >>> encode "hello world"
-- "68656c6c6f20776f726c64"
encode :: BS.ByteString -> BS.ByteString
-encode (BI.PS sfp soff l) =
+encode bs
+ | Arm.base16_arm_available = Arm.encode bs
+ | otherwise = encode_scalar bs
+{-# INLINABLE encode #-}
+
+-- | Decode a base16 'ByteString' to base256.
+--
+-- Uses ARM NEON extensions when available, otherwise a pure
+-- Haskell scalar loop. Invalid inputs (including odd-length
+-- inputs) will produce 'Nothing'.
+--
+-- >>> decode "68656c6c6f20776f726c64"
+-- Just "hello world"
+-- >>> decode "068656c6c6f20776f726c64" -- odd-length
+-- Nothing
+decode :: BS.ByteString -> Maybe BS.ByteString
+decode bs
+ | Arm.base16_arm_available = Arm.decode bs
+ | otherwise = decode_scalar bs
+{-# INLINABLE decode #-}
+
+encode_scalar :: BS.ByteString -> BS.ByteString
+encode_scalar (BI.PS sfp soff l) =
case enc_tab of
BI.PS tfp toff _ ->
BI.unsafeCreate (l `B.shiftL` 1) $ \dst ->
@@ -116,17 +142,8 @@ encode (BI.PS sfp soff l) =
loop (i + 1)
loop 0
--- | Decode a base16 'ByteString' to base256.
---
--- Invalid inputs (including odd-length inputs) will produce
--- 'Nothing'.
---
--- >>> decode "68656c6c6f20776f726c64"
--- Just "hello world"
--- >>> decode "068656c6c6f20776f726c64" -- odd-length
--- Nothing
-decode :: BS.ByteString -> Maybe BS.ByteString
-decode (BI.PS sfp soff l)
+decode_scalar :: BS.ByteString -> Maybe BS.ByteString
+decode_scalar (BI.PS sfp soff l)
| B.testBit l 0 = Nothing
| otherwise = case dec_tab of
BI.PS tfp toff _ -> unsafeDupablePerformIO $ do