commit 39d2d2c8eb4292a8a176367b2d72f4dee9f78153
parent ca6d2c9ff1f93bcc469176c75a85dc3918bcd3c6
Author: Jared Tobin <jared@jtobin.io>
Date: Sat, 16 May 2026 12:25:10 -0230
Merge branch 'arm-neon'
Add aarch64 NEON acceleration for base16 encode and decode, following
the integration pattern used by ppad-sha256: a small C kernel under
'cbits/' with NEON intrinsics, a thin Haskell FFI module, runtime
detection via a CAF, and dispatch in the top-level module that falls
back to the existing pure Haskell scalar loop on non-aarch64.
Single package, single Hackage release.
Changes:
* New 'cbits/base16_arm.c' containing the NEON kernels and a
'base16_arm_available' probe. Encode processes 16 input bytes per
iteration ('vqtbl1q_u8' table lookup + 'vst2q_u8' interleaved
store). Decode processes 32 input chars per iteration ('vld2q_u8'
deinterleave + branchless byte-range validation + OR-accumulated
bad mask reduced once with 'vmaxvq_u8'). Body is gated by
'#if defined(__aarch64__)' with stubs in the '#else' branch.
* New 'lib/Data/ByteString/Base16/Arm.hs' wraps the C kernel via
'foreign import ccall unsafe'. 'base16_arm_available' is a
NOINLINE CAF computed once. Module is exposed (matching sha256's
Arm module exposure) but hidden from haddock.
* 'lib/Data/ByteString/Base16.hs' now dispatches to the NEON path
when 'base16_arm_available' is True; the previous scalar bodies
are kept as 'encode_scalar' / 'decode_scalar' and serve as the
fallback on every other architecture.
* 'ppad-base16.cabal' adds 'c-sources', 'arch(aarch64)' cc-options
('-march=armv8-a' — NEON is baseline of armv8 so no extension flag
is needed), and a new 'sanitize' flag that turns on
AddressSanitizer + UndefinedBehaviorSanitizer in both the C source
and the test-suite link, mirroring 'ppad-sha256'.
* README performance section updated with the new criterion figures
and a note that the library now avails of hardware acceleration
on aarch64.
Verification (M4 MacBook Air, GHC 9.10.3 + LLVM 19, -fllvm):
* 'cabal test -fllvm' — 4/4 tests pass through the ARM
path (5000 QC cases x 3
properties + uppercase HUnit).
* 'cabal test -fllvm -fsanitize' — same suite, instrumented with
ASan + UBSan over the C kernel.
No diagnostics.
* 'cabal bench -fllvm':
encode 1 KiB: 296 ns -> 60.45 ns (~4.9x)
decode 1 KiB: 271 ns -> 76.03 ns (~3.6x)
* 'cabal bench -fllvm base16-weigh' — allocation per call matches
the scalar path.
Roughly 16.9 GB/s encode and 13.5 GB/s decode output on Apple
Silicon; faster than the C-backed 'base16-bytestring'. Non-aarch64
builds are unchanged (C stubs return availability = 0, dispatch
falls through to the scalar path).
Diffstat:
5 files changed, 271 insertions(+), 24 deletions(-)
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# base16
+# ppad-base16
[](https://hackage.haskell.org/package/ppad-base16)

@@ -31,25 +31,27 @@ Haddocks (API documentation, etc.) are hosted at
## Performance
-The aim is best-in-class performance.
-
-Current benchmark figures on 1kb inputs on my M4 MacBook Air look like
-(use `cabal bench -fllvm` to run the benchmark suite):
+The aim is best-in-class performance. Current benchmark figures on 1kb
+inputs on an M4 Silicon MacBook Air, where we avail of hardware
+acceleration via ARM NEON intrinsics, look like (use `cabal bench` to
+run the benchmark suite):
```
benchmarking ppad-base16/encode
- time 295.9 ns (295.4 ns .. 296.4 ns)
- 1.000 R² (0.999 R² .. 1.000 R²)
- mean 296.8 ns (296.4 ns .. 297.2 ns)
- std dev 1.367 ns (1.181 ns .. 1.619 ns)
+ time 60.45 ns (60.29 ns .. 60.65 ns)
+ 1.000 R² (1.000 R² .. 1.000 R²)
+ mean 60.59 ns (60.42 ns .. 60.78 ns)
+ std dev 587.6 ps (465.2 ps .. 776.7 ps)
benchmarking ppad-base16/decode
- time 270.8 ns (270.6 ns .. 271.1 ns)
+ time 76.03 ns (75.93 ns .. 76.15 ns)
1.000 R² (1.000 R² .. 1.000 R²)
- mean 270.9 ns (270.7 ns .. 271.1 ns)
- std dev 627.8 ps (515.3 ps .. 766.1 ps)
+ mean 76.04 ns (75.94 ns .. 76.16 ns)
+ std dev 373.0 ps (285.3 ps .. 555.3 ps)
```
+You should compile with the 'llvm' flag for maximum performance.
+
## Security
This library aims at the maximum security achievable in a
diff --git a/cbits/base16_arm.c b/cbits/base16_arm.c
@@ -0,0 +1,143 @@
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__aarch64__)
+
+#include <arm_neon.h>
+
+/* lowercase ASCII hex character for each nibble value 0..15 */
+static const uint8_t hex_lut[16] = {
+ '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
+};
+
+/*
+ * Encode 'l' input bytes at 'src' into 2*l ASCII hex bytes at 'dst'.
+ *
+ * NEON kernel processes 16 input bytes per iteration:
+ * - vshrq_n_u8 / vandq_u8 split the high and low nibbles
+ * - vqtbl1q_u8 looks up each nibble in 'hex_lut' (the tbl instr)
+ * - vst2q_u8 stores the two char vectors interleaved, producing
+ * [h0 l0 h1 l1 ... h15 l15] which is exactly the hex output
+ *
+ * A scalar tail finishes the final (l mod 16) bytes.
+ */
+void base16_encode_arm(const uint8_t *src, uint8_t *dst, size_t l) {
+ uint8x16_t lut = vld1q_u8(hex_lut);
+ uint8x16_t mask_lo = vdupq_n_u8(0x0f);
+ size_t i = 0;
+
+ for (; i + 16 <= l; i += 16) {
+ uint8x16_t in = vld1q_u8(src + i);
+ uint8x16_t hi = vshrq_n_u8(in, 4);
+ uint8x16_t lo = vandq_u8(in, mask_lo);
+ uint8x16x2_t pair = { { vqtbl1q_u8(lut, hi),
+ vqtbl1q_u8(lut, lo) } };
+ vst2q_u8(dst + 2 * i, pair);
+ }
+
+ for (; i < l; i++) {
+ uint8_t b = src[i];
+ dst[2 * i] = hex_lut[b >> 4];
+ dst[2 * i + 1] = hex_lut[b & 0x0f];
+ }
+}
+
+/*
+ * Convert 16 ASCII hex chars to nibble values (0..15) in 'nib'.
+ * Each lane of 'bad' is set to 0xff if the corresponding input is
+ * not a valid hex digit ('0'..'9', 'a'..'f', 'A'..'F'), 0x00 if it
+ * is. Case-insensitive.
+ */
+static inline void ascii_to_nibble(uint8x16_t c,
+ uint8x16_t *nib,
+ uint8x16_t *bad) {
+ uint8x16_t is_digit = vandq_u8(vcgeq_u8(c, vdupq_n_u8('0')),
+ vcleq_u8(c, vdupq_n_u8('9')));
+ uint8x16_t is_lower = vandq_u8(vcgeq_u8(c, vdupq_n_u8('a')),
+ vcleq_u8(c, vdupq_n_u8('f')));
+ uint8x16_t is_upper = vandq_u8(vcgeq_u8(c, vdupq_n_u8('A')),
+ vcleq_u8(c, vdupq_n_u8('F')));
+
+ /* offset to subtract from c: '0' (0x30), 'a'-10 (0x57), 'A'-10
+ * (0x37); zero in lanes that aren't valid hex (the resulting
+ * nibble in those lanes is garbage but 'bad' flags them). */
+ uint8x16_t off = vorrq_u8(
+ vandq_u8(is_digit, vdupq_n_u8(0x30)),
+ vorrq_u8(
+ vandq_u8(is_lower, vdupq_n_u8(0x57)),
+ vandq_u8(is_upper, vdupq_n_u8(0x37))));
+
+ *nib = vsubq_u8(c, off);
+ *bad = vmvnq_u8(vorrq_u8(is_digit, vorrq_u8(is_lower, is_upper)));
+}
+
+static inline uint8_t scalar_nib(uint8_t c) {
+ if (c >= '0' && c <= '9') return (uint8_t)(c - '0');
+ if (c >= 'a' && c <= 'f') return (uint8_t)(c - 'a' + 10);
+ if (c >= 'A' && c <= 'F') return (uint8_t)(c - 'A' + 10);
+ return 0x80; /* invalid sentinel */
+}
+
+/*
+ * Decode 2*outlen hex chars at 'src' into 'outlen' bytes at 'dst'.
+ * Returns 1 on success, 0 if any invalid hex char was seen (in which
+ * case the contents of 'dst' are unspecified).
+ *
+ * NEON kernel processes 32 input chars per iteration:
+ * - vld2q_u8 deinterleaves into a vector of high-nibble chars and a
+ * vector of low-nibble chars
+ * - ascii_to_nibble validates and converts each char to its nibble
+ * - vshlq_n_u8 + vorrq_u8 packs nibble pairs into output bytes
+ * - the per-iteration 'bad' masks are OR-accumulated; we reduce
+ * once at the end with vmaxvq_u8
+ *
+ * A scalar tail finishes the final (outlen mod 16) output bytes.
+ */
+int base16_decode_arm(const uint8_t *src, uint8_t *dst, size_t outlen) {
+ uint8x16_t bad = vdupq_n_u8(0);
+ size_t i = 0;
+
+ for (; i + 16 <= outlen; i += 16) {
+ uint8x16x2_t pair = vld2q_u8(src + 2 * i);
+ uint8x16_t nib_hi, nib_lo, bad_hi, bad_lo;
+ ascii_to_nibble(pair.val[0], &nib_hi, &bad_hi);
+ ascii_to_nibble(pair.val[1], &nib_lo, &bad_lo);
+ uint8x16_t byte = vorrq_u8(vshlq_n_u8(nib_hi, 4), nib_lo);
+ vst1q_u8(dst + i, byte);
+ bad = vorrq_u8(bad, vorrq_u8(bad_hi, bad_lo));
+ }
+
+ uint8_t tail_bad = 0;
+ for (; i < outlen; i++) {
+ uint8_t n0 = scalar_nib(src[2 * i]);
+ uint8_t n1 = scalar_nib(src[2 * i + 1]);
+ tail_bad |= (n0 | n1) & 0x80;
+ dst[i] = (uint8_t)((n0 << 4) | (n1 & 0x0f));
+ }
+
+ return (vmaxvq_u8(bad) == 0) && (tail_bad == 0);
+}
+
+int base16_arm_available(void) {
+ return 1;
+}
+
+#else
+
+/* stubs for non-aarch64 builds; never reached because dispatch is
+ * gated on 'base16_arm_available' returning 0 */
+
+void base16_encode_arm(const uint8_t *src, uint8_t *dst, size_t l) {
+ (void)src; (void)dst; (void)l;
+}
+
+int base16_decode_arm(const uint8_t *src, uint8_t *dst, size_t outlen) {
+ (void)src; (void)dst; (void)outlen;
+ return 0;
+}
+
+int base16_arm_available(void) {
+ return 0;
+}
+
+#endif
diff --git a/lib/Data/ByteString/Base16.hs b/lib/Data/ByteString/Base16.hs
@@ -18,6 +18,7 @@ module Data.ByteString.Base16 (
import qualified Data.Bits as B
import Data.Bits ((.&.), (.|.))
import qualified Data.ByteString as BS
+import qualified Data.ByteString.Base16.Arm as Arm
import qualified Data.ByteString.Internal as BI
import Data.Word (Word8, Word16)
import Foreign.ForeignPtr (withForeignPtr)
@@ -89,10 +90,35 @@ dec_tab =
-- | Encode a base256 'ByteString' as base16.
--
+-- Uses ARM NEON extensions when available, otherwise a pure
+-- Haskell scalar loop.
+--
-- >>> encode "hello world"
-- "68656c6c6f20776f726c64"
encode :: BS.ByteString -> BS.ByteString
-encode (BI.PS sfp soff l) =
+encode bs
+ | Arm.base16_arm_available = Arm.encode bs
+ | otherwise = encode_scalar bs
+{-# INLINABLE encode #-}
+
+-- | Decode a base16 'ByteString' to base256.
+--
+-- Uses ARM NEON extensions when available, otherwise a pure
+-- Haskell scalar loop. Invalid inputs (including odd-length
+-- inputs) will produce 'Nothing'.
+--
+-- >>> decode "68656c6c6f20776f726c64"
+-- Just "hello world"
+-- >>> decode "068656c6c6f20776f726c64" -- odd-length
+-- Nothing
+decode :: BS.ByteString -> Maybe BS.ByteString
+decode bs
+ | Arm.base16_arm_available = Arm.decode bs
+ | otherwise = decode_scalar bs
+{-# INLINABLE decode #-}
+
+encode_scalar :: BS.ByteString -> BS.ByteString
+encode_scalar (BI.PS sfp soff l) =
case enc_tab of
BI.PS tfp toff _ ->
BI.unsafeCreate (l `B.shiftL` 1) $ \dst ->
@@ -116,17 +142,8 @@ encode (BI.PS sfp soff l) =
loop (i + 1)
loop 0
--- | Decode a base16 'ByteString' to base256.
---
--- Invalid inputs (including odd-length inputs) will produce
--- 'Nothing'.
---
--- >>> decode "68656c6c6f20776f726c64"
--- Just "hello world"
--- >>> decode "068656c6c6f20776f726c64" -- odd-length
--- Nothing
-decode :: BS.ByteString -> Maybe BS.ByteString
-decode (BI.PS sfp soff l)
+decode_scalar :: BS.ByteString -> Maybe BS.ByteString
+decode_scalar (BI.PS sfp soff l)
| B.testBit l 0 = Nothing
| otherwise = case dec_tab of
BI.PS tfp toff _ -> unsafeDupablePerformIO $ do
diff --git a/lib/Data/ByteString/Base16/Arm.hs b/lib/Data/ByteString/Base16/Arm.hs
@@ -0,0 +1,70 @@
+{-# OPTIONS_HADDOCK hide #-}
+{-# LANGUAGE BangPatterns #-}
+
+-- |
+-- Module: Data.ByteString.Base16.Arm
+-- Copyright: (c) 2025 Jared Tobin
+-- License: MIT
+-- Maintainer: Jared Tobin <jared@ppad.tech>
+--
+-- ARM NEON support for base16 encoding and decoding.
+
+module Data.ByteString.Base16.Arm (
+ base16_arm_available
+ , encode
+ , decode
+ ) where
+
+import qualified Data.Bits as B
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Internal as BI
+import Data.Word (Word8)
+import Foreign.C.Types (CInt(..), CSize(..))
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Ptr (Ptr, plusPtr)
+import System.IO.Unsafe (unsafeDupablePerformIO)
+
+-- ffi ------------------------------------------------------------------------
+
+foreign import ccall unsafe "base16_encode_arm"
+ c_base16_encode :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
+
+foreign import ccall unsafe "base16_decode_arm"
+ c_base16_decode :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt
+
+foreign import ccall unsafe "base16_arm_available"
+ c_base16_arm_available :: IO CInt
+
+-- utilities ------------------------------------------------------------------
+
+fi :: (Integral a, Num b) => a -> b
+fi = fromIntegral
+{-# INLINE fi #-}
+
+-- api ------------------------------------------------------------------------
+
+-- | Are ARM NEON extensions available?
+base16_arm_available :: Bool
+base16_arm_available =
+ unsafeDupablePerformIO c_base16_arm_available /= 0
+{-# NOINLINE base16_arm_available #-}
+
+-- | Encode a base256 'ByteString' as base16 using NEON.
+encode :: BS.ByteString -> BS.ByteString
+encode (BI.PS sfp soff l) =
+ BI.unsafeCreate (l `B.shiftL` 1) $ \dst ->
+ withForeignPtr sfp $ \sp0 ->
+ c_base16_encode (sp0 `plusPtr` soff) dst (fi l)
+
+-- | Decode a base16 'ByteString' to base256 using NEON. Returns
+-- 'Nothing' on odd-length or otherwise invalid input.
+decode :: BS.ByteString -> Maybe BS.ByteString
+decode (BI.PS sfp soff l)
+ | B.testBit l 0 = Nothing
+ | otherwise = unsafeDupablePerformIO $ do
+ let !n = l `B.shiftR` 1
+ fp <- BI.mallocByteString n
+ ok <- withForeignPtr fp $ \dst ->
+ withForeignPtr sfp $ \sp0 ->
+ c_base16_decode (sp0 `plusPtr` soff) dst (fi n)
+ pure $! if ok /= 0 then Just (BI.PS fp 0 n) else Nothing
diff --git a/ppad-base16.cabal b/ppad-base16.cabal
@@ -18,6 +18,11 @@ flag llvm
default: False
manual: True
+flag sanitize
+ description: Build with AddressSanitizer and UndefinedBehaviorSanitizer.
+ default: False
+ manual: True
+
source-repository head
type: git
location: git.ppad.tech/base16.git
@@ -31,9 +36,17 @@ library
ghc-options: -fllvm -O2
exposed-modules:
Data.ByteString.Base16
+ Data.ByteString.Base16.Arm
build-depends:
base >= 4.9 && < 5
, bytestring >= 0.9 && < 0.13
+ c-sources:
+ cbits/base16_arm.c
+ if arch(aarch64)
+ cc-options: -march=armv8-a
+ if flag(sanitize)
+ cc-options: -fsanitize=address,undefined -fno-omit-frame-pointer
+ ghc-options: -optl=-fsanitize=address,undefined
test-suite base16-tests
type: exitcode-stdio-1.0
@@ -43,6 +56,8 @@ test-suite base16-tests
ghc-options:
-rtsopts -Wall -O2
+ if flag(sanitize)
+ ghc-options: -optl=-fsanitize=address,undefined
build-depends:
base