lib: add ARM NEON implementation - base16 - Pure Haskell base16 encoding/decoding (docs.ppad.tech/base16).

commit 17f5dd241bc4ad200676b2b30b72cd6718cedb3b
parent ca6d2c9ff1f93bcc469176c75a85dc3918bcd3c6
Author: Jared Tobin <jared@jtobin.io>
Date:   Sat, 16 May 2026 12:18:58 -0230

lib: add ARM NEON implementation

Add an aarch64 NEON kernel for base16 encode and decode, in the
shape used by ppad-sha256: a small C file with intrinsics gated by
'#if defined(__aarch64__)' + stubs in the '#else' branch, exposed
to Haskell via 'foreign import ccall unsafe' in a new module
'Data.ByteString.Base16.Arm'.

The C kernel:

* Encode processes 16 input bytes per NEON iteration.  'vshrq_n_u8'
  / 'vandq_u8' split each byte's high and low nibbles into two
  vectors; 'vqtbl1q_u8' (the 'tbl' instruction) maps both vectors
  through a 16-byte ASCII hex LUT in one cycle each; 'vst2q_u8'
  stores the two char vectors interleaved as [h0 l0 h1 l1 ...].
  ~6 NEON instructions for 32 output bytes.  A scalar tail finishes
  the final (l mod 16) bytes.

* Decode processes 32 input chars per NEON iteration.  'vld2q_u8'
  deinterleaves into a high-nibble-char vector and a low-nibble-char
  vector; each is converted to nibbles via byte-range compares
  ('vcgeq_u8' / 'vcleq_u8') + masked-offset subtraction; the per-
  chunk 'bad' masks are OR-accumulated and reduced once at the end
  with 'vmaxvq_u8'.  Case-insensitive.  A scalar tail handles the
  final (outlen mod 16) bytes.

The Haskell wrapper:

* 'base16_arm_available :: Bool' NOINLINE CAF queries the C-side
  availability probe once; returns 'True' on aarch64, 'False' on
  every other arch (where the C stubs are linked in).
* 'encode' and 'decode' wrappers allocate the output buffer with
  'BI.unsafeCreate' / 'BI.mallocByteString' and pass raw pointers
  to the C kernel.  Same allocation pattern as the scalar path.
* 'OPTIONS_HADDOCK hide' keeps the module out of public docs.

Cabal:

* 'c-sources: cbits/base16_arm.c' compiles the kernel into the
  library on every platform; the '#if'-gated body means the
  contributed code is empty on non-aarch64.
* 'if arch(aarch64) cc-options: -march=armv8-a' pins the target to
  baseline armv8 (NEON is mandatory there; no extension flag
  required).
* New 'sanitize' flag adds '-fsanitize=address,undefined
  -fno-omit-frame-pointer' to both the C source and the test-suite
  link, mirroring 'ppad-sha256'.  Built with 'cabal test -fllvm
  -fsanitize'.
* 'Data.ByteString.Base16.Arm' added to 'exposed-modules' so
  consumers can call the NEON path directly if they want to bypass
  dispatch.

No call sites in 'Data.ByteString.Base16' wired yet — the existing
tasty + criterion suites still go through the scalar path after
this commit, and pass unchanged.

Diffstat:
A cbits/base16_arm.c  | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lib/Data/ByteString/Base16/Arm.hs  | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M ppad-base16.cabal  | 15 +++++++++++++++

3 files changed, 228 insertions(+), 0 deletions(-)
diff --git a/cbits/base16_arm.c b/cbits/base16_arm.c
@@ -0,0 +1,143 @@
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__aarch64__)
+
+#include <arm_neon.h>
+
+/* lowercase ASCII hex character for each nibble value 0..15 */
+static const uint8_t hex_lut[16] = {
+    '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
+};
+
+/*
+ * Encode 'l' input bytes at 'src' into 2*l ASCII hex bytes at 'dst'.
+ *
+ * NEON kernel processes 16 input bytes per iteration:
+ *   - vshrq_n_u8 / vandq_u8 split the high and low nibbles
+ *   - vqtbl1q_u8 looks up each nibble in 'hex_lut' (the tbl instr)
+ *   - vst2q_u8 stores the two char vectors interleaved, producing
+ *     [h0 l0 h1 l1 ... h15 l15] which is exactly the hex output
+ *
+ * A scalar tail finishes the final (l mod 16) bytes.
+ */
+void base16_encode_arm(const uint8_t *src, uint8_t *dst, size_t l) {
+    uint8x16_t lut     = vld1q_u8(hex_lut);
+    uint8x16_t mask_lo = vdupq_n_u8(0x0f);
+    size_t i = 0;
+
+    for (; i + 16 <= l; i += 16) {
+        uint8x16_t in     = vld1q_u8(src + i);
+        uint8x16_t hi     = vshrq_n_u8(in, 4);
+        uint8x16_t lo     = vandq_u8(in, mask_lo);
+        uint8x16x2_t pair = { { vqtbl1q_u8(lut, hi),
+                                vqtbl1q_u8(lut, lo) } };
+        vst2q_u8(dst + 2 * i, pair);
+    }
+
+    for (; i < l; i++) {
+        uint8_t b = src[i];
+        dst[2 * i]     = hex_lut[b >> 4];
+        dst[2 * i + 1] = hex_lut[b & 0x0f];
+    }
+}
+
+/*
+ * Convert 16 ASCII hex chars to nibble values (0..15) in 'nib'.
+ * Each lane of 'bad' is set to 0xff if the corresponding input is
+ * not a valid hex digit ('0'..'9', 'a'..'f', 'A'..'F'), 0x00 if it
+ * is.  Case-insensitive.
+ */
+static inline void ascii_to_nibble(uint8x16_t c,
+                                   uint8x16_t *nib,
+                                   uint8x16_t *bad) {
+    uint8x16_t is_digit = vandq_u8(vcgeq_u8(c, vdupq_n_u8('0')),
+                                    vcleq_u8(c, vdupq_n_u8('9')));
+    uint8x16_t is_lower = vandq_u8(vcgeq_u8(c, vdupq_n_u8('a')),
+                                    vcleq_u8(c, vdupq_n_u8('f')));
+    uint8x16_t is_upper = vandq_u8(vcgeq_u8(c, vdupq_n_u8('A')),
+                                    vcleq_u8(c, vdupq_n_u8('F')));
+
+    /* offset to subtract from c: '0' (0x30), 'a'-10 (0x57), 'A'-10
+     * (0x37); zero in lanes that aren't valid hex (the resulting
+     * nibble in those lanes is garbage but 'bad' flags them).      */
+    uint8x16_t off = vorrq_u8(
+        vandq_u8(is_digit, vdupq_n_u8(0x30)),
+        vorrq_u8(
+            vandq_u8(is_lower, vdupq_n_u8(0x57)),
+            vandq_u8(is_upper, vdupq_n_u8(0x37))));
+
+    *nib = vsubq_u8(c, off);
+    *bad = vmvnq_u8(vorrq_u8(is_digit, vorrq_u8(is_lower, is_upper)));
+}
+
+static inline uint8_t scalar_nib(uint8_t c) {
+    if (c >= '0' && c <= '9') return (uint8_t)(c - '0');
+    if (c >= 'a' && c <= 'f') return (uint8_t)(c - 'a' + 10);
+    if (c >= 'A' && c <= 'F') return (uint8_t)(c - 'A' + 10);
+    return 0x80;  /* invalid sentinel */
+}
+
+/*
+ * Decode 2*outlen hex chars at 'src' into 'outlen' bytes at 'dst'.
+ * Returns 1 on success, 0 if any invalid hex char was seen (in which
+ * case the contents of 'dst' are unspecified).
+ *
+ * NEON kernel processes 32 input chars per iteration:
+ *   - vld2q_u8 deinterleaves into a vector of high-nibble chars and a
+ *     vector of low-nibble chars
+ *   - ascii_to_nibble validates and converts each char to its nibble
+ *   - vshlq_n_u8 + vorrq_u8 packs nibble pairs into output bytes
+ *   - the per-iteration 'bad' masks are OR-accumulated; we reduce
+ *     once at the end with vmaxvq_u8
+ *
+ * A scalar tail finishes the final (outlen mod 16) output bytes.
+ */
+int base16_decode_arm(const uint8_t *src, uint8_t *dst, size_t outlen) {
+    uint8x16_t bad = vdupq_n_u8(0);
+    size_t i = 0;
+
+    for (; i + 16 <= outlen; i += 16) {
+        uint8x16x2_t pair = vld2q_u8(src + 2 * i);
+        uint8x16_t nib_hi, nib_lo, bad_hi, bad_lo;
+        ascii_to_nibble(pair.val[0], &nib_hi, &bad_hi);
+        ascii_to_nibble(pair.val[1], &nib_lo, &bad_lo);
+        uint8x16_t byte = vorrq_u8(vshlq_n_u8(nib_hi, 4), nib_lo);
+        vst1q_u8(dst + i, byte);
+        bad = vorrq_u8(bad, vorrq_u8(bad_hi, bad_lo));
+    }
+
+    uint8_t tail_bad = 0;
+    for (; i < outlen; i++) {
+        uint8_t n0 = scalar_nib(src[2 * i]);
+        uint8_t n1 = scalar_nib(src[2 * i + 1]);
+        tail_bad |= (n0 | n1) & 0x80;
+        dst[i] = (uint8_t)((n0 << 4) | (n1 & 0x0f));
+    }
+
+    return (vmaxvq_u8(bad) == 0) && (tail_bad == 0);
+}
+
+int base16_arm_available(void) {
+    return 1;
+}
+
+#else
+
+/* stubs for non-aarch64 builds; never reached because dispatch is
+ * gated on 'base16_arm_available' returning 0                     */
+
+void base16_encode_arm(const uint8_t *src, uint8_t *dst, size_t l) {
+    (void)src; (void)dst; (void)l;
+}
+
+int base16_decode_arm(const uint8_t *src, uint8_t *dst, size_t outlen) {
+    (void)src; (void)dst; (void)outlen;
+    return 0;
+}
+
+int base16_arm_available(void) {
+    return 0;
+}
+
+#endif
diff --git a/lib/Data/ByteString/Base16/Arm.hs b/lib/Data/ByteString/Base16/Arm.hs
@@ -0,0 +1,70 @@
+{-# OPTIONS_HADDOCK hide #-}
+{-# LANGUAGE BangPatterns #-}
+
+-- |
+-- Module: Data.ByteString.Base16.Arm
+-- Copyright: (c) 2025 Jared Tobin
+-- License: MIT
+-- Maintainer: Jared Tobin <jared@ppad.tech>
+--
+-- ARM NEON support for base16 encoding and decoding.
+
+module Data.ByteString.Base16.Arm (
+    base16_arm_available
+  , encode
+  , decode
+  ) where
+
+import qualified Data.Bits as B
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Internal as BI
+import Data.Word (Word8)
+import Foreign.C.Types (CInt(..), CSize(..))
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Ptr (Ptr, plusPtr)
+import System.IO.Unsafe (unsafeDupablePerformIO)
+
+-- ffi ------------------------------------------------------------------------
+
+foreign import ccall unsafe "base16_encode_arm"
+  c_base16_encode :: Ptr Word8 -> Ptr Word8 -> CSize -> IO ()
+
+foreign import ccall unsafe "base16_decode_arm"
+  c_base16_decode :: Ptr Word8 -> Ptr Word8 -> CSize -> IO CInt
+
+foreign import ccall unsafe "base16_arm_available"
+  c_base16_arm_available :: IO CInt
+
+-- utilities ------------------------------------------------------------------
+
+fi :: (Integral a, Num b) => a -> b
+fi = fromIntegral
+{-# INLINE fi #-}
+
+-- api ------------------------------------------------------------------------
+
+-- | Are ARM NEON extensions available?
+base16_arm_available :: Bool
+base16_arm_available =
+  unsafeDupablePerformIO c_base16_arm_available /= 0
+{-# NOINLINE base16_arm_available #-}
+
+-- | Encode a base256 'ByteString' as base16 using NEON.
+encode :: BS.ByteString -> BS.ByteString
+encode (BI.PS sfp soff l) =
+  BI.unsafeCreate (l `B.shiftL` 1) $ \dst ->
+    withForeignPtr sfp $ \sp0 ->
+      c_base16_encode (sp0 `plusPtr` soff) dst (fi l)
+
+-- | Decode a base16 'ByteString' to base256 using NEON.  Returns
+--   'Nothing' on odd-length or otherwise invalid input.
+decode :: BS.ByteString -> Maybe BS.ByteString
+decode (BI.PS sfp soff l)
+  | B.testBit l 0 = Nothing
+  | otherwise = unsafeDupablePerformIO $ do
+      let !n = l `B.shiftR` 1
+      fp <- BI.mallocByteString n
+      ok <- withForeignPtr fp  $ \dst ->
+            withForeignPtr sfp $ \sp0 ->
+              c_base16_decode (sp0 `plusPtr` soff) dst (fi n)
+      pure $! if ok /= 0 then Just (BI.PS fp 0 n) else Nothing
diff --git a/ppad-base16.cabal b/ppad-base16.cabal
@@ -18,6 +18,11 @@ flag llvm
   default:     False
   manual:      True
 
+flag sanitize
+  description: Build with AddressSanitizer and UndefinedBehaviorSanitizer.
+  default:     False
+  manual:      True
+
 source-repository head
   type:     git
   location: git.ppad.tech/base16.git
@@ -31,9 +36,17 @@ library
     ghc-options: -fllvm -O2
   exposed-modules:
       Data.ByteString.Base16
+      Data.ByteString.Base16.Arm
   build-depends:
       base >= 4.9 && < 5
     , bytestring >= 0.9 && < 0.13
+  c-sources:
+      cbits/base16_arm.c
+  if arch(aarch64)
+    cc-options: -march=armv8-a
+  if flag(sanitize)
+    cc-options: -fsanitize=address,undefined -fno-omit-frame-pointer
+    ghc-options: -optl=-fsanitize=address,undefined
 
 test-suite base16-tests
   type:                exitcode-stdio-1.0
@@ -43,6 +56,8 @@ test-suite base16-tests
 
   ghc-options:
     -rtsopts -Wall -O2
+  if flag(sanitize)
+    ghc-options: -optl=-fsanitize=address,undefined
 
   build-depends:
       base

	base16 Pure Haskell base16 encoding/decoding (docs.ppad.tech/base16).
	git clone git://git.ppad.tech/base16.git
	Log \| Files \| Refs \| README \| LICENSE

A	cbits/base16_arm.c	\|	143	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lib/Data/ByteString/Base16/Arm.hs	\|	70	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	ppad-base16.cabal	\|	15	+++++++++++++++