commit 39477dfcbd306f38a42a1bd1981009d912ea12a9
parent c7be2ec6af1943172444c2d140e053ac92799479
Author: Jared Tobin <jared@jtobin.io>
Date: Sat, 16 May 2026 13:06:23 -0230
lib: add scalar C implementation (stage 1 of 2)
Add a C kernel for the Poly1305 MAC, following the integration
pattern used by ppad-sha256 / ppad-chacha / ppad-base16: cbits +
Haskell FFI module + sanitize flag + dispatch from the top-level
module.
This is stage 1 of 2. The C kernel here is a clean scalar
implementation using the 26-bit limb representation that NEON
Poly1305 implementations use, with 64-bit native arithmetic for
the limb multiplications. It is intended to:
* stand on its own as a known-good reference, both in correctness
(passes all RFC 8439 A.3 vectors) and as an audit baseline; and
* serve as the slow-path / fallback for stage 2 (a separate
commit) which replaces the inner block loop with NEON 4-way
parallel using precomputed r^2, r^3, r^4.
Algorithm summary:
* r is held as 5 26-bit limbs of 32 bits each; same for the
accumulator h. Limbs that 'spill' beyond position 4 during
multiplication fold back as 5x their value into low limbs,
thanks to 2^130 = 5 (mod 2^130 - 5).
* 'mul_mod_p' computes the 25 partial products in uint64_t, sums
them per output position, then carry-propagates once.
* Final reduction subtracts p in constant time via a bitmask.
* Output is (h + s) mod 2^128, written as 4 little-endian uint32.
Cabal:
* 'c-sources: cbits/poly1305_arm.c'.
* 'if arch(aarch64) cc-options: -march=armv8-a'.
* New 'sanitize' flag wiring '-fsanitize=address,undefined
-fno-omit-frame-pointer' into both the C source and the
test-suite link.
* 'Crypto.MAC.Poly1305.Arm' added to 'exposed-modules'.
No call sites wired yet — the existing 12 tasty cases still
execute through the pure Haskell scalar path after this commit
and pass unchanged.
Diffstat:
3 files changed, 302 insertions(+), 0 deletions(-)
diff --git a/cbits/poly1305_arm.c b/cbits/poly1305_arm.c
@@ -0,0 +1,229 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if defined(__aarch64__)
+
+/*
+ * Poly1305 (RFC 8439). Stage 1 kernel: clean scalar C using the
+ * 26-bit limb representation that NEON Poly1305 implementations use,
+ * with 64-bit native arithmetic for the limb multiplications. Stage
+ * 2 (a separate commit) replaces the inner block loop with a NEON
+ * 4-way parallel kernel.
+ *
+ * Layout: every 130-bit value is held as 5 uint32_t limbs of 26 bits
+ * each (top limb gets the spare 2 bits). The prime is p = 2^130 - 5,
+ * so the reduction rule for any 'spilled' limb beyond position 4 is
+ * 'add 5 * that limb back into the low end'.
+ */
+
+#define MASK26 0x3ffffffu
+
+/*
+ * Multiply two 130-bit values mod (2^130 - 5). Inputs in 5x 26-bit
+ * limb form, output in 5x 26-bit limb form (each output limb < 2^26
+ * except possibly limb 1, which may carry a small excess absorbed by
+ * the next 'mul_mod_p' or by 'normalize').
+ */
+static void mul_mod_p(const uint32_t a[5], const uint32_t b[5],
+ uint32_t out[5]) {
+ uint64_t d0, d1, d2, d3, d4, c;
+
+ /* 25 partial products; limbs that 'spill' beyond position 4 fold
+ * back as (5 * limb) thanks to 2^130 = 5 mod p. */
+ d0 = (uint64_t)a[0]*b[0]
+ + 5 * ((uint64_t)a[4]*b[1] + (uint64_t)a[3]*b[2]
+ + (uint64_t)a[2]*b[3] + (uint64_t)a[1]*b[4]);
+ d1 = (uint64_t)a[0]*b[1] + (uint64_t)a[1]*b[0]
+ + 5 * ((uint64_t)a[4]*b[2] + (uint64_t)a[3]*b[3]
+ + (uint64_t)a[2]*b[4]);
+ d2 = (uint64_t)a[0]*b[2] + (uint64_t)a[1]*b[1]
+ + (uint64_t)a[2]*b[0]
+ + 5 * ((uint64_t)a[4]*b[3] + (uint64_t)a[3]*b[4]);
+ d3 = (uint64_t)a[0]*b[3] + (uint64_t)a[1]*b[2]
+ + (uint64_t)a[2]*b[1] + (uint64_t)a[3]*b[0]
+ + 5 * ((uint64_t)a[4]*b[4]);
+ d4 = (uint64_t)a[0]*b[4] + (uint64_t)a[1]*b[3]
+ + (uint64_t)a[2]*b[2] + (uint64_t)a[3]*b[1]
+ + (uint64_t)a[4]*b[0];
+
+ /* single-pass carry propagation */
+ c = d0 >> 26; d0 &= MASK26; d1 += c;
+ c = d1 >> 26; d1 &= MASK26; d2 += c;
+ c = d2 >> 26; d2 &= MASK26; d3 += c;
+ c = d3 >> 26; d3 &= MASK26; d4 += c;
+ c = d4 >> 26; d4 &= MASK26; d0 += c * 5;
+ c = d0 >> 26; d0 &= MASK26; d1 += c;
+
+ out[0] = (uint32_t)d0;
+ out[1] = (uint32_t)d1;
+ out[2] = (uint32_t)d2;
+ out[3] = (uint32_t)d3;
+ out[4] = (uint32_t)d4;
+}
+
+/*
+ * Parse 16 little-endian bytes plus a 'hibit' value (0 or 1) at bit
+ * 128 into 5 26-bit limbs. 'hibit' = 1 for a full block (the implicit
+ * "+ 2^128" of Poly1305). Partial blocks set the marker byte inside
+ * the 16-byte buffer and pass 'hibit' = 0.
+ */
+static inline void blk2limbs(const uint8_t m[16], uint32_t hibit,
+ uint32_t l[5]) {
+ uint32_t t0, t1, t2, t3;
+ memcpy(&t0, m, 4);
+ memcpy(&t1, m + 4, 4);
+ memcpy(&t2, m + 8, 4);
+ memcpy(&t3, m + 12, 4);
+ l[0] = t0 & MASK26;
+ l[1] = ((t0 >> 26) | (t1 << 6)) & MASK26;
+ l[2] = ((t1 >> 20) | (t2 << 12)) & MASK26;
+ l[3] = ((t2 >> 14) | (t3 << 18)) & MASK26;
+ l[4] = (t3 >> 8) | (hibit << 24);
+}
+
+/*
+ * Compute a 16-byte Poly1305 MAC over 'msg' (length 'msglen') using
+ * the 32-byte 'key'. Writes the tag to 'mac_out'. 'key' is the
+ * unclamped raw 32 bytes — clamping happens inside.
+ */
+void poly1305_mac_arm(const uint8_t key[32], const uint8_t *msg,
+ size_t msglen, uint8_t mac_out[16]) {
+ /* clamp r (low 16 bytes of key, with specific bits cleared) */
+ uint32_t t0, t1, t2, t3;
+ memcpy(&t0, key, 4);
+ memcpy(&t1, key + 4, 4);
+ memcpy(&t2, key + 8, 4);
+ memcpy(&t3, key + 12, 4);
+ t0 &= 0x0fffffffu;
+ t1 &= 0x0ffffffcu;
+ t2 &= 0x0ffffffcu;
+ t3 &= 0x0ffffffcu;
+
+ /* r as 5 26-bit limbs */
+ uint32_t r[5];
+ r[0] = t0 & MASK26;
+ r[1] = ((t0 >> 26) | (t1 << 6)) & MASK26;
+ r[2] = ((t1 >> 20) | (t2 << 12)) & MASK26;
+ r[3] = ((t2 >> 14) | (t3 << 18)) & MASK26;
+ r[4] = (t3 >> 8);
+
+ /* accumulator h starts at 0 */
+ uint32_t h[5] = { 0, 0, 0, 0, 0 };
+
+ /* process full 16-byte blocks */
+ size_t pos = 0;
+ while (pos + 16 <= msglen) {
+ uint32_t blk[5];
+ blk2limbs(msg + pos, 1, blk);
+
+ /* h += blk (limb-wise; later carry-propagated inside mul) */
+ uint32_t hl[5];
+ hl[0] = h[0] + blk[0];
+ hl[1] = h[1] + blk[1];
+ hl[2] = h[2] + blk[2];
+ hl[3] = h[3] + blk[3];
+ hl[4] = h[4] + blk[4];
+
+ /* h := (h + blk) * r mod p */
+ mul_mod_p(hl, r, h);
+
+ pos += 16;
+ }
+
+ /* final partial block (1..15 trailing bytes) */
+ if (pos < msglen) {
+ size_t rem = msglen - pos;
+ uint8_t pad[16] = { 0 };
+ memcpy(pad, msg + pos, rem);
+ pad[rem] = 1; /* RFC 8439: append "1" byte, then zero-pad */
+
+ uint32_t blk[5];
+ blk2limbs(pad, 0, blk); /* no hibit; we set the marker inside */
+
+ uint32_t hl[5];
+ hl[0] = h[0] + blk[0];
+ hl[1] = h[1] + blk[1];
+ hl[2] = h[2] + blk[2];
+ hl[3] = h[3] + blk[3];
+ hl[4] = h[4] + blk[4];
+ mul_mod_p(hl, r, h);
+ }
+
+ /* mul_mod_p leaves a small excess in h[1]; absorb it before the
+ * final mod-p reduction. */
+ {
+ uint32_t c;
+ c = h[1] >> 26; h[1] &= MASK26; h[2] += c;
+ c = h[2] >> 26; h[2] &= MASK26; h[3] += c;
+ c = h[3] >> 26; h[3] &= MASK26; h[4] += c;
+ c = h[4] >> 26; h[4] &= MASK26; h[0] += c * 5;
+ c = h[0] >> 26; h[0] &= MASK26; h[1] += c;
+ }
+
+ /* full reduction to [0, p). Compute g = h + 5; if g overflows
+ * 2^130 (bit 130 set), then h >= p and we replace h with g (which
+ * equals h - p mod 2^130). Otherwise leave h alone. Done in
+ * constant time via a bitmask. */
+ uint32_t g[5];
+ uint32_t c = 5;
+ g[0] = h[0] + c; c = g[0] >> 26; g[0] &= MASK26;
+ g[1] = h[1] + c; c = g[1] >> 26; g[1] &= MASK26;
+ g[2] = h[2] + c; c = g[2] >> 26; g[2] &= MASK26;
+ g[3] = h[3] + c; c = g[3] >> 26; g[3] &= MASK26;
+ g[4] = h[4] + c;
+ uint32_t carry = g[4] >> 26;
+ g[4] &= MASK26;
+ uint32_t mask = (uint32_t)0 - carry; /* all-1s iff carry */
+ h[0] = (h[0] & ~mask) | (g[0] & mask);
+ h[1] = (h[1] & ~mask) | (g[1] & mask);
+ h[2] = (h[2] & ~mask) | (g[2] & mask);
+ h[3] = (h[3] & ~mask) | (g[3] & mask);
+ h[4] = (h[4] & ~mask) | (g[4] & mask);
+
+ /* repack 5x 26-bit limbs into 4x 32-bit limbs (low 128 bits) */
+ uint32_t h0 = h[0] | (h[1] << 26);
+ uint32_t h1 = (h[1] >> 6) | (h[2] << 20);
+ uint32_t h2 = (h[2] >> 12) | (h[3] << 14);
+ uint32_t h3 = (h[3] >> 18) | (h[4] << 8);
+
+ /* add s (high 16 bytes of key) as 4x little-endian Word32, mod
+ * 2^128 (drop the final 32-bit carry). */
+ uint32_t s0, s1, s2, s3;
+ memcpy(&s0, key + 16, 4);
+ memcpy(&s1, key + 20, 4);
+ memcpy(&s2, key + 24, 4);
+ memcpy(&s3, key + 28, 4);
+
+ uint64_t a0 = (uint64_t)h0 + s0;
+ uint64_t a1 = (uint64_t)h1 + s1 + (a0 >> 32);
+ uint64_t a2 = (uint64_t)h2 + s2 + (a1 >> 32);
+ uint64_t a3 = (uint64_t)h3 + s3 + (a2 >> 32);
+
+ uint32_t o0 = (uint32_t)a0;
+ uint32_t o1 = (uint32_t)a1;
+ uint32_t o2 = (uint32_t)a2;
+ uint32_t o3 = (uint32_t)a3;
+
+ memcpy(mac_out + 0, &o0, 4);
+ memcpy(mac_out + 4, &o1, 4);
+ memcpy(mac_out + 8, &o2, 4);
+ memcpy(mac_out + 12, &o3, 4);
+}
+
+int poly1305_arm_available(void) {
+ return 1;
+}
+
+#else
+
+void poly1305_mac_arm(const uint8_t *key, const uint8_t *msg,
+ size_t msglen, uint8_t *mac_out) {
+ (void)key; (void)msg; (void)msglen; (void)mac_out;
+}
+
+int poly1305_arm_available(void) {
+ return 0;
+}
+
+#endif
diff --git a/lib/Crypto/MAC/Poly1305/Arm.hs b/lib/Crypto/MAC/Poly1305/Arm.hs
@@ -0,0 +1,58 @@
+{-# OPTIONS_HADDOCK hide #-}
+{-# LANGUAGE BangPatterns #-}
+
+-- |
+-- Module: Crypto.MAC.Poly1305.Arm
+-- Copyright: (c) 2025 Jared Tobin
+-- License: MIT
+-- Maintainer: Jared Tobin <jared@ppad.tech>
+--
+-- ARM acceleration for the Poly1305 MAC.
+
+module Crypto.MAC.Poly1305.Arm (
+ poly1305_arm_available
+ , mac
+ ) where
+
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Internal as BI
+import Data.Word (Word8)
+import Foreign.C.Types (CInt(..), CSize(..))
+import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.Ptr (Ptr, plusPtr)
+import System.IO.Unsafe (unsafeDupablePerformIO)
+
+-- ffi ------------------------------------------------------------------------
+
+foreign import ccall unsafe "poly1305_mac_arm"
+ c_poly1305_mac
+ :: Ptr Word8 -> Ptr Word8 -> CSize -> Ptr Word8 -> IO ()
+
+foreign import ccall unsafe "poly1305_arm_available"
+ c_poly1305_arm_available :: IO CInt
+
+-- utilities ------------------------------------------------------------------
+
+fi :: (Integral a, Num b) => a -> b
+fi = fromIntegral
+{-# INLINE fi #-}
+
+-- api ------------------------------------------------------------------------
+
+-- | Are ARM extensions available?
+poly1305_arm_available :: Bool
+poly1305_arm_available =
+ unsafeDupablePerformIO c_poly1305_arm_available /= 0
+{-# NOINLINE poly1305_arm_available #-}
+
+-- | Compute a Poly1305 MAC over the message using the given (already-
+-- validated 32-byte) key.
+mac :: BS.ByteString -> BS.ByteString -> BS.ByteString
+mac (BI.PS kfp koff _) (BI.PS mfp moff mlen) =
+ BI.unsafeCreate 16 $ \dst ->
+ withForeignPtr kfp $ \kp0 ->
+ withForeignPtr mfp $ \mp0 ->
+ c_poly1305_mac (kp0 `plusPtr` koff)
+ (mp0 `plusPtr` moff)
+ (fi mlen)
+ dst
diff --git a/ppad-poly1305.cabal b/ppad-poly1305.cabal
@@ -19,6 +19,11 @@ flag llvm
default: False
manual: True
+flag sanitize
+ description: Build with AddressSanitizer and UndefinedBehaviorSanitizer.
+ default: False
+ manual: True
+
source-repository head
type: git
location: git.ppad.tech/poly1305.git
@@ -32,10 +37,18 @@ library
ghc-options: -fllvm -O2
exposed-modules:
Crypto.MAC.Poly1305
+ Crypto.MAC.Poly1305.Arm
build-depends:
base >= 4.9 && < 5
, bytestring >= 0.9 && < 0.13
, ppad-fixed >= 0.1.3 && < 0.2
+ c-sources:
+ cbits/poly1305_arm.c
+ if arch(aarch64)
+ cc-options: -march=armv8-a
+ if flag(sanitize)
+ cc-options: -fsanitize=address,undefined -fno-omit-frame-pointer
+ ghc-options: -optl=-fsanitize=address,undefined
test-suite poly1305-tests
type: exitcode-stdio-1.0
@@ -45,6 +58,8 @@ test-suite poly1305-tests
ghc-options:
-rtsopts -Wall -O2
+ if flag(sanitize)
+ ghc-options: -optl=-fsanitize=address,undefined
build-depends:
base