poly1305

The Poly1305 message authentication code (docs.ppad.tech/poly1305).
git clone git://git.ppad.tech/poly1305.git
Log | Files | Refs | README | LICENSE

commit 39477dfcbd306f38a42a1bd1981009d912ea12a9
parent c7be2ec6af1943172444c2d140e053ac92799479
Author: Jared Tobin <jared@jtobin.io>
Date:   Sat, 16 May 2026 13:06:23 -0230

lib: add scalar C implementation (stage 1 of 2)

Add a C kernel for the Poly1305 MAC, following the integration
pattern used by ppad-sha256 / ppad-chacha / ppad-base16: cbits +
Haskell FFI module + sanitize flag + dispatch from the top-level
module.

This is stage 1 of 2.  The C kernel here is a clean scalar
implementation using the 26-bit limb representation that NEON
Poly1305 implementations use, with 64-bit native arithmetic for
the limb multiplications.  It is intended to:

* stand on its own as a known-good reference, both in correctness
  (passes all RFC 8439 A.3 vectors) and as an audit baseline; and
* serve as the slow-path / fallback for stage 2 (a separate
  commit) which replaces the inner block loop with NEON 4-way
  parallel using precomputed r^2, r^3, r^4.

Algorithm summary:

* r is held as 5 26-bit limbs of 32 bits each; same for the
  accumulator h.  Limbs that 'spill' beyond position 4 during
  multiplication fold back as 5x their value into low limbs,
  thanks to 2^130 = 5 (mod 2^130 - 5).
* 'mul_mod_p' computes the 25 partial products in uint64_t, sums
  them per output position, then carry-propagates once.
* Final reduction subtracts p in constant time via a bitmask.
* Output is (h + s) mod 2^128, written as 4 little-endian uint32.

Cabal:

* 'c-sources: cbits/poly1305_arm.c'.
* 'if arch(aarch64) cc-options: -march=armv8-a'.
* New 'sanitize' flag wiring '-fsanitize=address,undefined
  -fno-omit-frame-pointer' into both the C source and the
  test-suite link.
* 'Crypto.MAC.Poly1305.Arm' added to 'exposed-modules'.

No call sites wired yet — the existing 12 tasty cases still
execute through the pure Haskell scalar path after this commit
and pass unchanged.

Diffstat:
Acbits/poly1305_arm.c | 229+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alib/Crypto/MAC/Poly1305/Arm.hs | 58++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mppad-poly1305.cabal | 15+++++++++++++++
3 files changed, 302 insertions(+), 0 deletions(-)

diff --git a/cbits/poly1305_arm.c b/cbits/poly1305_arm.c @@ -0,0 +1,229 @@ +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#if defined(__aarch64__) + +/* + * Poly1305 (RFC 8439). Stage 1 kernel: clean scalar C using the + * 26-bit limb representation that NEON Poly1305 implementations use, + * with 64-bit native arithmetic for the limb multiplications. Stage + * 2 (a separate commit) replaces the inner block loop with a NEON + * 4-way parallel kernel. + * + * Layout: every 130-bit value is held as 5 uint32_t limbs of 26 bits + * each (top limb gets the spare 2 bits). The prime is p = 2^130 - 5, + * so the reduction rule for any 'spilled' limb beyond position 4 is + * 'add 5 * that limb back into the low end'. + */ + +#define MASK26 0x3ffffffu + +/* + * Multiply two 130-bit values mod (2^130 - 5). Inputs in 5x 26-bit + * limb form, output in 5x 26-bit limb form (each output limb < 2^26 + * except possibly limb 1, which may carry a small excess absorbed by + * the next 'mul_mod_p' or by 'normalize'). + */ +static void mul_mod_p(const uint32_t a[5], const uint32_t b[5], + uint32_t out[5]) { + uint64_t d0, d1, d2, d3, d4, c; + + /* 25 partial products; limbs that 'spill' beyond position 4 fold + * back as (5 * limb) thanks to 2^130 = 5 mod p. */ + d0 = (uint64_t)a[0]*b[0] + + 5 * ((uint64_t)a[4]*b[1] + (uint64_t)a[3]*b[2] + + (uint64_t)a[2]*b[3] + (uint64_t)a[1]*b[4]); + d1 = (uint64_t)a[0]*b[1] + (uint64_t)a[1]*b[0] + + 5 * ((uint64_t)a[4]*b[2] + (uint64_t)a[3]*b[3] + + (uint64_t)a[2]*b[4]); + d2 = (uint64_t)a[0]*b[2] + (uint64_t)a[1]*b[1] + + (uint64_t)a[2]*b[0] + + 5 * ((uint64_t)a[4]*b[3] + (uint64_t)a[3]*b[4]); + d3 = (uint64_t)a[0]*b[3] + (uint64_t)a[1]*b[2] + + (uint64_t)a[2]*b[1] + (uint64_t)a[3]*b[0] + + 5 * ((uint64_t)a[4]*b[4]); + d4 = (uint64_t)a[0]*b[4] + (uint64_t)a[1]*b[3] + + (uint64_t)a[2]*b[2] + (uint64_t)a[3]*b[1] + + (uint64_t)a[4]*b[0]; + + /* single-pass carry propagation */ + c = d0 >> 26; d0 &= MASK26; d1 += c; + c = d1 >> 26; d1 &= MASK26; d2 += c; + c = d2 >> 26; d2 &= MASK26; d3 += c; + c = d3 >> 26; d3 &= MASK26; d4 += c; + c = d4 >> 26; d4 &= MASK26; d0 += c * 5; + c = d0 >> 26; d0 &= MASK26; d1 += c; + + out[0] = (uint32_t)d0; + out[1] = (uint32_t)d1; + out[2] = (uint32_t)d2; + out[3] = (uint32_t)d3; + out[4] = (uint32_t)d4; +} + +/* + * Parse 16 little-endian bytes plus a 'hibit' value (0 or 1) at bit + * 128 into 5 26-bit limbs. 'hibit' = 1 for a full block (the implicit + * "+ 2^128" of Poly1305). Partial blocks set the marker byte inside + * the 16-byte buffer and pass 'hibit' = 0. + */ +static inline void blk2limbs(const uint8_t m[16], uint32_t hibit, + uint32_t l[5]) { + uint32_t t0, t1, t2, t3; + memcpy(&t0, m, 4); + memcpy(&t1, m + 4, 4); + memcpy(&t2, m + 8, 4); + memcpy(&t3, m + 12, 4); + l[0] = t0 & MASK26; + l[1] = ((t0 >> 26) | (t1 << 6)) & MASK26; + l[2] = ((t1 >> 20) | (t2 << 12)) & MASK26; + l[3] = ((t2 >> 14) | (t3 << 18)) & MASK26; + l[4] = (t3 >> 8) | (hibit << 24); +} + +/* + * Compute a 16-byte Poly1305 MAC over 'msg' (length 'msglen') using + * the 32-byte 'key'. Writes the tag to 'mac_out'. 'key' is the + * unclamped raw 32 bytes — clamping happens inside. + */ +void poly1305_mac_arm(const uint8_t key[32], const uint8_t *msg, + size_t msglen, uint8_t mac_out[16]) { + /* clamp r (low 16 bytes of key, with specific bits cleared) */ + uint32_t t0, t1, t2, t3; + memcpy(&t0, key, 4); + memcpy(&t1, key + 4, 4); + memcpy(&t2, key + 8, 4); + memcpy(&t3, key + 12, 4); + t0 &= 0x0fffffffu; + t1 &= 0x0ffffffcu; + t2 &= 0x0ffffffcu; + t3 &= 0x0ffffffcu; + + /* r as 5 26-bit limbs */ + uint32_t r[5]; + r[0] = t0 & MASK26; + r[1] = ((t0 >> 26) | (t1 << 6)) & MASK26; + r[2] = ((t1 >> 20) | (t2 << 12)) & MASK26; + r[3] = ((t2 >> 14) | (t3 << 18)) & MASK26; + r[4] = (t3 >> 8); + + /* accumulator h starts at 0 */ + uint32_t h[5] = { 0, 0, 0, 0, 0 }; + + /* process full 16-byte blocks */ + size_t pos = 0; + while (pos + 16 <= msglen) { + uint32_t blk[5]; + blk2limbs(msg + pos, 1, blk); + + /* h += blk (limb-wise; later carry-propagated inside mul) */ + uint32_t hl[5]; + hl[0] = h[0] + blk[0]; + hl[1] = h[1] + blk[1]; + hl[2] = h[2] + blk[2]; + hl[3] = h[3] + blk[3]; + hl[4] = h[4] + blk[4]; + + /* h := (h + blk) * r mod p */ + mul_mod_p(hl, r, h); + + pos += 16; + } + + /* final partial block (1..15 trailing bytes) */ + if (pos < msglen) { + size_t rem = msglen - pos; + uint8_t pad[16] = { 0 }; + memcpy(pad, msg + pos, rem); + pad[rem] = 1; /* RFC 8439: append "1" byte, then zero-pad */ + + uint32_t blk[5]; + blk2limbs(pad, 0, blk); /* no hibit; we set the marker inside */ + + uint32_t hl[5]; + hl[0] = h[0] + blk[0]; + hl[1] = h[1] + blk[1]; + hl[2] = h[2] + blk[2]; + hl[3] = h[3] + blk[3]; + hl[4] = h[4] + blk[4]; + mul_mod_p(hl, r, h); + } + + /* mul_mod_p leaves a small excess in h[1]; absorb it before the + * final mod-p reduction. */ + { + uint32_t c; + c = h[1] >> 26; h[1] &= MASK26; h[2] += c; + c = h[2] >> 26; h[2] &= MASK26; h[3] += c; + c = h[3] >> 26; h[3] &= MASK26; h[4] += c; + c = h[4] >> 26; h[4] &= MASK26; h[0] += c * 5; + c = h[0] >> 26; h[0] &= MASK26; h[1] += c; + } + + /* full reduction to [0, p). Compute g = h + 5; if g overflows + * 2^130 (bit 130 set), then h >= p and we replace h with g (which + * equals h - p mod 2^130). Otherwise leave h alone. Done in + * constant time via a bitmask. */ + uint32_t g[5]; + uint32_t c = 5; + g[0] = h[0] + c; c = g[0] >> 26; g[0] &= MASK26; + g[1] = h[1] + c; c = g[1] >> 26; g[1] &= MASK26; + g[2] = h[2] + c; c = g[2] >> 26; g[2] &= MASK26; + g[3] = h[3] + c; c = g[3] >> 26; g[3] &= MASK26; + g[4] = h[4] + c; + uint32_t carry = g[4] >> 26; + g[4] &= MASK26; + uint32_t mask = (uint32_t)0 - carry; /* all-1s iff carry */ + h[0] = (h[0] & ~mask) | (g[0] & mask); + h[1] = (h[1] & ~mask) | (g[1] & mask); + h[2] = (h[2] & ~mask) | (g[2] & mask); + h[3] = (h[3] & ~mask) | (g[3] & mask); + h[4] = (h[4] & ~mask) | (g[4] & mask); + + /* repack 5x 26-bit limbs into 4x 32-bit limbs (low 128 bits) */ + uint32_t h0 = h[0] | (h[1] << 26); + uint32_t h1 = (h[1] >> 6) | (h[2] << 20); + uint32_t h2 = (h[2] >> 12) | (h[3] << 14); + uint32_t h3 = (h[3] >> 18) | (h[4] << 8); + + /* add s (high 16 bytes of key) as 4x little-endian Word32, mod + * 2^128 (drop the final 32-bit carry). */ + uint32_t s0, s1, s2, s3; + memcpy(&s0, key + 16, 4); + memcpy(&s1, key + 20, 4); + memcpy(&s2, key + 24, 4); + memcpy(&s3, key + 28, 4); + + uint64_t a0 = (uint64_t)h0 + s0; + uint64_t a1 = (uint64_t)h1 + s1 + (a0 >> 32); + uint64_t a2 = (uint64_t)h2 + s2 + (a1 >> 32); + uint64_t a3 = (uint64_t)h3 + s3 + (a2 >> 32); + + uint32_t o0 = (uint32_t)a0; + uint32_t o1 = (uint32_t)a1; + uint32_t o2 = (uint32_t)a2; + uint32_t o3 = (uint32_t)a3; + + memcpy(mac_out + 0, &o0, 4); + memcpy(mac_out + 4, &o1, 4); + memcpy(mac_out + 8, &o2, 4); + memcpy(mac_out + 12, &o3, 4); +} + +int poly1305_arm_available(void) { + return 1; +} + +#else + +void poly1305_mac_arm(const uint8_t *key, const uint8_t *msg, + size_t msglen, uint8_t *mac_out) { + (void)key; (void)msg; (void)msglen; (void)mac_out; +} + +int poly1305_arm_available(void) { + return 0; +} + +#endif diff --git a/lib/Crypto/MAC/Poly1305/Arm.hs b/lib/Crypto/MAC/Poly1305/Arm.hs @@ -0,0 +1,58 @@ +{-# OPTIONS_HADDOCK hide #-} +{-# LANGUAGE BangPatterns #-} + +-- | +-- Module: Crypto.MAC.Poly1305.Arm +-- Copyright: (c) 2025 Jared Tobin +-- License: MIT +-- Maintainer: Jared Tobin <jared@ppad.tech> +-- +-- ARM acceleration for the Poly1305 MAC. + +module Crypto.MAC.Poly1305.Arm ( + poly1305_arm_available + , mac + ) where + +import qualified Data.ByteString as BS +import qualified Data.ByteString.Internal as BI +import Data.Word (Word8) +import Foreign.C.Types (CInt(..), CSize(..)) +import Foreign.ForeignPtr (withForeignPtr) +import Foreign.Ptr (Ptr, plusPtr) +import System.IO.Unsafe (unsafeDupablePerformIO) + +-- ffi ------------------------------------------------------------------------ + +foreign import ccall unsafe "poly1305_mac_arm" + c_poly1305_mac + :: Ptr Word8 -> Ptr Word8 -> CSize -> Ptr Word8 -> IO () + +foreign import ccall unsafe "poly1305_arm_available" + c_poly1305_arm_available :: IO CInt + +-- utilities ------------------------------------------------------------------ + +fi :: (Integral a, Num b) => a -> b +fi = fromIntegral +{-# INLINE fi #-} + +-- api ------------------------------------------------------------------------ + +-- | Are ARM extensions available? +poly1305_arm_available :: Bool +poly1305_arm_available = + unsafeDupablePerformIO c_poly1305_arm_available /= 0 +{-# NOINLINE poly1305_arm_available #-} + +-- | Compute a Poly1305 MAC over the message using the given (already- +-- validated 32-byte) key. +mac :: BS.ByteString -> BS.ByteString -> BS.ByteString +mac (BI.PS kfp koff _) (BI.PS mfp moff mlen) = + BI.unsafeCreate 16 $ \dst -> + withForeignPtr kfp $ \kp0 -> + withForeignPtr mfp $ \mp0 -> + c_poly1305_mac (kp0 `plusPtr` koff) + (mp0 `plusPtr` moff) + (fi mlen) + dst diff --git a/ppad-poly1305.cabal b/ppad-poly1305.cabal @@ -19,6 +19,11 @@ flag llvm default: False manual: True +flag sanitize + description: Build with AddressSanitizer and UndefinedBehaviorSanitizer. + default: False + manual: True + source-repository head type: git location: git.ppad.tech/poly1305.git @@ -32,10 +37,18 @@ library ghc-options: -fllvm -O2 exposed-modules: Crypto.MAC.Poly1305 + Crypto.MAC.Poly1305.Arm build-depends: base >= 4.9 && < 5 , bytestring >= 0.9 && < 0.13 , ppad-fixed >= 0.1.3 && < 0.2 + c-sources: + cbits/poly1305_arm.c + if arch(aarch64) + cc-options: -march=armv8-a + if flag(sanitize) + cc-options: -fsanitize=address,undefined -fno-omit-frame-pointer + ghc-options: -optl=-fsanitize=address,undefined test-suite poly1305-tests type: exitcode-stdio-1.0 @@ -45,6 +58,8 @@ test-suite poly1305-tests ghc-options: -rtsopts -Wall -O2 + if flag(sanitize) + ghc-options: -optl=-fsanitize=address,undefined build-depends: base