sha512

Pure Haskell SHA-512, HMAC-SHA512 (docs.ppad.tech/sha512).
git clone git://git.ppad.tech/sha512.git
Log | Files | Refs | README | LICENSE

commit bedc284df59e09089d4305acc1de764590298039
parent 4a3a490de1cf94ac3aafb4a5b9d07726bb8ecaee
Author: Jared Tobin <jared@jtobin.io>
Date:   Thu,  8 Jan 2026 15:58:46 +0400

lib: major refactoring

Diffstat:
MREADME.md | 27++++++++++++++-------------
Acbits/sha512_arm.c | 467+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mlib/Crypto/Hash/SHA512.hs | 606++++++-------------------------------------------------------------------------
Alib/Crypto/Hash/SHA512/Arm.hs | 126+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alib/Crypto/Hash/SHA512/Internal.hs | 428+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alib/Crypto/Hash/SHA512/Lazy.hs | 182+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mppad-sha512.cabal | 7+++++++
7 files changed, 1269 insertions(+), 574 deletions(-)

diff --git a/README.md b/README.md @@ -5,7 +5,8 @@ [![](https://img.shields.io/badge/haddock-sha512-lightblue)](https://docs.ppad.tech/sha512) A pure Haskell implementation of SHA-512 and HMAC-SHA512 on strict and -lazy ByteStrings, as specified by RFC's [6234][r6234] and [2104][r2104]. +lazy ByteStrings, as specified by RFC's [6234][r6234] and [2104][r2104], +that uses ARM SHA2 intrinsics when available. ## Usage @@ -55,24 +56,24 @@ Haddocks (API documentation, etc.) are hosted at ## Performance -The aim is best-in-class performance for pure, highly-auditable Haskell -code. - -Current benchmark figures on an M4 Silicon MacBook Air look like (use -`cabal bench` to run the benchmark suite): +The aim is best-in-class performance. Current benchmark figures on an +M4 Silicon MacBook Air, where we avail of hardware acceleration via +ARM cryptography extensions, look like (use `cabal bench` to run the +benchmark suite): ``` benchmarking ppad-sha512/SHA512 (32B input)/hash - time 237.8 ns (237.4 ns .. 238.2 ns) - 1.000 R² (1.000 R² .. 1.000 R²) - mean 237.3 ns (237.1 ns .. 237.8 ns) - std dev 1.226 ns (854.2 ps .. 1.982 ns) + time 111.1 ns (110.2 ns .. 111.6 ns) + 1.000 R² (0.999 R² .. 1.000 R²) + mean 108.6 ns (107.8 ns .. 109.5 ns) + std dev 2.951 ns (2.637 ns .. 3.334 ns) + variance introduced by outliers: 41% (moderately inflated) benchmarking ppad-sha512/HMAC-SHA512 (32B input)/hmac - time 1.017 μs (1.013 μs .. 1.021 μs) + time 469.1 ns (468.2 ns .. 470.0 ns) 1.000 R² (1.000 R² .. 1.000 R²) - mean 1.015 μs (1.014 μs .. 1.018 μs) - std dev 7.727 ns (6.045 ns .. 9.684 ns) + mean 468.6 ns (467.7 ns .. 469.3 ns) + std dev 2.809 ns (2.317 ns .. 3.492 ns) ``` You should compile with the 'llvm' flag for maximum performance. diff --git a/cbits/sha512_arm.c b/cbits/sha512_arm.c @@ -0,0 +1,467 @@ +#include <stdint.h> +#include <string.h> + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SHA512) + +#include <arm_neon.h> + +static const uint64_t K[80] = { + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, + 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, + 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, + 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, + 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, + 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, + 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, + 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, + 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, + 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, + 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, + 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, + 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, + 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, + 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, + 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, + 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, + 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, + 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, + 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, + 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, + 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, + 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, + 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, + 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, + 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, + 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL +}; + +/* + * Process one 128-byte block using ARM SHA512 crypto instructions. + * + * state: pointer to 8 uint64_t words (a,b,c,d,e,f,g,h) + * block: pointer to 128 bytes of message data + * + * The state is updated in place. + */ +void sha512_block_arm(uint64_t *state, const uint8_t *block) { + /* Load current hash state */ + uint64x2_t ab = vld1q_u64(&state[0]); + uint64x2_t cd = vld1q_u64(&state[2]); + uint64x2_t ef = vld1q_u64(&state[4]); + uint64x2_t gh = vld1q_u64(&state[6]); + + /* Save original for final addition */ + uint64x2_t ab_orig = ab; + uint64x2_t cd_orig = cd; + uint64x2_t ef_orig = ef; + uint64x2_t gh_orig = gh; + + /* Load message and convert from big-endian */ + uint64x2_t m0 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[0]))); + uint64x2_t m1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[16]))); + uint64x2_t m2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[32]))); + uint64x2_t m3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[48]))); + uint64x2_t m4 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[64]))); + uint64x2_t m5 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[80]))); + uint64x2_t m6 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[96]))); + uint64x2_t m7 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(&block[112]))); + + uint64x2_t tmp; + + /* Rounds 0-1 */ + tmp = vaddq_u64(m0, vld1q_u64(&K[0])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(gh, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); + gh = vsha512h2q_u64(tmp, cd, ab); + cd = vaddq_u64(cd, tmp); + m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1)); + + /* Rounds 2-3 */ + tmp = vaddq_u64(m1, vld1q_u64(&K[2])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ef, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); + ef = vsha512h2q_u64(tmp, ab, gh); + ab = vaddq_u64(ab, tmp); + m1 = vsha512su1q_u64(vsha512su0q_u64(m1, m2), m0, vextq_u64(m5, m6, 1)); + + /* Rounds 4-5 */ + tmp = vaddq_u64(m2, vld1q_u64(&K[4])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(cd, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); + cd = vsha512h2q_u64(tmp, gh, ef); + gh = vaddq_u64(gh, tmp); + m2 = vsha512su1q_u64(vsha512su0q_u64(m2, m3), m1, vextq_u64(m6, m7, 1)); + + /* Rounds 6-7 */ + tmp = vaddq_u64(m3, vld1q_u64(&K[6])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ab, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); + ab = vsha512h2q_u64(tmp, ef, cd); + ef = vaddq_u64(ef, tmp); + m3 = vsha512su1q_u64(vsha512su0q_u64(m3, m4), m2, vextq_u64(m7, m0, 1)); + + /* Rounds 8-9 */ + tmp = vaddq_u64(m4, vld1q_u64(&K[8])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(gh, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); + gh = vsha512h2q_u64(tmp, cd, ab); + cd = vaddq_u64(cd, tmp); + m4 = vsha512su1q_u64(vsha512su0q_u64(m4, m5), m3, vextq_u64(m0, m1, 1)); + + /* Rounds 10-11 */ + tmp = vaddq_u64(m5, vld1q_u64(&K[10])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ef, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); + ef = vsha512h2q_u64(tmp, ab, gh); + ab = vaddq_u64(ab, tmp); + m5 = vsha512su1q_u64(vsha512su0q_u64(m5, m6), m4, vextq_u64(m1, m2, 1)); + + /* Rounds 12-13 */ + tmp = vaddq_u64(m6, vld1q_u64(&K[12])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(cd, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); + cd = vsha512h2q_u64(tmp, gh, ef); + gh = vaddq_u64(gh, tmp); + m6 = vsha512su1q_u64(vsha512su0q_u64(m6, m7), m5, vextq_u64(m2, m3, 1)); + + /* Rounds 14-15 */ + tmp = vaddq_u64(m7, vld1q_u64(&K[14])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ab, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); + ab = vsha512h2q_u64(tmp, ef, cd); + ef = vaddq_u64(ef, tmp); + m7 = vsha512su1q_u64(vsha512su0q_u64(m7, m0), m6, vextq_u64(m3, m4, 1)); + + /* Rounds 16-17 */ + tmp = vaddq_u64(m0, vld1q_u64(&K[16])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(gh, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); + gh = vsha512h2q_u64(tmp, cd, ab); + cd = vaddq_u64(cd, tmp); + m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1)); + + /* Rounds 18-19 */ + tmp = vaddq_u64(m1, vld1q_u64(&K[18])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ef, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); + ef = vsha512h2q_u64(tmp, ab, gh); + ab = vaddq_u64(ab, tmp); + m1 = vsha512su1q_u64(vsha512su0q_u64(m1, m2), m0, vextq_u64(m5, m6, 1)); + + /* Rounds 20-21 */ + tmp = vaddq_u64(m2, vld1q_u64(&K[20])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(cd, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); + cd = vsha512h2q_u64(tmp, gh, ef); + gh = vaddq_u64(gh, tmp); + m2 = vsha512su1q_u64(vsha512su0q_u64(m2, m3), m1, vextq_u64(m6, m7, 1)); + + /* Rounds 22-23 */ + tmp = vaddq_u64(m3, vld1q_u64(&K[22])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ab, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); + ab = vsha512h2q_u64(tmp, ef, cd); + ef = vaddq_u64(ef, tmp); + m3 = vsha512su1q_u64(vsha512su0q_u64(m3, m4), m2, vextq_u64(m7, m0, 1)); + + /* Rounds 24-25 */ + tmp = vaddq_u64(m4, vld1q_u64(&K[24])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(gh, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); + gh = vsha512h2q_u64(tmp, cd, ab); + cd = vaddq_u64(cd, tmp); + m4 = vsha512su1q_u64(vsha512su0q_u64(m4, m5), m3, vextq_u64(m0, m1, 1)); + + /* Rounds 26-27 */ + tmp = vaddq_u64(m5, vld1q_u64(&K[26])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ef, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); + ef = vsha512h2q_u64(tmp, ab, gh); + ab = vaddq_u64(ab, tmp); + m5 = vsha512su1q_u64(vsha512su0q_u64(m5, m6), m4, vextq_u64(m1, m2, 1)); + + /* Rounds 28-29 */ + tmp = vaddq_u64(m6, vld1q_u64(&K[28])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(cd, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); + cd = vsha512h2q_u64(tmp, gh, ef); + gh = vaddq_u64(gh, tmp); + m6 = vsha512su1q_u64(vsha512su0q_u64(m6, m7), m5, vextq_u64(m2, m3, 1)); + + /* Rounds 30-31 */ + tmp = vaddq_u64(m7, vld1q_u64(&K[30])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ab, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); + ab = vsha512h2q_u64(tmp, ef, cd); + ef = vaddq_u64(ef, tmp); + m7 = vsha512su1q_u64(vsha512su0q_u64(m7, m0), m6, vextq_u64(m3, m4, 1)); + + /* Rounds 32-33 */ + tmp = vaddq_u64(m0, vld1q_u64(&K[32])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(gh, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); + gh = vsha512h2q_u64(tmp, cd, ab); + cd = vaddq_u64(cd, tmp); + m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1)); + + /* Rounds 34-35 */ + tmp = vaddq_u64(m1, vld1q_u64(&K[34])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ef, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); + ef = vsha512h2q_u64(tmp, ab, gh); + ab = vaddq_u64(ab, tmp); + m1 = vsha512su1q_u64(vsha512su0q_u64(m1, m2), m0, vextq_u64(m5, m6, 1)); + + /* Rounds 36-37 */ + tmp = vaddq_u64(m2, vld1q_u64(&K[36])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(cd, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); + cd = vsha512h2q_u64(tmp, gh, ef); + gh = vaddq_u64(gh, tmp); + m2 = vsha512su1q_u64(vsha512su0q_u64(m2, m3), m1, vextq_u64(m6, m7, 1)); + + /* Rounds 38-39 */ + tmp = vaddq_u64(m3, vld1q_u64(&K[38])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ab, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); + ab = vsha512h2q_u64(tmp, ef, cd); + ef = vaddq_u64(ef, tmp); + m3 = vsha512su1q_u64(vsha512su0q_u64(m3, m4), m2, vextq_u64(m7, m0, 1)); + + /* Rounds 40-41 */ + tmp = vaddq_u64(m4, vld1q_u64(&K[40])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(gh, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); + gh = vsha512h2q_u64(tmp, cd, ab); + cd = vaddq_u64(cd, tmp); + m4 = vsha512su1q_u64(vsha512su0q_u64(m4, m5), m3, vextq_u64(m0, m1, 1)); + + /* Rounds 42-43 */ + tmp = vaddq_u64(m5, vld1q_u64(&K[42])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ef, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); + ef = vsha512h2q_u64(tmp, ab, gh); + ab = vaddq_u64(ab, tmp); + m5 = vsha512su1q_u64(vsha512su0q_u64(m5, m6), m4, vextq_u64(m1, m2, 1)); + + /* Rounds 44-45 */ + tmp = vaddq_u64(m6, vld1q_u64(&K[44])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(cd, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); + cd = vsha512h2q_u64(tmp, gh, ef); + gh = vaddq_u64(gh, tmp); + m6 = vsha512su1q_u64(vsha512su0q_u64(m6, m7), m5, vextq_u64(m2, m3, 1)); + + /* Rounds 46-47 */ + tmp = vaddq_u64(m7, vld1q_u64(&K[46])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ab, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); + ab = vsha512h2q_u64(tmp, ef, cd); + ef = vaddq_u64(ef, tmp); + m7 = vsha512su1q_u64(vsha512su0q_u64(m7, m0), m6, vextq_u64(m3, m4, 1)); + + /* Rounds 48-49 */ + tmp = vaddq_u64(m0, vld1q_u64(&K[48])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(gh, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); + gh = vsha512h2q_u64(tmp, cd, ab); + cd = vaddq_u64(cd, tmp); + m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1)); + + /* Rounds 50-51 */ + tmp = vaddq_u64(m1, vld1q_u64(&K[50])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ef, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); + ef = vsha512h2q_u64(tmp, ab, gh); + ab = vaddq_u64(ab, tmp); + m1 = vsha512su1q_u64(vsha512su0q_u64(m1, m2), m0, vextq_u64(m5, m6, 1)); + + /* Rounds 52-53 */ + tmp = vaddq_u64(m2, vld1q_u64(&K[52])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(cd, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); + cd = vsha512h2q_u64(tmp, gh, ef); + gh = vaddq_u64(gh, tmp); + m2 = vsha512su1q_u64(vsha512su0q_u64(m2, m3), m1, vextq_u64(m6, m7, 1)); + + /* Rounds 54-55 */ + tmp = vaddq_u64(m3, vld1q_u64(&K[54])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ab, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); + ab = vsha512h2q_u64(tmp, ef, cd); + ef = vaddq_u64(ef, tmp); + m3 = vsha512su1q_u64(vsha512su0q_u64(m3, m4), m2, vextq_u64(m7, m0, 1)); + + /* Rounds 56-57 */ + tmp = vaddq_u64(m4, vld1q_u64(&K[56])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(gh, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); + gh = vsha512h2q_u64(tmp, cd, ab); + cd = vaddq_u64(cd, tmp); + m4 = vsha512su1q_u64(vsha512su0q_u64(m4, m5), m3, vextq_u64(m0, m1, 1)); + + /* Rounds 58-59 */ + tmp = vaddq_u64(m5, vld1q_u64(&K[58])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ef, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); + ef = vsha512h2q_u64(tmp, ab, gh); + ab = vaddq_u64(ab, tmp); + m5 = vsha512su1q_u64(vsha512su0q_u64(m5, m6), m4, vextq_u64(m1, m2, 1)); + + /* Rounds 60-61 */ + tmp = vaddq_u64(m6, vld1q_u64(&K[60])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(cd, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); + cd = vsha512h2q_u64(tmp, gh, ef); + gh = vaddq_u64(gh, tmp); + m6 = vsha512su1q_u64(vsha512su0q_u64(m6, m7), m5, vextq_u64(m2, m3, 1)); + + /* Rounds 62-63 */ + tmp = vaddq_u64(m7, vld1q_u64(&K[62])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ab, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); + ab = vsha512h2q_u64(tmp, ef, cd); + ef = vaddq_u64(ef, tmp); + m7 = vsha512su1q_u64(vsha512su0q_u64(m7, m0), m6, vextq_u64(m3, m4, 1)); + + /* Rounds 64-65 */ + tmp = vaddq_u64(m0, vld1q_u64(&K[64])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(gh, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); + gh = vsha512h2q_u64(tmp, cd, ab); + cd = vaddq_u64(cd, tmp); + + /* Rounds 66-67 */ + tmp = vaddq_u64(m1, vld1q_u64(&K[66])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ef, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); + ef = vsha512h2q_u64(tmp, ab, gh); + ab = vaddq_u64(ab, tmp); + + /* Rounds 68-69 */ + tmp = vaddq_u64(m2, vld1q_u64(&K[68])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(cd, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); + cd = vsha512h2q_u64(tmp, gh, ef); + gh = vaddq_u64(gh, tmp); + + /* Rounds 70-71 */ + tmp = vaddq_u64(m3, vld1q_u64(&K[70])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ab, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); + ab = vsha512h2q_u64(tmp, ef, cd); + ef = vaddq_u64(ef, tmp); + + /* Rounds 72-73 */ + tmp = vaddq_u64(m4, vld1q_u64(&K[72])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(gh, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); + gh = vsha512h2q_u64(tmp, cd, ab); + cd = vaddq_u64(cd, tmp); + + /* Rounds 74-75 */ + tmp = vaddq_u64(m5, vld1q_u64(&K[74])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ef, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); + ef = vsha512h2q_u64(tmp, ab, gh); + ab = vaddq_u64(ab, tmp); + + /* Rounds 76-77 */ + tmp = vaddq_u64(m6, vld1q_u64(&K[76])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(cd, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); + cd = vsha512h2q_u64(tmp, gh, ef); + gh = vaddq_u64(gh, tmp); + + /* Rounds 78-79 */ + tmp = vaddq_u64(m7, vld1q_u64(&K[78])); + tmp = vextq_u64(tmp, tmp, 1); + tmp = vaddq_u64(ab, tmp); + tmp = vsha512hq_u64(tmp, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); + ab = vsha512h2q_u64(tmp, ef, cd); + ef = vaddq_u64(ef, tmp); + + /* Add original state back */ + ab = vaddq_u64(ab, ab_orig); + cd = vaddq_u64(cd, cd_orig); + ef = vaddq_u64(ef, ef_orig); + gh = vaddq_u64(gh, gh_orig); + + /* Store result */ + vst1q_u64(&state[0], ab); + vst1q_u64(&state[2], cd); + vst1q_u64(&state[4], ef); + vst1q_u64(&state[6], gh); +} + +/* Return 1 if ARM SHA512 is available, 0 otherwise */ +int sha512_arm_available(void) { + return 1; +} + +#else + +/* Stub implementations when ARM SHA512 is not available */ +void sha512_block_arm(uint64_t *state, const uint8_t *block) { + (void)state; + (void)block; + /* Should never be called - use pure Haskell fallback */ +} + +int sha512_arm_available(void) { + return 0; +} + +#endif diff --git a/lib/Crypto/Hash/SHA512.hs b/lib/Crypto/Hash/SHA512.hs @@ -1,7 +1,4 @@ -{-# OPTIONS_GHC -funbox-small-strict-fields #-} {-# LANGUAGE BangPatterns #-} -{-# LANGUAGE RecordWildCards #-} -{-# LANGUAGE ViewPatterns #-} -- | -- Module: Crypto.Hash.SHA512 @@ -9,7 +6,7 @@ -- License: MIT -- Maintainer: Jared Tobin <jared@ppad.tech> -- --- Pure SHA-512 and HMAC-SHA512 implementations for +-- SHA-512 and HMAC-SHA512 implementations for -- strict and lazy ByteStrings, as specified by RFC's -- [6234](https://datatracker.ietf.org/doc/html/rfc6234) and -- [2104](https://datatracker.ietf.org/doc/html/rfc2104). @@ -17,496 +14,29 @@ module Crypto.Hash.SHA512 ( -- * SHA-512 message digest functions hash - , hash_lazy + , Lazy.hash_lazy -- * SHA512-based MAC functions , hmac - , hmac_lazy + , Lazy.hmac_lazy ) where import qualified Data.Bits as B -import Data.Bits ((.|.), (.&.)) import qualified Data.ByteString as BS -import qualified Data.ByteString.Builder as BSB -import qualified Data.ByteString.Builder.Extra as BE import qualified Data.ByteString.Internal as BI -import qualified Data.ByteString.Lazy as BL -import qualified Data.ByteString.Lazy.Internal as BLI import qualified Data.ByteString.Unsafe as BU import Data.Word (Word64) -import Foreign.ForeignPtr (plusForeignPtr) +import Crypto.Hash.SHA512.Arm +import Crypto.Hash.SHA512.Internal +import qualified Crypto.Hash.SHA512.Lazy as Lazy --- preliminary utils ---------------------------------------------------------- +-- utils --------------------------------------------------------------------- --- keystroke saver fi :: (Integral a, Num b) => a -> b fi = fromIntegral {-# INLINE fi #-} --- parse strict ByteString in BE order to Word64 (verbatim from --- Data.Binary) --- --- invariant: --- the input bytestring is at least 64 bits in length -unsafe_word64be :: BS.ByteString -> Word64 -unsafe_word64be s = - (fi (s `BU.unsafeIndex` 0) `B.unsafeShiftL` 56) .|. - (fi (s `BU.unsafeIndex` 1) `B.unsafeShiftL` 48) .|. - (fi (s `BU.unsafeIndex` 2) `B.unsafeShiftL` 40) .|. - (fi (s `BU.unsafeIndex` 3) `B.unsafeShiftL` 32) .|. - (fi (s `BU.unsafeIndex` 4) `B.unsafeShiftL` 24) .|. - (fi (s `BU.unsafeIndex` 5) `B.unsafeShiftL` 16) .|. - (fi (s `BU.unsafeIndex` 6) `B.unsafeShiftL` 8) .|. - (fi (s `BU.unsafeIndex` 7) ) -{-# INLINE unsafe_word64be #-} - --- utility types for more efficient ByteString management - -data SSPair = SSPair - {-# UNPACK #-} !BS.ByteString - {-# UNPACK #-} !BS.ByteString - -data SLPair = SLPair {-# UNPACK #-} !BS.ByteString !BL.ByteString - -data WSPair = WSPair {-# UNPACK #-} !Word64 {-# UNPACK #-} !BS.ByteString - --- unsafe version of splitAt that does no bounds checking --- --- invariant: --- 0 <= n <= l -unsafe_splitAt :: Int -> BS.ByteString -> SSPair -unsafe_splitAt n (BI.BS x l) = - SSPair (BI.BS x n) (BI.BS (plusForeignPtr x n) (l - n)) - --- variant of Data.ByteString.Lazy.splitAt that returns the initial --- component as a strict, unboxed ByteString -splitAt128 :: BL.ByteString -> SLPair -splitAt128 = splitAt' (128 :: Int) where - splitAt' _ BLI.Empty = SLPair mempty BLI.Empty - splitAt' n (BLI.Chunk c@(BI.PS _ _ l) cs) = - if n < l - then - -- n < BS.length c, so unsafe_splitAt is safe - let !(SSPair c0 c1) = unsafe_splitAt n c - in SLPair c0 (BLI.Chunk c1 cs) - else - let SLPair cs' cs'' = splitAt' (n - l) cs - in SLPair (c <> cs') cs'' - --- variant of Data.ByteString.splitAt that behaves like an incremental --- Word64 parser --- --- invariant: --- the input bytestring is at least 64 bits in length -unsafe_parseWsPair :: BS.ByteString -> WSPair -unsafe_parseWsPair (BI.BS x l) = - WSPair (unsafe_word64be (BI.BS x 8)) (BI.BS (plusForeignPtr x 8) (l - 8)) -{-# INLINE unsafe_parseWsPair #-} - --- builder realization strategies - -to_strict :: BSB.Builder -> BS.ByteString -to_strict = BL.toStrict . BSB.toLazyByteString - -to_strict_small :: BSB.Builder -> BS.ByteString -to_strict_small = BL.toStrict . BE.toLazyByteStringWith - (BE.safeStrategy 128 BE.smallChunkSize) mempty - --- message padding and parsing ------------------------------------------------ --- https://datatracker.ietf.org/doc/html/rfc6234#section-4.1 - --- k such that (l + 1 + k) mod 128 = 112 -sol :: Word64 -> Word64 -sol l = - let r = 112 - fi l `rem` 128 - 1 :: Integer -- fi prevents underflow - in fi (if r < 0 then r + 128 else r) - --- RFC 6234 4.1 (strict) -pad :: BS.ByteString -> BS.ByteString -pad m@(BI.PS _ _ (fi -> l)) - | l < 128 = to_strict_small padded - | otherwise = to_strict padded - where - padded = BSB.byteString m - <> fill (sol l) (BSB.word8 0x80) - <> BSB.word64BE 0x00 - <> BSB.word64BE (l * 8) - - fill j !acc - | j `rem` 8 == 0 = - loop64 j acc - | (j - 7) `rem` 8 == 0 = - loop64 (j - 7) acc - <> BSB.word32BE 0x00 - <> BSB.word16BE 0x00 - <> BSB.word8 0x00 - | (j - 6) `rem` 8 == 0 = - loop64 (j - 6) acc - <> BSB.word32BE 0x00 - <> BSB.word16BE 0x00 - | (j - 5) `rem` 8 == 0 = - loop64 (j - 5) acc - <> BSB.word32BE 0x00 - <> BSB.word8 0x00 - | (j - 4) `rem` 8 == 0 = - loop64 (j - 4) acc - <> BSB.word32BE 0x00 - | (j - 3) `rem` 8 == 0 = - loop64 (j - 3) acc - <> BSB.word16BE 0x00 - <> BSB.word8 0x00 - | (j - 2) `rem` 8 == 0 = - loop64 (j - 2) acc - <> BSB.word16BE 0x00 - | (j - 1) `rem` 8 == 0 = - loop64 (j - 1) acc - <> BSB.word8 0x00 - - | j `rem` 4 == 0 = - loop32 j acc - | (j - 3) `rem` 4 == 0 = - loop32 (j - 3) acc - <> BSB.word16BE 0x00 - <> BSB.word8 0x00 - | (j - 2) `rem` 4 == 0 = - loop32 (j - 2) acc - <> BSB.word16BE 0x00 - | (j - 1) `rem` 4 == 0 = - loop32 (j - 1) acc - <> BSB.word8 0x00 - - | j `rem` 2 == 0 = - loop16 j acc - | (j - 1) `rem` 2 == 0 = - loop16 (j - 1) acc - <> BSB.word8 0x00 - - | otherwise = - loop8 j acc - - loop64 j !acc - | j == 0 = acc - | otherwise = loop64 (j - 8) (acc <> BSB.word64BE 0x00) - - loop32 j !acc - | j == 0 = acc - | otherwise = loop32 (j - 4) (acc <> BSB.word32BE 0x00) - - loop16 j !acc - | j == 0 = acc - | otherwise = loop16 (j - 2) (acc <> BSB.word16BE 0x00) - - loop8 j !acc - | j == 0 = acc - | otherwise = loop8 (pred j) (acc <> BSB.word8 0x00) - --- RFC 6234 4.1 (lazy) -pad_lazy :: BL.ByteString -> BL.ByteString -pad_lazy (BL.toChunks -> m) = BL.fromChunks (walk 0 m) where - walk !l bs = case bs of - (c:cs) -> c : walk (l + fi (BS.length c)) cs - [] -> padding l (sol l) (BSB.word8 0x80) - - padding l k bs - | k == 0 = - pure - . to_strict - $ bs <> BSB.word64BE 0x00 <> BSB.word64BE (l * 8) - | otherwise = - let nacc = bs <> BSB.word8 0x00 - in padding l (pred k) nacc - --- functions and constants used ----------------------------------------------- --- https://datatracker.ietf.org/doc/html/rfc6234#section-5.1 - -ch :: Word64 -> Word64 -> Word64 -> Word64 -ch x y z = (x .&. y) `B.xor` (B.complement x .&. z) -{-# INLINE ch #-} - --- credit to SHA authors for the following optimisation. their text: --- --- > note: --- > the original functions is (x & y) ^ (x & z) ^ (y & z) --- > if you fire off truth tables, this is equivalent to --- > (x & y) | (x & z) | (y & z) --- > which you can the use distribution on: --- > (x & (y | z)) | (y & z) --- > which saves us one operation. -maj :: Word64 -> Word64 -> Word64 -> Word64 -maj x y z = (x .&. (y .|. z)) .|. (y .&. z) -{-# INLINE maj #-} - -bsig0 :: Word64 -> Word64 -bsig0 x = B.rotateR x 28 `B.xor` B.rotateR x 34 `B.xor` B.rotateR x 39 -{-# INLINE bsig0 #-} - -bsig1 :: Word64 -> Word64 -bsig1 x = B.rotateR x 14 `B.xor` B.rotateR x 18 `B.xor` B.rotateR x 41 -{-# INLINE bsig1 #-} - -ssig0 :: Word64 -> Word64 -ssig0 x = B.rotateR x 1 `B.xor` B.rotateR x 8 `B.xor` B.unsafeShiftR x 7 -{-# INLINE ssig0 #-} - -ssig1 :: Word64 -> Word64 -ssig1 x = B.rotateR x 19 `B.xor` B.rotateR x 61 `B.xor` B.unsafeShiftR x 6 -{-# INLINE ssig1 #-} - -data Schedule = Schedule { - w00 :: !Word64, w01 :: !Word64, w02 :: !Word64, w03 :: !Word64 - , w04 :: !Word64, w05 :: !Word64, w06 :: !Word64, w07 :: !Word64 - , w08 :: !Word64, w09 :: !Word64, w10 :: !Word64, w11 :: !Word64 - , w12 :: !Word64, w13 :: !Word64, w14 :: !Word64, w15 :: !Word64 - , w16 :: !Word64, w17 :: !Word64, w18 :: !Word64, w19 :: !Word64 - , w20 :: !Word64, w21 :: !Word64, w22 :: !Word64, w23 :: !Word64 - , w24 :: !Word64, w25 :: !Word64, w26 :: !Word64, w27 :: !Word64 - , w28 :: !Word64, w29 :: !Word64, w30 :: !Word64, w31 :: !Word64 - , w32 :: !Word64, w33 :: !Word64, w34 :: !Word64, w35 :: !Word64 - , w36 :: !Word64, w37 :: !Word64, w38 :: !Word64, w39 :: !Word64 - , w40 :: !Word64, w41 :: !Word64, w42 :: !Word64, w43 :: !Word64 - , w44 :: !Word64, w45 :: !Word64, w46 :: !Word64, w47 :: !Word64 - , w48 :: !Word64, w49 :: !Word64, w50 :: !Word64, w51 :: !Word64 - , w52 :: !Word64, w53 :: !Word64, w54 :: !Word64, w55 :: !Word64 - , w56 :: !Word64, w57 :: !Word64, w58 :: !Word64, w59 :: !Word64 - , w60 :: !Word64, w61 :: !Word64, w62 :: !Word64, w63 :: !Word64 - , w64 :: !Word64, w65 :: !Word64, w66 :: !Word64, w67 :: !Word64 - , w68 :: !Word64, w69 :: !Word64, w70 :: !Word64, w71 :: !Word64 - , w72 :: !Word64, w73 :: !Word64, w74 :: !Word64, w75 :: !Word64 - , w76 :: !Word64, w77 :: !Word64, w78 :: !Word64, w79 :: !Word64 - } - --- initialization ------------------------------------------------------------- --- https://datatracker.ietf.org/doc/html/rfc6234#section-6.1 - -data Registers = Registers { - h0 :: !Word64, h1 :: !Word64, h2 :: !Word64, h3 :: !Word64 - , h4 :: !Word64, h5 :: !Word64, h6 :: !Word64, h7 :: !Word64 - } - --- first 64 bits of the fractional parts of the square roots of the --- first eight primes -iv :: Registers -iv = Registers - 0x6a09e667f3bcc908 0xbb67ae8584caa73b 0x3c6ef372fe94f82b 0xa54ff53a5f1d36f1 - 0x510e527fade682d1 0x9b05688c2b3e6c1f 0x1f83d9abfb41bd6b 0x5be0cd19137e2179 - --- processing ----------------------------------------------------------------- --- https://datatracker.ietf.org/doc/html/rfc6234#section-6.2 - -data Block = Block { - m00 :: !Word64, m01 :: !Word64, m02 :: !Word64, m03 :: !Word64 - , m04 :: !Word64, m05 :: !Word64, m06 :: !Word64, m07 :: !Word64 - , m08 :: !Word64, m09 :: !Word64, m10 :: !Word64, m11 :: !Word64 - , m12 :: !Word64, m13 :: !Word64, m14 :: !Word64, m15 :: !Word64 - } - --- parse strict bytestring to block --- --- invariant: --- the input bytestring is exactly 1024 bits long -unsafe_parse :: BS.ByteString -> Block -unsafe_parse bs = - let !(WSPair m00 t00) = unsafe_parseWsPair bs - !(WSPair m01 t01) = unsafe_parseWsPair t00 - !(WSPair m02 t02) = unsafe_parseWsPair t01 - !(WSPair m03 t03) = unsafe_parseWsPair t02 - !(WSPair m04 t04) = unsafe_parseWsPair t03 - !(WSPair m05 t05) = unsafe_parseWsPair t04 - !(WSPair m06 t06) = unsafe_parseWsPair t05 - !(WSPair m07 t07) = unsafe_parseWsPair t06 - !(WSPair m08 t08) = unsafe_parseWsPair t07 - !(WSPair m09 t09) = unsafe_parseWsPair t08 - !(WSPair m10 t10) = unsafe_parseWsPair t09 - !(WSPair m11 t11) = unsafe_parseWsPair t10 - !(WSPair m12 t12) = unsafe_parseWsPair t11 - !(WSPair m13 t13) = unsafe_parseWsPair t12 - !(WSPair m14 t14) = unsafe_parseWsPair t13 - !(WSPair m15 t15) = unsafe_parseWsPair t14 - in if BS.null t15 - then Block {..} - else error "ppad-sha512: internal error (bytes remaining)" - --- RFC 6234 6.2 step 1 -prepare_schedule :: Block -> Schedule -prepare_schedule Block {..} = Schedule {..} where - w00 = m00; w01 = m01; w02 = m02; w03 = m03 - w04 = m04; w05 = m05; w06 = m06; w07 = m07 - w08 = m08; w09 = m09; w10 = m10; w11 = m11 - w12 = m12; w13 = m13; w14 = m14; w15 = m15 - w16 = ssig1 w14 + w09 + ssig0 w01 + w00 - w17 = ssig1 w15 + w10 + ssig0 w02 + w01 - w18 = ssig1 w16 + w11 + ssig0 w03 + w02 - w19 = ssig1 w17 + w12 + ssig0 w04 + w03 - w20 = ssig1 w18 + w13 + ssig0 w05 + w04 - w21 = ssig1 w19 + w14 + ssig0 w06 + w05 - w22 = ssig1 w20 + w15 + ssig0 w07 + w06 - w23 = ssig1 w21 + w16 + ssig0 w08 + w07 - w24 = ssig1 w22 + w17 + ssig0 w09 + w08 - w25 = ssig1 w23 + w18 + ssig0 w10 + w09 - w26 = ssig1 w24 + w19 + ssig0 w11 + w10 - w27 = ssig1 w25 + w20 + ssig0 w12 + w11 - w28 = ssig1 w26 + w21 + ssig0 w13 + w12 - w29 = ssig1 w27 + w22 + ssig0 w14 + w13 - w30 = ssig1 w28 + w23 + ssig0 w15 + w14 - w31 = ssig1 w29 + w24 + ssig0 w16 + w15 - w32 = ssig1 w30 + w25 + ssig0 w17 + w16 - w33 = ssig1 w31 + w26 + ssig0 w18 + w17 - w34 = ssig1 w32 + w27 + ssig0 w19 + w18 - w35 = ssig1 w33 + w28 + ssig0 w20 + w19 - w36 = ssig1 w34 + w29 + ssig0 w21 + w20 - w37 = ssig1 w35 + w30 + ssig0 w22 + w21 - w38 = ssig1 w36 + w31 + ssig0 w23 + w22 - w39 = ssig1 w37 + w32 + ssig0 w24 + w23 - w40 = ssig1 w38 + w33 + ssig0 w25 + w24 - w41 = ssig1 w39 + w34 + ssig0 w26 + w25 - w42 = ssig1 w40 + w35 + ssig0 w27 + w26 - w43 = ssig1 w41 + w36 + ssig0 w28 + w27 - w44 = ssig1 w42 + w37 + ssig0 w29 + w28 - w45 = ssig1 w43 + w38 + ssig0 w30 + w29 - w46 = ssig1 w44 + w39 + ssig0 w31 + w30 - w47 = ssig1 w45 + w40 + ssig0 w32 + w31 - w48 = ssig1 w46 + w41 + ssig0 w33 + w32 - w49 = ssig1 w47 + w42 + ssig0 w34 + w33 - w50 = ssig1 w48 + w43 + ssig0 w35 + w34 - w51 = ssig1 w49 + w44 + ssig0 w36 + w35 - w52 = ssig1 w50 + w45 + ssig0 w37 + w36 - w53 = ssig1 w51 + w46 + ssig0 w38 + w37 - w54 = ssig1 w52 + w47 + ssig0 w39 + w38 - w55 = ssig1 w53 + w48 + ssig0 w40 + w39 - w56 = ssig1 w54 + w49 + ssig0 w41 + w40 - w57 = ssig1 w55 + w50 + ssig0 w42 + w41 - w58 = ssig1 w56 + w51 + ssig0 w43 + w42 - w59 = ssig1 w57 + w52 + ssig0 w44 + w43 - w60 = ssig1 w58 + w53 + ssig0 w45 + w44 - w61 = ssig1 w59 + w54 + ssig0 w46 + w45 - w62 = ssig1 w60 + w55 + ssig0 w47 + w46 - w63 = ssig1 w61 + w56 + ssig0 w48 + w47 - w64 = ssig1 w62 + w57 + ssig0 w49 + w48 - w65 = ssig1 w63 + w58 + ssig0 w50 + w49 - w66 = ssig1 w64 + w59 + ssig0 w51 + w50 - w67 = ssig1 w65 + w60 + ssig0 w52 + w51 - w68 = ssig1 w66 + w61 + ssig0 w53 + w52 - w69 = ssig1 w67 + w62 + ssig0 w54 + w53 - w70 = ssig1 w68 + w63 + ssig0 w55 + w54 - w71 = ssig1 w69 + w64 + ssig0 w56 + w55 - w72 = ssig1 w70 + w65 + ssig0 w57 + w56 - w73 = ssig1 w71 + w66 + ssig0 w58 + w57 - w74 = ssig1 w72 + w67 + ssig0 w59 + w58 - w75 = ssig1 w73 + w68 + ssig0 w60 + w59 - w76 = ssig1 w74 + w69 + ssig0 w61 + w60 - w77 = ssig1 w75 + w70 + ssig0 w62 + w61 - w78 = ssig1 w76 + w71 + ssig0 w63 + w62 - w79 = ssig1 w77 + w72 + ssig0 w64 + w63 - --- RFC 6234 6.2 steps 2, 3, 4 -block_hash :: Registers -> Schedule -> Registers -block_hash r00@Registers {..} Schedule {..} = - -- constants are the first 64 bits of the fractional parts of the - -- cube roots of the first eighty prime numbers - let r01 = step r00 0x428a2f98d728ae22 w00 - r02 = step r01 0x7137449123ef65cd w01 - r03 = step r02 0xb5c0fbcfec4d3b2f w02 - r04 = step r03 0xe9b5dba58189dbbc w03 - r05 = step r04 0x3956c25bf348b538 w04 - r06 = step r05 0x59f111f1b605d019 w05 - r07 = step r06 0x923f82a4af194f9b w06 - r08 = step r07 0xab1c5ed5da6d8118 w07 - r09 = step r08 0xd807aa98a3030242 w08 - r10 = step r09 0x12835b0145706fbe w09 - r11 = step r10 0x243185be4ee4b28c w10 - r12 = step r11 0x550c7dc3d5ffb4e2 w11 - r13 = step r12 0x72be5d74f27b896f w12 - r14 = step r13 0x80deb1fe3b1696b1 w13 - r15 = step r14 0x9bdc06a725c71235 w14 - r16 = step r15 0xc19bf174cf692694 w15 - r17 = step r16 0xe49b69c19ef14ad2 w16 - r18 = step r17 0xefbe4786384f25e3 w17 - r19 = step r18 0x0fc19dc68b8cd5b5 w18 - r20 = step r19 0x240ca1cc77ac9c65 w19 - r21 = step r20 0x2de92c6f592b0275 w20 - r22 = step r21 0x4a7484aa6ea6e483 w21 - r23 = step r22 0x5cb0a9dcbd41fbd4 w22 - r24 = step r23 0x76f988da831153b5 w23 - r25 = step r24 0x983e5152ee66dfab w24 - r26 = step r25 0xa831c66d2db43210 w25 - r27 = step r26 0xb00327c898fb213f w26 - r28 = step r27 0xbf597fc7beef0ee4 w27 - r29 = step r28 0xc6e00bf33da88fc2 w28 - r30 = step r29 0xd5a79147930aa725 w29 - r31 = step r30 0x06ca6351e003826f w30 - r32 = step r31 0x142929670a0e6e70 w31 - r33 = step r32 0x27b70a8546d22ffc w32 - r34 = step r33 0x2e1b21385c26c926 w33 - r35 = step r34 0x4d2c6dfc5ac42aed w34 - r36 = step r35 0x53380d139d95b3df w35 - r37 = step r36 0x650a73548baf63de w36 - r38 = step r37 0x766a0abb3c77b2a8 w37 - r39 = step r38 0x81c2c92e47edaee6 w38 - r40 = step r39 0x92722c851482353b w39 - r41 = step r40 0xa2bfe8a14cf10364 w40 - r42 = step r41 0xa81a664bbc423001 w41 - r43 = step r42 0xc24b8b70d0f89791 w42 - r44 = step r43 0xc76c51a30654be30 w43 - r45 = step r44 0xd192e819d6ef5218 w44 - r46 = step r45 0xd69906245565a910 w45 - r47 = step r46 0xf40e35855771202a w46 - r48 = step r47 0x106aa07032bbd1b8 w47 - r49 = step r48 0x19a4c116b8d2d0c8 w48 - r50 = step r49 0x1e376c085141ab53 w49 - r51 = step r50 0x2748774cdf8eeb99 w50 - r52 = step r51 0x34b0bcb5e19b48a8 w51 - r53 = step r52 0x391c0cb3c5c95a63 w52 - r54 = step r53 0x4ed8aa4ae3418acb w53 - r55 = step r54 0x5b9cca4f7763e373 w54 - r56 = step r55 0x682e6ff3d6b2b8a3 w55 - r57 = step r56 0x748f82ee5defb2fc w56 - r58 = step r57 0x78a5636f43172f60 w57 - r59 = step r58 0x84c87814a1f0ab72 w58 - r60 = step r59 0x8cc702081a6439ec w59 - r61 = step r60 0x90befffa23631e28 w60 - r62 = step r61 0xa4506cebde82bde9 w61 - r63 = step r62 0xbef9a3f7b2c67915 w62 - r64 = step r63 0xc67178f2e372532b w63 - r65 = step r64 0xca273eceea26619c w64 - r66 = step r65 0xd186b8c721c0c207 w65 - r67 = step r66 0xeada7dd6cde0eb1e w66 - r68 = step r67 0xf57d4f7fee6ed178 w67 - r69 = step r68 0x06f067aa72176fba w68 - r70 = step r69 0x0a637dc5a2c898a6 w69 - r71 = step r70 0x113f9804bef90dae w70 - r72 = step r71 0x1b710b35131c471b w71 - r73 = step r72 0x28db77f523047d84 w72 - r74 = step r73 0x32caab7b40c72493 w73 - r75 = step r74 0x3c9ebe0a15c9bebc w74 - r76 = step r75 0x431d67c49c100d4c w75 - r77 = step r76 0x4cc5d4becb3e42b6 w76 - r78 = step r77 0x597f299cfc657e2a w77 - r79 = step r78 0x5fcb6fab3ad6faec w78 - r80 = step r79 0x6c44198c4a475817 w79 - !(Registers a b c d e f g h) = r80 - in Registers - (a + h0) (b + h1) (c + h2) (d + h3) - (e + h4) (f + h5) (g + h6) (h + h7) - -step :: Registers -> Word64 -> Word64 -> Registers -step (Registers a b c d e f g h) k w = - let t1 = h + bsig1 e + ch e f g + k + w - t2 = bsig0 a + maj a b c - in Registers (t1 + t2) a b c (d + t1) e f g -{-# INLINE step #-} - --- RFC 6234 6.2 block pipeline --- --- invariant: --- the input bytestring is exactly 1024 bits in length -unsafe_hash_alg :: Registers -> BS.ByteString -> Registers -unsafe_hash_alg rs bs = block_hash rs (prepare_schedule (unsafe_parse bs)) - --- register concatenation -cat :: Registers -> BS.ByteString -cat Registers {..} = to_strict_small $ - BSB.word64BE h0 <> BSB.word64BE h1 <> BSB.word64BE h2 <> BSB.word64BE h3 - <> BSB.word64BE h4 <> BSB.word64BE h5 <> BSB.word64BE h6 <> BSB.word64BE h7 +-- hash ---------------------------------------------------------------------- -- | Compute a condensed representation of a strict bytestring via -- SHA-512. @@ -516,52 +46,31 @@ cat Registers {..} = to_strict_small $ -- >>> hash "strict bytestring input" -- "<strict 512-bit message digest>" hash :: BS.ByteString -> BS.ByteString -hash bs = cat (go iv (pad bs)) where - -- proof that 'go' always terminates safely: - -- - -- let b = pad bs - -- then length(b) = n * 1024 bits for some n >= 0 (1) - go :: Registers -> BS.ByteString -> Registers - go !acc b - -- if n == 0, then 'go' terminates safely (2) - | BS.null b = acc - -- if n > 0, then - -- - -- let (c, r) = unsafe_splitAt 128 b - -- then length(c) == 1024 bits by (1) - -- length(r) == m * 1024 bits for some m >= 0 by (1) - -- - -- note 'unsafe_hash_alg' terminates safely for bytestring (3) - -- input of exactly 1024 bits in length - -- - -- length(c) == 1024 - -- => 'unsafe_hash_alg' terminates safely by (3) - -- => 'go' terminates safely (4) - -- length(r) == m * 1024 bits for m >= 0 - -- => next invocation of 'go' terminates safely by (2), (4) - -- - -- then by induction, 'go' always terminates safely (QED) - | otherwise = case unsafe_splitAt 128 b of - SSPair c r -> go (unsafe_hash_alg acc c) r - --- | Compute a condensed representation of a lazy bytestring via --- SHA-512. --- --- The 512-bit output digest is returned as a strict bytestring. --- --- >>> hash_lazy "lazy bytestring input" --- "<strict 512-bit message digest>" -hash_lazy :: BL.ByteString -> BS.ByteString -hash_lazy bl = cat (go iv (pad_lazy bl)) where - -- proof of safety proceeds analogously - go :: Registers -> BL.ByteString -> Registers - go !acc bs - | BL.null bs = acc - | otherwise = case splitAt128 bs of - SLPair c r -> go (unsafe_hash_alg acc c) r - --- HMAC ----------------------------------------------------------------------- --- https://datatracker.ietf.org/doc/html/rfc2104#section-2 +hash m + | sha512_arm_available = hash_arm m + | otherwise = cat (process m) + +-- process a message, given the specified iv +process_with :: Registers -> Word64 -> BS.ByteString -> Registers +process_with acc0 el m@(BI.PS _ _ l) = finalize (go acc0 0) where + go !acc !j + | j + 128 <= l = go (block_hash acc (prepare_schedule (parse_block m j))) + (j + 128) + | otherwise = acc + + finalize !acc + | len < 112 = block_hash acc (prepare_schedule (parse_block padded 0)) + | otherwise = block_hash + (block_hash acc (prepare_schedule (parse_block padded 0))) + (prepare_schedule (parse_block padded 128)) + where + !remaining@(BI.PS _ _ len) = BU.unsafeDrop (l - l `rem` 128) m + !padded = unsafe_padding remaining (el + fi l) + +process :: BS.ByteString -> Registers +process = process_with iv 0 + +-- hmac ---------------------------------------------------------------------- data KeyAndLen = KeyAndLen {-# UNPACK #-} !BS.ByteString @@ -573,7 +82,7 @@ data KeyAndLen = KeyAndLen -- The 512-bit MAC is returned as a strict bytestring. -- -- Per RFC 2104, the key /should/ be a minimum of 64 bytes long. Keys --- exceeding 1024 bytes in length will first be hashed (via SHA-512). +-- exceeding 128 bytes in length will first be hashed (via SHA-512). -- -- >>> hmac "strict bytestring key" "strict bytestring input" -- "<strict 512-bit MAC>" @@ -581,43 +90,18 @@ hmac :: BS.ByteString -- ^ key -> BS.ByteString -- ^ text -> BS.ByteString -hmac mk@(BI.PS _ _ l) text = - let step1 = k <> BS.replicate (128 - lk) 0x00 - step2 = BS.map (B.xor 0x36) step1 - step3 = step2 <> text - step4 = hash step3 - step5 = BS.map (B.xor 0x5C) step1 - step6 = step5 <> step4 - in hash step6 - where - !(KeyAndLen k lk) - | l > 128 = KeyAndLen (hash mk) 64 - | otherwise = KeyAndLen mk l - --- | Produce a message authentication code for a lazy bytestring, based --- on the provided (strict, bytestring) key, via SHA-512. --- --- The 512-bit MAC is returned as a strict bytestring. --- --- Per RFC 2104, the key /should/ be a minimum of 64 bytes long. Keys --- exceeding 1024 bytes in length will first be hashed (via SHA-512). --- --- >>> hmac_lazy "strict bytestring key" "lazy bytestring input" --- "<strict 512-bit MAC>" -hmac_lazy - :: BS.ByteString -- ^ key - -> BL.ByteString -- ^ text - -> BS.ByteString -hmac_lazy mk@(BI.PS _ _ l) text = - let step1 = k <> BS.replicate (128 - lk) 0x00 - step2 = BS.map (B.xor 0x36) step1 - step3 = BL.fromStrict step2 <> text - step4 = hash_lazy step3 - step5 = BS.map (B.xor 0x5C) step1 - step6 = step5 <> step4 - in hash step6 +hmac mk@(BI.PS _ _ l) text + | sha512_arm_available = + let !inner = hash_arm_with ipad 128 text + in hash_arm (opad <> inner) + | otherwise = + let !ipad_state = block_hash iv (prepare_schedule (parse_block ipad 0)) + !inner = cat (process_with ipad_state 128 text) + in hash (opad <> inner) where + !step1 = k <> BS.replicate (128 - lk) 0x00 + !ipad = BS.map (B.xor 0x36) step1 + !opad = BS.map (B.xor 0x5C) step1 !(KeyAndLen k lk) | l > 128 = KeyAndLen (hash mk) 64 | otherwise = KeyAndLen mk l - diff --git a/lib/Crypto/Hash/SHA512/Arm.hs b/lib/Crypto/Hash/SHA512/Arm.hs @@ -0,0 +1,126 @@ +{-# LANGUAGE BangPatterns #-} + +-- | +-- Module: Crypto.Hash.SHA512.Arm +-- Copyright: (c) 2024 Jared Tobin +-- License: MIT +-- Maintainer: Jared Tobin <jared@ppad.tech> +-- +-- ARM crypto extension support for SHA-512. + +module Crypto.Hash.SHA512.Arm ( + sha512_arm_available + , hash_arm + , hash_arm_with + ) where + +import Control.Monad (unless, when) +import qualified Data.Bits as B +import qualified Data.ByteString as BS +import qualified Data.ByteString.Internal as BI +import qualified Data.ByteString.Unsafe as BU +import Data.Word (Word8, Word64) +import Foreign.Marshal.Alloc (allocaBytes) +import Foreign.Ptr (Ptr, plusPtr) +import Foreign.Storable (poke, peek) +import Crypto.Hash.SHA512.Internal (unsafe_padding) +import System.IO.Unsafe (unsafePerformIO) + +-- ffi ----------------------------------------------------------------------- + +foreign import ccall unsafe "sha512_block_arm" + c_sha512_block :: Ptr Word64 -> Ptr Word8 -> IO () + +foreign import ccall unsafe "sha512_arm_available" + c_sha512_arm_available :: IO Int + +-- utilities ----------------------------------------------------------------- + +fi :: (Integral a, Num b) => a -> b +fi = fromIntegral +{-# INLINE fi #-} + +-- api ----------------------------------------------------------------------- + +sha512_arm_available :: Bool +sha512_arm_available = unsafePerformIO c_sha512_arm_available /= 0 +{-# NOINLINE sha512_arm_available #-} + +hash_arm :: BS.ByteString -> BS.ByteString +hash_arm = hash_arm_with mempty 0 + +-- | Hash with optional 128-byte prefix and extra length for padding. +hash_arm_with + :: BS.ByteString -- ^ optional 128-byte prefix (or empty) + -> Word64 -- ^ extra length to add for padding + -> BS.ByteString -- ^ message + -> BS.ByteString +hash_arm_with prefix el m@(BI.PS fp off l) = unsafePerformIO $ + allocaBytes 64 $ \state -> do + poke_iv state + -- process prefix block if provided + unless (BS.null prefix) $ do + let BI.PS pfp poff _ = prefix + BI.unsafeWithForeignPtr pfp $ \src -> + c_sha512_block state (src `plusPtr` poff) + + go state 0 + + let !remaining@(BI.PS _ _ rlen) = BU.unsafeDrop (l - l `rem` 128) m + BI.PS padfp padoff _ = unsafe_padding remaining (el + fi l) + BI.unsafeWithForeignPtr padfp $ \src -> do + c_sha512_block state (src `plusPtr` padoff) + when (rlen >= 112) $ + c_sha512_block state (src `plusPtr` (padoff + 128)) + + read_state state + where + go !state !j + | j + 128 <= l = do + BI.unsafeWithForeignPtr fp $ \src -> + c_sha512_block state (src `plusPtr` (off + j)) + go state (j + 128) + | otherwise = pure () + +-- arm helpers --------------------------------------------------------------- + +poke_iv :: Ptr Word64 -> IO () +poke_iv !state = do + poke state (0x6a09e667f3bcc908 :: Word64) + poke (state `plusPtr` 8) (0xbb67ae8584caa73b :: Word64) + poke (state `plusPtr` 16) (0x3c6ef372fe94f82b :: Word64) + poke (state `plusPtr` 24) (0xa54ff53a5f1d36f1 :: Word64) + poke (state `plusPtr` 32) (0x510e527fade682d1 :: Word64) + poke (state `plusPtr` 40) (0x9b05688c2b3e6c1f :: Word64) + poke (state `plusPtr` 48) (0x1f83d9abfb41bd6b :: Word64) + poke (state `plusPtr` 56) (0x5be0cd19137e2179 :: Word64) + +read_state :: Ptr Word64 -> IO BS.ByteString +read_state !state = BI.create 64 $ \out -> do + h0 <- peek state :: IO Word64 + h1 <- peek (state `plusPtr` 8) :: IO Word64 + h2 <- peek (state `plusPtr` 16) :: IO Word64 + h3 <- peek (state `plusPtr` 24) :: IO Word64 + h4 <- peek (state `plusPtr` 32) :: IO Word64 + h5 <- peek (state `plusPtr` 40) :: IO Word64 + h6 <- peek (state `plusPtr` 48) :: IO Word64 + h7 <- peek (state `plusPtr` 56) :: IO Word64 + poke_word64be out 0 h0 + poke_word64be out 8 h1 + poke_word64be out 16 h2 + poke_word64be out 24 h3 + poke_word64be out 32 h4 + poke_word64be out 40 h5 + poke_word64be out 48 h6 + poke_word64be out 56 h7 + +poke_word64be :: Ptr Word8 -> Int -> Word64 -> IO () +poke_word64be !p !off !w = do + poke (p `plusPtr` off) (fi (w `B.unsafeShiftR` 56) :: Word8) + poke (p `plusPtr` (off + 1)) (fi (w `B.unsafeShiftR` 48) :: Word8) + poke (p `plusPtr` (off + 2)) (fi (w `B.unsafeShiftR` 40) :: Word8) + poke (p `plusPtr` (off + 3)) (fi (w `B.unsafeShiftR` 32) :: Word8) + poke (p `plusPtr` (off + 4)) (fi (w `B.unsafeShiftR` 24) :: Word8) + poke (p `plusPtr` (off + 5)) (fi (w `B.unsafeShiftR` 16) :: Word8) + poke (p `plusPtr` (off + 6)) (fi (w `B.unsafeShiftR` 8) :: Word8) + poke (p `plusPtr` (off + 7)) (fi w :: Word8) diff --git a/lib/Crypto/Hash/SHA512/Internal.hs b/lib/Crypto/Hash/SHA512/Internal.hs @@ -0,0 +1,428 @@ +{-# OPTIONS_GHC -funbox-small-strict-fields #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE RecordWildCards #-} + +-- | +-- Module: Crypto.Hash.SHA512.Internal +-- Copyright: (c) 2024 Jared Tobin +-- License: MIT +-- Maintainer: Jared Tobin <jared@ppad.tech> +-- +-- SHA-512 internals. + +module Crypto.Hash.SHA512.Internal ( + Registers(..) + , Block(..) + , Schedule(..) + + , iv + , block_hash + , prepare_schedule + , parse_block + , cat + , unsafe_hash_alg + , unsafe_parse + , unsafe_padding + + , WSPair(..) + , unsafe_word64be + , unsafe_parseWsPair + ) where + +import qualified Data.Bits as B +import Data.Bits ((.|.), (.&.)) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Builder as BSB +import qualified Data.ByteString.Builder.Extra as BE +import qualified Data.ByteString.Internal as BI +import qualified Data.ByteString.Unsafe as BU +import Data.Word (Word8, Word64) +import Foreign.ForeignPtr (plusForeignPtr) +import Foreign.Marshal.Utils (copyBytes, fillBytes) +import Foreign.Ptr (Ptr, plusPtr) +import Foreign.Storable (poke) + +-- preliminary utils --------------------------------------------------------- + +fi :: (Integral a, Num b) => a -> b +fi = fromIntegral +{-# INLINE fi #-} + +-- parse strict ByteString in BE order to Word64 +-- +-- invariant: +-- the input bytestring is at least 64 bits in length +unsafe_word64be :: BS.ByteString -> Word64 +unsafe_word64be s = + (fi (s `BU.unsafeIndex` 0) `B.unsafeShiftL` 56) .|. + (fi (s `BU.unsafeIndex` 1) `B.unsafeShiftL` 48) .|. + (fi (s `BU.unsafeIndex` 2) `B.unsafeShiftL` 40) .|. + (fi (s `BU.unsafeIndex` 3) `B.unsafeShiftL` 32) .|. + (fi (s `BU.unsafeIndex` 4) `B.unsafeShiftL` 24) .|. + (fi (s `BU.unsafeIndex` 5) `B.unsafeShiftL` 16) .|. + (fi (s `BU.unsafeIndex` 6) `B.unsafeShiftL` 8) .|. + (fi (s `BU.unsafeIndex` 7) ) +{-# INLINE unsafe_word64be #-} + +data WSPair = WSPair {-# UNPACK #-} !Word64 {-# UNPACK #-} !BS.ByteString + +-- variant of Data.ByteString.splitAt that behaves like an incremental +-- Word64 parser +-- +-- invariant: +-- the input bytestring is at least 64 bits in length +unsafe_parseWsPair :: BS.ByteString -> WSPair +unsafe_parseWsPair (BI.BS x l) = + WSPair (unsafe_word64be (BI.BS x 8)) (BI.BS (plusForeignPtr x 8) (l - 8)) +{-# INLINE unsafe_parseWsPair #-} + +-- builder realization strategy + +to_strict_small :: BSB.Builder -> BS.ByteString +to_strict_small = BS.toStrict . BE.toLazyByteStringWith + (BE.safeStrategy 128 BE.smallChunkSize) mempty + +-- functions and constants --------------------------------------------------- +-- https://datatracker.ietf.org/doc/html/rfc6234#section-5.1 + +ch :: Word64 -> Word64 -> Word64 -> Word64 +ch x y z = (x .&. y) `B.xor` (B.complement x .&. z) +{-# INLINE ch #-} + +maj :: Word64 -> Word64 -> Word64 -> Word64 +maj x y z = (x .&. (y .|. z)) .|. (y .&. z) +{-# INLINE maj #-} + +bsig0 :: Word64 -> Word64 +bsig0 x = B.rotateR x 28 `B.xor` B.rotateR x 34 `B.xor` B.rotateR x 39 +{-# INLINE bsig0 #-} + +bsig1 :: Word64 -> Word64 +bsig1 x = B.rotateR x 14 `B.xor` B.rotateR x 18 `B.xor` B.rotateR x 41 +{-# INLINE bsig1 #-} + +ssig0 :: Word64 -> Word64 +ssig0 x = B.rotateR x 1 `B.xor` B.rotateR x 8 `B.xor` B.unsafeShiftR x 7 +{-# INLINE ssig0 #-} + +ssig1 :: Word64 -> Word64 +ssig1 x = B.rotateR x 19 `B.xor` B.rotateR x 61 `B.xor` B.unsafeShiftR x 6 +{-# INLINE ssig1 #-} + +data Schedule = Schedule { + w00 :: !Word64, w01 :: !Word64, w02 :: !Word64, w03 :: !Word64 + , w04 :: !Word64, w05 :: !Word64, w06 :: !Word64, w07 :: !Word64 + , w08 :: !Word64, w09 :: !Word64, w10 :: !Word64, w11 :: !Word64 + , w12 :: !Word64, w13 :: !Word64, w14 :: !Word64, w15 :: !Word64 + , w16 :: !Word64, w17 :: !Word64, w18 :: !Word64, w19 :: !Word64 + , w20 :: !Word64, w21 :: !Word64, w22 :: !Word64, w23 :: !Word64 + , w24 :: !Word64, w25 :: !Word64, w26 :: !Word64, w27 :: !Word64 + , w28 :: !Word64, w29 :: !Word64, w30 :: !Word64, w31 :: !Word64 + , w32 :: !Word64, w33 :: !Word64, w34 :: !Word64, w35 :: !Word64 + , w36 :: !Word64, w37 :: !Word64, w38 :: !Word64, w39 :: !Word64 + , w40 :: !Word64, w41 :: !Word64, w42 :: !Word64, w43 :: !Word64 + , w44 :: !Word64, w45 :: !Word64, w46 :: !Word64, w47 :: !Word64 + , w48 :: !Word64, w49 :: !Word64, w50 :: !Word64, w51 :: !Word64 + , w52 :: !Word64, w53 :: !Word64, w54 :: !Word64, w55 :: !Word64 + , w56 :: !Word64, w57 :: !Word64, w58 :: !Word64, w59 :: !Word64 + , w60 :: !Word64, w61 :: !Word64, w62 :: !Word64, w63 :: !Word64 + , w64 :: !Word64, w65 :: !Word64, w66 :: !Word64, w67 :: !Word64 + , w68 :: !Word64, w69 :: !Word64, w70 :: !Word64, w71 :: !Word64 + , w72 :: !Word64, w73 :: !Word64, w74 :: !Word64, w75 :: !Word64 + , w76 :: !Word64, w77 :: !Word64, w78 :: !Word64, w79 :: !Word64 + } + +-- initialization ------------------------------------------------------------ +-- https://datatracker.ietf.org/doc/html/rfc6234#section-6.1 + +data Registers = Registers { + h0 :: !Word64, h1 :: !Word64, h2 :: !Word64, h3 :: !Word64 + , h4 :: !Word64, h5 :: !Word64, h6 :: !Word64, h7 :: !Word64 + } + +-- first 64 bits of the fractional parts of the square roots of the +-- first eight primes +iv :: Registers +iv = Registers + 0x6a09e667f3bcc908 0xbb67ae8584caa73b 0x3c6ef372fe94f82b 0xa54ff53a5f1d36f1 + 0x510e527fade682d1 0x9b05688c2b3e6c1f 0x1f83d9abfb41bd6b 0x5be0cd19137e2179 + +-- processing ---------------------------------------------------------------- +-- https://datatracker.ietf.org/doc/html/rfc6234#section-6.2 + +data Block = Block { + m00 :: !Word64, m01 :: !Word64, m02 :: !Word64, m03 :: !Word64 + , m04 :: !Word64, m05 :: !Word64, m06 :: !Word64, m07 :: !Word64 + , m08 :: !Word64, m09 :: !Word64, m10 :: !Word64, m11 :: !Word64 + , m12 :: !Word64, m13 :: !Word64, m14 :: !Word64, m15 :: !Word64 + } + +-- given a bytestring and offset, parse word64. length not checked. +word64be :: BS.ByteString -> Int -> Word64 +word64be bs off = + (fi (bs `BU.unsafeIndex` off) `B.unsafeShiftL` 56) .|. + (fi (bs `BU.unsafeIndex` (off + 1)) `B.unsafeShiftL` 48) .|. + (fi (bs `BU.unsafeIndex` (off + 2)) `B.unsafeShiftL` 40) .|. + (fi (bs `BU.unsafeIndex` (off + 3)) `B.unsafeShiftL` 32) .|. + (fi (bs `BU.unsafeIndex` (off + 4)) `B.unsafeShiftL` 24) .|. + (fi (bs `BU.unsafeIndex` (off + 5)) `B.unsafeShiftL` 16) .|. + (fi (bs `BU.unsafeIndex` (off + 6)) `B.unsafeShiftL` 8) .|. + (fi (bs `BU.unsafeIndex` (off + 7))) +{-# INLINE word64be #-} + +-- given a bytestring and block offset, parse block. length not checked. +parse_block :: BS.ByteString -> Int -> Block +parse_block bs m = Block + (word64be bs m) + (word64be bs (m + 8)) + (word64be bs (m + 16)) + (word64be bs (m + 24)) + (word64be bs (m + 32)) + (word64be bs (m + 40)) + (word64be bs (m + 48)) + (word64be bs (m + 56)) + (word64be bs (m + 64)) + (word64be bs (m + 72)) + (word64be bs (m + 80)) + (word64be bs (m + 88)) + (word64be bs (m + 96)) + (word64be bs (m + 104)) + (word64be bs (m + 112)) + (word64be bs (m + 120)) +{-# INLINE parse_block #-} + +-- parse strict bytestring to block +-- +-- invariant: +-- the input bytestring is exactly 1024 bits long +unsafe_parse :: BS.ByteString -> Block +unsafe_parse bs = + let !(WSPair m00 t00) = unsafe_parseWsPair bs + !(WSPair m01 t01) = unsafe_parseWsPair t00 + !(WSPair m02 t02) = unsafe_parseWsPair t01 + !(WSPair m03 t03) = unsafe_parseWsPair t02 + !(WSPair m04 t04) = unsafe_parseWsPair t03 + !(WSPair m05 t05) = unsafe_parseWsPair t04 + !(WSPair m06 t06) = unsafe_parseWsPair t05 + !(WSPair m07 t07) = unsafe_parseWsPair t06 + !(WSPair m08 t08) = unsafe_parseWsPair t07 + !(WSPair m09 t09) = unsafe_parseWsPair t08 + !(WSPair m10 t10) = unsafe_parseWsPair t09 + !(WSPair m11 t11) = unsafe_parseWsPair t10 + !(WSPair m12 t12) = unsafe_parseWsPair t11 + !(WSPair m13 t13) = unsafe_parseWsPair t12 + !(WSPair m14 t14) = unsafe_parseWsPair t13 + !(WSPair m15 _) = unsafe_parseWsPair t14 + in Block {..} + +-- RFC 6234 6.2 step 1 +prepare_schedule :: Block -> Schedule +prepare_schedule Block {..} = Schedule {..} where + w00 = m00; w01 = m01; w02 = m02; w03 = m03 + w04 = m04; w05 = m05; w06 = m06; w07 = m07 + w08 = m08; w09 = m09; w10 = m10; w11 = m11 + w12 = m12; w13 = m13; w14 = m14; w15 = m15 + w16 = ssig1 w14 + w09 + ssig0 w01 + w00 + w17 = ssig1 w15 + w10 + ssig0 w02 + w01 + w18 = ssig1 w16 + w11 + ssig0 w03 + w02 + w19 = ssig1 w17 + w12 + ssig0 w04 + w03 + w20 = ssig1 w18 + w13 + ssig0 w05 + w04 + w21 = ssig1 w19 + w14 + ssig0 w06 + w05 + w22 = ssig1 w20 + w15 + ssig0 w07 + w06 + w23 = ssig1 w21 + w16 + ssig0 w08 + w07 + w24 = ssig1 w22 + w17 + ssig0 w09 + w08 + w25 = ssig1 w23 + w18 + ssig0 w10 + w09 + w26 = ssig1 w24 + w19 + ssig0 w11 + w10 + w27 = ssig1 w25 + w20 + ssig0 w12 + w11 + w28 = ssig1 w26 + w21 + ssig0 w13 + w12 + w29 = ssig1 w27 + w22 + ssig0 w14 + w13 + w30 = ssig1 w28 + w23 + ssig0 w15 + w14 + w31 = ssig1 w29 + w24 + ssig0 w16 + w15 + w32 = ssig1 w30 + w25 + ssig0 w17 + w16 + w33 = ssig1 w31 + w26 + ssig0 w18 + w17 + w34 = ssig1 w32 + w27 + ssig0 w19 + w18 + w35 = ssig1 w33 + w28 + ssig0 w20 + w19 + w36 = ssig1 w34 + w29 + ssig0 w21 + w20 + w37 = ssig1 w35 + w30 + ssig0 w22 + w21 + w38 = ssig1 w36 + w31 + ssig0 w23 + w22 + w39 = ssig1 w37 + w32 + ssig0 w24 + w23 + w40 = ssig1 w38 + w33 + ssig0 w25 + w24 + w41 = ssig1 w39 + w34 + ssig0 w26 + w25 + w42 = ssig1 w40 + w35 + ssig0 w27 + w26 + w43 = ssig1 w41 + w36 + ssig0 w28 + w27 + w44 = ssig1 w42 + w37 + ssig0 w29 + w28 + w45 = ssig1 w43 + w38 + ssig0 w30 + w29 + w46 = ssig1 w44 + w39 + ssig0 w31 + w30 + w47 = ssig1 w45 + w40 + ssig0 w32 + w31 + w48 = ssig1 w46 + w41 + ssig0 w33 + w32 + w49 = ssig1 w47 + w42 + ssig0 w34 + w33 + w50 = ssig1 w48 + w43 + ssig0 w35 + w34 + w51 = ssig1 w49 + w44 + ssig0 w36 + w35 + w52 = ssig1 w50 + w45 + ssig0 w37 + w36 + w53 = ssig1 w51 + w46 + ssig0 w38 + w37 + w54 = ssig1 w52 + w47 + ssig0 w39 + w38 + w55 = ssig1 w53 + w48 + ssig0 w40 + w39 + w56 = ssig1 w54 + w49 + ssig0 w41 + w40 + w57 = ssig1 w55 + w50 + ssig0 w42 + w41 + w58 = ssig1 w56 + w51 + ssig0 w43 + w42 + w59 = ssig1 w57 + w52 + ssig0 w44 + w43 + w60 = ssig1 w58 + w53 + ssig0 w45 + w44 + w61 = ssig1 w59 + w54 + ssig0 w46 + w45 + w62 = ssig1 w60 + w55 + ssig0 w47 + w46 + w63 = ssig1 w61 + w56 + ssig0 w48 + w47 + w64 = ssig1 w62 + w57 + ssig0 w49 + w48 + w65 = ssig1 w63 + w58 + ssig0 w50 + w49 + w66 = ssig1 w64 + w59 + ssig0 w51 + w50 + w67 = ssig1 w65 + w60 + ssig0 w52 + w51 + w68 = ssig1 w66 + w61 + ssig0 w53 + w52 + w69 = ssig1 w67 + w62 + ssig0 w54 + w53 + w70 = ssig1 w68 + w63 + ssig0 w55 + w54 + w71 = ssig1 w69 + w64 + ssig0 w56 + w55 + w72 = ssig1 w70 + w65 + ssig0 w57 + w56 + w73 = ssig1 w71 + w66 + ssig0 w58 + w57 + w74 = ssig1 w72 + w67 + ssig0 w59 + w58 + w75 = ssig1 w73 + w68 + ssig0 w60 + w59 + w76 = ssig1 w74 + w69 + ssig0 w61 + w60 + w77 = ssig1 w75 + w70 + ssig0 w62 + w61 + w78 = ssig1 w76 + w71 + ssig0 w63 + w62 + w79 = ssig1 w77 + w72 + ssig0 w64 + w63 + +-- RFC 6234 6.2 steps 2, 3, 4 +block_hash :: Registers -> Schedule -> Registers +block_hash r00@Registers {..} Schedule {..} = + -- constants are the first 64 bits of the fractional parts of the + -- cube roots of the first eighty prime numbers + let r01 = step r00 0x428a2f98d728ae22 w00 + r02 = step r01 0x7137449123ef65cd w01 + r03 = step r02 0xb5c0fbcfec4d3b2f w02 + r04 = step r03 0xe9b5dba58189dbbc w03 + r05 = step r04 0x3956c25bf348b538 w04 + r06 = step r05 0x59f111f1b605d019 w05 + r07 = step r06 0x923f82a4af194f9b w06 + r08 = step r07 0xab1c5ed5da6d8118 w07 + r09 = step r08 0xd807aa98a3030242 w08 + r10 = step r09 0x12835b0145706fbe w09 + r11 = step r10 0x243185be4ee4b28c w10 + r12 = step r11 0x550c7dc3d5ffb4e2 w11 + r13 = step r12 0x72be5d74f27b896f w12 + r14 = step r13 0x80deb1fe3b1696b1 w13 + r15 = step r14 0x9bdc06a725c71235 w14 + r16 = step r15 0xc19bf174cf692694 w15 + r17 = step r16 0xe49b69c19ef14ad2 w16 + r18 = step r17 0xefbe4786384f25e3 w17 + r19 = step r18 0x0fc19dc68b8cd5b5 w18 + r20 = step r19 0x240ca1cc77ac9c65 w19 + r21 = step r20 0x2de92c6f592b0275 w20 + r22 = step r21 0x4a7484aa6ea6e483 w21 + r23 = step r22 0x5cb0a9dcbd41fbd4 w22 + r24 = step r23 0x76f988da831153b5 w23 + r25 = step r24 0x983e5152ee66dfab w24 + r26 = step r25 0xa831c66d2db43210 w25 + r27 = step r26 0xb00327c898fb213f w26 + r28 = step r27 0xbf597fc7beef0ee4 w27 + r29 = step r28 0xc6e00bf33da88fc2 w28 + r30 = step r29 0xd5a79147930aa725 w29 + r31 = step r30 0x06ca6351e003826f w30 + r32 = step r31 0x142929670a0e6e70 w31 + r33 = step r32 0x27b70a8546d22ffc w32 + r34 = step r33 0x2e1b21385c26c926 w33 + r35 = step r34 0x4d2c6dfc5ac42aed w34 + r36 = step r35 0x53380d139d95b3df w35 + r37 = step r36 0x650a73548baf63de w36 + r38 = step r37 0x766a0abb3c77b2a8 w37 + r39 = step r38 0x81c2c92e47edaee6 w38 + r40 = step r39 0x92722c851482353b w39 + r41 = step r40 0xa2bfe8a14cf10364 w40 + r42 = step r41 0xa81a664bbc423001 w41 + r43 = step r42 0xc24b8b70d0f89791 w42 + r44 = step r43 0xc76c51a30654be30 w43 + r45 = step r44 0xd192e819d6ef5218 w44 + r46 = step r45 0xd69906245565a910 w45 + r47 = step r46 0xf40e35855771202a w46 + r48 = step r47 0x106aa07032bbd1b8 w47 + r49 = step r48 0x19a4c116b8d2d0c8 w48 + r50 = step r49 0x1e376c085141ab53 w49 + r51 = step r50 0x2748774cdf8eeb99 w50 + r52 = step r51 0x34b0bcb5e19b48a8 w51 + r53 = step r52 0x391c0cb3c5c95a63 w52 + r54 = step r53 0x4ed8aa4ae3418acb w53 + r55 = step r54 0x5b9cca4f7763e373 w54 + r56 = step r55 0x682e6ff3d6b2b8a3 w55 + r57 = step r56 0x748f82ee5defb2fc w56 + r58 = step r57 0x78a5636f43172f60 w57 + r59 = step r58 0x84c87814a1f0ab72 w58 + r60 = step r59 0x8cc702081a6439ec w59 + r61 = step r60 0x90befffa23631e28 w60 + r62 = step r61 0xa4506cebde82bde9 w61 + r63 = step r62 0xbef9a3f7b2c67915 w62 + r64 = step r63 0xc67178f2e372532b w63 + r65 = step r64 0xca273eceea26619c w64 + r66 = step r65 0xd186b8c721c0c207 w65 + r67 = step r66 0xeada7dd6cde0eb1e w66 + r68 = step r67 0xf57d4f7fee6ed178 w67 + r69 = step r68 0x06f067aa72176fba w68 + r70 = step r69 0x0a637dc5a2c898a6 w69 + r71 = step r70 0x113f9804bef90dae w70 + r72 = step r71 0x1b710b35131c471b w71 + r73 = step r72 0x28db77f523047d84 w72 + r74 = step r73 0x32caab7b40c72493 w73 + r75 = step r74 0x3c9ebe0a15c9bebc w74 + r76 = step r75 0x431d67c49c100d4c w75 + r77 = step r76 0x4cc5d4becb3e42b6 w76 + r78 = step r77 0x597f299cfc657e2a w77 + r79 = step r78 0x5fcb6fab3ad6faec w78 + r80 = step r79 0x6c44198c4a475817 w79 + !(Registers a b c d e f g h) = r80 + in Registers + (a + h0) (b + h1) (c + h2) (d + h3) + (e + h4) (f + h5) (g + h6) (h + h7) + +step :: Registers -> Word64 -> Word64 -> Registers +step (Registers a b c d e f g h) k w = + let t1 = h + bsig1 e + ch e f g + k + w + t2 = bsig0 a + maj a b c + in Registers (t1 + t2) a b c (d + t1) e f g +{-# INLINE step #-} + +-- RFC 6234 6.2 block pipeline +-- +-- invariant: +-- the input bytestring is exactly 1024 bits in length +unsafe_hash_alg :: Registers -> BS.ByteString -> Registers +unsafe_hash_alg rs bs = block_hash rs (prepare_schedule (unsafe_parse bs)) + +-- register concatenation +cat :: Registers -> BS.ByteString +cat Registers {..} = to_strict_small $ + BSB.word64BE h0 <> BSB.word64BE h1 <> BSB.word64BE h2 <> BSB.word64BE h3 + <> BSB.word64BE h4 <> BSB.word64BE h5 <> BSB.word64BE h6 <> BSB.word64BE h7 + +-- RFC 6234 4.1 message padding +unsafe_padding :: BS.ByteString -> Word64 -> BS.ByteString +unsafe_padding (BI.PS fp off r) len + | r < 112 = BI.unsafeCreate 128 $ \p -> do + BI.unsafeWithForeignPtr fp $ \src -> + copyBytes p (src `plusPtr` off) r + poke (p `plusPtr` r) (0x80 :: Word8) + fillBytes (p `plusPtr` (r + 1)) 0 (111 - r) + poke_word64be (p `plusPtr` 112) 0 + poke_word64be (p `plusPtr` 120) (len * 8) + | otherwise = BI.unsafeCreate 256 $ \p -> do + BI.unsafeWithForeignPtr fp $ \src -> + copyBytes p (src `plusPtr` off) r + poke (p `plusPtr` r) (0x80 :: Word8) + fillBytes (p `plusPtr` (r + 1)) 0 (127 - r) + fillBytes (p `plusPtr` 128) 0 112 + poke_word64be (p `plusPtr` 240) 0 + poke_word64be (p `plusPtr` 248) (len * 8) + where + poke_word64be :: Ptr Word8 -> Word64 -> IO () + poke_word64be p w = do + poke p (fi (w `B.unsafeShiftR` 56) :: Word8) + poke (p `plusPtr` 1) (fi (w `B.unsafeShiftR` 48) :: Word8) + poke (p `plusPtr` 2) (fi (w `B.unsafeShiftR` 40) :: Word8) + poke (p `plusPtr` 3) (fi (w `B.unsafeShiftR` 32) :: Word8) + poke (p `plusPtr` 4) (fi (w `B.unsafeShiftR` 24) :: Word8) + poke (p `plusPtr` 5) (fi (w `B.unsafeShiftR` 16) :: Word8) + poke (p `plusPtr` 6) (fi (w `B.unsafeShiftR` 8) :: Word8) + poke (p `plusPtr` 7) (fi w :: Word8) diff --git a/lib/Crypto/Hash/SHA512/Lazy.hs b/lib/Crypto/Hash/SHA512/Lazy.hs @@ -0,0 +1,182 @@ +{-# OPTIONS_GHC -funbox-small-strict-fields #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ViewPatterns #-} + +-- | +-- Module: Crypto.Hash.SHA512.Lazy +-- Copyright: (c) 2024 Jared Tobin +-- License: MIT +-- Maintainer: Jared Tobin <jared@ppad.tech> +-- +-- Pure SHA-512 and HMAC-SHA512 implementations for lazy ByteStrings, +-- as specified by RFC's +-- [6234](https://datatracker.ietf.org/doc/html/rfc6234) and +-- [2104](https://datatracker.ietf.org/doc/html/rfc2104). + +module Crypto.Hash.SHA512.Lazy ( + -- * SHA-512 message digest functions + hash_lazy + + -- * SHA512-based MAC functions + , hmac_lazy + ) where + +import qualified Data.Bits as B +import qualified Data.ByteString as BS +import qualified Data.ByteString.Builder as BSB +import qualified Data.ByteString.Builder.Extra as BE +import qualified Data.ByteString.Internal as BI +import qualified Data.ByteString.Lazy as BL +import qualified Data.ByteString.Lazy.Internal as BLI +import Data.Word (Word64) +import Foreign.ForeignPtr (plusForeignPtr) +import Crypto.Hash.SHA512.Internal + +-- preliminary utils + +-- keystroke saver +fi :: (Integral a, Num b) => a -> b +fi = fromIntegral +{-# INLINE fi #-} + +-- utility types for more efficient ByteString management + +data SSPair = SSPair + {-# UNPACK #-} !BS.ByteString + {-# UNPACK #-} !BS.ByteString + +data SLPair = SLPair {-# UNPACK #-} !BS.ByteString !BL.ByteString + +-- unsafe version of splitAt that does no bounds checking +-- +-- invariant: +-- 0 <= n <= l +unsafe_splitAt :: Int -> BS.ByteString -> SSPair +unsafe_splitAt n (BI.BS x l) = + SSPair (BI.BS x n) (BI.BS (plusForeignPtr x n) (l - n)) + +-- variant of Data.ByteString.Lazy.splitAt that returns the initial +-- component as a strict, unboxed ByteString +splitAt128 :: BL.ByteString -> SLPair +splitAt128 = splitAt' (128 :: Int) where + splitAt' _ BLI.Empty = SLPair mempty BLI.Empty + splitAt' n (BLI.Chunk c@(BI.PS _ _ l) cs) = + if n < l + then + -- n < BS.length c, so unsafe_splitAt is safe + let !(SSPair c0 c1) = unsafe_splitAt n c + in SLPair c0 (BLI.Chunk c1 cs) + else + let SLPair cs' cs'' = splitAt' (n - l) cs + in SLPair (c <> cs') cs'' + +-- builder realization strategies + +to_strict :: BSB.Builder -> BS.ByteString +to_strict = BL.toStrict . BSB.toLazyByteString + +-- message padding and parsing +-- https://datatracker.ietf.org/doc/html/rfc6234#section-4.1 + +-- k such that (l + 1 + k) mod 128 = 112 +sol :: Word64 -> Word64 +sol l = + let r = 112 - fi l `rem` 128 - 1 :: Integer -- fi prevents underflow + in fi (if r < 0 then r + 128 else r) + +-- RFC 6234 4.1 (lazy) +pad_lazy :: BL.ByteString -> BL.ByteString +pad_lazy (BL.toChunks -> m) = BL.fromChunks (walk 0 m) where + walk !l bs = case bs of + (c:cs) -> c : walk (l + fi (BS.length c)) cs + [] -> padding l (sol l) (BSB.word8 0x80) + + padding l k bs + | k == 0 = + pure + . to_strict + -- more efficient for small builder + $ bs <> BSB.word64BE 0x00 <> BSB.word64BE (l * 8) + | otherwise = + let nacc = bs <> BSB.word8 0x00 + in padding l (pred k) nacc + +-- | Compute a condensed representation of a lazy bytestring via +-- SHA-512. +-- +-- The 512-bit output digest is returned as a strict bytestring. +-- +-- >>> hash_lazy "lazy bytestring input" +-- "<strict 512-bit message digest>" +hash_lazy :: BL.ByteString -> BS.ByteString +hash_lazy bl = cat (go iv (pad_lazy bl)) where + go :: Registers -> BL.ByteString -> Registers + go !acc bs + | BL.null bs = acc + | otherwise = case splitAt128 bs of + SLPair c r -> go (unsafe_hash_alg acc c) r + +-- HMAC ----------------------------------------------------------------------- +-- https://datatracker.ietf.org/doc/html/rfc2104#section-2 + +data KeyAndLen = KeyAndLen + {-# UNPACK #-} !BS.ByteString + {-# UNPACK #-} !Int + +-- | Produce a message authentication code for a lazy bytestring, based +-- on the provided (strict, bytestring) key, via SHA-512. +-- +-- The 512-bit MAC is returned as a strict bytestring. +-- +-- Per RFC 2104, the key /should/ be a minimum of 64 bytes long. Keys +-- exceeding 128 bytes in length will first be hashed (via SHA-512). +-- +-- >>> hmac_lazy "strict bytestring key" "lazy bytestring input" +-- "<strict 512-bit MAC>" +hmac_lazy + :: BS.ByteString -- ^ key + -> BL.ByteString -- ^ text + -> BS.ByteString +hmac_lazy mk@(BI.PS _ _ l) text = + let step1 = k <> BS.replicate (128 - lk) 0x00 + step2 = BS.map (B.xor 0x36) step1 + step3 = BL.fromStrict step2 <> text + step4 = hash_lazy step3 + step5 = BS.map (B.xor 0x5C) step1 + step6 = step5 <> step4 + in hash step6 + where + hash bs = cat (go iv (pad bs)) where + go :: Registers -> BS.ByteString -> Registers + go !acc b + | BS.null b = acc + | otherwise = case unsafe_splitAt 128 b of + SSPair c r -> go (unsafe_hash_alg acc c) r + + pad m@(BI.PS _ _ (fi -> len)) + | len < 256 = to_strict_small padded + | otherwise = to_strict padded + where + padded = BSB.byteString m + <> fill (sol len) (BSB.word8 0x80) + <> BSB.word64BE 0x00 + <> BSB.word64BE (len * 8) + + to_strict_small = BL.toStrict . BE.toLazyByteStringWith + (BE.safeStrategy 256 BE.smallChunkSize) mempty + + fill j !acc + | j `rem` 8 == 0 = loop64 j acc + | otherwise = loop8 j acc + + loop64 j !acc + | j == 0 = acc + | otherwise = loop64 (j - 8) (acc <> BSB.word64BE 0x00) + + loop8 j !acc + | j == 0 = acc + | otherwise = loop8 (pred j) (acc <> BSB.word8 0x00) + + !(KeyAndLen k lk) + | l > 128 = KeyAndLen (hash mk) 64 + | otherwise = KeyAndLen mk l diff --git a/ppad-sha512.cabal b/ppad-sha512.cabal @@ -32,9 +32,16 @@ library ghc-options: -fllvm -O2 exposed-modules: Crypto.Hash.SHA512 + Crypto.Hash.SHA512.Arm + Crypto.Hash.SHA512.Internal + Crypto.Hash.SHA512.Lazy build-depends: base >= 4.9 && < 5 , bytestring >= 0.9 && < 0.13 + c-sources: + cbits/sha512_arm.c + if arch(aarch64) + cc-options: -march=armv8.2-a+sha3 test-suite sha512-tests type: exitcode-stdio-1.0