commit de8af403df9657b2055cae09606d85e5c21bc6e4 parent 281117576a3481c12be8601058549730f994bf9c Author: Jared Tobin <jared@jtobin.io> Date: Tue, 21 Jan 2025 22:11:54 +0400 lib: performance tuning Optimizes builders during padding in the strict case. Diffstat:
M | README.md | | | 32 | +++++++++----------------------- |
M | lib/Crypto/Hash/RIPEMD160.hs | | | 82 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ |
2 files changed, 85 insertions(+), 29 deletions(-)
diff --git a/README.md b/README.md @@ -59,32 +59,18 @@ Current benchmark figures on my mid-2020 MacBook Air look like (use ``` benchmarking ppad-ripemd160/RIPEMD160 (32B input)/hash - time 1.115 μs (1.109 μs .. 1.122 μs) - 1.000 R² (1.000 R² .. 1.000 R²) - mean 1.123 μs (1.117 μs .. 1.130 μs) - std dev 19.84 ns (16.75 ns .. 23.55 ns) - variance introduced by outliers: 19% (moderately inflated) - - benchmarking ppad-ripemd160/RIPEMD160 (32B input)/hash_lazy - time 1.072 μs (1.060 μs .. 1.085 μs) + time 786.6 ns (778.0 ns .. 796.7 ns) 0.999 R² (0.999 R² .. 1.000 R²) - mean 1.073 μs (1.066 μs .. 1.082 μs) - std dev 27.29 ns (23.35 ns .. 31.28 ns) - variance introduced by outliers: 33% (moderately inflated) + mean 778.6 ns (775.3 ns .. 784.2 ns) + std dev 13.85 ns (9.858 ns .. 22.05 ns) + variance introduced by outliers: 20% (moderately inflated) benchmarking ppad-ripemd160/HMAC-RIPEMD160 (32B input)/hmac - time 3.941 μs (3.919 μs .. 3.963 μs) - 1.000 R² (0.999 R² .. 1.000 R²) - mean 3.997 μs (3.972 μs .. 4.037 μs) - std dev 111.0 ns (71.80 ns .. 191.1 ns) - variance introduced by outliers: 34% (moderately inflated) - - benchmarking ppad-ripemd160/HMAC-RIPEMD160 (32B input)/hmac_lazy - time 3.944 μs (3.912 μs .. 3.991 μs) - 0.999 R² (0.999 R² .. 1.000 R²) - mean 3.982 μs (3.955 μs .. 4.012 μs) - std dev 96.66 ns (83.81 ns .. 117.3 ns) - variance introduced by outliers: 28% (moderately inflated) + time 2.933 μs (2.906 μs .. 2.974 μs) + 0.999 R² (0.999 R² .. 0.999 R²) + mean 3.002 μs (2.978 μs .. 3.022 μs) + std dev 74.97 ns (62.74 ns .. 89.91 ns) + variance introduced by outliers: 30% (moderately inflated) ``` ## Security diff --git a/lib/Crypto/Hash/RIPEMD160.hs b/lib/Crypto/Hash/RIPEMD160.hs @@ -113,15 +113,85 @@ unsafe_parseWsPair (BI.BS x l) = -- k such that (l + 1 + k) mod 64 = 56 sol :: Word64 -> Word64 sol l = - let r = 56 - fi l `mod` 64 - 1 :: Integer -- fi prevents underflow + let r = 56 - fi l `rem` 64 - 1 :: Integer -- fi prevents underflow in fi (if r < 0 then r + 64 else r) pad :: BS.ByteString -> BS.ByteString -pad m@(BI.PS _ _ (fi -> l)) = BL.toStrict . BSB.toLazyByteString $ padded where - padded = BSB.byteString m <> fill (sol l) (BSB.word8 0x80) - fill j !acc - | j == 0 = acc <> BSB.word64LE (l * 8) - | otherwise = fill (pred j) (acc <> BSB.word8 0x00) +pad m@(BI.PS _ _ (fi -> l)) = + BL.toStrict . BE.toLazyByteStringWith + (BE.safeStrategy 128 BE.smallChunkSize) mempty $ padded + where + padded = BSB.byteString m + <> fill (sol l) (BSB.word8 0x80) + <> BSB.word64LE (l * 8) + + fill j !acc + | j `rem` 8 == 0 = + loop64 j acc + | (j - 7) `rem` 8 == 0 = + loop64 (j - 7) acc + <> BSB.word32LE 0x00 + <> BSB.word16LE 0x00 + <> BSB.word8 0x00 + | (j - 6) `rem` 8 == 0 = + loop64 (j - 6) acc + <> BSB.word32LE 0x00 + <> BSB.word16LE 0x00 + | (j - 5) `rem` 8 == 0 = + loop64 (j - 5) acc + <> BSB.word32LE 0x00 + <> BSB.word8 0x00 + | (j - 4) `rem` 8 == 0 = + loop64 (j - 4) acc + <> BSB.word32LE 0x00 + | (j - 3) `rem` 8 == 0 = + loop64 (j - 3) acc + <> BSB.word16LE 0x00 + <> BSB.word8 0x00 + | (j - 2) `rem` 8 == 0 = + loop64 (j - 2) acc + <> BSB.word16LE 0x00 + | (j - 1) `rem` 8 == 0 = + loop64 (j - 1) acc + <> BSB.word8 0x00 + + | j `rem` 4 == 0 = + loop32 j acc + | (j - 3) `rem` 4 == 0 = + loop32 (j - 3) acc + <> BSB.word16LE 0x00 + <> BSB.word8 0x00 + | (j - 2) `rem` 4 == 0 = + loop32 (j - 2) acc + <> BSB.word16LE 0x00 + | (j - 1) `rem` 4 == 0 = + loop32 (j - 1) acc + <> BSB.word8 0x00 + + | j `rem` 2 == 0 = + loop16 j acc + | (j - 1) `rem` 2 == 0 = + loop16 (j - 1) acc + <> BSB.word8 0x00 + + | otherwise = + loop8 j acc + + loop64 j !acc + | j == 0 = acc + | otherwise = loop64 (j - 8) (acc <> BSB.word64LE 0x00) + + loop32 j !acc + | j == 0 = acc + | otherwise = loop32 (j - 4) (acc <> BSB.word32LE 0x00) + + loop16 j !acc + | j == 0 = acc + | otherwise = loop16 (j - 2) (acc <> BSB.word16LE 0x00) + + loop8 j !acc + | j == 0 = acc + | otherwise = loop8 (pred j) (acc <> BSB.word8 0x00) pad_lazy :: BL.ByteString -> BL.ByteString pad_lazy (BL.toChunks -> m) = BL.fromChunks (walk 0 m) where