commit 04aee7f3f5781614832f33c29b4c34a55b2bbb36
parent 83463c2f9fd6779eeac6649b0df72a459372197f
Author: Jared Tobin <jared@jtobin.io>
Date: Sat, 14 Sep 2024 00:35:26 +0400
meta: update readme with performance notes
Diffstat:
4 files changed, 93 insertions(+), 54 deletions(-)
diff --git a/README.md b/README.md
@@ -51,76 +51,112 @@ Haddocks (API documentation, etc.) are hosted at
## Performance
-The eventual aim is best-in-class performance for pure, highly-auditable
-Haskell code. But let's keep it as an eventual goal.
+The aim is best-in-class performance for pure, highly-auditable Haskell
+code.
Current benchmark figures look like (use `cabal bench` to run the
benchmark suite):
```
benchmarking ppad-sha256/SHA256 (32B input)/hash
- time 2.684 μs (2.658 μs .. 2.714 μs)
- 0.999 R² (0.999 R² .. 1.000 R²)
- mean 2.689 μs (2.674 μs .. 2.706 μs)
- std dev 55.18 ns (44.66 ns .. 66.35 ns)
- variance introduced by outliers: 22% (moderately inflated)
+ time 1.898 μs (1.858 μs .. 1.941 μs)
+ 0.997 R² (0.996 R² .. 0.999 R²)
+ mean 1.874 μs (1.856 μs .. 1.902 μs)
+ std dev 75.90 ns (60.30 ns .. 101.8 ns)
+ variance introduced by outliers: 55% (severely inflated)
benchmarking ppad-sha256/SHA256 (32B input)/hash_lazy
- time 2.746 μs (2.712 μs .. 2.786 μs)
- 0.999 R² (0.998 R² .. 1.000 R²)
- mean 2.747 μs (2.720 μs .. 2.784 μs)
- std dev 101.1 ns (73.17 ns .. 144.1 ns)
- variance introduced by outliers: 49% (moderately inflated)
+ time 1.930 μs (1.891 μs .. 1.967 μs)
+ 0.998 R² (0.997 R² .. 0.999 R²)
+ mean 1.912 μs (1.885 μs .. 1.947 μs)
+ std dev 104.6 ns (77.34 ns .. 162.6 ns)
+ variance introduced by outliers: 69% (severely inflated)
benchmarking ppad-sha256/HMAC-SHA256 (32B input)/hmac
- time 10.30 μs (10.18 μs .. 10.48 μs)
- 0.997 R² (0.996 R² .. 0.998 R²)
- mean 10.68 μs (10.48 μs .. 10.92 μs)
- std dev 720.5 ns (603.8 ns .. 874.2 ns)
- variance introduced by outliers: 74% (severely inflated)
+ time 7.287 μs (7.128 μs .. 7.424 μs)
+ 0.996 R² (0.995 R² .. 0.998 R²)
+ mean 7.272 μs (7.115 μs .. 7.455 μs)
+ std dev 565.2 ns (490.9 ns .. 689.7 ns)
+ variance introduced by outliers: 80% (severely inflated)
benchmarking ppad-sha256/HMAC-SHA256 (32B input)/hmac_lazy
- time 10.58 μs (10.36 μs .. 10.85 μs)
- 0.996 R² (0.991 R² .. 0.998 R²)
- mean 10.72 μs (10.56 μs .. 10.93 μs)
- std dev 634.4 ns (523.1 ns .. 868.8 ns)
- variance introduced by outliers: 68% (severely inflated)
+ time 7.231 μs (7.102 μs .. 7.393 μs)
+ 0.996 R² (0.994 R² .. 0.998 R²)
+ mean 7.436 μs (7.290 μs .. 7.621 μs)
+ std dev 559.2 ns (446.3 ns .. 732.7 ns)
+ variance introduced by outliers: 78% (severely inflated)
```
-When testing `hash_lazy` on a 1GB input, we get a profile like the
+Compare this to Hackage's famous SHA package:
+
+```
+ benchmarking ppad-sha256/SHA256 (32B input)/SHA.sha256
+ time 2.929 μs (2.871 μs .. 2.995 μs)
+ 0.997 R² (0.995 R² .. 0.998 R²)
+ mean 2.879 μs (2.833 μs .. 2.938 μs)
+ std dev 170.4 ns (130.4 ns .. 258.9 ns)
+ variance introduced by outliers: 71% (severely inflated)
+
+ benchmarking ppad-sha256/HMAC-SHA256 (32B input)/SHA.hmacSha256
+ time 11.42 μs (11.09 μs .. 11.80 μs)
+ 0.994 R² (0.992 R² .. 0.997 R²)
+ mean 11.36 μs (11.09 μs .. 11.61 μs)
+ std dev 903.5 ns (766.5 ns .. 1.057 μs)
+ variance introduced by outliers: 79% (severely inflated)
+```
+
+When testing `hash_lazy` on a 1GB input, we get statistics like the
following:
```
- COST CENTRE %time %alloc
-
- Crypto.Hash.SHA256.block_hash 72.8 4.9
- Crypto.Hash.SHA256.prepare_schedule 15.9 32.3
- Crypto.Hash.SHA256.blocks_lazy 3.7 37.2
- Crypto.Hash.SHA256.parse 3.6 14.7
- Crypto.Hash.SHA256.hash_alg 2.1 2.9
- hash 1.3 8.0
+ 2,310,899,616 bytes allocated in the heap
+ 93,800 bytes copied during GC
+ 78,912 bytes maximum residency (2 sample(s))
+ 35,776 bytes maximum slop
+ 10 MiB total memory in use (0 MiB lost due to fragmentation)
+
+ Tot time (elapsed) Avg pause Max pause
+ Gen 0 295 colls, 0 par 0.007s 0.008s 0.0000s 0.0001s
+ Gen 1 2 colls, 0 par 0.000s 0.001s 0.0004s 0.0004s
+
+ INIT time 0.003s ( 0.003s elapsed)
+ MUT time 22.205s ( 22.260s elapsed)
+ GC time 0.007s ( 0.009s elapsed)
+ EXIT time 0.000s ( 0.001s elapsed)
+ Total time 22.216s ( 22.273s elapsed)
+
+ %GC time 0.0% (0.0% elapsed)
+
+ Alloc rate 104,073,382 bytes per MUT second
+
+ Productivity 100.0% of total user, 99.9% of total elapsed
```
-The overwhelming majority of time is spent in `block_hash`, i.e. steps
-2, 3 and 4 of RFC 6234's section 6.2, which is a good target for
-optimisation.
-
-Almost all allocation can be eliminated via the use of 1) better
-bytestring management, and 2) unlifted types & unboxed tuples (the
-internal `Schedule` type, for example, is a record type of sixty-four
-Word32's, which can be replaced by an unboxed 64-tuple, the maximum
-tuple size supported by GHC).
-
-More care with bytestrings reduces the majority. The use of
-Data.ByteString.Lazy.splitAt is very problematic, as it is neither
-O(1) in time nor space as is its strict cousin. The use of a custom
-splitAt function that returns a (StrictByteString, LazyByteString) pair
-decreases allocation substantially, as do similar strategies (e.g.
-careful use of a custom Data.ByteString.splitAt that returns a strict,
-unboxed pair).
-
-None of these optimisations actually improve wall-clock performance, so
-they are left unimplemented for the time being.
+SHA.sha256 gets more like:
+
+```
+ 74,403,596,936 bytes allocated in the heap
+ 12,971,992 bytes copied during GC
+ 79,176 bytes maximum residency (2 sample(s))
+ 35,512 bytes maximum slop
+ 6 MiB total memory in use (0 MiB lost due to fragmentation)
+
+ Tot time (elapsed) Avg pause Max pause
+ Gen 0 17883 colls, 0 par 0.103s 0.148s 0.0000s 0.0001s
+ Gen 1 2 colls, 0 par 0.000s 0.000s 0.0002s 0.0003s
+
+ INIT time 0.006s ( 0.006s elapsed)
+ MUT time 32.367s ( 32.408s elapsed)
+ GC time 0.104s ( 0.149s elapsed)
+ EXIT time 0.000s ( 0.001s elapsed)
+ Total time 32.477s ( 32.563s elapsed)
+
+ %GC time 0.0% (0.0% elapsed)
+
+ Alloc rate 2,298,740,250 bytes per MUT second
+
+ Productivity 99.7% of total user, 99.5% of total elapsed
+```
## Security
diff --git a/bench/Main.hs b/bench/Main.hs
@@ -24,6 +24,7 @@ suite = env setup $ \ ~(bs, bl) ->
, bgroup "HMAC-SHA256 (32B input)" [
bench "hmac" $ whnf (SHA256.hmac "key") bs
, bench "hmac_lazy" $ whnf (SHA256.hmac_lazy "key") bl
+ , bench "SHA.hmacSha256" $ whnf (SHA.hmacSha256 "key") bl
]
]
where
diff --git a/ppad-sha256.cabal b/ppad-sha256.cabal
@@ -72,4 +72,5 @@ executable hash-large
, base16-bytestring
, bytestring
, ppad-sha256
+ , SHA
diff --git a/src/HashLarge.hs b/src/HashLarge.hs
@@ -6,6 +6,7 @@ import qualified Crypto.Hash.SHA256 as SHA256
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Builder as BSB
import qualified Data.ByteString.Base16 as B16
+import qualified Data.Digest.Pure.SHA as SHA
import System.Environment
main :: IO ()
@@ -20,12 +21,12 @@ main = do
hash :: IO ()
hash = do
- input <- BL.readFile "ppad-sha256-large.dat"
- let digest = B16.encode $ SHA256.hash_lazy input
+ input <- BL.readFile "ppad-sha256-hash-large.dat"
+ let digest = SHA.sha256 input -- B16.encode $ SHA256.hash_lazy input
print digest
make :: IO ()
-make = BL.writeFile "ppad-sha256-large.dat" big_input where
+make = BL.writeFile "ppad-sha256-hash-large.dat" big_input where
big_input :: BL.ByteString
big_input = go (16777216 :: Int) mempty where
go j acc