commit e6a960955e8b67215ac3ee67446d468814ba9cad
parent 8b72bcf0deff5a29a7259b13456115ff6a23a254
Author: Jared Tobin <jared@jtobin.io>
Date: Wed, 29 Jan 2025 20:57:58 +0400
lib: wip optimization snapshot, <10ns div diff
Diffstat:
3 files changed, 47 insertions(+), 27 deletions(-)
diff --git a/README.md b/README.md
@@ -95,18 +95,18 @@ Current benchmark figures on my mid-2020 MacBook Air look like (use
variance introduced by outliers: 54% (severely inflated)
benchmarking division/div
- time 90.87 ns (89.74 ns .. 92.07 ns)
- 0.999 R² (0.999 R² .. 0.999 R²)
- mean 91.45 ns (90.43 ns .. 92.55 ns)
- std dev 3.503 ns (2.899 ns .. 4.626 ns)
- variance introduced by outliers: 59% (severely inflated)
+ time 81.94 ns (81.11 ns .. 82.84 ns)
+ 0.999 R² (0.999 R² .. 1.000 R²)
+ mean 81.97 ns (81.43 ns .. 82.89 ns)
+ std dev 2.422 ns (1.696 ns .. 3.578 ns)
+ variance introduced by outliers: 46% (moderately inflated)
benchmarking division/div (baseline)
- time 77.31 ns (76.25 ns .. 78.93 ns)
- 0.998 R² (0.997 R² .. 0.999 R²)
- mean 78.42 ns (77.56 ns .. 79.76 ns)
- std dev 3.492 ns (2.790 ns .. 4.688 ns)
- variance introduced by outliers: 66% (severely inflated)
+ time 72.08 ns (71.26 ns .. 72.89 ns)
+ 0.999 R² (0.999 R² .. 0.999 R²)
+ mean 72.35 ns (71.52 ns .. 73.15 ns)
+ std dev 2.566 ns (2.203 ns .. 3.044 ns)
+ variance introduced by outliers: 55% (severely inflated)
benchmarking division/mod
time 106.6 ns (105.7 ns .. 107.5 ns)
diff --git a/bench/Main.hs b/bench/Main.hs
@@ -42,9 +42,9 @@ division = bgroup "division" [
main :: IO ()
main = defaultMain [
- add_sub
- , multiplication
- , division
+ -- add_sub
+ --, multiplication
+ division
]
-- addition and subtraction ---------------------------------------------------
diff --git a/lib/Data/Word/Extended.hs b/lib/Data/Word/Extended.hs
@@ -61,7 +61,6 @@ module Data.Word.Extended (
) where
import Control.DeepSeq
-import Control.Monad (void)
import Control.Monad.Primitive
import Control.Monad.ST
import Data.Bits ((.|.), (.&.), (.<<.), (.>>.), (.^.))
@@ -765,6 +764,7 @@ normalize_divisor (Word256 d0 d1 d2 d3) = do
dn_0 <- norm (dlen - 1) d_last
d_final <- PA.unsafeFreezePrimArray dn
pure (d_final, shift, dn_0)
+{-# INLINE normalize_divisor #-}
quotrem
:: PrimMonad m
@@ -827,19 +827,21 @@ quotrem quo u d = do
!un_0 <- PA.readPrimArray u 0
unn_rem 0 un_0
+{-# INLINE quotrem #-}
quot
:: PrimMonad m
- => PA.MutablePrimArray (PrimState m) Word64 -- quotient (potentially large)
+ => PA.MutablePrimArray (PrimState m) Word64 -- quotient
-> PA.MutablePrimArray (PrimState m) Word64 -- unnormalized dividend
-> Word256 -- unnormalized divisor
- -> m ()
+ -> m Int -- length of quotient
quot quo u d = do
-- normalize divisor
!(dn, shift, dn_0) <- normalize_divisor d
let !dlen = PA.sizeofPrimArray dn
-- get size of normalized dividend
+ -- XX extract this
!lu <- PA.getSizeofMutablePrimArray u
!ulen <- let loop !j
| j < 0 = pure 0
@@ -848,9 +850,10 @@ quot quo u d = do
if uj /= 0 then pure (j + 1) else loop (j - 1)
in loop (lu - 2) -- don't touch the uninitialized word
if ulen < dlen
- then pure ()
+ then pure 0
else do
-- normalize dividend
+ -- XX extract this
u_hi <- PA.readPrimArray u (ulen - 1)
PA.writePrimArray u ulen (u_hi .>>. (64 - shift))
let normalize_u !j !uj
@@ -866,9 +869,12 @@ quot quo u d = do
then do
-- normalized divisor is small
!un <- PA.unsafeFreezePrimArray u
- void (quotrem_by1 quo un dn_0)
- else
+ _ <- quotrem_by1 quo un dn_0
+ pure ulen
+ else do
quotrem_knuth quo u (ulen + 1) dn
+ pure (ulen + 1 - dlen)
+{-# INLINE quot #-}
div :: Word256 -> Word256 -> Word256
div u@(Word256 u0 u1 u2 u3) d@(Word256 d0 _ _ _)
@@ -878,7 +884,6 @@ div u@(Word256 u0 u1 u2 u3) d@(Word256 d0 _ _ _)
| otherwise = runST $ do
-- allocate quotient
quo <- PA.newPrimArray 4
- PA.setPrimArray quo 0 4 0
-- allocate dividend, leaving enough space for normalization
u_arr <- PA.newPrimArray 5
PA.writePrimArray u_arr 0 u0
@@ -886,12 +891,27 @@ div u@(Word256 u0 u1 u2 u3) d@(Word256 d0 _ _ _)
PA.writePrimArray u_arr 2 u2
PA.writePrimArray u_arr 3 u3
-- last index of u_hot intentionally unset
- quot quo u_arr d
- q0 <- PA.readPrimArray quo 0
- q1 <- PA.readPrimArray quo 1
- q2 <- PA.readPrimArray quo 2
- q3 <- PA.readPrimArray quo 3
- pure (Word256 q0 q1 q2 q3)
+ l <- quot quo u_arr d
+ case l of
+ 1 -> do
+ q0 <- PA.readPrimArray quo 0
+ pure (Word256 q0 0 0 0)
+ 2 -> do
+ q0 <- PA.readPrimArray quo 0
+ q1 <- PA.readPrimArray quo 1
+ pure (Word256 q0 q1 0 0)
+ 3 -> do
+ q0 <- PA.readPrimArray quo 0
+ q1 <- PA.readPrimArray quo 1
+ q2 <- PA.readPrimArray quo 2
+ pure (Word256 q0 q1 q2 0)
+ 4 -> do
+ q0 <- PA.readPrimArray quo 0
+ q1 <- PA.readPrimArray quo 1
+ q2 <- PA.readPrimArray quo 2
+ q3 <- PA.readPrimArray quo 3
+ pure (Word256 q0 q1 q2 q3)
+ _ -> error "ppad-fixed (quot): invalid quotient length"
-- | Modulo operation on 'Word256' values.
--
@@ -905,7 +925,7 @@ mod u@(Word256 u0 u1 u2 u3) d@(Word256 d0 _ _ _)
| otherwise = runST $ do
-- allocate quotient
quo <- PA.newPrimArray 4
- PA.setPrimArray quo 0 4 0
+ PA.setPrimArray quo 0 4 0 -- XX avoid
-- allocate dividend, leaving enough space for normalization
u_hot <- PA.newPrimArray 5
PA.writePrimArray u_hot 0 u0