fixed

Pure Haskell large fixed-width integers.
git clone git://git.ppad.tech/fixed.git
Log | Files | Refs | README | LICENSE

commit e6a960955e8b67215ac3ee67446d468814ba9cad
parent 8b72bcf0deff5a29a7259b13456115ff6a23a254
Author: Jared Tobin <jared@jtobin.io>
Date:   Wed, 29 Jan 2025 20:57:58 +0400

lib: wip optimization snapshot, <10ns div diff

Diffstat:
MREADME.md | 20++++++++++----------
Mbench/Main.hs | 6+++---
Mlib/Data/Word/Extended.hs | 48++++++++++++++++++++++++++++++++++--------------
3 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md @@ -95,18 +95,18 @@ Current benchmark figures on my mid-2020 MacBook Air look like (use variance introduced by outliers: 54% (severely inflated) benchmarking division/div - time 90.87 ns (89.74 ns .. 92.07 ns) - 0.999 R² (0.999 R² .. 0.999 R²) - mean 91.45 ns (90.43 ns .. 92.55 ns) - std dev 3.503 ns (2.899 ns .. 4.626 ns) - variance introduced by outliers: 59% (severely inflated) + time 81.94 ns (81.11 ns .. 82.84 ns) + 0.999 R² (0.999 R² .. 1.000 R²) + mean 81.97 ns (81.43 ns .. 82.89 ns) + std dev 2.422 ns (1.696 ns .. 3.578 ns) + variance introduced by outliers: 46% (moderately inflated) benchmarking division/div (baseline) - time 77.31 ns (76.25 ns .. 78.93 ns) - 0.998 R² (0.997 R² .. 0.999 R²) - mean 78.42 ns (77.56 ns .. 79.76 ns) - std dev 3.492 ns (2.790 ns .. 4.688 ns) - variance introduced by outliers: 66% (severely inflated) + time 72.08 ns (71.26 ns .. 72.89 ns) + 0.999 R² (0.999 R² .. 0.999 R²) + mean 72.35 ns (71.52 ns .. 73.15 ns) + std dev 2.566 ns (2.203 ns .. 3.044 ns) + variance introduced by outliers: 55% (severely inflated) benchmarking division/mod time 106.6 ns (105.7 ns .. 107.5 ns) diff --git a/bench/Main.hs b/bench/Main.hs @@ -42,9 +42,9 @@ division = bgroup "division" [ main :: IO () main = defaultMain [ - add_sub - , multiplication - , division + -- add_sub + --, multiplication + division ] -- addition and subtraction --------------------------------------------------- diff --git a/lib/Data/Word/Extended.hs b/lib/Data/Word/Extended.hs @@ -61,7 +61,6 @@ module Data.Word.Extended ( ) where import Control.DeepSeq -import Control.Monad (void) import Control.Monad.Primitive import Control.Monad.ST import Data.Bits ((.|.), (.&.), (.<<.), (.>>.), (.^.)) @@ -765,6 +764,7 @@ normalize_divisor (Word256 d0 d1 d2 d3) = do dn_0 <- norm (dlen - 1) d_last d_final <- PA.unsafeFreezePrimArray dn pure (d_final, shift, dn_0) +{-# INLINE normalize_divisor #-} quotrem :: PrimMonad m @@ -827,19 +827,21 @@ quotrem quo u d = do !un_0 <- PA.readPrimArray u 0 unn_rem 0 un_0 +{-# INLINE quotrem #-} quot :: PrimMonad m - => PA.MutablePrimArray (PrimState m) Word64 -- quotient (potentially large) + => PA.MutablePrimArray (PrimState m) Word64 -- quotient -> PA.MutablePrimArray (PrimState m) Word64 -- unnormalized dividend -> Word256 -- unnormalized divisor - -> m () + -> m Int -- length of quotient quot quo u d = do -- normalize divisor !(dn, shift, dn_0) <- normalize_divisor d let !dlen = PA.sizeofPrimArray dn -- get size of normalized dividend + -- XX extract this !lu <- PA.getSizeofMutablePrimArray u !ulen <- let loop !j | j < 0 = pure 0 @@ -848,9 +850,10 @@ quot quo u d = do if uj /= 0 then pure (j + 1) else loop (j - 1) in loop (lu - 2) -- don't touch the uninitialized word if ulen < dlen - then pure () + then pure 0 else do -- normalize dividend + -- XX extract this u_hi <- PA.readPrimArray u (ulen - 1) PA.writePrimArray u ulen (u_hi .>>. (64 - shift)) let normalize_u !j !uj @@ -866,9 +869,12 @@ quot quo u d = do then do -- normalized divisor is small !un <- PA.unsafeFreezePrimArray u - void (quotrem_by1 quo un dn_0) - else + _ <- quotrem_by1 quo un dn_0 + pure ulen + else do quotrem_knuth quo u (ulen + 1) dn + pure (ulen + 1 - dlen) +{-# INLINE quot #-} div :: Word256 -> Word256 -> Word256 div u@(Word256 u0 u1 u2 u3) d@(Word256 d0 _ _ _) @@ -878,7 +884,6 @@ div u@(Word256 u0 u1 u2 u3) d@(Word256 d0 _ _ _) | otherwise = runST $ do -- allocate quotient quo <- PA.newPrimArray 4 - PA.setPrimArray quo 0 4 0 -- allocate dividend, leaving enough space for normalization u_arr <- PA.newPrimArray 5 PA.writePrimArray u_arr 0 u0 @@ -886,12 +891,27 @@ div u@(Word256 u0 u1 u2 u3) d@(Word256 d0 _ _ _) PA.writePrimArray u_arr 2 u2 PA.writePrimArray u_arr 3 u3 -- last index of u_hot intentionally unset - quot quo u_arr d - q0 <- PA.readPrimArray quo 0 - q1 <- PA.readPrimArray quo 1 - q2 <- PA.readPrimArray quo 2 - q3 <- PA.readPrimArray quo 3 - pure (Word256 q0 q1 q2 q3) + l <- quot quo u_arr d + case l of + 1 -> do + q0 <- PA.readPrimArray quo 0 + pure (Word256 q0 0 0 0) + 2 -> do + q0 <- PA.readPrimArray quo 0 + q1 <- PA.readPrimArray quo 1 + pure (Word256 q0 q1 0 0) + 3 -> do + q0 <- PA.readPrimArray quo 0 + q1 <- PA.readPrimArray quo 1 + q2 <- PA.readPrimArray quo 2 + pure (Word256 q0 q1 q2 0) + 4 -> do + q0 <- PA.readPrimArray quo 0 + q1 <- PA.readPrimArray quo 1 + q2 <- PA.readPrimArray quo 2 + q3 <- PA.readPrimArray quo 3 + pure (Word256 q0 q1 q2 q3) + _ -> error "ppad-fixed (quot): invalid quotient length" -- | Modulo operation on 'Word256' values. -- @@ -905,7 +925,7 @@ mod u@(Word256 u0 u1 u2 u3) d@(Word256 d0 _ _ _) | otherwise = runST $ do -- allocate quotient quo <- PA.newPrimArray 4 - PA.setPrimArray quo 0 4 0 + PA.setPrimArray quo 0 4 0 -- XX avoid -- allocate dividend, leaving enough space for normalization u_hot <- PA.newPrimArray 5 PA.writePrimArray u_hot 0 u0