commit e401de2b86f7470eedfbf537a6cbcb57614e1b20
parent 26bc887b09a314b71d6cb0d3831e9f9486ae4552
Author: Jared Tobin <jared@jtobin.io>
Date: Sun, 25 Jan 2026 17:53:40 +0400
Merge impl/decoding: transaction parsing
Diffstat:
| M | lib/Bitcoin/Prim/Tx.hs | | | 228 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- |
| A | plans/IMPL1.md | | | 208 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
2 files changed, 432 insertions(+), 4 deletions(-)
diff --git a/lib/Bitcoin/Prim/Tx.hs b/lib/Bitcoin/Prim/Tx.hs
@@ -33,6 +33,7 @@ module Bitcoin.Prim.Tx (
, txid
) where
+import Data.Bits ((.|.), shiftL)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Base16 as B16
import qualified Data.ByteString.Builder as BSB
@@ -95,10 +96,6 @@ to_bytes tx@Tx {..}
<> foldMap put_witness tx_witnesses
<> put_word32_le tx_locktime
--- | Parse a transaction from bytes.
-from_bytes :: BS.ByteString -> Maybe Tx
-from_bytes = error "Bitcoin.Prim.Tx.from_bytes: not yet implemented"
-
-- | Serialise a transaction to legacy format (no witness data).
--
-- Used for txid computation.
@@ -189,6 +186,229 @@ put_witness (Witness items) =
<> BSB.byteString item
{-# INLINE put_witness #-}
+-- decoding --------------------------------------------------------------------
+
+-- | Parse a transaction from bytes.
+--
+-- Automatically detects segwit vs legacy format by checking for
+-- marker byte 0x00 followed by flag 0x01 after the version field.
+from_bytes :: BS.ByteString -> Maybe Tx
+from_bytes !bs = do
+ -- need at least 4 bytes for version
+ guard (BS.length bs >= 4)
+ let !version = get_word32_le bs 0
+ !off0 = 4
+ -- check for segwit marker (0x00) and flag (0x01)
+ if BS.length bs > off0 + 1
+ && BS.index bs off0 == 0x00
+ && BS.index bs (off0 + 1) == 0x01
+ then parse_segwit bs version (off0 + 2)
+ else parse_legacy bs version off0
+
+-- Parse legacy transaction (no witness data)
+parse_legacy :: BS.ByteString -> Word32 -> Int -> Maybe Tx
+parse_legacy !bs !version !off0 = do
+ -- input count
+ (input_count, off1) <- get_compact bs off0
+ -- inputs
+ (inputs, off2) <- get_many get_txin bs off1 (fromIntegral input_count)
+ -- output count
+ (output_count, off3) <- get_compact bs off2
+ -- outputs
+ (outputs, off4) <- get_many get_txout bs off3 (fromIntegral output_count)
+ -- locktime (4 bytes)
+ guard (BS.length bs >= off4 + 4)
+ let !locktime = get_word32_le bs off4
+ !off5 = off4 + 4
+ -- should have consumed all bytes
+ guard (off5 == BS.length bs)
+ pure $! Tx version inputs outputs [] locktime
+
+-- Parse segwit transaction (with witness data)
+parse_segwit :: BS.ByteString -> Word32 -> Int -> Maybe Tx
+parse_segwit !bs !version !off0 = do
+ -- input count
+ (input_count, off1) <- get_compact bs off0
+ -- inputs
+ (inputs, off2) <- get_many get_txin bs off1 (fromIntegral input_count)
+ -- output count
+ (output_count, off3) <- get_compact bs off2
+ -- outputs
+ (outputs, off4) <- get_many get_txout bs off3 (fromIntegral output_count)
+ -- witnesses (one per input)
+ (witnesses, off5) <- get_many get_witness bs off4 (fromIntegral input_count)
+ -- locktime (4 bytes)
+ guard (BS.length bs >= off5 + 4)
+ let !locktime = get_word32_le bs off5
+ !off6 = off5 + 4
+ -- should have consumed all bytes
+ guard (off6 == BS.length bs)
+ pure $! Tx version inputs outputs witnesses locktime
+
+-- internal helpers ------------------------------------------------------------
+
+-- | Guard for Maybe monad.
+guard :: Bool -> Maybe ()
+guard True = Just ()
+guard False = Nothing
+{-# INLINE guard #-}
+
+-- | Decode a 32-bit little-endian word at the given offset.
+-- Does not bounds-check; caller must ensure sufficient bytes.
+get_word32_le :: BS.ByteString -> Int -> Word32
+get_word32_le !bs !off =
+ let !b0 = fromIntegral (BS.index bs off) :: Word32
+ !b1 = fromIntegral (BS.index bs (off + 1)) :: Word32
+ !b2 = fromIntegral (BS.index bs (off + 2)) :: Word32
+ !b3 = fromIntegral (BS.index bs (off + 3)) :: Word32
+ in b0 .|. (b1 `shiftL` 8) .|. (b2 `shiftL` 16) .|. (b3 `shiftL` 24)
+{-# INLINE get_word32_le #-}
+
+-- | Decode a 64-bit little-endian word at the given offset.
+-- Does not bounds-check; caller must ensure sufficient bytes.
+get_word64_le :: BS.ByteString -> Int -> Word64
+get_word64_le !bs !off =
+ let !b0 = fromIntegral (BS.index bs off) :: Word64
+ !b1 = fromIntegral (BS.index bs (off + 1)) :: Word64
+ !b2 = fromIntegral (BS.index bs (off + 2)) :: Word64
+ !b3 = fromIntegral (BS.index bs (off + 3)) :: Word64
+ !b4 = fromIntegral (BS.index bs (off + 4)) :: Word64
+ !b5 = fromIntegral (BS.index bs (off + 5)) :: Word64
+ !b6 = fromIntegral (BS.index bs (off + 6)) :: Word64
+ !b7 = fromIntegral (BS.index bs (off + 7)) :: Word64
+ in b0 .|. (b1 `shiftL` 8) .|. (b2 `shiftL` 16) .|. (b3 `shiftL` 24)
+ .|. (b4 `shiftL` 32) .|. (b5 `shiftL` 40)
+ .|. (b6 `shiftL` 48) .|. (b7 `shiftL` 56)
+{-# INLINE get_word64_le #-}
+
+-- | Decode a 16-bit little-endian word at the given offset.
+-- Does not bounds-check; caller must ensure sufficient bytes.
+get_word16_le :: BS.ByteString -> Int -> Word64
+get_word16_le !bs !off =
+ let !b0 = fromIntegral (BS.index bs off) :: Word64
+ !b1 = fromIntegral (BS.index bs (off + 1)) :: Word64
+ in b0 .|. (b1 `shiftL` 8)
+{-# INLINE get_word16_le #-}
+
+-- | Decode compactSize (Bitcoin's variable-length integer).
+-- Returns (value, new_offset).
+-- Enforces minimal encoding: rejects non-minimal representations.
+get_compact :: BS.ByteString -> Int -> Maybe (Word64, Int)
+get_compact !bs !off
+ | off >= BS.length bs = Nothing
+ | otherwise = case BS.index bs off of
+ tag | tag <= 0xfc ->
+ -- Single byte: value is the tag itself
+ Just (fromIntegral tag, off + 1)
+
+ 0xfd ->
+ -- 2-byte value follows
+ if BS.length bs < off + 3
+ then Nothing
+ else
+ let !val = get_word16_le bs (off + 1)
+ in if val < 0xfd
+ then Nothing -- non-minimal encoding
+ else Just (val, off + 3)
+
+ 0xfe ->
+ -- 4-byte value follows
+ if BS.length bs < off + 5
+ then Nothing
+ else
+ let !val = fromIntegral (get_word32_le bs (off + 1)) :: Word64
+ in if val <= 0xffff
+ then Nothing -- non-minimal encoding
+ else Just (val, off + 5)
+
+ _ -> -- 0xff
+ -- 8-byte value follows
+ if BS.length bs < off + 9
+ then Nothing
+ else
+ let !val = get_word64_le bs (off + 1)
+ in if val <= 0xffffffff
+ then Nothing -- non-minimal encoding
+ else Just (val, off + 9)
+{-# INLINE get_compact #-}
+
+-- | Decode an outpoint (txid + vout).
+-- Returns (OutPoint, new_offset).
+get_outpoint :: BS.ByteString -> Int -> Maybe (OutPoint, Int)
+get_outpoint !bs !off
+ | BS.length bs < off + 36 = Nothing
+ | otherwise =
+ let !txid_bytes = BS.take 32 (BS.drop off bs)
+ !vout = get_word32_le bs (off + 32)
+ in Just (OutPoint (TxId txid_bytes) vout, off + 36)
+{-# INLINE get_outpoint #-}
+
+-- | Decode a transaction input.
+-- Returns (TxIn, new_offset).
+get_txin :: BS.ByteString -> Int -> Maybe (TxIn, Int)
+get_txin !bs !off0 = do
+ -- outpoint: 36 bytes
+ (outpoint, off1) <- get_outpoint bs off0
+ -- scriptSig length + bytes
+ (script_len, off2) <- get_compact bs off1
+ let !slen = fromIntegral script_len
+ guard (BS.length bs >= off2 + slen)
+ let !script_sig = BS.take slen (BS.drop off2 bs)
+ !off3 = off2 + slen
+ -- sequence: 4 bytes
+ guard (BS.length bs >= off3 + 4)
+ let !seqn = get_word32_le bs off3
+ !off4 = off3 + 4
+ pure (TxIn outpoint script_sig seqn, off4)
+
+-- | Decode a transaction output.
+-- Returns (TxOut, new_offset).
+get_txout :: BS.ByteString -> Int -> Maybe (TxOut, Int)
+get_txout !bs !off0 = do
+ -- value: 8 bytes
+ guard (BS.length bs >= off0 + 8)
+ let !value = get_word64_le bs off0
+ !off1 = off0 + 8
+ -- scriptPubKey length + bytes
+ (script_len, off2) <- get_compact bs off1
+ let !slen = fromIntegral script_len
+ guard (BS.length bs >= off2 + slen)
+ let !script_pk = BS.take slen (BS.drop off2 bs)
+ !off3 = off2 + slen
+ pure (TxOut value script_pk, off3)
+
+-- | Decode a witness stack for one input.
+-- Returns (Witness, new_offset).
+get_witness :: BS.ByteString -> Int -> Maybe (Witness, Int)
+get_witness !bs !off0 = do
+ -- stack item count
+ (item_count, off1) <- get_compact bs off0
+ -- each item: length + bytes
+ (items, off2) <- get_many get_witness_item bs off1 (fromIntegral item_count)
+ pure (Witness items, off2)
+
+-- | Decode a single witness stack item (length-prefixed bytes).
+get_witness_item :: BS.ByteString -> Int -> Maybe (BS.ByteString, Int)
+get_witness_item !bs !off0 = do
+ (item_len, off1) <- get_compact bs off0
+ let !ilen = fromIntegral item_len
+ guard (BS.length bs >= off1 + ilen)
+ let !item = BS.take ilen (BS.drop off1 bs)
+ pure (item, off1 + ilen)
+
+-- | Decode multiple items using a decoder function.
+-- Returns (list of items, new_offset).
+get_many :: (BS.ByteString -> Int -> Maybe (a, Int))
+ -> BS.ByteString -> Int -> Int -> Maybe ([a], Int)
+get_many getter !bs = go []
+ where
+ go !acc !off !n
+ | n <= 0 = Just (reverse acc, off)
+ | otherwise = do
+ (item, off') <- getter bs off
+ go (item : acc) off' (n - 1)
+{-# INLINE get_many #-}
+
-- txid ------------------------------------------------------------------------
-- | Compute the transaction ID (double SHA256 of legacy serialisation).
diff --git a/plans/IMPL1.md b/plans/IMPL1.md
@@ -0,0 +1,208 @@
+# IMPL1 - Core Types, Serialisation, and TxId
+
+## Goal
+
+Implement core transaction types, binary serialisation (legacy and segwit
+formats), and txid computation.
+
+## Scope
+
+- `Bitcoin.Prim.Tx` module: types and serialisation
+- CompactSize (varint) encoding/decoding
+- Legacy and segwit tx formats
+- TxId computation via double SHA256
+
+## Types
+
+Types are already defined in skeleton. Key points:
+
+- `TxId`: 32-byte ByteString (stored as-is, displayed reversed per convention)
+- `OutPoint`: TxId + Word32 vout
+- `TxIn`: OutPoint + scriptSig + sequence
+- `TxOut`: Word64 value + scriptPubKey
+- `Witness`: list of stack items (ByteStrings)
+- `Tx`: version + inputs + outputs + witnesses + locktime
+
+## CompactSize Encoding
+
+Internal helpers for Bitcoin's variable-length integer format:
+
+```haskell
+-- | Encode a Word64 as compactSize.
+put_compact :: Word64 -> BS.ByteString
+
+-- | Decode compactSize, returning (value, bytes_consumed).
+get_compact :: BS.ByteString -> Maybe (Word64, Int)
+```
+
+Encoding rules:
+- 0x00-0xfc: 1 byte (value itself)
+- 0xfd-0xffff: 0xfd ++ 2 bytes LE
+- 0x10000-0xffffffff: 0xfe ++ 4 bytes LE
+- larger: 0xff ++ 8 bytes LE
+
+## Serialisation Implementation
+
+### Encoding (to_bytes)
+
+Build output via `Data.ByteString.Builder` or direct unsafe writes:
+
+```
+to_bytes tx:
+ if has_witnesses tx:
+ put_word32_le version
+ put_byte 0x00 -- marker
+ put_byte 0x01 -- flag
+ put_compact (length inputs)
+ for each input: put_txin
+ put_compact (length outputs)
+ for each output: put_txout
+ for each witness: put_witness
+ put_word32_le locktime
+ else:
+ put_word32_le version
+ put_compact (length inputs)
+ for each input: put_txin
+ put_compact (length outputs)
+ for each output: put_txout
+ put_word32_le locktime
+```
+
+Component encoders:
+```haskell
+put_txin :: TxIn -> Builder
+ -- outpoint (32 + 4 bytes) + scriptSig (compact + bytes) + sequence (4)
+
+put_txout :: TxOut -> Builder
+ -- value (8 bytes LE) + scriptPubKey (compact + bytes)
+
+put_witness :: Witness -> Builder
+ -- compact count + for each item: compact len + bytes
+```
+
+### Decoding (from_bytes)
+
+Parse with explicit offset tracking or a simple parser state:
+
+```
+from_bytes bs:
+ version <- get_word32_le
+ peek next byte:
+ if 0x00 and following byte is 0x01:
+ skip marker/flag
+ parse as segwit
+ else:
+ parse as legacy
+
+ -- segwit parse:
+ input_count <- get_compact
+ inputs <- replicateM input_count get_txin
+ output_count <- get_compact
+ outputs <- replicateM output_count get_txout
+ witnesses <- replicateM input_count get_witness
+ locktime <- get_word32_le
+
+ -- legacy parse:
+ input_count <- get_compact
+ inputs <- replicateM input_count get_txin
+ output_count <- get_compact
+ outputs <- replicateM output_count get_txout
+ locktime <- get_word32_le
+ witnesses = []
+```
+
+Component decoders:
+```haskell
+get_txin :: Parser TxIn
+get_txout :: Parser TxOut
+get_witness :: Parser Witness
+```
+
+### Legacy Serialisation
+
+```haskell
+to_bytes_legacy :: Tx -> BS.ByteString
+ -- Always legacy format (no marker/flag/witnesses)
+ -- Used for txid computation
+```
+
+## TxId Computation
+
+```haskell
+txid :: Tx -> TxId
+txid tx = TxId (SHA256.hash (SHA256.hash (to_bytes_legacy tx)))
+```
+
+The result is the raw 32-byte hash. Display convention (reversed hex) is
+separate from storage.
+
+## Internal Helpers
+
+Little-endian word encoding/decoding:
+
+```haskell
+put_word32_le :: Word32 -> Builder
+put_word64_le :: Word64 -> Builder
+get_word32_le :: BS.ByteString -> Int -> Maybe Word32
+get_word64_le :: BS.ByteString -> Int -> Maybe Word64
+```
+
+Use `Data.Bits` shifts or `Foreign.Storable` with explicit byte order.
+
+## Work Items
+
+### Phase 1: Encoding (independent)
+
+1. Implement `put_compact` (compactSize encoding)
+2. Implement `put_word32_le`, `put_word64_le`
+3. Implement `put_txin`, `put_txout`, `put_witness`
+4. Implement `to_bytes` and `to_bytes_legacy`
+
+### Phase 2: Decoding (independent of Phase 1)
+
+1. Implement `get_compact` (compactSize decoding)
+2. Implement `get_word32_le`, `get_word64_le`
+3. Implement `get_txin`, `get_txout`, `get_witness`
+4. Implement `from_bytes` with format detection
+
+### Phase 3: TxId (depends on Phase 1)
+
+1. Implement `txid` using ppad-sha256
+
+### Phase 4: Base16 wrappers
+
+1. `to_base16` wraps `to_bytes` with B16.encode
+2. `from_base16` decodes hex then calls `from_bytes`
+
+## Tests
+
+- Round-trip: `from_bytes (to_bytes tx) == Just tx`
+- Known vectors: parse real Bitcoin transactions, verify txid
+- Edge cases: empty inputs/outputs, max-size compactSize values
+- Legacy vs segwit format detection
+
+## Test Vectors
+
+### Simple legacy tx (1 input, 1 output)
+
+Use a known mainnet transaction, e.g., the pizza transaction or a
+simple testnet tx with known txid.
+
+### Segwit tx (P2WPKH)
+
+Parse a native segwit transaction, verify witnesses preserved, verify
+txid matches (should exclude witnesses).
+
+### Sources
+
+- BIP143 test vectors (have full tx hex + expected sighash)
+- Bitcoin Core tx_valid.json
+- Manually hex-dump transactions from block explorers
+
+## Notes
+
+- All integers are little-endian except where noted
+- TxId is stored in natural byte order (not display order)
+- Witnesses list length must equal inputs list length for segwit
+- Empty witness list indicates legacy transaction
+- CompactSize must use minimal encoding (enforced on decode)