Parser.hs (19697B)
1 {-# OPTIONS_HADDOCK prune #-} 2 {-# LANGUAGE OverloadedStrings #-} 3 4 -- | 5 -- Module: Audit.AArch64.Parser 6 -- Copyright: (c) 2025 Jared Tobin 7 -- License: MIT 8 -- Maintainer: jared@ppad.tech 9 -- 10 -- Parser for AArch64 assembly. 11 12 module Audit.AArch64.Parser ( 13 parseAsm 14 , ParseError 15 , showParseError 16 ) where 17 18 import Audit.AArch64.Types 19 import Control.Monad (void) 20 import Data.Char (isAlphaNum) 21 import Data.Text (Text) 22 import qualified Data.Text as T 23 import Data.Void (Void) 24 import Text.Megaparsec hiding (ParseError) 25 import Text.Megaparsec.Char 26 import qualified Text.Megaparsec.Char.Lexer as L 27 28 type Parser = Parsec Void Text 29 type ParseError = ParseErrorBundle Text Void 30 31 -- | Parse assembly source into a list of lines. 32 parseAsm :: Text -> Either ParseError [Line] 33 parseAsm = parse (many pLine <* eof) "<asm>" 34 35 -- | Format a parse error succinctly (no giant input dump). 36 showParseError :: ParseError -> String 37 showParseError = errorBundlePretty 38 39 -- Space consumer (skips horizontal whitespace and comments, not newlines). 40 sc :: Parser () 41 sc = L.space 42 hspace1 43 (L.skipLineComment "//") 44 (L.skipBlockComment "/*" "*/") 45 46 -- Lexeme wrapper. 47 lexeme :: Parser a -> Parser a 48 lexeme = L.lexeme sc 49 50 -- Parse a single line. 51 pLine :: Parser Line 52 pLine = do 53 pos <- getSourcePos 54 let ln = unPos (sourceLine pos) 55 hspace -- Skip leading horizontal whitespace only 56 choice 57 [ pDirectiveLine ln 58 , pLabelOrInstrLine ln 59 , pEmptyLine ln 60 ] 61 62 pEmptyLine :: Int -> Parser Line 63 pEmptyLine ln = do 64 void (optional pComment) 65 void eol -- Must consume actual newline 66 pure (Line ln Nothing Nothing) 67 68 pComment :: Parser () 69 pComment = do 70 void (choice [string "//", string ";", string "@"]) 71 void (takeWhileP Nothing (/= '\n')) 72 73 pDirectiveLine :: Int -> Parser Line 74 pDirectiveLine ln = do 75 void (char '.') 76 void (takeWhileP Nothing (\c -> c /= '\n')) 77 void (optional eol) 78 pure (Line ln Nothing Nothing) 79 80 -- Combined parser for label or instruction lines. 81 -- Parses the leading identifier once, then branches on ':'. 82 pLabelOrInstrLine :: Int -> Parser Line 83 pLabelOrInstrLine ln = do 84 name <- pIdentifier 85 mColon <- optional (char ':') 86 case mColon of 87 Just _ -> do 88 -- Label line: name is the label 89 hspace 90 mInstr <- optional pInstr 91 hspace 92 void (optional pComment) 93 void (optional eol) 94 pure (Line ln (Just name) mInstr) 95 Nothing -> do 96 -- Instruction line: name must look like a valid mnemonic. 97 -- Guard: mnemonics start with alphanumeric or '.' (not '_' etc.) 98 case T.uncons name of 99 Just (c, _) | isAlphaNum c || c == '.' -> do 100 sc 101 instr <- parseByMnemonic (T.toLower name) 102 hspace 103 void (optional pComment) 104 void (optional eol) 105 pure (Line ln Nothing (Just instr)) 106 _ -> fail "not a valid mnemonic" 107 108 pIdentifier :: Parser Text 109 pIdentifier = do 110 c <- satisfy isIdentStart 111 cs <- takeWhileP Nothing isIdentChar 112 pure (T.cons c cs) 113 where 114 isIdentStart c = c == '_' || c == '.' || c == 'L' || 115 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 116 -- GHC NCG uses many special chars in symbols (operators, types, etc.) 117 -- Note: ':' excluded here (label terminator); use pSymbolIdent for refs 118 isIdentChar c = isAlphaNum c || c `elem` ("_.$'#()[]+-*/<>=!|&^~" :: String) 119 120 -- | Like pIdentifier but allows ':' for GHC operator symbols (e.g., _:_con_info) 121 pSymbolIdent :: Parser Text 122 pSymbolIdent = do 123 c <- satisfy isIdentStart 124 cs <- takeWhileP Nothing isIdentChar 125 pure (T.cons c cs) 126 where 127 isIdentStart c = c == '_' || c == '.' || c == 'L' || 128 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 129 -- Include ':' for GHC operator symbols like _:_con_info 130 isIdentChar c = isAlphaNum c || c `elem` ("_.$'#()[]+-*/<>=!|&^~:" :: String) 131 132 pInstr :: Parser Instr 133 pInstr = do 134 mnem <- lexeme pMnemonic 135 parseByMnemonic mnem 136 137 pMnemonic :: Parser Text 138 pMnemonic = do 139 cs <- takeWhile1P Nothing isIdentChar 140 pure (T.toLower cs) 141 where 142 isIdentChar c = isAlphaNum c || c == '.' 143 144 parseByMnemonic :: Text -> Parser Instr 145 parseByMnemonic m = case m of 146 -- Data movement 147 "mov" -> Mov <$> pReg <*> (pComma *> pOperand) 148 "movz" -> Movz <$> pReg <*> (pComma *> pImm) <*> optional (pComma *> pShift) 149 "movk" -> Movk <$> pReg <*> (pComma *> pImm) <*> optional (pComma *> pShift) 150 "movn" -> Movn <$> pReg <*> (pComma *> pImm) <*> optional (pComma *> pShift) 151 152 -- Arithmetic 153 "add" -> Add <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 154 "sub" -> Sub <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 155 "adds" -> Adds <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 156 "subs" -> Subs <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 157 "adc" -> Adc <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 158 "adcs" -> Adcs <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 159 "sbc" -> Sbc <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 160 "neg" -> Neg <$> pReg <*> (pComma *> pOperand) 161 "negs" -> Negs <$> pReg <*> (pComma *> pOperand) 162 "mul" -> Mul <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 163 "mneg" -> Mneg <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 164 "madd" -> Madd <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 165 <*> (pComma *> pReg) 166 "msub" -> Msub <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 167 <*> (pComma *> pReg) 168 "umulh" -> Umulh <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 169 "smulh" -> Smulh <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 170 "udiv" -> Udiv <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 171 "sdiv" -> Sdiv <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 172 173 -- Logical 174 "and" -> And <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 175 "orr" -> Orr <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 176 "eor" -> Eor <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 177 "bic" -> Bic <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 178 "mvn" -> Mvn <$> pReg <*> (pComma *> pOperand) 179 "tst" -> Tst <$> pReg <*> (pComma *> pOperand) 180 181 -- Shifts 182 "lsl" -> Lsl <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 183 "lsr" -> Lsr <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 184 "asr" -> Asr <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 185 "ror" -> Ror <$> pReg <*> (pComma *> pReg) <*> (pComma *> pOperand) 186 187 -- Bit manipulation 188 "ubfx" -> Ubfx <$> pReg <*> (pComma *> pReg) <*> (pComma *> pInt) 189 <*> (pComma *> pInt) 190 "sbfx" -> Sbfx <$> pReg <*> (pComma *> pReg) <*> (pComma *> pInt) 191 <*> (pComma *> pInt) 192 "bfi" -> Bfi <$> pReg <*> (pComma *> pReg) <*> (pComma *> pInt) 193 <*> (pComma *> pInt) 194 "extr" -> Extr <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 195 <*> (pComma *> pInt) 196 197 -- Address generation 198 "adr" -> Adr <$> pReg <*> (pComma *> pSymbolRef) 199 "adrp" -> Adrp <$> pReg <*> (pComma *> pSymbolRef) 200 201 -- Load/Store 202 "ldr" -> Ldr <$> pReg <*> (pComma *> pAddrMode) 203 "ldrb" -> Ldrb <$> pReg <*> (pComma *> pAddrMode) 204 "ldrh" -> Ldrh <$> pReg <*> (pComma *> pAddrMode) 205 "ldrsb" -> Ldrsb <$> pReg <*> (pComma *> pAddrMode) 206 "ldrsh" -> Ldrsh <$> pReg <*> (pComma *> pAddrMode) 207 "ldrsw" -> Ldrsw <$> pReg <*> (pComma *> pAddrMode) 208 "ldur" -> Ldur <$> pReg <*> (pComma *> pAddrMode) 209 "str" -> Str <$> pReg <*> (pComma *> pAddrMode) 210 "strb" -> Strb <$> pReg <*> (pComma *> pAddrMode) 211 "strh" -> Strh <$> pReg <*> (pComma *> pAddrMode) 212 "stur" -> Stur <$> pReg <*> (pComma *> pAddrMode) 213 "ldp" -> Ldp <$> pReg <*> (pComma *> pReg) <*> (pComma *> pAddrMode) 214 "stp" -> Stp <$> pReg <*> (pComma *> pReg) <*> (pComma *> pAddrMode) 215 216 -- Acquire/release loads and stores 217 "ldar" -> Ldar <$> pReg <*> (pComma *> pAddrMode) 218 "ldarb" -> Ldarb <$> pReg <*> (pComma *> pAddrMode) 219 "ldarh" -> Ldarh <$> pReg <*> (pComma *> pAddrMode) 220 "stlr" -> Stlr <$> pReg <*> (pComma *> pAddrMode) 221 "stlrb" -> Stlrb <$> pReg <*> (pComma *> pAddrMode) 222 "stlrh" -> Stlrh <$> pReg <*> (pComma *> pAddrMode) 223 224 -- Exclusive loads and stores 225 "ldxr" -> Ldxr <$> pReg <*> (pComma *> pAddrMode) 226 "ldxrb" -> Ldxrb <$> pReg <*> (pComma *> pAddrMode) 227 "ldxrh" -> Ldxrh <$> pReg <*> (pComma *> pAddrMode) 228 "stxr" -> Stxr <$> pReg <*> (pComma *> pReg) <*> (pComma *> pAddrMode) 229 "stxrb" -> Stxrb <$> pReg <*> (pComma *> pReg) <*> (pComma *> pAddrMode) 230 "stxrh" -> Stxrh <$> pReg <*> (pComma *> pReg) <*> (pComma *> pAddrMode) 231 232 -- Acquire-exclusive loads and release-exclusive stores 233 "ldaxr" -> Ldaxr <$> pReg <*> (pComma *> pAddrMode) 234 "ldaxrb" -> Ldaxrb <$> pReg <*> (pComma *> pAddrMode) 235 "ldaxrh" -> Ldaxrh <$> pReg <*> (pComma *> pAddrMode) 236 "stlxr" -> Stlxr <$> pReg <*> (pComma *> pReg) <*> (pComma *> pAddrMode) 237 "stlxrb" -> Stlxrb <$> pReg <*> (pComma *> pReg) <*> (pComma *> pAddrMode) 238 "stlxrh" -> Stlxrh <$> pReg <*> (pComma *> pReg) <*> (pComma *> pAddrMode) 239 240 -- Compare and select 241 "cmp" -> Cmp <$> pReg <*> (pComma *> pOperand) 242 "cmn" -> Cmn <$> pReg <*> (pComma *> pOperand) 243 "csel" -> Csel <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 244 <*> (pComma *> pCondCode) 245 "csinc" -> Csinc <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 246 <*> (pComma *> pCondCode) 247 "csinv" -> Csinv <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 248 <*> (pComma *> pCondCode) 249 "csneg" -> Csneg <$> pReg <*> (pComma *> pReg) <*> (pComma *> pReg) 250 <*> (pComma *> pCondCode) 251 "cset" -> Cset <$> pReg <*> (pComma *> pCondCode) 252 "cinc" -> Cinc <$> pReg <*> (pComma *> pReg) <*> (pComma *> pCondCode) 253 254 -- Branches 255 "b" -> B <$> pSymbolRef 256 "b.eq" -> BCond "eq" <$> pSymbolRef 257 "b.ne" -> BCond "ne" <$> pSymbolRef 258 "b.lt" -> BCond "lt" <$> pSymbolRef 259 "b.le" -> BCond "le" <$> pSymbolRef 260 "b.gt" -> BCond "gt" <$> pSymbolRef 261 "b.ge" -> BCond "ge" <$> pSymbolRef 262 "b.hi" -> BCond "hi" <$> pSymbolRef 263 "b.hs" -> BCond "hs" <$> pSymbolRef 264 "b.lo" -> BCond "lo" <$> pSymbolRef 265 "b.ls" -> BCond "ls" <$> pSymbolRef 266 "b.mi" -> BCond "mi" <$> pSymbolRef 267 "b.pl" -> BCond "pl" <$> pSymbolRef 268 "b.vs" -> BCond "vs" <$> pSymbolRef 269 "b.vc" -> BCond "vc" <$> pSymbolRef 270 "b.cc" -> BCond "cc" <$> pSymbolRef 271 "b.cs" -> BCond "cs" <$> pSymbolRef 272 "bl" -> Bl <$> pSymbolRef 273 "blr" -> Blr <$> pReg 274 "br" -> Br <$> pReg 275 "ret" -> Ret <$> optional pReg 276 "cbz" -> Cbz <$> pReg <*> (pComma *> pSymbolRef) 277 "cbnz" -> Cbnz <$> pReg <*> (pComma *> pSymbolRef) 278 "tbz" -> Tbz <$> pReg <*> (pComma *> pInt) <*> (pComma *> pSymbolRef) 279 "tbnz" -> Tbnz <$> pReg <*> (pComma *> pInt) <*> (pComma *> pSymbolRef) 280 281 -- System 282 "nop" -> pure Nop 283 "svc" -> Svc <$> pImm 284 285 -- Fallback: store as Other 286 _ -> Other m <$> pRemainingOperands 287 288 pRemainingOperands :: Parser [Text] 289 pRemainingOperands = do 290 rest <- takeWhileP Nothing (\c -> c /= '\n' && c /= ';' && c /= '/') 291 let trimmed = T.strip rest 292 if T.null trimmed 293 then pure [] 294 else pure (map T.strip (T.splitOn "," trimmed)) 295 296 pComma :: Parser () 297 pComma = void (lexeme (char ',')) 298 299 -- Dispatch on first character to avoid trying all register types. 300 -- All alternatives use try to ensure pReg doesn't consume on total failure, 301 -- allowing callers to try pSymbolRef or other alternatives. 302 pReg :: Parser Reg 303 pReg = lexeme $ do 304 c <- lookAhead anySingle 305 case c of 306 'x' -> try pXReg <|> try (XZR <$ string' "xzr") 307 'X' -> try pXReg <|> try (XZR <$ string' "xzr") 308 'w' -> try pWReg <|> try (WZR <$ string' "wzr") 309 'W' -> try pWReg <|> try (WZR <$ string' "wzr") 310 'd' -> try pDReg 311 'D' -> try pDReg 312 's' -> try pSReg <|> try (SP <$ string' "sp") 313 'S' -> try pSReg <|> try (SP <$ string' "sp") 314 'q' -> try pQReg 315 'Q' -> try pQReg 316 'f' -> try (X29 <$ string' "fp") 317 'F' -> try (X29 <$ string' "fp") 318 'l' -> try (X30 <$ string' "lr") 319 'L' -> try (X30 <$ string' "lr") 320 _ -> fail "not a register" 321 322 pXReg :: Parser Reg 323 pXReg = do 324 void (char' 'x') 325 n <- L.decimal 326 case n :: Int of 327 0 -> pure X0; 1 -> pure X1; 2 -> pure X2; 3 -> pure X3 328 4 -> pure X4; 5 -> pure X5; 6 -> pure X6; 7 -> pure X7 329 8 -> pure X8; 9 -> pure X9; 10 -> pure X10; 11 -> pure X11 330 12 -> pure X12; 13 -> pure X13; 14 -> pure X14; 15 -> pure X15 331 16 -> pure X16; 17 -> pure X17; 18 -> pure X18; 19 -> pure X19 332 20 -> pure X20; 21 -> pure X21; 22 -> pure X22; 23 -> pure X23 333 24 -> pure X24; 25 -> pure X25; 26 -> pure X26; 27 -> pure X27 334 28 -> pure X28; 29 -> pure X29; 30 -> pure X30 335 _ -> fail "invalid x register number" 336 337 pWReg :: Parser Reg 338 pWReg = do 339 void (char' 'w') 340 n <- L.decimal 341 case n :: Int of 342 0 -> pure W0; 1 -> pure W1; 2 -> pure W2; 3 -> pure W3 343 4 -> pure W4; 5 -> pure W5; 6 -> pure W6; 7 -> pure W7 344 8 -> pure W8; 9 -> pure W9; 10 -> pure W10; 11 -> pure W11 345 12 -> pure W12; 13 -> pure W13; 14 -> pure W14; 15 -> pure W15 346 16 -> pure W16; 17 -> pure W17; 18 -> pure W18; 19 -> pure W19 347 20 -> pure W20; 21 -> pure W21; 22 -> pure W22; 23 -> pure W23 348 24 -> pure W24; 25 -> pure W25; 26 -> pure W26; 27 -> pure W27 349 28 -> pure W28; 29 -> pure W29; 30 -> pure W30 350 _ -> fail "invalid w register number" 351 352 pDReg :: Parser Reg 353 pDReg = do 354 void (char' 'd') 355 n <- L.decimal 356 case n :: Int of 357 0 -> pure D0; 1 -> pure D1; 2 -> pure D2; 3 -> pure D3 358 4 -> pure D4; 5 -> pure D5; 6 -> pure D6; 7 -> pure D7 359 8 -> pure D8; 9 -> pure D9; 10 -> pure D10; 11 -> pure D11 360 12 -> pure D12; 13 -> pure D13; 14 -> pure D14; 15 -> pure D15 361 16 -> pure D16; 17 -> pure D17; 18 -> pure D18; 19 -> pure D19 362 20 -> pure D20; 21 -> pure D21; 22 -> pure D22; 23 -> pure D23 363 24 -> pure D24; 25 -> pure D25; 26 -> pure D26; 27 -> pure D27 364 28 -> pure D28; 29 -> pure D29; 30 -> pure D30; 31 -> pure D31 365 _ -> fail "invalid d register number" 366 367 pSReg :: Parser Reg 368 pSReg = do 369 void (char' 's') 370 n <- L.decimal 371 case n :: Int of 372 0 -> pure S0; 1 -> pure S1; 2 -> pure S2; 3 -> pure S3 373 4 -> pure S4; 5 -> pure S5; 6 -> pure S6; 7 -> pure S7 374 8 -> pure S8; 9 -> pure S9; 10 -> pure S10; 11 -> pure S11 375 12 -> pure S12; 13 -> pure S13; 14 -> pure S14; 15 -> pure S15 376 16 -> pure S16; 17 -> pure S17; 18 -> pure S18; 19 -> pure S19 377 20 -> pure S20; 21 -> pure S21; 22 -> pure S22; 23 -> pure S23 378 24 -> pure S24; 25 -> pure S25; 26 -> pure S26; 27 -> pure S27 379 28 -> pure S28; 29 -> pure S29; 30 -> pure S30; 31 -> pure S31 380 _ -> fail "invalid s register number" 381 382 pQReg :: Parser Reg 383 pQReg = do 384 void (char' 'q') 385 n <- L.decimal 386 case n :: Int of 387 0 -> pure Q0; 1 -> pure Q1; 2 -> pure Q2; 3 -> pure Q3 388 4 -> pure Q4; 5 -> pure Q5; 6 -> pure Q6; 7 -> pure Q7 389 8 -> pure Q8; 9 -> pure Q9; 10 -> pure Q10; 11 -> pure Q11 390 12 -> pure Q12; 13 -> pure Q13; 14 -> pure Q14; 15 -> pure Q15 391 16 -> pure Q16; 17 -> pure Q17; 18 -> pure Q18; 19 -> pure Q19 392 20 -> pure Q20; 21 -> pure Q21; 22 -> pure Q22; 23 -> pure Q23 393 24 -> pure Q24; 25 -> pure Q25; 26 -> pure Q26; 27 -> pure Q27 394 28 -> pure Q28; 29 -> pure Q29; 30 -> pure Q30; 31 -> pure Q31 395 _ -> fail "invalid q register number" 396 397 pOperand :: Parser Operand 398 pOperand = choice 399 [ try (OpAddr <$> pAddrMode) 400 , try pRegOrShiftedReg 401 , OpImm <$> pImm 402 , OpLabel <$> pSymbolRef 403 ] 404 405 -- Parse register once, then optionally check for shift suffix. 406 pRegOrShiftedReg :: Parser Operand 407 pRegOrShiftedReg = do 408 r <- pReg 409 mShift <- optional (pComma *> pShift) 410 pure $ case mShift of 411 Just sh -> OpShiftedReg r sh 412 Nothing -> OpReg r 413 414 pImm :: Parser Integer 415 pImm = lexeme $ do 416 void (optional (char '#')) 417 signed pNumber 418 where 419 pNumber = try pHex <|> L.decimal 420 pHex = string' "0x" *> L.hexadecimal -- case-insensitive 421 422 signed :: Parser Integer -> Parser Integer 423 signed p = do 424 neg <- optional (char '-') 425 n <- p 426 case neg of 427 Just _ -> pure (negate n) 428 Nothing -> pure n 429 430 pInt :: Parser Int 431 pInt = lexeme $ do 432 void (optional (char '#')) 433 fromInteger <$> signed L.decimal 434 435 pShift :: Parser Shift 436 pShift = lexeme $ choice 437 [ LSL <$> (string' "lsl" *> sc *> pShiftAmt) 438 , LSR <$> (string' "lsr" *> sc *> pShiftAmt) 439 , ASR <$> (string' "asr" *> sc *> pShiftAmt) 440 ] 441 442 pShiftAmt :: Parser Int 443 pShiftAmt = do 444 void (optional (char '#')) 445 fromInteger <$> L.decimal 446 447 pSymbolRef :: Parser Text 448 pSymbolRef = lexeme $ do 449 -- Handle =literal, @PAGE, @PAGEOFF suffixes, etc. 450 hasPrefix <- option False (True <$ char '=') 451 name <- pSymbolIdent -- Use pSymbolIdent to allow ':' in GHC symbols 452 suffix <- optional pSymbolSuffix 453 let base = if hasPrefix then T.cons '=' name else name 454 pure (maybe base (base <>) suffix) 455 456 pSymbolSuffix :: Parser Text 457 pSymbolSuffix = do 458 void (char '@') 459 suf <- takeWhile1P Nothing isAlphaNum 460 -- Optional addend like +2 or -3 461 addend <- optional pAddend 462 let base = T.cons '@' suf 463 pure (maybe base (base <>) addend) 464 465 pAddend :: Parser Text 466 pAddend = do 467 sign <- satisfy (\c -> c == '+' || c == '-') 468 digits <- takeWhile1P Nothing isDigit 469 pure (T.cons sign digits) 470 where 471 isDigit c = c >= '0' && c <= '9' 472 473 pCondCode :: Parser Text 474 pCondCode = lexeme $ takeWhile1P Nothing isAlphaNum 475 476 -- Dispatch by first char: '=' for literal, '[' for bracket address. 477 -- No try needed since char '=' doesn't consume on failure. 478 pAddrMode :: Parser AddrMode 479 pAddrMode = choice 480 [ pLiteralAddr 481 , pBracketAddr 482 ] 483 484 pLiteralAddr :: Parser AddrMode 485 pLiteralAddr = Literal <$> (char '=' *> pIdentifier) 486 487 pBracketAddr :: Parser AddrMode 488 pBracketAddr = lexeme $ do 489 void (char '[') 490 sc 491 base <- pReg 492 sc 493 innerMode <- optional (pComma *> pAddrModeInner) 494 sc 495 void (char ']') 496 -- Check for pre-index ! or post-index offset 497 preIdx <- option False (True <$ char '!') 498 postIdx <- optional (try pPostIndex) 499 case (innerMode, preIdx, postIdx) of 500 (Nothing, False, Nothing) -> pure (BaseImm base 0) 501 (Nothing, False, Just imm) -> pure (PostIndex base imm) 502 (Just (Left (Left imm)), True, Nothing) -> pure (PreIndex base imm) 503 (Just (Left (Left imm)), False, Nothing) -> pure (BaseImm base imm) 504 (Just (Left (Right sym)), False, Nothing) -> pure (BaseSymbol base sym) 505 (Just (Right (r, Nothing)), False, Nothing) -> pure (BaseReg base r) 506 (Just (Right (r, Just (Left sh))), False, Nothing) -> pure (BaseRegShift base r sh) 507 (Just (Right (r, Just (Right ext))), False, Nothing) -> pure (BaseRegExtend base r ext) 508 _ -> fail "invalid addressing mode" 509 510 -- | Inner part of [base, ...] after the comma. 511 -- Left (Left imm) = immediate offset 512 -- Left (Right sym) = symbol offset (e.g., @PAGEOFF) 513 -- Right (reg, mod) = register with optional shift/extend 514 type AddrInner = Either (Either Integer Text) (Reg, Maybe (Either Shift Extend)) 515 516 -- Parse the inner part of [base, ...] after the comma 517 pAddrModeInner :: Parser AddrInner 518 pAddrModeInner = choice 519 [ Left . Left <$> pImm 520 , try (Right <$> pRegWithModifier) 521 , Left . Right <$> pSymbolRef 522 ] 523 524 pRegWithModifier :: Parser (Reg, Maybe (Either Shift Extend)) 525 pRegWithModifier = do 526 r <- pReg 527 modifier <- optional (pComma *> pShiftOrExtend) 528 pure (r, modifier) 529 530 pShiftOrExtend :: Parser (Either Shift Extend) 531 pShiftOrExtend = choice 532 [ Left <$> pShift 533 , Right <$> pExtend 534 ] 535 536 pExtend :: Parser Extend 537 pExtend = lexeme $ do 538 ext <- choice 539 [ UXTW <$ string' "uxtw" 540 , SXTW <$ string' "sxtw" 541 , UXTX <$ string' "uxtx" 542 , SXTX <$ string' "sxtx" 543 ] 544 -- Optionally consume shift amount (e.g., #3) - ignored for taint analysis 545 void $ optional (sc *> pImm) 546 pure ext 547 548 pPostIndex :: Parser Integer 549 pPostIndex = do 550 sc 551 void (char ',') 552 sc 553 pImm