base16_arm.c (4937B)
1 #include <stddef.h> 2 #include <stdint.h> 3 4 #if defined(__aarch64__) 5 6 #include <arm_neon.h> 7 8 /* lowercase ASCII hex character for each nibble value 0..15 */ 9 static const uint8_t hex_lut[16] = { 10 '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f' 11 }; 12 13 /* 14 * Encode 'l' input bytes at 'src' into 2*l ASCII hex bytes at 'dst'. 15 * 16 * NEON kernel processes 16 input bytes per iteration: 17 * - vshrq_n_u8 / vandq_u8 split the high and low nibbles 18 * - vqtbl1q_u8 looks up each nibble in 'hex_lut' (the tbl instr) 19 * - vst2q_u8 stores the two char vectors interleaved, producing 20 * [h0 l0 h1 l1 ... h15 l15] which is exactly the hex output 21 * 22 * A scalar tail finishes the final (l mod 16) bytes. 23 */ 24 void base16_encode_arm(const uint8_t *src, uint8_t *dst, size_t l) { 25 uint8x16_t lut = vld1q_u8(hex_lut); 26 uint8x16_t mask_lo = vdupq_n_u8(0x0f); 27 size_t i = 0; 28 29 for (; i + 16 <= l; i += 16) { 30 uint8x16_t in = vld1q_u8(src + i); 31 uint8x16_t hi = vshrq_n_u8(in, 4); 32 uint8x16_t lo = vandq_u8(in, mask_lo); 33 uint8x16x2_t pair = { { vqtbl1q_u8(lut, hi), 34 vqtbl1q_u8(lut, lo) } }; 35 vst2q_u8(dst + 2 * i, pair); 36 } 37 38 for (; i < l; i++) { 39 uint8_t b = src[i]; 40 dst[2 * i] = hex_lut[b >> 4]; 41 dst[2 * i + 1] = hex_lut[b & 0x0f]; 42 } 43 } 44 45 /* 46 * Convert 16 ASCII hex chars to nibble values (0..15) in 'nib'. 47 * Each lane of 'bad' is set to 0xff if the corresponding input is 48 * not a valid hex digit ('0'..'9', 'a'..'f', 'A'..'F'), 0x00 if it 49 * is. Case-insensitive. 50 */ 51 static inline void ascii_to_nibble(uint8x16_t c, 52 uint8x16_t *nib, 53 uint8x16_t *bad) { 54 uint8x16_t is_digit = vandq_u8(vcgeq_u8(c, vdupq_n_u8('0')), 55 vcleq_u8(c, vdupq_n_u8('9'))); 56 uint8x16_t is_lower = vandq_u8(vcgeq_u8(c, vdupq_n_u8('a')), 57 vcleq_u8(c, vdupq_n_u8('f'))); 58 uint8x16_t is_upper = vandq_u8(vcgeq_u8(c, vdupq_n_u8('A')), 59 vcleq_u8(c, vdupq_n_u8('F'))); 60 61 /* offset to subtract from c: '0' (0x30), 'a'-10 (0x57), 'A'-10 62 * (0x37); zero in lanes that aren't valid hex (the resulting 63 * nibble in those lanes is garbage but 'bad' flags them). */ 64 uint8x16_t off = vorrq_u8( 65 vandq_u8(is_digit, vdupq_n_u8(0x30)), 66 vorrq_u8( 67 vandq_u8(is_lower, vdupq_n_u8(0x57)), 68 vandq_u8(is_upper, vdupq_n_u8(0x37)))); 69 70 *nib = vsubq_u8(c, off); 71 *bad = vmvnq_u8(vorrq_u8(is_digit, vorrq_u8(is_lower, is_upper))); 72 } 73 74 static inline uint8_t scalar_nib(uint8_t c) { 75 if (c >= '0' && c <= '9') return (uint8_t)(c - '0'); 76 if (c >= 'a' && c <= 'f') return (uint8_t)(c - 'a' + 10); 77 if (c >= 'A' && c <= 'F') return (uint8_t)(c - 'A' + 10); 78 return 0x80; /* invalid sentinel */ 79 } 80 81 /* 82 * Decode 2*outlen hex chars at 'src' into 'outlen' bytes at 'dst'. 83 * Returns 1 on success, 0 if any invalid hex char was seen (in which 84 * case the contents of 'dst' are unspecified). 85 * 86 * NEON kernel processes 32 input chars per iteration: 87 * - vld2q_u8 deinterleaves into a vector of high-nibble chars and a 88 * vector of low-nibble chars 89 * - ascii_to_nibble validates and converts each char to its nibble 90 * - vshlq_n_u8 + vorrq_u8 packs nibble pairs into output bytes 91 * - the per-iteration 'bad' masks are OR-accumulated; we reduce 92 * once at the end with vmaxvq_u8 93 * 94 * A scalar tail finishes the final (outlen mod 16) output bytes. 95 */ 96 int base16_decode_arm(const uint8_t *src, uint8_t *dst, size_t outlen) { 97 uint8x16_t bad = vdupq_n_u8(0); 98 size_t i = 0; 99 100 for (; i + 16 <= outlen; i += 16) { 101 uint8x16x2_t pair = vld2q_u8(src + 2 * i); 102 uint8x16_t nib_hi, nib_lo, bad_hi, bad_lo; 103 ascii_to_nibble(pair.val[0], &nib_hi, &bad_hi); 104 ascii_to_nibble(pair.val[1], &nib_lo, &bad_lo); 105 uint8x16_t byte = vorrq_u8(vshlq_n_u8(nib_hi, 4), nib_lo); 106 vst1q_u8(dst + i, byte); 107 bad = vorrq_u8(bad, vorrq_u8(bad_hi, bad_lo)); 108 } 109 110 uint8_t tail_bad = 0; 111 for (; i < outlen; i++) { 112 uint8_t n0 = scalar_nib(src[2 * i]); 113 uint8_t n1 = scalar_nib(src[2 * i + 1]); 114 tail_bad |= (n0 | n1) & 0x80; 115 dst[i] = (uint8_t)((n0 << 4) | (n1 & 0x0f)); 116 } 117 118 return (vmaxvq_u8(bad) == 0) && (tail_bad == 0); 119 } 120 121 int base16_arm_available(void) { 122 return 1; 123 } 124 125 #else 126 127 /* stubs for non-aarch64 builds; never reached because dispatch is 128 * gated on 'base16_arm_available' returning 0 */ 129 130 void base16_encode_arm(const uint8_t *src, uint8_t *dst, size_t l) { 131 (void)src; (void)dst; (void)l; 132 } 133 134 int base16_decode_arm(const uint8_t *src, uint8_t *dst, size_t outlen) { 135 (void)src; (void)dst; (void)outlen; 136 return 0; 137 } 138 139 int base16_arm_available(void) { 140 return 0; 141 } 142 143 #endif