base16

Fast Haskell base16 encoding/decoding (docs.ppad.tech/base16).
git clone git://git.ppad.tech/base16.git
Log | Files | Refs | README | LICENSE

base16_arm.c (4937B)


      1 #include <stddef.h>
      2 #include <stdint.h>
      3 
      4 #if defined(__aarch64__)
      5 
      6 #include <arm_neon.h>
      7 
      8 /* lowercase ASCII hex character for each nibble value 0..15 */
      9 static const uint8_t hex_lut[16] = {
     10     '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
     11 };
     12 
     13 /*
     14  * Encode 'l' input bytes at 'src' into 2*l ASCII hex bytes at 'dst'.
     15  *
     16  * NEON kernel processes 16 input bytes per iteration:
     17  *   - vshrq_n_u8 / vandq_u8 split the high and low nibbles
     18  *   - vqtbl1q_u8 looks up each nibble in 'hex_lut' (the tbl instr)
     19  *   - vst2q_u8 stores the two char vectors interleaved, producing
     20  *     [h0 l0 h1 l1 ... h15 l15] which is exactly the hex output
     21  *
     22  * A scalar tail finishes the final (l mod 16) bytes.
     23  */
     24 void base16_encode_arm(const uint8_t *src, uint8_t *dst, size_t l) {
     25     uint8x16_t lut     = vld1q_u8(hex_lut);
     26     uint8x16_t mask_lo = vdupq_n_u8(0x0f);
     27     size_t i = 0;
     28 
     29     for (; i + 16 <= l; i += 16) {
     30         uint8x16_t in     = vld1q_u8(src + i);
     31         uint8x16_t hi     = vshrq_n_u8(in, 4);
     32         uint8x16_t lo     = vandq_u8(in, mask_lo);
     33         uint8x16x2_t pair = { { vqtbl1q_u8(lut, hi),
     34                                 vqtbl1q_u8(lut, lo) } };
     35         vst2q_u8(dst + 2 * i, pair);
     36     }
     37 
     38     for (; i < l; i++) {
     39         uint8_t b = src[i];
     40         dst[2 * i]     = hex_lut[b >> 4];
     41         dst[2 * i + 1] = hex_lut[b & 0x0f];
     42     }
     43 }
     44 
     45 /*
     46  * Convert 16 ASCII hex chars to nibble values (0..15) in 'nib'.
     47  * Each lane of 'bad' is set to 0xff if the corresponding input is
     48  * not a valid hex digit ('0'..'9', 'a'..'f', 'A'..'F'), 0x00 if it
     49  * is.  Case-insensitive.
     50  */
     51 static inline void ascii_to_nibble(uint8x16_t c,
     52                                    uint8x16_t *nib,
     53                                    uint8x16_t *bad) {
     54     uint8x16_t is_digit = vandq_u8(vcgeq_u8(c, vdupq_n_u8('0')),
     55                                     vcleq_u8(c, vdupq_n_u8('9')));
     56     uint8x16_t is_lower = vandq_u8(vcgeq_u8(c, vdupq_n_u8('a')),
     57                                     vcleq_u8(c, vdupq_n_u8('f')));
     58     uint8x16_t is_upper = vandq_u8(vcgeq_u8(c, vdupq_n_u8('A')),
     59                                     vcleq_u8(c, vdupq_n_u8('F')));
     60 
     61     /* offset to subtract from c: '0' (0x30), 'a'-10 (0x57), 'A'-10
     62      * (0x37); zero in lanes that aren't valid hex (the resulting
     63      * nibble in those lanes is garbage but 'bad' flags them).      */
     64     uint8x16_t off = vorrq_u8(
     65         vandq_u8(is_digit, vdupq_n_u8(0x30)),
     66         vorrq_u8(
     67             vandq_u8(is_lower, vdupq_n_u8(0x57)),
     68             vandq_u8(is_upper, vdupq_n_u8(0x37))));
     69 
     70     *nib = vsubq_u8(c, off);
     71     *bad = vmvnq_u8(vorrq_u8(is_digit, vorrq_u8(is_lower, is_upper)));
     72 }
     73 
     74 static inline uint8_t scalar_nib(uint8_t c) {
     75     if (c >= '0' && c <= '9') return (uint8_t)(c - '0');
     76     if (c >= 'a' && c <= 'f') return (uint8_t)(c - 'a' + 10);
     77     if (c >= 'A' && c <= 'F') return (uint8_t)(c - 'A' + 10);
     78     return 0x80;  /* invalid sentinel */
     79 }
     80 
     81 /*
     82  * Decode 2*outlen hex chars at 'src' into 'outlen' bytes at 'dst'.
     83  * Returns 1 on success, 0 if any invalid hex char was seen (in which
     84  * case the contents of 'dst' are unspecified).
     85  *
     86  * NEON kernel processes 32 input chars per iteration:
     87  *   - vld2q_u8 deinterleaves into a vector of high-nibble chars and a
     88  *     vector of low-nibble chars
     89  *   - ascii_to_nibble validates and converts each char to its nibble
     90  *   - vshlq_n_u8 + vorrq_u8 packs nibble pairs into output bytes
     91  *   - the per-iteration 'bad' masks are OR-accumulated; we reduce
     92  *     once at the end with vmaxvq_u8
     93  *
     94  * A scalar tail finishes the final (outlen mod 16) output bytes.
     95  */
     96 int base16_decode_arm(const uint8_t *src, uint8_t *dst, size_t outlen) {
     97     uint8x16_t bad = vdupq_n_u8(0);
     98     size_t i = 0;
     99 
    100     for (; i + 16 <= outlen; i += 16) {
    101         uint8x16x2_t pair = vld2q_u8(src + 2 * i);
    102         uint8x16_t nib_hi, nib_lo, bad_hi, bad_lo;
    103         ascii_to_nibble(pair.val[0], &nib_hi, &bad_hi);
    104         ascii_to_nibble(pair.val[1], &nib_lo, &bad_lo);
    105         uint8x16_t byte = vorrq_u8(vshlq_n_u8(nib_hi, 4), nib_lo);
    106         vst1q_u8(dst + i, byte);
    107         bad = vorrq_u8(bad, vorrq_u8(bad_hi, bad_lo));
    108     }
    109 
    110     uint8_t tail_bad = 0;
    111     for (; i < outlen; i++) {
    112         uint8_t n0 = scalar_nib(src[2 * i]);
    113         uint8_t n1 = scalar_nib(src[2 * i + 1]);
    114         tail_bad |= (n0 | n1) & 0x80;
    115         dst[i] = (uint8_t)((n0 << 4) | (n1 & 0x0f));
    116     }
    117 
    118     return (vmaxvq_u8(bad) == 0) && (tail_bad == 0);
    119 }
    120 
    121 int base16_arm_available(void) {
    122     return 1;
    123 }
    124 
    125 #else
    126 
    127 /* stubs for non-aarch64 builds; never reached because dispatch is
    128  * gated on 'base16_arm_available' returning 0                     */
    129 
    130 void base16_encode_arm(const uint8_t *src, uint8_t *dst, size_t l) {
    131     (void)src; (void)dst; (void)l;
    132 }
    133 
    134 int base16_decode_arm(const uint8_t *src, uint8_t *dst, size_t outlen) {
    135     (void)src; (void)dst; (void)outlen;
    136     return 0;
    137 }
    138 
    139 int base16_arm_available(void) {
    140     return 0;
    141 }
    142 
    143 #endif