Spdk/lib/util/base64_neon.c

/*   SPDX-License-Identifier: BSD-2-Clause
 *   Copyright (c) 2005-2007, Nick Galbreath
 *   Copyright (c) 2013-2017, Alfred Klomp
 *   Copyright (c) 2015-2017, Wojciech Mula
 *   Copyright (c) 2016-2017, Matthieu Darbois
 *   All rights reserved.
 */

#ifndef __aarch64__
#error Unsupported hardware
#endif

#include "spdk/stdinc.h"
/*
 * Encoding
 * Use a 64-byte lookup to do the encoding.
 * Reuse existing base64_dec_table and base64_dec_table.

 * Decoding
 * The input consists of five valid character sets in the Base64 alphabet,
 * which we need to map back to the 6-bit values they represent.
 * There are three ranges, two singles, and then there's the rest.
 *
 * LUT1[0-63] = base64_dec_table_neon64[0-63]
 * LUT2[0-63] = base64_dec_table_neon64[64-127]
 *   #  From       To        LUT  Characters
 *   1  [0..42]    [255]      #1  invalid input
 *   2  [43]       [62]       #1  +
 *   3  [44..46]   [255]      #1  invalid input
 *   4  [47]       [63]       #1  /
 *   5  [48..57]   [52..61]   #1  0..9
 *   6  [58..63]   [255]      #1  invalid input
 *   7  [64]       [255]      #2  invalid input
 *   8  [65..90]   [0..25]    #2  A..Z
 *   9  [91..96]   [255]      #2 invalid input
 *  10  [97..122]  [26..51]   #2  a..z
 *  11  [123..126] [255]      #2 invalid input
 * (12) Everything else => invalid input
 */
static const uint8_t base64_dec_table_neon64[] = {
	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  62, 255, 255, 255,  63,
	52,  53,  54,  55,  56,  57,  58,  59,  60,  61, 255, 255, 255, 255, 255, 255,
	0, 255,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
	14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25, 255, 255, 255, 255,
	255, 255,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
	40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51, 255, 255, 255, 255
};

/*
 * LUT1[0-63] = base64_urlsafe_dec_table_neon64[0-63]
 * LUT2[0-63] = base64_urlsafe_dec_table_neon64[64-127]
 *   #  From       To        LUT  Characters
 *   1  [0..44]    [255]      #1  invalid input
 *   2  [45]       [62]       #1  -
 *   3  [46..47]   [255]      #1  invalid input
 *   5  [48..57]   [52..61]   #1  0..9
 *   6  [58..63]   [255]      #1  invalid input
 *   7  [64]       [255]      #2  invalid input
 *   8  [65..90]   [0..25]    #2  A..Z
 *   9  [91..94]   [255]      #2  invalid input
 *  10  [95]       [63]       #2  _
 *  11  [96]       [255]      #2  invalid input
 *  12  [97..122]  [26..51]   #2  a..z
 *  13  [123..126] [255]      #2 invalid input
 * (14) Everything else => invalid input
 */
static const uint8_t base64_urlsafe_dec_table_neon64[] = {
	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  62, 255, 255,
	52,  53,  54,  55,  56,  57,  58,  59,  60,  61, 255, 255, 255, 255, 255, 255,
	0, 255,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
	14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25, 255, 255, 255, 255,
	63, 255,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
	40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51, 255, 255, 255, 255
};

#include <arm_neon.h>
#define CMPGT(s,n)      vcgtq_u8((s), vdupq_n_u8(n))

static inline uint8x16x4_t
load_64byte_table(const uint8_t *p)
{
	uint8x16x4_t ret;
	ret.val[0] = vld1q_u8(p +  0);
	ret.val[1] = vld1q_u8(p + 16);
	ret.val[2] = vld1q_u8(p + 32);
	ret.val[3] = vld1q_u8(p + 48);
	return ret;
}

static void
base64_encode_neon64(char **dst, const char *enc_table, const void **src, size_t *src_len)
{
	const uint8x16x4_t tbl_enc = load_64byte_table(enc_table);

	while (*src_len >= 48) {
		uint8x16x3_t str;
		uint8x16x4_t res;

		/* Load 48 bytes and deinterleave */
		str = vld3q_u8((uint8_t *)*src);

		/* Divide bits of three input bytes over four output bytes and clear top two bits */
		res.val[0] = vshrq_n_u8(str.val[0], 2);
		res.val[1] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[1], 4), vshlq_n_u8(str.val[0], 4)),
				      vdupq_n_u8(0x3F));
		res.val[2] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[2], 6), vshlq_n_u8(str.val[1], 2)),
				      vdupq_n_u8(0x3F));
		res.val[3] = vandq_u8(str.val[2], vdupq_n_u8(0x3F));

		/*
		 * The bits have now been shifted to the right locations;
		 * translate their values 0..63 to the Base64 alphabet.
		 * Use a 64-byte table lookup:
		 */
		res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]);
		res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]);
		res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]);
		res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]);

		/* Interleave and store result */
		vst4q_u8((uint8_t *)*dst, res);

		*src += 48;      /* 3 * 16 bytes of input */
		*dst += 64;      /* 4 * 16 bytes of output */
		*src_len -= 48;
	}
}

static void
base64_decode_neon64(void **dst, const uint8_t *dec_table_neon64, const uint8_t **src,
		     size_t *src_len)
{
	/*
	 * First LUT tbl_dec1 will use VTBL instruction (out of range indices are set to 0 in destination).
	 * Second LUT tbl_dec2 will use VTBX instruction (out of range indices will be unchanged in destination).
	 * Input [64..126] will be mapped to index [1..63] in tb1_dec2. Index 0 means that value comes from tb1_dec1.
	 */
	const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_table_neon64);
	const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_table_neon64 + 64);
	const uint8x16_t offset = vdupq_n_u8(63U);

	while (*src_len >= 64) {

		uint8x16x4_t dec1, dec2;
		uint8x16x3_t dec;

		/* Load 64 bytes and deinterleave */
		uint8x16x4_t str = vld4q_u8((uint8_t *)*src);

		/* Get indices for 2nd LUT */
		dec2.val[0] = vqsubq_u8(str.val[0], offset);
		dec2.val[1] = vqsubq_u8(str.val[1], offset);
		dec2.val[2] = vqsubq_u8(str.val[2], offset);
		dec2.val[3] = vqsubq_u8(str.val[3], offset);

		/* Get values from 1st LUT */
		dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
		dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
		dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
		dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);

		/* Get values from 2nd LUT */
		dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
		dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
		dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
		dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);

		/* Get final values */
		str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
		str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
		str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
		str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);

		/* Check for invalid input, any value larger than 63 */
		uint8x16_t classified = CMPGT(str.val[0], 63);
		classified = vorrq_u8(classified, CMPGT(str.val[1], 63));
		classified = vorrq_u8(classified, CMPGT(str.val[2], 63));
		classified = vorrq_u8(classified, CMPGT(str.val[3], 63));

		/* check that all bits are zero */
		if (vmaxvq_u8(classified) != 0U) {
			break;
		}

		/* Compress four bytes into three */
		dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
		dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
		dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);

		/* Interleave and store decoded result */
		vst3q_u8((uint8_t *)*dst, dec);

		*src += 64;
		*dst += 48;
		*src_len -= 64;
	}
}
Use SPDX license identifiers in remaining files. There are a few places we can replace existing license text with SPDX license identifiers, that did not match the auto-replacement script in the previous patch. Make those replacements manually in this patch instead. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: I258720c03bc2153d1c56a8adf6357f224b911c0b Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12913 Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com> Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com> Reviewed-by: Dong Yi <dongx.yi@intel.com> Reviewed-by: Konrad Sztyber <konrad.sztyber@intel.com> 2022-06-06 18:03:13 +00:00			`/* SPDX-License-Identifier: BSD-2-Clause`
util: optimize base64 with Arm NEON Algorithm and some code from: https://github.com/aklomp/base64 Get ~2.3x speedup for encoding and ~1.7x speedup for decoding on AArch64. Signed-off-by: Richael Zhuang <richael.zhuang@arm.com> Change-Id: Ifce07299aea722337b0b4886117d1f616c5c03ef Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/465733 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> 2019-08-20 02:58:18 +00:00			`* Copyright (c) 2005-2007, Nick Galbreath`
			`* Copyright (c) 2013-2017, Alfred Klomp`
			`* Copyright (c) 2015-2017, Wojciech Mula`
			`* Copyright (c) 2016-2017, Matthieu Darbois`
			`* All rights reserved.`
			`*/`

			`#ifndef __aarch64__`
			`#error Unsupported hardware`
			`#endif`

			`#include "spdk/stdinc.h"`
			`/*`
			`* Encoding`
			`* Use a 64-byte lookup to do the encoding.`
			`* Reuse existing base64_dec_table and base64_dec_table.`

			`* Decoding`
			`* The input consists of five valid character sets in the Base64 alphabet,`
			`* which we need to map back to the 6-bit values they represent.`
			`* There are three ranges, two singles, and then there's the rest.`
			`*`
			`* LUT1[0-63] = base64_dec_table_neon64[0-63]`
			`* LUT2[0-63] = base64_dec_table_neon64[64-127]`
			`* # From To LUT Characters`
			`* 1 [0..42] [255] #1 invalid input`
			`* 2 [43] [62] #1 +`
			`* 3 [44..46] [255] #1 invalid input`
			`* 4 [47] [63] #1 /`
			`* 5 [48..57] [52..61] #1 0..9`
			`* 6 [58..63] [255] #1 invalid input`
			`* 7 [64] [255] #2 invalid input`
			`* 8 [65..90] [0..25] #2 A..Z`
			`* 9 [91..96] [255] #2 invalid input`
			`* 10 [97..122] [26..51] #2 a..z`
			`* 11 [123..126] [255] #2 invalid input`
			`* (12) Everything else => invalid input`
			`*/`
			`static const uint8_t base64_dec_table_neon64[] = {`
			`255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,`
			`255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,`
			`255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63,`
			`52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255,`
			`0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,`
			`14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255,`
			`255, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,`
			`40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255`
			`};`

			`/*`
			`* LUT1[0-63] = base64_urlsafe_dec_table_neon64[0-63]`
			`* LUT2[0-63] = base64_urlsafe_dec_table_neon64[64-127]`
			`* # From To LUT Characters`
			`* 1 [0..44] [255] #1 invalid input`
			`* 2 [45] [62] #1 -`
			`* 3 [46..47] [255] #1 invalid input`
			`* 5 [48..57] [52..61] #1 0..9`
			`* 6 [58..63] [255] #1 invalid input`
			`* 7 [64] [255] #2 invalid input`
			`* 8 [65..90] [0..25] #2 A..Z`
			`* 9 [91..94] [255] #2 invalid input`
			`* 10 [95] [63] #2 _`
			`* 11 [96] [255] #2 invalid input`
			`* 12 [97..122] [26..51] #2 a..z`
			`* 13 [123..126] [255] #2 invalid input`
			`* (14) Everything else => invalid input`
			`*/`
			`static const uint8_t base64_urlsafe_dec_table_neon64[] = {`
			`255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,`
			`255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,`
			`255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255,`
			`52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255,`
			`0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,`
			`14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255,`
			`63, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,`
			`40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255`
			`};`

			`#include <arm_neon.h>`
			`#define CMPGT(s,n) vcgtq_u8((s), vdupq_n_u8(n))`

			`static inline uint8x16x4_t`
			`load_64byte_table(const uint8_t *p)`
			`{`
			`uint8x16x4_t ret;`
			`ret.val[0] = vld1q_u8(p + 0);`
			`ret.val[1] = vld1q_u8(p + 16);`
			`ret.val[2] = vld1q_u8(p + 32);`
			`ret.val[3] = vld1q_u8(p + 48);`
			`return ret;`
			`}`

			`static void`
lib/util: remove _spdk prefix from functions. Signed-off-by: Seth Howell <seth.howell@intel.com> Change-Id: Id87b6eae46e7503796904676edfa22d821673a9a Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/2462 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> 2020-05-15 16:26:28 +00:00			`base64_encode_neon64(char *dst, const char enc_table, const void *src, size_t src_len)`
util: optimize base64 with Arm NEON Algorithm and some code from: https://github.com/aklomp/base64 Get ~2.3x speedup for encoding and ~1.7x speedup for decoding on AArch64. Signed-off-by: Richael Zhuang <richael.zhuang@arm.com> Change-Id: Ifce07299aea722337b0b4886117d1f616c5c03ef Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/465733 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> 2019-08-20 02:58:18 +00:00			`{`
			`const uint8x16x4_t tbl_enc = load_64byte_table(enc_table);`

			`while (*src_len >= 48) {`
			`uint8x16x3_t str;`
			`uint8x16x4_t res;`

			`/* Load 48 bytes and deinterleave */`
			`str = vld3q_u8((uint8_t )src);`

			`/* Divide bits of three input bytes over four output bytes and clear top two bits */`
			`res.val[0] = vshrq_n_u8(str.val[0], 2);`
			`res.val[1] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[1], 4), vshlq_n_u8(str.val[0], 4)),`
			`vdupq_n_u8(0x3F));`
			`res.val[2] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[2], 6), vshlq_n_u8(str.val[1], 2)),`
			`vdupq_n_u8(0x3F));`
			`res.val[3] = vandq_u8(str.val[2], vdupq_n_u8(0x3F));`

			`/*`
			`* The bits have now been shifted to the right locations;`
			`* translate their values 0..63 to the Base64 alphabet.`
			`* Use a 64-byte table lookup:`
			`*/`
			`res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]);`
			`res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]);`
			`res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]);`
			`res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]);`

			`/* Interleave and store result */`
			`vst4q_u8((uint8_t )dst, res);`

			`src += 48; / 3 * 16 bytes of input */`
			`dst += 64; / 4 * 16 bytes of output */`
			`*src_len -= 48;`
			`}`
			`}`

			`static void`
lib/util: remove _spdk prefix from functions. Signed-off-by: Seth Howell <seth.howell@intel.com> Change-Id: Id87b6eae46e7503796904676edfa22d821673a9a Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/2462 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> 2020-05-15 16:26:28 +00:00			`base64_decode_neon64(void *dst, const uint8_t dec_table_neon64, const uint8_t **src,`
			`size_t *src_len)`
util: optimize base64 with Arm NEON Algorithm and some code from: https://github.com/aklomp/base64 Get ~2.3x speedup for encoding and ~1.7x speedup for decoding on AArch64. Signed-off-by: Richael Zhuang <richael.zhuang@arm.com> Change-Id: Ifce07299aea722337b0b4886117d1f616c5c03ef Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/465733 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> 2019-08-20 02:58:18 +00:00			`{`
			`/*`
			`* First LUT tbl_dec1 will use VTBL instruction (out of range indices are set to 0 in destination).`
			`* Second LUT tbl_dec2 will use VTBX instruction (out of range indices will be unchanged in destination).`
			`* Input [64..126] will be mapped to index [1..63] in tb1_dec2. Index 0 means that value comes from tb1_dec1.`
			`*/`
			`const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_table_neon64);`
			`const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_table_neon64 + 64);`
			`const uint8x16_t offset = vdupq_n_u8(63U);`

			`while (*src_len >= 64) {`

			`uint8x16x4_t dec1, dec2;`
			`uint8x16x3_t dec;`

			`/* Load 64 bytes and deinterleave */`
			`uint8x16x4_t str = vld4q_u8((uint8_t )src);`

			`/* Get indices for 2nd LUT */`
			`dec2.val[0] = vqsubq_u8(str.val[0], offset);`
			`dec2.val[1] = vqsubq_u8(str.val[1], offset);`
			`dec2.val[2] = vqsubq_u8(str.val[2], offset);`
			`dec2.val[3] = vqsubq_u8(str.val[3], offset);`

			`/* Get values from 1st LUT */`
			`dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);`
			`dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);`
			`dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);`
			`dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);`

			`/* Get values from 2nd LUT */`
			`dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);`
			`dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);`
			`dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);`
			`dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);`

			`/* Get final values */`
			`str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);`
			`str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);`
			`str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);`
			`str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);`

			`/* Check for invalid input, any value larger than 63 */`
			`uint8x16_t classified = CMPGT(str.val[0], 63);`
			`classified = vorrq_u8(classified, CMPGT(str.val[1], 63));`
			`classified = vorrq_u8(classified, CMPGT(str.val[2], 63));`
			`classified = vorrq_u8(classified, CMPGT(str.val[3], 63));`

			`/* check that all bits are zero */`
			`if (vmaxvq_u8(classified) != 0U) {`
			`break;`
			`}`

			`/* Compress four bytes into three */`
			`dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));`
			`dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));`
			`dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);`

			`/* Interleave and store decoded result */`
			`vst3q_u8((uint8_t )dst, dec);`

			`*src += 64;`
			`*dst += 48;`
			`*src_len -= 64;`
			`}`
			`}`