per Intel policy to include file commit date using git cmd below. The policy does not apply to non-Intel (C) notices. git log --follow -C90% --format=%ad --date default <file> | tail -1 and then pull just the 4 digit year from the result. Intel copyrights were not added to files where Intel either had no contribution ot the contribution lacked substance (ie license header updates, formatting changes, etc). Contribution date used "--follow -C95%" to get the most accurate date. Note that several files in this patch didn't end the license/(c) block with a blank comment line so these were added as the vast majority of files do have this last blank line. Simply there for consistency. Signed-off-by: paul luse <paul.e.luse@intel.com> Change-Id: Id5b7ce4f658fe87132f14139ead58d6e285c04d4 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15192 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Community-CI: Mellanox Build Bot
474 lines
16 KiB
C
474 lines
16 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright (C) 2022 Intel Corporation.
|
|
* Copyright(c) ARM Limited. 2021 All rights reserved.
|
|
*/
|
|
|
|
#ifndef __aarch64__
|
|
#error Unsupported hardware
|
|
#endif
|
|
|
|
#include "spdk/stdinc.h"
|
|
#include <arm_sve.h>
|
|
|
|
static int
|
|
table_lookup_8vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
|
|
svuint8_t tbl_vec4, svuint8_t tbl_vec5, svuint8_t tbl_vec6, svuint8_t tbl_vec7,
|
|
svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
|
|
{
|
|
svuint8_t res2, res3, res4, res5, res6, res7;
|
|
|
|
/*
|
|
* In base64 decode table, the first 32 elements are invalid value,
|
|
* so skip tbl_vec0 and tbl_vec1
|
|
*/
|
|
indices = svsub_n_u8_z(p8_in, indices, 2 * vl);
|
|
res2 = svtbl_u8(tbl_vec2, indices);
|
|
indices = svsub_n_u8_z(p8_in, indices, vl);
|
|
res3 = svtbl_u8(tbl_vec3, indices);
|
|
indices = svsub_n_u8_z(p8_in, indices, vl);
|
|
res4 = svtbl_u8(tbl_vec4, indices);
|
|
indices = svsub_n_u8_z(p8_in, indices, vl);
|
|
res5 = svtbl_u8(tbl_vec5, indices);
|
|
indices = svsub_n_u8_z(p8_in, indices, vl);
|
|
res6 = svtbl_u8(tbl_vec6, indices);
|
|
indices = svsub_n_u8_z(p8_in, indices, vl);
|
|
res7 = svtbl_u8(tbl_vec7, indices);
|
|
|
|
*output = svdup_n_u8(0);
|
|
*output = svadd_u8_z(p8_in, res2, *output);
|
|
*output = svadd_u8_z(p8_in, res3, *output);
|
|
*output = svadd_u8_z(p8_in, res4, *output);
|
|
*output = svadd_u8_z(p8_in, res5, *output);
|
|
*output = svadd_u8_z(p8_in, res6, *output);
|
|
*output = svadd_u8_z(p8_in, res7, *output);
|
|
|
|
if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
table_lookup_4vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
|
|
svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
|
|
{
|
|
svuint8_t res0, res1, res2, res3;
|
|
|
|
res0 = svtbl_u8(tbl_vec0, indices);
|
|
indices = svsub_n_u8_z(p8_in, indices, vl);
|
|
res1 = svtbl_u8(tbl_vec1, indices);
|
|
indices = svsub_n_u8_z(p8_in, indices, vl);
|
|
res2 = svtbl_u8(tbl_vec2, indices);
|
|
indices = svsub_n_u8_z(p8_in, indices, vl);
|
|
res3 = svtbl_u8(tbl_vec3, indices);
|
|
|
|
*output = svdup_n_u8(0);
|
|
|
|
*output = svadd_u8_z(p8_in, res0, *output);
|
|
*output = svadd_u8_z(p8_in, res1, *output);
|
|
*output = svadd_u8_z(p8_in, res2, *output);
|
|
*output = svadd_u8_z(p8_in, res3, *output);
|
|
|
|
if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
table_lookup_3vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t indices,
|
|
svuint8_t *output, svbool_t p8_in, uint64_t vl)
|
|
{
|
|
svuint8_t res0, res1, res2;
|
|
|
|
res0 = svtbl_u8(tbl_vec0, indices);
|
|
indices = svsub_n_u8_z(p8_in, indices, vl);
|
|
res1 = svtbl_u8(tbl_vec1, indices);
|
|
indices = svsub_n_u8_z(p8_in, indices, vl);
|
|
res2 = svtbl_u8(tbl_vec2, indices);
|
|
|
|
*output = svdup_n_u8(0);
|
|
|
|
*output = svadd_u8_z(p8_in, res0, *output);
|
|
*output = svadd_u8_z(p8_in, res1, *output);
|
|
*output = svadd_u8_z(p8_in, res2, *output);
|
|
|
|
if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
table_lookup_2vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t indices, svuint8_t *output,
|
|
svbool_t p8_in, uint64_t vl)
|
|
{
|
|
svuint8_t res0, res1;
|
|
|
|
res0 = svtbl_u8(tbl_vec0, indices);
|
|
indices = svsub_n_u8_z(p8_in, indices, vl);
|
|
res1 = svtbl_u8(tbl_vec1, indices);
|
|
|
|
*output = svdup_n_u8(0);
|
|
|
|
*output = svadd_u8_z(p8_in, res0, *output);
|
|
*output = svadd_u8_z(p8_in, res1, *output);
|
|
|
|
if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline void
|
|
convert_6bits_to_8bits(svbool_t pred, uint8_t *src, svuint8_t *temp0, svuint8_t *temp1,
|
|
svuint8_t *temp2, svuint8_t *temp3)
|
|
{
|
|
svuint8_t str0, str1, str2;
|
|
svuint8x3_t ld_enc_input;
|
|
|
|
ld_enc_input = svld3_u8(pred, src);
|
|
|
|
str0 = svget3_u8(ld_enc_input, 0);
|
|
str1 = svget3_u8(ld_enc_input, 1);
|
|
str2 = svget3_u8(ld_enc_input, 2);
|
|
|
|
|
|
*temp0 = svlsr_n_u8_z(pred, str0, 2);
|
|
*temp1 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str1, 4), svlsl_n_u8_z(pred, str0,
|
|
4)),
|
|
svdup_u8(0x3F));
|
|
*temp2 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str2, 6), svlsl_n_u8_z(pred, str1,
|
|
2)),
|
|
svdup_u8(0x3F));
|
|
*temp3 = svand_u8_z(pred, str2, svdup_u8(0x3F));
|
|
}
|
|
|
|
static inline void
|
|
convert_8bits_to_6bits(svbool_t pred, svuint8_t temp0, svuint8_t temp1, svuint8_t temp2,
|
|
svuint8_t temp3, svuint8_t *output0, svuint8_t *output1, svuint8_t *output2)
|
|
{
|
|
*output0 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp0, 2), svlsr_n_u8_z(pred, temp1, 4));
|
|
*output1 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp1, 4), svlsr_n_u8_z(pred, temp2, 2));
|
|
*output2 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp2, 6), temp3);
|
|
}
|
|
|
|
static void
|
|
base64_encode_sve(char **dst, const char *enc_table, const void **src, size_t *src_len)
|
|
{
|
|
uint64_t vl = svcntb();
|
|
svuint8_t temp0, temp1, temp2, temp3;
|
|
svuint8_t output0, output1, output2, output3;
|
|
svuint8_t tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3;
|
|
svuint8x4_t st_enc_output;
|
|
svbool_t p8_all = svptrue_b8();
|
|
svbool_t pred;
|
|
uint64_t i = 0;
|
|
uint64_t pred_count = 0;
|
|
uint64_t N = (*src_len / 3) * 3;
|
|
|
|
if (vl == 16) {
|
|
|
|
tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
|
|
tbl_enc1 = svld1_u8(p8_all, (uint8_t *)enc_table + 16);
|
|
tbl_enc2 = svld1_u8(p8_all, (uint8_t *)enc_table + 32);
|
|
tbl_enc3 = svld1_u8(p8_all, (uint8_t *)enc_table + 48);
|
|
|
|
while (i < N) {
|
|
pred = svwhilelt_b8(i / 3, N / 3);
|
|
|
|
convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
|
|
|
|
table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp0, &output0, pred, vl);
|
|
table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp1, &output1, pred, vl);
|
|
table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp2, &output2, pred, vl);
|
|
table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp3, &output3, pred, vl);
|
|
|
|
st_enc_output = svcreate4_u8(output0, output1, output2, output3);
|
|
svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
|
|
|
|
pred_count = svcntp_b8(pred, pred);
|
|
*src += pred_count * 3;
|
|
*dst += pred_count * 4;
|
|
*src_len -= pred_count * 3;
|
|
i += pred_count * 3;
|
|
|
|
}
|
|
} else if (vl == 32 || vl == 48) {
|
|
|
|
tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
|
|
pred = svwhilelt_b8(vl, (uint64_t)64);
|
|
tbl_enc1 = svld1_u8(pred, (uint8_t *)enc_table + vl);
|
|
|
|
while (i < N) {
|
|
pred = svwhilelt_b8(i / 3, N / 3);
|
|
|
|
convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
|
|
|
|
table_lookup_2vec(tbl_enc0, tbl_enc1, temp0, &output0, pred, vl);
|
|
table_lookup_2vec(tbl_enc0, tbl_enc1, temp1, &output1, pred, vl);
|
|
table_lookup_2vec(tbl_enc0, tbl_enc1, temp2, &output2, pred, vl);
|
|
table_lookup_2vec(tbl_enc0, tbl_enc1, temp3, &output3, pred, vl);
|
|
|
|
st_enc_output = svcreate4_u8(output0, output1, output2, output3);
|
|
svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
|
|
|
|
pred_count = svcntp_b8(pred, pred);
|
|
*src += pred_count * 3;
|
|
*dst += pred_count * 4;
|
|
*src_len -= pred_count * 3;
|
|
i += pred_count * 3;
|
|
|
|
}
|
|
} else if (vl >= 64) {
|
|
|
|
pred = svwhilelt_b8((uint64_t)0, (uint64_t)64);
|
|
tbl_enc0 = svld1_u8(pred, (uint8_t *)enc_table);
|
|
|
|
while (i < N) {
|
|
pred = svwhilelt_b8(i / 3, N / 3);
|
|
|
|
convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
|
|
|
|
output0 = svtbl_u8(tbl_enc0, temp0);
|
|
output1 = svtbl_u8(tbl_enc0, temp1);
|
|
output2 = svtbl_u8(tbl_enc0, temp2);
|
|
output3 = svtbl_u8(tbl_enc0, temp3);
|
|
|
|
st_enc_output = svcreate4_u8(output0, output1, output2, output3);
|
|
svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
|
|
|
|
pred_count = svcntp_b8(pred, pred);
|
|
*src += pred_count * 3;
|
|
*dst += pred_count * 4;
|
|
*src_len -= pred_count * 3;
|
|
i += pred_count * 3;
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
base64_decode_sve(void **dst, const uint8_t *dec_table, const uint8_t **src, size_t *src_len)
|
|
{
|
|
uint64_t vl = svcntb();
|
|
svuint8_t str0, str1, str2, str3;
|
|
svuint8_t temp0, temp1, temp2, temp3;
|
|
svuint8_t output0, output1, output2;
|
|
svuint8_t tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, tbl_dec7;
|
|
svuint8x3_t st_dec_output;
|
|
svbool_t p8_all = svptrue_b8();
|
|
svbool_t pred;
|
|
uint64_t i = 0;
|
|
uint64_t pred_count = 0;
|
|
uint64_t N = (*src_len / 4) * 4;
|
|
svuint8x4_t ld_dec_input;
|
|
|
|
if (vl == 16) {
|
|
tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
|
|
tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + 16);
|
|
tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + 32);
|
|
tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + 48);
|
|
tbl_dec4 = svld1_u8(p8_all, (uint8_t *)dec_table + 64);
|
|
tbl_dec5 = svld1_u8(p8_all, (uint8_t *)dec_table + 80);
|
|
tbl_dec6 = svld1_u8(p8_all, (uint8_t *)dec_table + 96);
|
|
tbl_dec7 = svld1_u8(p8_all, (uint8_t *)dec_table + 112);
|
|
|
|
while (i < N) {
|
|
pred = svwhilelt_b8(i / 4, N / 4);
|
|
|
|
ld_dec_input = svld4_u8(pred, *src);
|
|
|
|
str0 = svget4_u8(ld_dec_input, 0);
|
|
str1 = svget4_u8(ld_dec_input, 1);
|
|
str2 = svget4_u8(ld_dec_input, 2);
|
|
str3 = svget4_u8(ld_dec_input, 3);
|
|
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
|
|
|
|
if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
|
|
tbl_dec7, str0, &temp0, pred, vl)) { return; }
|
|
if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
|
|
tbl_dec7, str1, &temp1, pred, vl)) { return; }
|
|
if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
|
|
tbl_dec7, str2, &temp2, pred, vl)) { return; }
|
|
if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
|
|
tbl_dec7, str3, &temp3, pred, vl)) { return; }
|
|
|
|
convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
|
|
|
|
st_dec_output = svcreate3_u8(output0, output1, output2);
|
|
svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
|
|
|
|
pred_count = svcntp_b8(pred, pred);
|
|
*src += pred_count * 4;
|
|
*dst += pred_count * 3;
|
|
*src_len -= pred_count * 4;
|
|
i += pred_count * 4;
|
|
|
|
}
|
|
} else if (vl == 32) {
|
|
tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
|
|
tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
|
|
tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 2);
|
|
tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 3);
|
|
|
|
while (i < N) {
|
|
pred = svwhilelt_b8(i / 4, N / 4);
|
|
|
|
ld_dec_input = svld4_u8(pred, *src);
|
|
|
|
str0 = svget4_u8(ld_dec_input, 0);
|
|
str1 = svget4_u8(ld_dec_input, 1);
|
|
str2 = svget4_u8(ld_dec_input, 2);
|
|
str3 = svget4_u8(ld_dec_input, 3);
|
|
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
|
|
|
|
if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str0, &temp0, pred, vl)) { return; }
|
|
if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str1, &temp1, pred, vl)) { return; }
|
|
if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str2, &temp2, pred, vl)) { return; }
|
|
if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str3, &temp3, pred, vl)) { return; }
|
|
|
|
convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
|
|
|
|
st_dec_output = svcreate3_u8(output0, output1, output2);
|
|
svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
|
|
|
|
pred_count = svcntp_b8(pred, pred);
|
|
*src += pred_count * 4;
|
|
*dst += pred_count * 3;
|
|
*src_len -= pred_count * 4;
|
|
i += pred_count * 4;
|
|
|
|
}
|
|
|
|
} else if (vl == 48) {
|
|
tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
|
|
tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
|
|
pred = svwhilelt_b8(vl * 2, (uint64_t)128);
|
|
tbl_dec2 = svld1_u8(pred, (uint8_t *)dec_table + 2 * vl);
|
|
|
|
while (i < N) {
|
|
pred = svwhilelt_b8(i / 4, N / 4);
|
|
|
|
ld_dec_input = svld4_u8(pred, *src);
|
|
|
|
str0 = svget4_u8(ld_dec_input, 0);
|
|
str1 = svget4_u8(ld_dec_input, 1);
|
|
str2 = svget4_u8(ld_dec_input, 2);
|
|
str3 = svget4_u8(ld_dec_input, 3);
|
|
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
|
|
|
|
if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str0, &temp0, pred, vl)) { return; }
|
|
if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str1, &temp1, pred, vl)) { return; }
|
|
if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str2, &temp2, pred, vl)) { return; }
|
|
if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str3, &temp3, pred, vl)) { return; }
|
|
|
|
convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
|
|
|
|
st_dec_output = svcreate3_u8(output0, output1, output2);
|
|
svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
|
|
|
|
pred_count = svcntp_b8(pred, pred);
|
|
*src += pred_count * 4;
|
|
*dst += pred_count * 3;
|
|
*src_len -= pred_count * 4;
|
|
i += pred_count * 4;
|
|
|
|
}
|
|
} else if (vl == 64 || vl == 80 || vl == 96 || vl == 112) {
|
|
tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
|
|
pred = svwhilelt_b8(vl, (uint64_t)128);
|
|
tbl_dec1 = svld1_u8(pred, (uint8_t *)dec_table + vl);
|
|
|
|
while (i < N) {
|
|
pred = svwhilelt_b8(i / 4, N / 4);
|
|
|
|
ld_dec_input = svld4_u8(pred, *src);
|
|
|
|
str0 = svget4_u8(ld_dec_input, 0);
|
|
str1 = svget4_u8(ld_dec_input, 1);
|
|
str2 = svget4_u8(ld_dec_input, 2);
|
|
str3 = svget4_u8(ld_dec_input, 3);
|
|
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
|
|
|
|
if (table_lookup_2vec(tbl_dec0, tbl_dec1, str0, &temp0, pred, vl)) { return; }
|
|
if (table_lookup_2vec(tbl_dec0, tbl_dec1, str1, &temp1, pred, vl)) { return; }
|
|
if (table_lookup_2vec(tbl_dec0, tbl_dec1, str2, &temp2, pred, vl)) { return; }
|
|
if (table_lookup_2vec(tbl_dec0, tbl_dec1, str3, &temp3, pred, vl)) { return; }
|
|
|
|
convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
|
|
|
|
st_dec_output = svcreate3_u8(output0, output1, output2);
|
|
svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
|
|
|
|
pred_count = svcntp_b8(pred, pred);
|
|
*src += pred_count * 4;
|
|
*dst += pred_count * 3;
|
|
*src_len -= pred_count * 4;
|
|
i += pred_count * 4;
|
|
|
|
}
|
|
} else if (vl >= 128) {
|
|
pred = svwhilelt_b8((uint64_t)0, (uint64_t)128);
|
|
tbl_dec0 = svld1_u8(pred, (uint8_t *)dec_table + 0);
|
|
|
|
while (i < N) {
|
|
pred = svwhilelt_b8(i / 4, N / 4);
|
|
|
|
ld_dec_input = svld4_u8(pred, *src);
|
|
|
|
str0 = svget4_u8(ld_dec_input, 0);
|
|
str1 = svget4_u8(ld_dec_input, 1);
|
|
str2 = svget4_u8(ld_dec_input, 2);
|
|
str3 = svget4_u8(ld_dec_input, 3);
|
|
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
|
|
if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
|
|
|
|
temp0 = svtbl_u8(tbl_dec0, str0);
|
|
temp1 = svtbl_u8(tbl_dec0, str1);
|
|
temp2 = svtbl_u8(tbl_dec0, str2);
|
|
temp3 = svtbl_u8(tbl_dec0, str3);
|
|
|
|
if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp0, 255))) { return; }
|
|
if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp1, 255))) { return; }
|
|
if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp2, 255))) { return; }
|
|
if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp3, 255))) { return; }
|
|
|
|
convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
|
|
|
|
st_dec_output = svcreate3_u8(output0, output1, output2);
|
|
svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
|
|
|
|
pred_count = svcntp_b8(pred, pred);
|
|
*src += pred_count * 4;
|
|
*dst += pred_count * 3;
|
|
*src_len -= pred_count * 4;
|
|
i += pred_count * 4;
|
|
|
|
}
|
|
}
|
|
}
|