From: Heiko Stuebner <heiko.stuebner@...>
Date: Wed, 19 Oct 2022 15:36:22 +0200
Add handling for ZBB extension and add support for using it as a
variant for optimized string functions.
Support for the Zbb-str-variants is limited to the GNU-assembler
for now, as LLVM has not yet acquired the functionality to
selectively change the arch option in assembler code.
This is still under review at
https://reviews.llvm.org/D123515
Co-developed-by: Christoph Muellner <christoph.muellner@...>
Signed-off-by: Christoph Muellner <christoph.muellner@...>
Signed-off-by: Heiko Stuebner <heiko.stuebner@...>
arch/riscv/Kconfig | 24 ++++++
arch/riscv/include/asm/errata_list.h | 3 +-
arch/riscv/include/asm/hwcap.h | 1 +
arch/riscv/include/asm/string.h | 2 +
arch/riscv/kernel/cpu.c | 1 +
arch/riscv/kernel/cpufeature.c | 18 +++++
arch/riscv/lib/strcmp.S | 94 ++++++++++++++++++++++
arch/riscv/lib/strlen.S | 114 +++++++++++++++++++++++++++
arch/riscv/lib/strncmp.S | 111 ++++++++++++++++++++++++++
9 files changed, 367 insertions(+), 1 deletion(-)
@@ -416,6 +416,30 @@ config RISCV_ISA_SVPBMT
If you don't know what to do here, say Y.
+config TOOLCHAIN_HAS_ZBB
+ bool
+ default y
+ depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbb)
+ depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbb)
+ depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900
+ depends on AS_IS_GNU
+
+config RISCV_ISA_ZBB
+ bool "Zbb extension support for bit manipulation instructions"
+ depends on TOOLCHAIN_HAS_ZBB
+ depends on !XIP_KERNEL && MMU
+ select RISCV_ALTERNATIVE
+ default y
+ help
+ Adds support to dynamically detect the presence of the ZBB
+ extension (basic bit manipulation) and enable its usage.
+
+ The Zbb extension provides instructions to accelerate a number
+ of bit-specific operations (count bit population, sign extending,
+ bitrotation, etc).
+
+ If you don't know what to do here, say Y.
+
config TOOLCHAIN_HAS_ZICBOM
bool
default y
@@ -24,7 +24,8 @@
#define CPUFEATURE_SVPBMT 0
#define CPUFEATURE_ZICBOM 1
-#define CPUFEATURE_NUMBER 2
+#define CPUFEATURE_ZBB 2
+#define CPUFEATURE_NUMBER 3
#ifdef __ASSEMBLY__
@@ -59,6 +59,7 @@ enum riscv_isa_ext_id {
RISCV_ISA_EXT_ZIHINTPAUSE,
RISCV_ISA_EXT_SSTC,
RISCV_ISA_EXT_SVINVAL,
+ RISCV_ISA_EXT_ZBB,
RISCV_ISA_EXT_ID_MAX
};
static_assert(RISCV_ISA_EXT_ID_MAX <= RISCV_ISA_EXT_MAX);
@@ -6,6 +6,8 @@
#ifndef _ASM_RISCV_STRING_H
#define _ASM_RISCV_STRING_H
+#include <asm/alternative-macros.h>
+#include <asm/errata_list.h>
#include <linux/types.h>
#include <linux/linkage.h>
@@ -162,6 +162,7 @@ arch_initcall(riscv_cpuinfo_init);
* extensions by an underscore.
*/
static struct riscv_isa_ext_data isa_ext_arr[] = {
+ __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB),
__RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
@@ -222,6 +222,7 @@ void __init riscv_fill_hwcap(void)
set_bit(nr, this_isa);
}
} else {
+ SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB);
SET_ISA_EXT_MAP("sscofpmf", RISCV_ISA_EXT_SSCOFPMF);
SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT);
SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM);
@@ -301,6 +302,20 @@ static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage)
return true;
}
+static bool __init_or_module cpufeature_probe_zbb(unsigned int stage)
+{
+ if (!IS_ENABLED(CONFIG_RISCV_ISA_ZBB))
+ return false;
+
+ if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
+ return false;
+
+ if (!riscv_isa_extension_available(NULL, ZBB))
+ return false;
+
+ return true;
+}
+
/*
* Probe presence of individual extensions.
*
@@ -318,6 +333,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
if (cpufeature_probe_zicbom(stage))
cpu_req_feature |= BIT(CPUFEATURE_ZICBOM);
+ if (cpufeature_probe_zbb(stage))
+ cpu_req_feature |= BIT(CPUFEATURE_ZBB);
+
return cpu_req_feature;
}
@@ -3,9 +3,14 @@
#include <linux/linkage.h>
#include <asm/asm.h>
#include <asm-generic/export.h>
+#include <asm/alternative-macros.h>
+#include <asm/errata_list.h>
/* int strcmp(const char *cs, const char *ct) */
SYM_FUNC_START(strcmp)
+
+ ALTERNATIVE("nop", "j variant_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
+
/*
* Returns
* a0 - comparison result, value like strcmp
@@ -34,4 +39,93 @@ SYM_FUNC_START(strcmp)
bnez t1, 1b
li a0, 0
j 2b
+
+/*
+ * Variant of strcmp using the ZBB extension if available
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+variant_zbb:
+#define src1 a0
+#define result a0
+#define src2 t5
+#define data1 t0
+#define data2 t1
+#define align t2
+#define data1_orcb t3
+#define m1 t4
+
+.option push
+.option arch,+zbb
+
+ /*
+ * Returns
+ * a0 - comparison result, value like strcmp
+ *
+ * Parameters
+ * a0 - string1
+ * a1 - string2
+ *
+ * Clobbers
+ * t0, t1, t2, t3, t4, t5
+ */
+ mv src2, a1
+
+ or align, src1, src2
+ li m1, -1
+ and align, align, SZREG-1
+ bnez align, 3f
+
+ /* Main loop for aligned string. */
+ .p2align 3
+1:
+ REG_L data1, 0(src1)
+ REG_L data2, 0(src2)
+ orc.b data1_orcb, data1
+ bne data1_orcb, m1, 2f
+ addi src1, src1, SZREG
+ addi src2, src2, SZREG
+ beq data1, data2, 1b
+
+ /*
+ * Words don't match, and no null byte in the first
+ * word. Get bytes in big-endian order and compare.
+ */
+#ifndef CONFIG_CPU_BIG_ENDIAN
+ rev8 data1, data1
+ rev8 data2, data2
+#endif
+
+ /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
+ sltu result, data1, data2
+ neg result, result
+ ori result, result, 1
+ ret
+
+2:
+ /*
+ * Found a null byte.
+ * If words don't match, fall back to simple loop.
+ */
+ bne data1, data2, 3f
+
+ /* Otherwise, strings are equal. */
+ li result, 0
+ ret
+
+ /* Simple loop for misaligned strings. */
+ .p2align 3
+3:
+ lbu data1, 0(src1)
+ lbu data2, 0(src2)
+ addi src1, src1, 1
+ addi src2, src2, 1
+ bne data1, data2, 4f
+ bnez data1, 3b
+
+4:
+ sub result, data1, data2
+ ret
+
+.option pop
+#endif
SYM_FUNC_END(strcmp)
@@ -3,9 +3,14 @@
#include <linux/linkage.h>
#include <asm/asm.h>
#include <asm-generic/export.h>
+#include <asm/alternative-macros.h>
+#include <asm/errata_list.h>
/* int strlen(const char *s) */
SYM_FUNC_START(strlen)
+
+ ALTERNATIVE("nop", "j variant_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
+
/*
* Returns
* a0 - string length
@@ -25,4 +30,113 @@ SYM_FUNC_START(strlen)
2:
addi t1, t1, 1
j 1b
+
+/*
+ * Variant of strlen using the ZBB extension if available
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+variant_zbb:
+
+#define src a0
+#define result a0
+#define addr t0
+#define data t1
+#define offset t2
+#define offset_bits t2
+#define valid_bytes t3
+#define m1 t3
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+# define CZ clz
+# define SHIFT sll
+#else
+# define CZ ctz
+# define SHIFT srl
+#endif
+
+.option push
+.option arch,+zbb
+
+ /*
+ * Returns
+ * a0 - string length
+ *
+ * Parameters
+ * a0 - String to measure
+ *
+ * Clobbers
+ * t0, t1, t2, t3
+ */
+
+ /* Number of irrelevant bytes in the first word. */
+ andi offset, src, SZREG-1
+
+ /* Align pointer. */
+ andi addr, src, -SZREG
+
+ li valid_bytes, SZREG
+ sub valid_bytes, valid_bytes, offset
+ slli offset_bits, offset, RISCV_LGPTR
+
+ /* Get the first word. */
+ REG_L data, 0(addr)
+
+ /*
+ * Shift away the partial data we loaded to remove the irrelevant bytes
+ * preceding the string with the effect of adding NUL bytes at the
+ * end of the string.
+ */
+ SHIFT data, data, offset_bits
+
+ /* Convert non-NUL into 0xff and NUL into 0x00. */
+ orc.b data, data
+
+ /* Convert non-NUL into 0x00 and NUL into 0xff. */
+ not data, data
+
+ /*
+ * Search for the first set bit (corresponding to a NUL byte in the
+ * original chunk).
+ */
+ CZ data, data
+
+ /*
+ * The first chunk is special: commpare against the number
+ * of valid bytes in this chunk.
+ */
+ srli result, data, 3
+ bgtu valid_bytes, result, 3f
+
+ /* Prepare for the word comparison loop. */
+ addi offset, addr, SZREG
+ li m1, -1
+
+ /*
+ * Our critical loop is 4 instructions and processes data in
+ * 4 byte or 8 byte chunks.
+ */
+ .p2align 3
+1:
+ REG_L data, SZREG(addr)
+ addi addr, addr, SZREG
+ orc.b data, data
+ beq data, m1, 1b
+2:
+ not data, data
+ CZ data, data
+
+ /* Get number of processed words. */
+ sub offset, addr, offset
+
+ /* Add number of characters in the first word. */
+ add result, result, offset
+ srli data, data, 3
+
+ /* Add number of characters in the last word. */
+ add result, result, data
+3:
+ ret
+
+.option pop
+#endif
SYM_FUNC_END(strlen)
@@ -3,9 +3,14 @@
#include <linux/linkage.h>
#include <asm/asm.h>
#include <asm-generic/export.h>
+#include <asm/alternative-macros.h>
+#include <asm/errata_list.h>
/* int strncmp(const char *cs, const char *ct, size_t count) */
SYM_FUNC_START(strncmp)
+
+ ALTERNATIVE("nop", "j variant_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
+
/*
* Returns
* a0 - comparison result, value like strncmp
@@ -37,4 +42,110 @@ SYM_FUNC_START(strncmp)
4:
li a0, 0
j 2b
+
+/*
+ * Variant of strncmp using the ZBB extension if available
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+variant_zbb:
+
+#define src1 a0
+#define result a0
+#define src2 t6
+#define len a2
+#define data1 t0
+#define data2 t1
+#define align t2
+#define data1_orcb t3
+#define limit t4
+#define m1 t5
+
+.option push
+.option arch,+zbb
+
+ /*
+ * Returns
+ * a0 - comparison result, like strncmp
+ *
+ * Parameters
+ * a0 - string1
+ * a1 - string2
+ * a2 - number of characters to compare
+ *
+ * Clobbers
+ * t0, t1, t2, t3, t4, t5, t6
+ */
+ mv src2, a1
+
+ or align, src1, src2
+ li m1, -1
+ and align, align, SZREG-1
+ add limit, src1, len
+ bnez align, 4f
+
+ /* Adjust limit for fast-path. */
+ addi limit, limit, -SZREG
+
+ /* Main loop for aligned string. */
+ .p2align 3
+1:
+ bgt src1, limit, 3f
+ REG_L data1, 0(src1)
+ REG_L data2, 0(src2)
+ orc.b data1_orcb, data1
+ bne data1_orcb, m1, 2f
+ addi src1, src1, SZREG
+ addi src2, src2, SZREG
+ beq data1, data2, 1b
+
+ /*
+ * Words don't match, and no null byte in the first
+ * word. Get bytes in big-endian order and compare.
+ */
+#ifndef CONFIG_CPU_BIG_ENDIAN
+ rev8 data1, data1
+ rev8 data2, data2
+#endif
+
+ /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
+ sltu result, data1, data2
+ neg result, result
+ ori result, result, 1
+ ret
+
+2:
+ /*
+ * Found a null byte.
+ * If words don't match, fall back to simple loop.
+ */
+ bne data1, data2, 3f
+
+ /* Otherwise, strings are equal. */
+ li result, 0
+ ret
+
+ /* Simple loop for misaligned strings. */
+3:
+ /* Restore limit for slow-path. */
+ addi limit, limit, SZREG
+ .p2align 3
+4:
+ bge src1, limit, 6f
+ lbu data1, 0(src1)
+ lbu data2, 0(src2)
+ addi src1, src1, 1
+ addi src2, src2, 1
+ bne data1, data2, 5f
+ bnez data1, 4b
+
+5:
+ sub result, data1, data2
+ ret
+
+6:
+ li result, 0
+ ret
+
+.option pop
+#endif
SYM_FUNC_END(strncmp)