Preliminary work on using ZBB to optimize some string functions.
The ZBB-optimized variants itself do not like me yet, so I'm
using copies of the generic functions to test the call-offset
adaptions.
Signed-off-by: Heiko Stuebner <heiko@...>
arch/riscv/Kconfig | 23 ++++++++
arch/riscv/include/asm/errata_list.h | 3 +-
arch/riscv/include/asm/hwcap.h | 1 +
arch/riscv/include/asm/string.h | 27 +++++++--
arch/riscv/kernel/cpu.c | 1 +
arch/riscv/kernel/cpufeature.c | 18 ++++++
arch/riscv/lib/Makefile | 6 ++
arch/riscv/lib/strcmp_tmp.S | 25 ++++++++
arch/riscv/lib/strcmp_zbb.S | 70 +++++++++++++++++++++++
arch/riscv/lib/strlen_tmp.S | 17 ++++++
arch/riscv/lib/strlen_zbb.S | 80 ++++++++++++++++++++++++++
arch/riscv/lib/strncmp_tmp.S | 27 +++++++++
arch/riscv/lib/strncmp_zbb.S | 85 ++++++++++++++++++++++++++++
13 files changed, 378 insertions(+), 5 deletions(-)
create mode 100644 arch/riscv/lib/strcmp_tmp.S
create mode 100644 arch/riscv/lib/strcmp_zbb.S
create mode 100644 arch/riscv/lib/strlen_tmp.S
create mode 100644 arch/riscv/lib/strlen_zbb.S
create mode 100644 arch/riscv/lib/strncmp_tmp.S
create mode 100644 arch/riscv/lib/strncmp_zbb.S
@@ -411,6 +411,29 @@ config RISCV_ISA_SVPBMT
If you don't know what to do here, say Y.
+config TOOLCHAIN_HAS_ZBB
+ bool
+ default y
+ depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbb)
+ depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbb)
+ depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900
+
+config RISCV_ISA_ZBB
+ bool "Zbb extension support for "
+ depends on TOOLCHAIN_HAS_ZBB
+ depends on !XIP_KERNEL && MMU
+ select RISCV_ALTERNATIVE
+ default y
+ help
+ Adds support to dynamically detect the presence of the ZBB
+ extension (...) and enable its
+ usage.
+
+ The Zbb extension can be used to handle for example
+ ...
+
+ If you don't know what to do here, say Y.
+
config TOOLCHAIN_HAS_ZICBOM
bool
default y
@@ -24,7 +24,8 @@
#define CPUFEATURE_SVPBMT 0
#define CPUFEATURE_ZICBOM 1
-#define CPUFEATURE_NUMBER 2
+#define CPUFEATURE_ZBB 2
+#define CPUFEATURE_NUMBER 3
#ifdef __ASSEMBLY__
@@ -59,6 +59,7 @@ enum riscv_isa_ext_id {
RISCV_ISA_EXT_ZIHINTPAUSE,
RISCV_ISA_EXT_SSTC,
RISCV_ISA_EXT_SVINVAL,
+ RISCV_ISA_EXT_ZBB,
RISCV_ISA_EXT_ID_MAX = RISCV_ISA_EXT_MAX,
};
@@ -6,6 +6,8 @@
#ifndef _ASM_RISCV_STRING_H
#define _ASM_RISCV_STRING_H
+#include <asm/alternative-macros.h>
+#include <asm/errata_list.h>
#include <linux/types.h>
#include <linux/linkage.h>
@@ -21,6 +23,7 @@ extern asmlinkage void *__memmove(void *, const void *, size_t);
#define __HAVE_ARCH_STRCMP
extern asmlinkage int __strcmp_generic(const char *cs, const char *ct);
+extern asmlinkage int __strcmp_tmp(const char *cs, const char *ct);
static __always_inline int strcmp(const char *cs, const char *ct)
{
@@ -31,7 +34,11 @@ static __always_inline int strcmp(const char *cs, const char *ct)
register const char* a1 asm("a1") = ct;
register int a0_out asm("a0");
- asm volatile("call __strcmp_generic\n\t"
+ asm volatile(
+ ALTERNATIVE(
+ "call __strcmp_generic\n\t",
+ "call __strcmp_tmp\n\t",
+ 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
: "=r"(a0_out)
: "r"(a0), "r"(a1)
: "ra", "t0", "t1", "t2");
@@ -43,7 +50,7 @@ static __always_inline int strcmp(const char *cs, const char *ct)
#define __HAVE_ARCH_STRNCMP
extern asmlinkage int __strncmp_generic(const char *cs,
const char *ct, size_t count);
-extern asmlinkage int __strncmp_zbb(const char *cs,
+extern asmlinkage int __strncmp_tmp(const char *cs,
const char *ct, size_t count);
static __always_inline int strncmp(const char *cs, const char *ct, size_t count)
@@ -56,7 +63,11 @@ static __always_inline int strncmp(const char *cs, const char *ct, size_t count)
register size_t a2 asm("a2") = count;
register int a0_out asm("a0");
- asm volatile("call __strncmp_generic\n\t"
+ asm volatile(
+ ALTERNATIVE(
+ "call __strncmp_generic\n\t",
+ "call __strncmp_tmp\n\t",
+ 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
: "=r"(a0_out)
: "r"(a0), "r"(a1), "r"(a2)
: "ra", "t0", "t1", "t2");
@@ -67,19 +78,27 @@ static __always_inline int strncmp(const char *cs, const char *ct, size_t count)
#define __HAVE_ARCH_STRLEN
extern asmlinkage __kernel_size_t __strlen_generic(const char *);
+extern asmlinkage __kernel_size_t __strlen_tmp(const char *);
static __always_inline __kernel_size_t strlen(const char *s)
{
+#ifdef RISCV_EFISTUB
+ return __strlen_generic(s);
+#else
register const char* a0 asm("a0") = s;
register int a0_out asm("a0");
asm volatile(
- "call __strlen_generic\n\t"
+ ALTERNATIVE(
+ "call __strlen_generic\n\t",
+ "call __strlen_tmp\n\t",
+ 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
: "=r"(a0_out)
: "r"(a0)
: "ra", "t0", "t1");
return a0_out;
+#endif
}
#define __HAVE_ARCH_STRSTARTS
@@ -166,6 +166,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = {
__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
+ __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB),
__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
__RISCV_ISA_EXT_DATA("", RISCV_ISA_EXT_MAX),
@@ -201,6 +201,7 @@ void __init riscv_fill_hwcap(void)
} else {
SET_ISA_EXT_MAP("sscofpmf", RISCV_ISA_EXT_SSCOFPMF);
SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT);
+ SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB);
SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM);
SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
@@ -278,6 +279,20 @@ static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage)
return true;
}
+static bool __init_or_module cpufeature_probe_zbb(unsigned int stage)
+{
+ if (!IS_ENABLED(CONFIG_RISCV_ISA_ZBB))
+ return false;
+
+ if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
+ return false;
+
+ if (!riscv_isa_extension_available(NULL, ZBB))
+ return false;
+
+ return true;
+}
+
/*
* Probe presence of individual extensions.
*
@@ -295,6 +310,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
if (cpufeature_probe_zicbom(stage))
cpu_req_feature |= BIT(CPUFEATURE_ZICBOM);
+ if (cpufeature_probe_zbb(stage))
+ cpu_req_feature |= BIT(CPUFEATURE_ZBB);
+
return cpu_req_feature;
}
@@ -5,8 +5,14 @@ lib-y += memset.o
lib-y += memmove.o
lib-y += string.o
lib-y += strcmp.o
+lib-y += strcmp_tmp.o
+lib-$(CONFIG_RISCV_ISA_ZBB) += strcmp_zbb.o
lib-y += strlen.o
+lib-y += strlen_tmp.o
+lib-$(CONFIG_RISCV_ISA_ZBB) += strlen_zbb.o
lib-y += strncmp.o
+lib-y += strncmp_tmp.o
+lib-$(CONFIG_RISCV_ISA_ZBB) += strncmp_zbb.o
lib-$(CONFIG_MMU) += uaccess.o
lib-$(CONFIG_64BIT) += tishift.o
@@ -0,0 +1,25 @@
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+
+/* int __strcmp_tmp(const char *cs, const char *ct) */
+ENTRY(__strcmp_tmp)
+ mv t2, a1
+1:
+ lbu t1, 0(a0)
+ lbu t0, 0(a1)
+ addi a0, a0, 1
+ addi a1, a1, 1
+ beq t1, t0, 3f
+ li a0, 1
+ bgeu t1, t0, 2f
+ li a0, -1
+2:
+ mv a1, t2
+ ret
+3:
+ bnez t1, 1b
+ li a0, 0
+ j 2b
+END(__strcmp_tmp)
+EXPORT_SYMBOL(__strcmp_tmp)
@@ -0,0 +1,70 @@
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+
+#define src1 a0
+#define result a0
+#define src2 a1
+#define data1 a2
+#define data2 a3
+#define align a4
+#define data1_orcb t0
+#define m1 t2
+
+.option push
+.option arch,+zbb
+
+ENTRY(__strcmp_zbb)
+ or align, src1, src2
+ li m1, -1
+ and align, align, SZREG-1
+ bnez align, 3f
+ /* Main loop for aligned string. */
+ .p2align 3
+1:
+ REG_L data1, 0(src1)
+ REG_L data2, 0(src2)
+ orc.b data1_orcb, data1
+ bne data1_orcb, m1, 2f
+ addi src1, src1, SZREG
+ addi src2, src2, SZREG
+ beq data1, data2, 1b
+
+ /* Words don't match, and no null byte in the first
+ * word. Get bytes in big-endian order and compare. */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ rev8 data1, data1
+ rev8 data2, data2
+#endif
+ /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
+ sltu result, data1, data2
+ neg result, result
+ ori result, result, 1
+ ret
+
+2:
+ /* Found a null byte.
+ * If words don't match, fall back to simple loop. */
+ bne data1, data2, 3f
+
+ /* Otherwise, strings are equal. */
+ li result, 0
+ ret
+
+ /* Simple loop for misaligned strings. */
+ .p2align 3
+3:
+ lbu data1, 0(src1)
+ lbu data2, 0(src2)
+ addi src1, src1, 1
+ addi src2, src2, 1
+ bne data1, data2, 4f
+ bnez data1, 3b
+
+4:
+ sub result, data1, data2
+ ret
+END(__strcmp_zbb)
+EXPORT_SYMBOL(__strcmp_zbb)
+
+.option pop
@@ -0,0 +1,17 @@
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+
+/* int __strlen_tmp(const char *s) */
+ENTRY(__strlen_tmp)
+ mv t1, a0
+1:
+ lbu t0, 0(t1)
+ bnez t0, 2f
+ sub a0, t1, a0
+ ret
+2:
+ addi t1, t1, 1
+ j 1b
+END(__strlen_tmp)
+EXPORT_SYMBOL(__strlen_tmp)
@@ -0,0 +1,80 @@
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+
+#define src a0
+#define result a0
+#define addr a1
+#define data a2
+#define offset a3
+#define offset_bits a3
+#define valid_bytes a4
+#define m1 a4
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+# define CZ clz
+# define SHIFT sll
+#else
+# define CZ ctz
+# define SHIFT srl
+#endif
+
+.option push
+.option arch,+zbb
+
+ENTRY(__strlen_zbb)
+ /* Number of irrelevant bytes in the first word. */
+ andi offset, src, SZREG-1
+ /* Align pointer. */
+ andi addr, src, -SZREG
+
+ li valid_bytes, SZREG
+ sub valid_bytes, valid_bytes, offset
+ slli offset_bits, offset, RISCV_LGPTR
+
+ /* Get the first word. */
+ REG_L data, 0(addr)
+ /* Shift away the partial data we loaded to remove the irrelevant bytes
+ * preceeding the string with the effect of adding NUL bytes at the
+ * end of the string. */
+ SHIFT data, data, offset_bits
+ /* Convert non-NUL into 0xff and NUL into 0x00. */
+ orc.b data, data
+ /* Convert non-NUL into 0x00 and NUL into 0xff. */
+ not data, data
+ /* Search for the first set bit (corresponding to a NUL byte in the
+ * original chunk). */
+ CZ data, data
+ /* The first chunk is special: commpare against the number
+ * of valid bytes in this chunk. */
+ srli result, data, 3
+ bgtu valid_bytes, result, 3f
+
+ /* Prepare for the word comparison loop. */
+ addi offset, addr, SZREG
+ li m1, -1
+
+ /* Our critical loop is 4 instructions and processes data in
+ * 4 byte or 8 byte chunks. */
+ .p2align 3
+1:
+ REG_L data, SZREG(addr)
+ addi addr, addr, SZREG
+ orc.b data, data
+ beq data, m1, 1b
+2:
+ not data, data
+ CZ data, data
+ /* Get number of processed words. */
+ sub offset, addr, offset
+ /* Add number of characters in the first word. */
+ add result, result, offset
+ srli data, data, 3
+ /* Add number of characters in the last word. */
+ add result, result, data
+3:
+ ret
+END(__strlen_zbb)
+EXPORT_SYMBOL(__strlen_zbb)
+
+.option pop
@@ -0,0 +1,27 @@
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+
+/* int __strncmp_tmp(const char *cs, const char *ct, size_t count) */
+ENTRY(__strncmp_tmp)
+ li t0, 0
+1:
+ beq a2, t0, 4f
+ add t1, a0, t0
+ add t2, a1, t0
+ lbu t1, 0(t1)
+ lbu t2, 0(t2)
+ beq t1, t2, 3f
+ li a0, 1
+ bgeu t1, t2, 2f
+ li a0, -1
+2:
+ ret
+3:
+ addi t0, t0, 1
+ bnez t1, 1b
+4:
+ li a0, 0
+ j 2b
+END(__strncmp_tmp)
+EXPORT_SYMBOL(__strncmp_tmp)
@@ -0,0 +1,85 @@
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+
+#define src1 a0
+#define result a0
+#define src2 a1
+#define len a2
+#define data1 a2
+#define data2 a3
+#define align a4
+#define data1_orcb a5
+#define limit t0
+#define m1 t1
+
+.option push
+.option arch,+zbb
+
+ENTRY(__strncmp_zbb)
+ or align, src1, src2
+ li m1, -1
+ and align, align, SZREG-1
+ add limit, src1, len
+ bnez align, 4f
+
+ /* Adjust limit for fast-path. */
+ addi limit, limit, -SZREG
+ /* Main loop for aligned string. */
+ .p2align 3
+1:
+ bgt src1, limit, 3f
+ REG_L data1, 0(src1)
+ REG_L data2, 0(src2)
+ orc.b data1_orcb, data1
+ bne data1_orcb, m1, 2f
+ addi src1, src1, SZREG
+ addi src2, src2, SZREG
+ beq data1, data2, 1b
+
+ /* Words don't match, and no null byte in the first
+ * word. Get bytes in big-endian order and compare. */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ rev8 data1, data1
+ rev8 data2, data2
+#endif
+ /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
+ sltu result, data1, data2
+ neg result, result
+ ori result, result, 1
+ ret
+
+2:
+ /* Found a null byte.
+ * If words don't match, fall back to simple loop. */
+ bne data1, data2, 3f
+
+ /* Otherwise, strings are equal. */
+ li result, 0
+ ret
+
+ /* Simple loop for misaligned strings. */
+3:
+ /* Restore limit for slow-path. */
+ addi limit, limit, SZREG
+ .p2align 3
+4:
+ bge src1, limit, 6f
+ lbu data1, 0(src1)
+ lbu data2, 0(src2)
+ addi src1, src1, 1
+ addi src2, src2, 1
+ bne data1, data2, 5f
+ bnez data1, 4b
+
+5:
+ sub result, data1, data2
+ ret
+
+6:
+ li result, 0
+ ret
+END(__strncmp_zbb)
+EXPORT_SYMBOL(__strncmp_zbb)
+
+.option pop