RISC-V: add strcmp variant using zbb and fast-unaligned

A patch from »Zbb + fast-unaligned string optimization« in state Obsolete for linux-kernel

From: Heiko Stuebner <heiko.stuebner@...> Date: Fri, 23 Dec 2022 00:12:12 +0100

Commit-Message

On cores that can do unaligned access fast in hardware, there are some more optimizations possible, so add a second strcmp variant for that case. Signed-off-by: Heiko Stuebner <heiko.stuebner@...>

Patch-Comment

arch/riscv/lib/strcmp.S | 170 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 169 insertions(+), 1 deletion(-)

Statistics

  • 169 lines added
  • 1 lines removed

Changes

--------------------------- arch/riscv/lib/strcmp.S ----------------------------
index ce85bbbee4b9..53f41d032aae 100644
@@ -9,7 +9,13 @@
/* int strcmp(const char *cs, const char *ct) */
SYM_FUNC_START(strcmp)
- ALTERNATIVE_2("nop", "j strcmp_zbb", 0, CPUFEATURE_ZBB, 0, CONFIG_RISCV_ISA_ZBB)
+ ALTERNATIVE_2("nop",
+ "j strcmp_zbb_unaligned", 0,
+ CPUFEATURE_ZBB | CPUFEATURE_FAST_UNALIGNED, 0,
+ CONFIG_RISCV_ISA_ZBB,
+ "j strcmp_zbb", 0,
+ CPUFEATURE_ZBB, CPUFEATURE_FAST_UNALIGNED,
+ CONFIG_RISCV_ISA_ZBB)
/*
* Returns
@@ -116,6 +122,168 @@ strcmp_zbb:
sub a0, t0, t1
ret
+strcmp_zbb_unaligned:
+
+ /*
+ * Returns
+ * a0 - comparison result, value like strcmp
+ *
+ * Parameters
+ * a0 - string1
+ * a1 - string2
+ *
+ * Clobbers
+ * a3, a4, a5, a6, a7, t0, t1, t2, t3, t4, t5
+ */
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+# error big endian is untested!
+# define CZ ctz
+# define SHIFT srl
+# define SHIFT2 sll
+#else
+# define CZ ctz
+# define SHIFT sll
+# define SHIFT2 srl
+#endif
+
+ /* a3...delta from a0 to a1. */
+ sub a3, a1, a0
+ li a4, -1
+ andi a7, a3, SZREG-1
+ andi a5, a0, SZREG-1
+ bnez a7, 7f
+ bnez a5, 6f
+
+ .p2align 4
+1:
+ REG_L t0, 0(a0)
+ add a7, a0, a3
+ addi a0, a0, SZREG
+ REG_L t1, 0(a7)
+
+2:
+ orc.b t3, t0
+ bne t3, a4, 4f
+ beq t0, t1, 1b
+
+ /* Words don't match, and no NUL byte in one word.
+ Get bytes in big-endian order and compare as words. */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ rev8 t0, t0
+ rev8 t1, t1
+#endif
+ /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */
+ sltu a0, t0, t1
+ neg a0, a0
+ ori a0, a0, 1
+ ret
+
+3:
+ orc.b t3, t0
+4:
+ /* Words don't match or NUL byte in at least one word.
+ t3 holds orc.b value of t0. */
+ xor a7, t0, t1
+ orc.b a7, a7
+
+ orn a7, a7, t3
+ CZ t5, a7
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ rev8 t0, t0
+ rev8 t1, t1
+#endif
+ sll t0, t0, t5
+ sll t1, t1, t5
+ srl t0, t0, SZREG*8-8
+ srl t1, t1, SZREG*8-8
+
+5:
+ sub a0, t0, t1
+ ret
+
+ .p2align 4
+6:
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask a3
+ the bytes that precede the start point. */
+ andi a0, a0, -SZREG
+ add a7, a0, a3
+ REG_L t0, 0(a0)
+ addi a0, a0, SZREG
+ REG_L t1, 0(a7)
+ /* Get number of bits to mask. */
+ sll t5, a1, 3
+ /* Bits to mask are now 0, others are 1. */
+ SHIFT a7, a4, t5
+ /* Or with inverted value -> masked bits become 1. */
+ orn t0, t0, a7
+ orn t1, t1, a7
+ j 2b
+
+7:
+ /* Skip slow loop if a0 is aligned. */
+ beqz a5, 9f
+8:
+ /* Align a0 to 8 bytes. */
+ lbu t0, 0(a0)
+ lbu t1, 0(a1)
+ beqz t0, 5b
+ bne t0, t1, 5b
+ addi a0, a0, 1
+ addi a1, a1, 1
+ andi a5, a0, SZREG-1
+ bnez a5, 8b
+
+9:
+ /* a0 is aligned. Align a1 down and check for NUL there.
+ * If there is no NUL, we may read the next word from a1.
+ * If there is a NUL, we must not read a complete word from a1
+ * because we might cross a page boundary. */
+ /* Get number of bits to mask (upper bits are ignored by shifts). */
+ sll t5, a1, 3
+ /* a6 := align_down (a1) */
+ andi a6, a1, -SZREG
+ REG_L t2, 0(a6)
+ addi a6, a6, SZREG
+
+ /* Bits to mask are now 0, others are 1. */
+ SHIFT a7, a4, t5
+ /* Or with inverted value -> masked bits become 1. */
+ orn t4, t2, a7
+ /* Check for NUL in next aligned word. */
+ orc.b t4, t4
+ bne t4, a4, 11f
+
+ .p2align 4
+10:
+ /* Read the (aligned) t0 and the unaligned t1. */
+ REG_L t0, 0(a0)
+ addi a0, a0, SZREG
+ REG_L t1, 0(a1)
+ addi a1, a1, SZREG
+ orc.b t3, t0
+ bne t3, a4, 4b
+ bne t0, t1, 4b
+
+ /* Read the next aligned-down word. */
+ REG_L t2, 0(a6)
+ addi a6, a6, SZREG
+ orc.b t4, t2
+ beq t4, a4, 10b
+
+11:
+ /* a0 points to unread word (only first bytes relevant).
+ * t2 holds next aligned-down word with NUL.
+ * Compare the first bytes of t0 with the last bytes of t2. */
+ REG_L t0, 0(a0)
+ /* Shift NUL bytes into t2 to become t1. */
+ SHIFT2 t1, t2, t5
+ bne t0, t1, 3b
+ li a0, 0
+ ret
+
.option pop
#endif
SYM_FUNC_END(strcmp)
 
 

Recent Patches

About Us

Sed lacus. Donec lectus. Nullam pretium nibh ut turpis. Nam bibendum. In nulla tortor, elementum vel, tempor at, varius non, purus. Mauris vitae nisl nec metus placerat consectetuer.

Read More...