Add the BTI instructions and the associated note to make the AArch64 asm objects compatible with BTI enforcement. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Reviewed-by: Leif Lindholm <quic_llindhol@quicinc.com> Reviewed-by: Oliver Smith-Denny <osd@smith-denny.com>
		
			
				
	
	
		
			141 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			141 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| //
 | |
| // Copyright (c) 2014, ARM Limited
 | |
| // All rights Reserved.
 | |
| // SPDX-License-Identifier: BSD-2-Clause-Patent
 | |
| //
 | |
| 
 | |
| // Assumptions:
 | |
| //
 | |
| // ARMv8-a, AArch64
 | |
| // Neon Available.
 | |
| //
 | |
| 
 | |
| // Arguments and results.
 | |
| #define srcin     x0
 | |
| #define cntin     x1
 | |
| #define chrin     w2
 | |
| 
 | |
| #define result    x0
 | |
| 
 | |
| #define src       x3
 | |
| #define  tmp       x4
 | |
| #define wtmp2     w5
 | |
| #define synd      x6
 | |
| #define soff      x9
 | |
| #define cntrem    x10
 | |
| 
 | |
| #define vrepchr   v0
 | |
| #define vdata1    v1
 | |
| #define vdata2    v2
 | |
| #define vhas_chr1 v3
 | |
| #define vhas_chr2 v4
 | |
| #define vrepmask  v5
 | |
| #define vend      v6
 | |
| 
 | |
| //
 | |
| // Core algorithm:
 | |
| //
 | |
| // For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
 | |
| // per byte. For each tuple, bit 0 is set if the relevant byte matched the
 | |
| // requested character and bit 1 is not used (faster than using a 32bit
 | |
| // syndrome). Since the bits in the syndrome reflect exactly the order in which
 | |
| // things occur in the original string, counting trailing zeros allows to
 | |
| // identify exactly which byte has matched.
 | |
| //
 | |
| 
 | |
| ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
 | |
| ASM_PFX(InternalMemScanMem8):
 | |
|     AARCH64_BTI(c)
 | |
|     // Do not dereference srcin if no bytes to compare.
 | |
|     cbz  cntin, .Lzero_length
 | |
|     //
 | |
|     // Magic constant 0x40100401 allows us to identify which lane matches
 | |
|     // the requested byte.
 | |
|     //
 | |
|     mov     wtmp2, #0x0401
 | |
|     movk    wtmp2, #0x4010, lsl #16
 | |
|     dup     vrepchr.16b, chrin
 | |
|     // Work with aligned 32-byte chunks
 | |
|     bic     src, srcin, #31
 | |
|     dup     vrepmask.4s, wtmp2
 | |
|     ands    soff, srcin, #31
 | |
|     and     cntrem, cntin, #31
 | |
|     b.eq    .Lloop
 | |
| 
 | |
|     //
 | |
|     // Input string is not 32-byte aligned. We calculate the syndrome
 | |
|     // value for the aligned 32 bytes block containing the first bytes
 | |
|     // and mask the irrelevant part.
 | |
|     //
 | |
| 
 | |
|     ld1     {vdata1.16b, vdata2.16b}, [src], #32
 | |
|     sub     tmp, soff, #32
 | |
|     adds    cntin, cntin, tmp
 | |
|     cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
 | |
|     cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
 | |
|     and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 | |
|     and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 | |
|     addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b        // 256->128
 | |
|     addp    vend.16b, vend.16b, vend.16b                  // 128->64
 | |
|     mov     synd, vend.d[0]
 | |
|     // Clear the soff*2 lower bits
 | |
|     lsl     tmp, soff, #1
 | |
|     lsr     synd, synd, tmp
 | |
|     lsl     synd, synd, tmp
 | |
|     // The first block can also be the last
 | |
|     b.ls    .Lmasklast
 | |
|     // Have we found something already?
 | |
|     cbnz    synd, .Ltail
 | |
| 
 | |
| .Lloop:
 | |
|     ld1     {vdata1.16b, vdata2.16b}, [src], #32
 | |
|     subs    cntin, cntin, #32
 | |
|     cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
 | |
|     cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
 | |
|     // If we're out of data we finish regardless of the result
 | |
|     b.ls    .Lend
 | |
|     // Use a fast check for the termination condition
 | |
|     orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b
 | |
|     addp    vend.2d, vend.2d, vend.2d
 | |
|     mov     synd, vend.d[0]
 | |
|     // We're not out of data, loop if we haven't found the character
 | |
|     cbz     synd, .Lloop
 | |
| 
 | |
| .Lend:
 | |
|     // Termination condition found, let's calculate the syndrome value
 | |
|     and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 | |
|     and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 | |
|     addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b      // 256->128
 | |
|     addp    vend.16b, vend.16b, vend.16b                // 128->64
 | |
|     mov     synd, vend.d[0]
 | |
|     // Only do the clear for the last possible block
 | |
|     b.hi    .Ltail
 | |
| 
 | |
| .Lmasklast:
 | |
|     // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
 | |
|     add     tmp, cntrem, soff
 | |
|     and     tmp, tmp, #31
 | |
|     sub     tmp, tmp, #32
 | |
|     neg     tmp, tmp, lsl #1
 | |
|     lsl     synd, synd, tmp
 | |
|     lsr     synd, synd, tmp
 | |
| 
 | |
| .Ltail:
 | |
|     // Count the trailing zeros using bit reversing
 | |
|     rbit    synd, synd
 | |
|     // Compensate the last post-increment
 | |
|     sub     src, src, #32
 | |
|     // Check that we have found a character
 | |
|     cmp     synd, #0
 | |
|     // And count the leading zeros
 | |
|     clz     synd, synd
 | |
|     // Compute the potential result
 | |
|     add     result, src, synd, lsr #1
 | |
|     // Select result or NULL
 | |
|     csel    result, xzr, result, eq
 | |
|     ret
 | |
| 
 | |
| .Lzero_length:
 | |
|     mov   result, #0
 | |
|     ret
 |