Add the BTI instructions and the associated note to make the AArch64 asm objects compatible with BTI enforcement. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Reviewed-by: Leif Lindholm <quic_llindhol@quicinc.com> Reviewed-by: Oliver Smith-Denny <osd@smith-denny.com>
		
			
				
	
	
		
			205 lines
		
	
	
		
			5.1 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			205 lines
		
	
	
		
			5.1 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| //
 | |
| // Copyright (c) 2012 - 2016, Linaro Limited
 | |
| // All rights reserved.
 | |
| // Copyright (c) 2015 ARM Ltd
 | |
| // All rights reserved.
 | |
| // SPDX-License-Identifier: BSD-2-Clause-Patent
 | |
| //
 | |
| 
 | |
| // Assumptions:
 | |
| //
 | |
| // ARMv8-a, AArch64, unaligned accesses
 | |
| //
 | |
| //
 | |
| 
 | |
| #define dstin     x0
 | |
| #define count     x1
 | |
| #define val       x2
 | |
| #define valw      w2
 | |
| #define dst       x3
 | |
| #define dstend    x4
 | |
| #define tmp1      x5
 | |
| #define tmp1w     w5
 | |
| #define tmp2      x6
 | |
| #define tmp2w     w6
 | |
| #define zva_len   x7
 | |
| #define zva_lenw  w7
 | |
| 
 | |
| #define L(l) .L ## l
 | |
| 
 | |
| ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
 | |
| ASM_PFX(InternalMemSetMem16):
 | |
|     AARCH64_BTI(c)
 | |
|     dup     v0.8H, valw
 | |
|     lsl     count, count, #1
 | |
|     b       0f
 | |
| 
 | |
| ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
 | |
| ASM_PFX(InternalMemSetMem32):
 | |
|     AARCH64_BTI(c)
 | |
|     dup     v0.4S, valw
 | |
|     lsl     count, count, #2
 | |
|     b       0f
 | |
| 
 | |
| ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
 | |
| ASM_PFX(InternalMemSetMem64):
 | |
|     AARCH64_BTI(c)
 | |
|     dup     v0.2D, val
 | |
|     lsl     count, count, #3
 | |
|     b       0f
 | |
| 
 | |
| ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
 | |
| ASM_PFX(InternalMemZeroMem):
 | |
|     AARCH64_BTI(c)
 | |
|     movi    v0.16B, #0
 | |
|     b       0f
 | |
| 
 | |
| ASM_GLOBAL ASM_PFX(InternalMemSetMem)
 | |
| ASM_PFX(InternalMemSetMem):
 | |
|     AARCH64_BTI(c)
 | |
|     dup     v0.16B, valw
 | |
| 0:  add     dstend, dstin, count
 | |
|     mov     val, v0.D[0]
 | |
| 
 | |
|     cmp     count, 96
 | |
|     b.hi    L(set_long)
 | |
|     cmp     count, 16
 | |
|     b.hs    L(set_medium)
 | |
| 
 | |
|     // Set 0..15 bytes.
 | |
|     tbz     count, 3, 1f
 | |
|     str     val, [dstin]
 | |
|     str     val, [dstend, -8]
 | |
|     ret
 | |
|     nop
 | |
| 1:  tbz     count, 2, 2f
 | |
|     str     valw, [dstin]
 | |
|     str     valw, [dstend, -4]
 | |
|     ret
 | |
| 2:  cbz     count, 3f
 | |
|     strb    valw, [dstin]
 | |
|     tbz     count, 1, 3f
 | |
|     strh    valw, [dstend, -2]
 | |
| 3:  ret
 | |
| 
 | |
|     // Set 17..96 bytes.
 | |
| L(set_medium):
 | |
|     str     q0, [dstin]
 | |
|     tbnz    count, 6, L(set96)
 | |
|     str     q0, [dstend, -16]
 | |
|     tbz     count, 5, 1f
 | |
|     str     q0, [dstin, 16]
 | |
|     str     q0, [dstend, -32]
 | |
| 1:  ret
 | |
| 
 | |
|     .p2align 4
 | |
|     // Set 64..96 bytes.  Write 64 bytes from the start and
 | |
|     // 32 bytes from the end.
 | |
| L(set96):
 | |
|     str     q0, [dstin, 16]
 | |
|     stp     q0, q0, [dstin, 32]
 | |
|     stp     q0, q0, [dstend, -32]
 | |
|     ret
 | |
| 
 | |
|     .p2align 3
 | |
|     nop
 | |
| L(set_long):
 | |
|     bic     dst, dstin, 15
 | |
|     str     q0, [dstin]
 | |
|     cmp     count, 256
 | |
|     ccmp    val, 0, 0, cs
 | |
|     b.eq    L(try_zva)
 | |
| L(no_zva):
 | |
|     sub     count, dstend, dst        // Count is 16 too large.
 | |
|     add     dst, dst, 16
 | |
|     sub     count, count, 64 + 16     // Adjust count and bias for loop.
 | |
| 1:  stp     q0, q0, [dst], 64
 | |
|     stp     q0, q0, [dst, -32]
 | |
| L(tail64):
 | |
|     subs    count, count, 64
 | |
|     b.hi    1b
 | |
| 2:  stp     q0, q0, [dstend, -64]
 | |
|     stp     q0, q0, [dstend, -32]
 | |
|     ret
 | |
| 
 | |
|     .p2align 3
 | |
| L(try_zva):
 | |
|     mrs     tmp1, dczid_el0
 | |
|     tbnz    tmp1w, 4, L(no_zva)
 | |
|     and     tmp1w, tmp1w, 15
 | |
|     cmp     tmp1w, 4                  // ZVA size is 64 bytes.
 | |
|     b.ne    L(zva_128)
 | |
| 
 | |
|     // Write the first and last 64 byte aligned block using stp rather
 | |
|     // than using DC ZVA.  This is faster on some cores.
 | |
| L(zva_64):
 | |
|     str     q0, [dst, 16]
 | |
|     stp     q0, q0, [dst, 32]
 | |
|     bic     dst, dst, 63
 | |
|     stp     q0, q0, [dst, 64]
 | |
|     stp     q0, q0, [dst, 96]
 | |
|     sub     count, dstend, dst         // Count is now 128 too large.
 | |
|     sub     count, count, 128+64+64    // Adjust count and bias for loop.
 | |
|     add     dst, dst, 128
 | |
|     nop
 | |
| 1:  dc      zva, dst
 | |
|     add     dst, dst, 64
 | |
|     subs    count, count, 64
 | |
|     b.hi    1b
 | |
|     stp     q0, q0, [dst, 0]
 | |
|     stp     q0, q0, [dst, 32]
 | |
|     stp     q0, q0, [dstend, -64]
 | |
|     stp     q0, q0, [dstend, -32]
 | |
|     ret
 | |
| 
 | |
|     .p2align 3
 | |
| L(zva_128):
 | |
|     cmp     tmp1w, 5                    // ZVA size is 128 bytes.
 | |
|     b.ne    L(zva_other)
 | |
| 
 | |
|     str     q0, [dst, 16]
 | |
|     stp     q0, q0, [dst, 32]
 | |
|     stp     q0, q0, [dst, 64]
 | |
|     stp     q0, q0, [dst, 96]
 | |
|     bic     dst, dst, 127
 | |
|     sub     count, dstend, dst          // Count is now 128 too large.
 | |
|     sub     count, count, 128+128       // Adjust count and bias for loop.
 | |
|     add     dst, dst, 128
 | |
| 1:  dc      zva, dst
 | |
|     add     dst, dst, 128
 | |
|     subs    count, count, 128
 | |
|     b.hi    1b
 | |
|     stp     q0, q0, [dstend, -128]
 | |
|     stp     q0, q0, [dstend, -96]
 | |
|     stp     q0, q0, [dstend, -64]
 | |
|     stp     q0, q0, [dstend, -32]
 | |
|     ret
 | |
| 
 | |
| L(zva_other):
 | |
|     mov     tmp2w, 4
 | |
|     lsl     zva_lenw, tmp2w, tmp1w
 | |
|     add     tmp1, zva_len, 64           // Max alignment bytes written.
 | |
|     cmp     count, tmp1
 | |
|     blo     L(no_zva)
 | |
| 
 | |
|     sub     tmp2, zva_len, 1
 | |
|     add     tmp1, dst, zva_len
 | |
|     add     dst, dst, 16
 | |
|     subs    count, tmp1, dst            // Actual alignment bytes to write.
 | |
|     bic     tmp1, tmp1, tmp2            // Aligned dc zva start address.
 | |
|     beq     2f
 | |
| 1:  stp     q0, q0, [dst], 64
 | |
|     stp     q0, q0, [dst, -32]
 | |
|     subs    count, count, 64
 | |
|     b.hi    1b
 | |
| 2:  mov     dst, tmp1
 | |
|     sub     count, dstend, tmp1         // Remaining bytes to write.
 | |
|     subs    count, count, zva_len
 | |
|     b.lo    4f
 | |
| 3:  dc      zva, dst
 | |
|     add     dst, dst, zva_len
 | |
|     subs    count, count, zva_len
 | |
|     b.hs    3b
 | |
| 4:  add     count, count, zva_len
 | |
|     b       L(tail64)
 |