1. Do not use tab characters 2. No trailing white space in one line 3. All files must end with CRLF Contributed-under: TianoCore Contribution Agreement 1.1 Signed-off-by: Liming Gao <liming.gao@intel.com>
		
			
				
	
	
		
			285 lines
		
	
	
		
			9.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			285 lines
		
	
	
		
			9.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| //
 | |
| // Copyright (c) 2012 - 2016, Linaro Limited
 | |
| // All rights reserved.
 | |
| //
 | |
| // Redistribution and use in source and binary forms, with or without
 | |
| // modification, are permitted provided that the following conditions are met:
 | |
| //     * Redistributions of source code must retain the above copyright
 | |
| //       notice, this list of conditions and the following disclaimer.
 | |
| //     * Redistributions in binary form must reproduce the above copyright
 | |
| //       notice, this list of conditions and the following disclaimer in the
 | |
| //       documentation and/or other materials provided with the distribution.
 | |
| //     * Neither the name of the Linaro nor the
 | |
| //       names of its contributors may be used to endorse or promote products
 | |
| //       derived from this software without specific prior written permission.
 | |
| //
 | |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 | |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 | |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 | |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 | |
| // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 | |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 | |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 | |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 | |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 | |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
| //
 | |
| 
 | |
| //
 | |
| // Copyright (c) 2015 ARM Ltd
 | |
| // All rights reserved.
 | |
| //
 | |
| // Redistribution and use in source and binary forms, with or without
 | |
| // modification, are permitted provided that the following conditions
 | |
| // are met:
 | |
| // 1. Redistributions of source code must retain the above copyright
 | |
| //    notice, this list of conditions and the following disclaimer.
 | |
| // 2. Redistributions in binary form must reproduce the above copyright
 | |
| //    notice, this list of conditions and the following disclaimer in the
 | |
| //    documentation and/or other materials provided with the distribution.
 | |
| // 3. The name of the company may not be used to endorse or promote
 | |
| //    products derived from this software without specific prior written
 | |
| //    permission.
 | |
| //
 | |
| // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
 | |
| // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 | |
| // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 | |
| // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 | |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 | |
| // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 | |
| // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 | |
| // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 | |
| // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 | |
| // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
| //
 | |
| 
 | |
| // Assumptions:
 | |
| //
 | |
| // ARMv8-a, AArch64, unaligned accesses.
 | |
| //
 | |
| //
 | |
| 
 | |
| #define dstin     x0
 | |
| #define src       x1
 | |
| #define count     x2
 | |
| #define dst       x3
 | |
| #define srcend    x4
 | |
| #define dstend    x5
 | |
| #define A_l       x6
 | |
| #define A_lw      w6
 | |
| #define A_h       x7
 | |
| #define A_hw      w7
 | |
| #define B_l       x8
 | |
| #define B_lw      w8
 | |
| #define B_h       x9
 | |
| #define C_l       x10
 | |
| #define C_h       x11
 | |
| #define D_l       x12
 | |
| #define D_h       x13
 | |
| #define E_l       x14
 | |
| #define E_h       x15
 | |
| #define F_l       srcend
 | |
| #define F_h       dst
 | |
| #define tmp1      x9
 | |
| #define tmp2      x3
 | |
| 
 | |
| #define L(l) .L ## l
 | |
| 
 | |
| // Copies are split into 3 main cases: small copies of up to 16 bytes,
 | |
| // medium copies of 17..96 bytes which are fully unrolled. Large copies
 | |
| // of more than 96 bytes align the destination and use an unrolled loop
 | |
| // processing 64 bytes per iteration.
 | |
| // Small and medium copies read all data before writing, allowing any
 | |
| // kind of overlap, and memmove tailcalls memcpy for these cases as
 | |
| // well as non-overlapping copies.
 | |
| 
 | |
| __memcpy:
 | |
|     prfm    PLDL1KEEP, [src]
 | |
|     add     srcend, src, count
 | |
|     add     dstend, dstin, count
 | |
|     cmp     count, 16
 | |
|     b.ls    L(copy16)
 | |
|     cmp     count, 96
 | |
|     b.hi    L(copy_long)
 | |
| 
 | |
|     // Medium copies: 17..96 bytes.
 | |
|     sub     tmp1, count, 1
 | |
|     ldp     A_l, A_h, [src]
 | |
|     tbnz    tmp1, 6, L(copy96)
 | |
|     ldp     D_l, D_h, [srcend, -16]
 | |
|     tbz     tmp1, 5, 1f
 | |
|     ldp     B_l, B_h, [src, 16]
 | |
|     ldp     C_l, C_h, [srcend, -32]
 | |
|     stp     B_l, B_h, [dstin, 16]
 | |
|     stp     C_l, C_h, [dstend, -32]
 | |
| 1:
 | |
|     stp     A_l, A_h, [dstin]
 | |
|     stp     D_l, D_h, [dstend, -16]
 | |
|     ret
 | |
| 
 | |
|     .p2align 4
 | |
|     // Small copies: 0..16 bytes.
 | |
| L(copy16):
 | |
|     cmp     count, 8
 | |
|     b.lo    1f
 | |
|     ldr     A_l, [src]
 | |
|     ldr     A_h, [srcend, -8]
 | |
|     str     A_l, [dstin]
 | |
|     str     A_h, [dstend, -8]
 | |
|     ret
 | |
|     .p2align 4
 | |
| 1:
 | |
|     tbz     count, 2, 1f
 | |
|     ldr     A_lw, [src]
 | |
|     ldr     A_hw, [srcend, -4]
 | |
|     str     A_lw, [dstin]
 | |
|     str     A_hw, [dstend, -4]
 | |
|     ret
 | |
| 
 | |
|     // Copy 0..3 bytes.  Use a branchless sequence that copies the same
 | |
|     // byte 3 times if count==1, or the 2nd byte twice if count==2.
 | |
| 1:
 | |
|     cbz     count, 2f
 | |
|     lsr     tmp1, count, 1
 | |
|     ldrb    A_lw, [src]
 | |
|     ldrb    A_hw, [srcend, -1]
 | |
|     ldrb    B_lw, [src, tmp1]
 | |
|     strb    A_lw, [dstin]
 | |
|     strb    B_lw, [dstin, tmp1]
 | |
|     strb    A_hw, [dstend, -1]
 | |
| 2:  ret
 | |
| 
 | |
|     .p2align 4
 | |
|     // Copy 64..96 bytes.  Copy 64 bytes from the start and
 | |
|     // 32 bytes from the end.
 | |
| L(copy96):
 | |
|     ldp     B_l, B_h, [src, 16]
 | |
|     ldp     C_l, C_h, [src, 32]
 | |
|     ldp     D_l, D_h, [src, 48]
 | |
|     ldp     E_l, E_h, [srcend, -32]
 | |
|     ldp     F_l, F_h, [srcend, -16]
 | |
|     stp     A_l, A_h, [dstin]
 | |
|     stp     B_l, B_h, [dstin, 16]
 | |
|     stp     C_l, C_h, [dstin, 32]
 | |
|     stp     D_l, D_h, [dstin, 48]
 | |
|     stp     E_l, E_h, [dstend, -32]
 | |
|     stp     F_l, F_h, [dstend, -16]
 | |
|     ret
 | |
| 
 | |
|     // Align DST to 16 byte alignment so that we don't cross cache line
 | |
|     // boundaries on both loads and stores. There are at least 96 bytes
 | |
|     // to copy, so copy 16 bytes unaligned and then align.  The loop
 | |
|     // copies 64 bytes per iteration and prefetches one iteration ahead.
 | |
| 
 | |
|     .p2align 4
 | |
| L(copy_long):
 | |
|     and     tmp1, dstin, 15
 | |
|     bic     dst, dstin, 15
 | |
|     ldp     D_l, D_h, [src]
 | |
|     sub     src, src, tmp1
 | |
|     add     count, count, tmp1      // Count is now 16 too large.
 | |
|     ldp     A_l, A_h, [src, 16]
 | |
|     stp     D_l, D_h, [dstin]
 | |
|     ldp     B_l, B_h, [src, 32]
 | |
|     ldp     C_l, C_h, [src, 48]
 | |
|     ldp     D_l, D_h, [src, 64]!
 | |
|     subs    count, count, 128 + 16  // Test and readjust count.
 | |
|     b.ls    2f
 | |
| 1:
 | |
|     stp     A_l, A_h, [dst, 16]
 | |
|     ldp     A_l, A_h, [src, 16]
 | |
|     stp     B_l, B_h, [dst, 32]
 | |
|     ldp     B_l, B_h, [src, 32]
 | |
|     stp     C_l, C_h, [dst, 48]
 | |
|     ldp     C_l, C_h, [src, 48]
 | |
|     stp     D_l, D_h, [dst, 64]!
 | |
|     ldp     D_l, D_h, [src, 64]!
 | |
|     subs    count, count, 64
 | |
|     b.hi    1b
 | |
| 
 | |
|     // Write the last full set of 64 bytes.   The remainder is at most 64
 | |
|     // bytes, so it is safe to always copy 64 bytes from the end even if
 | |
|     // there is just 1 byte left.
 | |
| 2:
 | |
|     ldp     E_l, E_h, [srcend, -64]
 | |
|     stp     A_l, A_h, [dst, 16]
 | |
|     ldp     A_l, A_h, [srcend, -48]
 | |
|     stp     B_l, B_h, [dst, 32]
 | |
|     ldp     B_l, B_h, [srcend, -32]
 | |
|     stp     C_l, C_h, [dst, 48]
 | |
|     ldp     C_l, C_h, [srcend, -16]
 | |
|     stp     D_l, D_h, [dst, 64]
 | |
|     stp     E_l, E_h, [dstend, -64]
 | |
|     stp     A_l, A_h, [dstend, -48]
 | |
|     stp     B_l, B_h, [dstend, -32]
 | |
|     stp     C_l, C_h, [dstend, -16]
 | |
|     ret
 | |
| 
 | |
| 
 | |
| //
 | |
| // All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
 | |
| // Larger backwards copies are also handled by memcpy. The only remaining
 | |
| // case is forward large copies.  The destination is aligned, and an
 | |
| // unrolled loop processes 64 bytes per iteration.
 | |
| //
 | |
| 
 | |
| ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
 | |
| ASM_PFX(InternalMemCopyMem):
 | |
|     sub     tmp2, dstin, src
 | |
|     cmp     count, 96
 | |
|     ccmp    tmp2, count, 2, hi
 | |
|     b.hs    __memcpy
 | |
| 
 | |
|     cbz     tmp2, 3f
 | |
|     add     dstend, dstin, count
 | |
|     add     srcend, src, count
 | |
| 
 | |
|     // Align dstend to 16 byte alignment so that we don't cross cache line
 | |
|     // boundaries on both loads and stores. There are at least 96 bytes
 | |
|     // to copy, so copy 16 bytes unaligned and then align. The loop
 | |
|     // copies 64 bytes per iteration and prefetches one iteration ahead.
 | |
| 
 | |
|     and     tmp2, dstend, 15
 | |
|     ldp     D_l, D_h, [srcend, -16]
 | |
|     sub     srcend, srcend, tmp2
 | |
|     sub     count, count, tmp2
 | |
|     ldp     A_l, A_h, [srcend, -16]
 | |
|     stp     D_l, D_h, [dstend, -16]
 | |
|     ldp     B_l, B_h, [srcend, -32]
 | |
|     ldp     C_l, C_h, [srcend, -48]
 | |
|     ldp     D_l, D_h, [srcend, -64]!
 | |
|     sub     dstend, dstend, tmp2
 | |
|     subs    count, count, 128
 | |
|     b.ls    2f
 | |
|     nop
 | |
| 1:
 | |
|     stp     A_l, A_h, [dstend, -16]
 | |
|     ldp     A_l, A_h, [srcend, -16]
 | |
|     stp     B_l, B_h, [dstend, -32]
 | |
|     ldp     B_l, B_h, [srcend, -32]
 | |
|     stp     C_l, C_h, [dstend, -48]
 | |
|     ldp     C_l, C_h, [srcend, -48]
 | |
|     stp     D_l, D_h, [dstend, -64]!
 | |
|     ldp     D_l, D_h, [srcend, -64]!
 | |
|     subs    count, count, 64
 | |
|     b.hi    1b
 | |
| 
 | |
|     // Write the last full set of 64 bytes. The remainder is at most 64
 | |
|     // bytes, so it is safe to always copy 64 bytes from the start even if
 | |
|     // there is just 1 byte left.
 | |
| 2:
 | |
|     ldp     E_l, E_h, [src, 48]
 | |
|     stp     A_l, A_h, [dstend, -16]
 | |
|     ldp     A_l, A_h, [src, 32]
 | |
|     stp     B_l, B_h, [dstend, -32]
 | |
|     ldp     B_l, B_h, [src, 16]
 | |
|     stp     C_l, C_h, [dstend, -48]
 | |
|     ldp     C_l, C_h, [src]
 | |
|     stp     D_l, D_h, [dstend, -64]
 | |
|     stp     E_l, E_h, [dstin, 48]
 | |
|     stp     A_l, A_h, [dstin, 32]
 | |
|     stp     B_l, B_h, [dstin, 16]
 | |
|     stp     C_l, C_h, [dstin]
 | |
| 3:  ret
 |