MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines

This adds AARCH64 support to BaseMemoryLibOptDxe, based on the cortex-strings library. All string routines are accelerated except ScanMem16, ScanMem32, ScanMem64 and IsZeroBuffer, which can wait for another day. (Very few occurrences exist in the codebase) Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Reviewed-by: Liming Gao <liming.gao@intel.com>
2016-09-02 12:34:22 +01:00
parent a37f660599
commit c86cd1e175
5 changed files with 839 additions and 1 deletions
--- a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
@@ -0,0 +1,284 @@
+//
+// Copyright (c) 2012 - 2016, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the Linaro nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+//
+// Copyright (c) 2015 ARM Ltd
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+// 3. The name of the company may not be used to endorse or promote
+//    products derived from this software without specific prior written
+//    permission.
+//
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64, unaligned accesses.
+//
+//
+
+#define dstin     x0
+#define src       x1
+#define count     x2
+#define dst       x3
+#define srcend    x4
+#define dstend    x5
+#define A_l       x6
+#define A_lw      w6
+#define A_h       x7
+#define A_hw      w7
+#define B_l       x8
+#define B_lw      w8
+#define B_h       x9
+#define C_l       x10
+#define C_h       x11
+#define D_l       x12
+#define D_h       x13
+#define E_l       x14
+#define E_h       x15
+#define F_l       srcend
+#define F_h       dst
+#define tmp1      x9
+#define tmp2      x3
+
+#define L(l) .L ## l
+
+// Copies are split into 3 main cases: small copies of up to 16 bytes,
+// medium copies of 17..96 bytes which are fully unrolled. Large copies
+// of more than 96 bytes align the destination and use an unrolled loop
+// processing 64 bytes per iteration.
+// Small and medium copies read all data before writing, allowing any
+// kind of overlap, and memmove tailcalls memcpy for these cases as
+// well as non-overlapping copies.
+
+__memcpy:
+    prfm    PLDL1KEEP, [src]
+    add     srcend, src, count
+    add     dstend, dstin, count
+    cmp     count, 16
+    b.ls    L(copy16)
+    cmp     count, 96
+    b.hi    L(copy_long)
+
+    // Medium copies: 17..96 bytes.
+    sub     tmp1, count, 1
+    ldp     A_l, A_h, [src]
+    tbnz    tmp1, 6, L(copy96)
+    ldp     D_l, D_h, [srcend, -16]
+    tbz     tmp1, 5, 1f
+    ldp     B_l, B_h, [src, 16]
+    ldp     C_l, C_h, [srcend, -32]
+    stp     B_l, B_h, [dstin, 16]
+    stp     C_l, C_h, [dstend, -32]
+1:
+    stp     A_l, A_h, [dstin]
+    stp     D_l, D_h, [dstend, -16]
+    ret
+
+    .p2align 4
+    // Small copies: 0..16 bytes.
+L(copy16):
+    cmp     count, 8
+    b.lo    1f
+    ldr     A_l, [src]
+    ldr     A_h, [srcend, -8]
+    str     A_l, [dstin]
+    str     A_h, [dstend, -8]
+    ret
+    .p2align 4
+1:
+    tbz     count, 2, 1f
+    ldr     A_lw, [src]
+    ldr     A_hw, [srcend, -4]
+    str     A_lw, [dstin]
+    str     A_hw, [dstend, -4]
+    ret
+
+    // Copy 0..3 bytes.  Use a branchless sequence that copies the same
+    // byte 3 times if count==1, or the 2nd byte twice if count==2.
+1:
+    cbz     count, 2f
+    lsr     tmp1, count, 1
+    ldrb    A_lw, [src]
+    ldrb    A_hw, [srcend, -1]
+    ldrb    B_lw, [src, tmp1]
+    strb    A_lw, [dstin]
+    strb    B_lw, [dstin, tmp1]
+    strb    A_hw, [dstend, -1]
+2:  ret
+
+    .p2align 4
+    // Copy 64..96 bytes.  Copy 64 bytes from the start and
+    // 32 bytes from the end.
+L(copy96):
+    ldp     B_l, B_h, [src, 16]
+    ldp     C_l, C_h, [src, 32]
+    ldp     D_l, D_h, [src, 48]
+    ldp     E_l, E_h, [srcend, -32]
+    ldp     F_l, F_h, [srcend, -16]
+    stp     A_l, A_h, [dstin]
+    stp     B_l, B_h, [dstin, 16]
+    stp     C_l, C_h, [dstin, 32]
+    stp     D_l, D_h, [dstin, 48]
+    stp     E_l, E_h, [dstend, -32]
+    stp     F_l, F_h, [dstend, -16]
+    ret
+
+    // Align DST to 16 byte alignment so that we don't cross cache line
+    // boundaries on both loads and stores. There are at least 96 bytes
+    // to copy, so copy 16 bytes unaligned and then align.	The loop
+    // copies 64 bytes per iteration and prefetches one iteration ahead.
+
+    .p2align 4
+L(copy_long):
+    and     tmp1, dstin, 15
+    bic     dst, dstin, 15
+    ldp     D_l, D_h, [src]
+    sub     src, src, tmp1
+    add     count, count, tmp1      // Count is now 16 too large.
+    ldp     A_l, A_h, [src, 16]
+    stp     D_l, D_h, [dstin]
+    ldp     B_l, B_h, [src, 32]
+    ldp     C_l, C_h, [src, 48]
+    ldp     D_l, D_h, [src, 64]!
+    subs    count, count, 128 + 16  // Test and readjust count.
+    b.ls    2f
+1:
+    stp     A_l, A_h, [dst, 16]
+    ldp     A_l, A_h, [src, 16]
+    stp     B_l, B_h, [dst, 32]
+    ldp     B_l, B_h, [src, 32]
+    stp     C_l, C_h, [dst, 48]
+    ldp     C_l, C_h, [src, 48]
+    stp     D_l, D_h, [dst, 64]!
+    ldp     D_l, D_h, [src, 64]!
+    subs    count, count, 64
+    b.hi    1b
+
+    // Write the last full set of 64 bytes.	 The remainder is at most 64
+    // bytes, so it is safe to always copy 64 bytes from the end even if
+    // there is just 1 byte left.
+2:
+    ldp     E_l, E_h, [srcend, -64]
+    stp     A_l, A_h, [dst, 16]
+    ldp     A_l, A_h, [srcend, -48]
+    stp     B_l, B_h, [dst, 32]
+    ldp     B_l, B_h, [srcend, -32]
+    stp     C_l, C_h, [dst, 48]
+    ldp     C_l, C_h, [srcend, -16]
+    stp     D_l, D_h, [dst, 64]
+    stp     E_l, E_h, [dstend, -64]
+    stp     A_l, A_h, [dstend, -48]
+    stp     B_l, B_h, [dstend, -32]
+    stp     C_l, C_h, [dstend, -16]
+    ret
+
+
+//
+// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+// Larger backwards copies are also handled by memcpy. The only remaining
+// case is forward large copies.  The destination is aligned, and an
+// unrolled loop processes 64 bytes per iteration.
+//
+
+ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
+ASM_PFX(InternalMemCopyMem):
+    sub     tmp2, dstin, src
+    cmp     count, 96
+    ccmp    tmp2, count, 2, hi
+    b.hs    __memcpy
+
+    cbz     tmp2, 3f
+    add     dstend, dstin, count
+    add     srcend, src, count
+
+    // Align dstend to 16 byte alignment so that we don't cross cache line
+    // boundaries on both loads and stores. There are at least 96 bytes
+    // to copy, so copy 16 bytes unaligned and then align. The loop
+    // copies 64 bytes per iteration and prefetches one iteration ahead.
+
+    and     tmp2, dstend, 15
+    ldp     D_l, D_h, [srcend, -16]
+    sub     srcend, srcend, tmp2
+    sub     count, count, tmp2
+    ldp     A_l, A_h, [srcend, -16]
+    stp     D_l, D_h, [dstend, -16]
+    ldp     B_l, B_h, [srcend, -32]
+    ldp     C_l, C_h, [srcend, -48]
+    ldp     D_l, D_h, [srcend, -64]!
+    sub     dstend, dstend, tmp2
+    subs    count, count, 128
+    b.ls    2f
+    nop
+1:
+    stp     A_l, A_h, [dstend, -16]
+    ldp     A_l, A_h, [srcend, -16]
+    stp     B_l, B_h, [dstend, -32]
+    ldp     B_l, B_h, [srcend, -32]
+    stp     C_l, C_h, [dstend, -48]
+    ldp     C_l, C_h, [srcend, -48]
+    stp     D_l, D_h, [dstend, -64]!
+    ldp     D_l, D_h, [srcend, -64]!
+    subs    count, count, 64
+    b.hi    1b
+
+    // Write the last full set of 64 bytes. The remainder is at most 64
+    // bytes, so it is safe to always copy 64 bytes from the start even if
+    // there is just 1 byte left.
+2:
+    ldp     E_l, E_h, [src, 48]
+    stp     A_l, A_h, [dstend, -16]
+    ldp     A_l, A_h, [src, 32]
+    stp     B_l, B_h, [dstend, -32]
+    ldp     B_l, B_h, [src, 16]
+    stp     C_l, C_h, [dstend, -48]
+    ldp     C_l, C_h, [src]
+    stp     D_l, D_h, [dstend, -64]
+    stp     E_l, E_h, [dstin, 48]
+    stp     A_l, A_h, [dstin, 32]
+    stp     B_l, B_h, [dstin, 16]
+    stp     C_l, C_h, [dstin]
+3:  ret