diff --git a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem.nasm b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem.nasm index 24313cb4b3..a8744300c6 100644 --- a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem.nasm +++ b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem.nasm @@ -34,7 +34,7 @@ ASM_PFX(InternalMemSetMem): mov al, [esp + 16] ; al <- Value xor ecx, ecx sub ecx, edi - and ecx, 15 ; ecx + edi aligns on 16-byte boundary + and ecx, 63 ; ecx + edi aligns on 16-byte boundary jz .0 cmp ecx, edx cmova ecx, edx @@ -42,8 +42,8 @@ ASM_PFX(InternalMemSetMem): rep stosb .0: mov ecx, edx - and edx, 15 - shr ecx, 4 ; ecx <- # of DQwords to set + and edx, 63 + shr ecx, 6 ; ecx <- # of DQwords to set jz @SetBytes mov ah, al ; ax <- Value | (Value << 8) add esp, -16 @@ -53,7 +53,10 @@ ASM_PFX(InternalMemSetMem): movlhps xmm0, xmm0 ; xmm0 <- Value repeats 16 times .1: movntdq [edi], xmm0 ; edi should be 16-byte aligned - add edi, 16 + movntdq [edi + 16], xmm0 + movntdq [edi + 32], xmm0 + movntdq [edi + 48], xmm0 + add edi, 64 loop .1 mfence movdqu xmm0, [esp] ; restore xmm0 diff --git a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem16.nasm b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem16.nasm index 6e308b5594..d461ee086c 100644 --- a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem16.nasm +++ b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem16.nasm @@ -33,7 +33,7 @@ ASM_PFX(InternalMemSetMem16): mov edi, [esp + 8] xor ecx, ecx sub ecx, edi - and ecx, 15 ; ecx + edi aligns on 16-byte boundary + and ecx, 63 ; ecx + edi aligns on 16-byte boundary mov eax, [esp + 16] jz .0 shr ecx, 1 @@ -43,15 +43,18 @@ ASM_PFX(InternalMemSetMem16): rep stosw .0: mov ecx, edx - and edx, 7 - shr ecx, 3 + and edx, 31 + shr ecx, 5 jz @SetWords movd xmm0, eax pshuflw xmm0, xmm0, 0 movlhps xmm0, xmm0 .1: movntdq [edi], xmm0 ; edi should be 16-byte aligned - add edi, 16 + movntdq [edi + 16], xmm0 + movntdq [edi + 32], xmm0 + movntdq [edi + 48], xmm0 + add edi, 64 loop .1 mfence @SetWords: diff --git a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem32.nasm b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem32.nasm index 2cfc8cb0dd..3ffdcd07d7 100644 --- a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem32.nasm +++ b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem32.nasm @@ -43,14 +43,17 @@ ASM_PFX(InternalMemSetMem32): rep stosd .0: mov ecx, edx - and edx, 3 - shr ecx, 2 + and edx, 15 + shr ecx, 4 jz @SetDwords movd xmm0, eax pshufd xmm0, xmm0, 0 .1: movntdq [edi], xmm0 - add edi, 16 + movntdq [edi + 16], xmm0 + movntdq [edi + 32], xmm0 + movntdq [edi + 48], xmm0 + add edi, 64 loop .1 mfence @SetDwords: diff --git a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem64.nasm b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem64.nasm index e153495a68..cd000648ae 100644 --- a/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem64.nasm +++ b/MdePkg/Library/BaseMemoryLibSse2/Ia32/SetMem64.nasm @@ -38,17 +38,29 @@ ASM_PFX(InternalMemSetMem64): add edx, 8 dec ecx .0: - shr ecx, 1 + push ebx + mov ebx, ecx + and ebx, 7 + shr ecx, 3 jz @SetQwords movlhps xmm0, xmm0 .1: movntdq [edx], xmm0 - lea edx, [edx + 16] + movntdq [edx + 16], xmm0 + movntdq [edx + 32], xmm0 + movntdq [edx + 48], xmm0 + lea edx, [edx + 64] loop .1 mfence @SetQwords: - jnc .2 + test ebx, ebx + jz .3 + mov ecx, ebx +.2 movq qword [edx], xmm0 -.2: + lea edx, [edx + 8] + loop .2 +.3: + pop ebx ret diff --git a/MdePkg/Library/BaseMemoryLibSse2/Ia32/ZeroMem.nasm b/MdePkg/Library/BaseMemoryLibSse2/Ia32/ZeroMem.nasm index cd34006f59..0e0828551b 100644 --- a/MdePkg/Library/BaseMemoryLibSse2/Ia32/ZeroMem.nasm +++ b/MdePkg/Library/BaseMemoryLibSse2/Ia32/ZeroMem.nasm @@ -33,7 +33,7 @@ ASM_PFX(InternalMemZeroMem): xor ecx, ecx sub ecx, edi xor eax, eax - and ecx, 15 + and ecx, 63 jz .0 cmp ecx, edx cmova ecx, edx @@ -41,13 +41,16 @@ ASM_PFX(InternalMemZeroMem): rep stosb .0: mov ecx, edx - and edx, 15 - shr ecx, 4 + and edx, 63 + shr ecx, 6 jz @ZeroBytes pxor xmm0, xmm0 .1: movntdq [edi], xmm0 - add edi, 16 + movntdq [edi + 16], xmm0 + movntdq [edi + 32], xmm0 + movntdq [edi + 48], xmm0 + add edi, 64 loop .1 mfence @ZeroBytes: diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm index 5bd1c2262d..28b11ee586 100644 --- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm +++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem.nasm @@ -42,8 +42,8 @@ ASM_PFX(InternalMemSetMem): rep stosb .0: mov rcx, rdx - and rdx, 15 - shr rcx, 4 + and rdx, 63 + shr rcx, 6 jz @SetBytes mov ah, al ; ax <- Value repeats twice movdqa [rsp + 0x10], xmm0 ; save xmm0 @@ -52,7 +52,10 @@ ASM_PFX(InternalMemSetMem): movlhps xmm0, xmm0 ; xmm0 <- Value repeats 16 times .1: movntdq [rdi], xmm0 ; rdi should be 16-byte aligned - add rdi, 16 + movntdq [rdi + 16], xmm0 + movntdq [rdi + 32], xmm0 + movntdq [rdi + 48], xmm0 + add rdi, 64 loop .1 mfence movdqa xmm0, [rsp + 0x10] ; restore xmm0 diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm index 90d159820a..375be19313 100644 --- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm +++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem16.nasm @@ -33,7 +33,7 @@ ASM_PFX(InternalMemSetMem16): mov r9, rdi xor rcx, rcx sub rcx, rdi - and rcx, 15 + and rcx, 63 mov rax, r8 jz .0 shr rcx, 1 @@ -43,15 +43,18 @@ ASM_PFX(InternalMemSetMem16): rep stosw .0: mov rcx, rdx - and edx, 7 - shr rcx, 3 + and edx, 31 + shr rcx, 5 jz @SetWords movd xmm0, eax pshuflw xmm0, xmm0, 0 movlhps xmm0, xmm0 .1: movntdq [rdi], xmm0 - add rdi, 16 + movntdq [rdi + 16], xmm0 + movntdq [rdi + 32], xmm0 + movntdq [rdi + 48], xmm0 + add rdi, 64 loop .1 mfence @SetWords: diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm index 928e086889..5d12beaa9a 100644 --- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm +++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem32.nasm @@ -43,14 +43,17 @@ ASM_PFX(InternalMemSetMem32): rep stosd .0: mov rcx, rdx - and edx, 3 - shr rcx, 2 + and edx, 15 + shr rcx, 4 jz @SetDwords movd xmm0, eax pshufd xmm0, xmm0, 0 .1: movntdq [rdi], xmm0 - add rdi, 16 + movntdq [rdi + 16], xmm0 + movntdq [rdi + 32], xmm0 + movntdq [rdi + 48], xmm0 + add rdi, 64 loop .1 mfence @SetDwords: diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm index d771810542..485f74ddac 100644 --- a/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm +++ b/MdePkg/Library/BaseMemoryLibSse2/X64/SetMem64.nasm @@ -37,17 +37,28 @@ ASM_PFX(InternalMemSetMem64): add rdx, 8 dec rcx .0: - shr rcx, 1 + push rbx + mov rbx, rcx + and rbx, 7 + shr rcx, 3 jz @SetQwords movlhps xmm0, xmm0 .1: movntdq [rdx], xmm0 - lea rdx, [rdx + 16] + movntdq [rdx + 16], xmm0 + movntdq [rdx + 32], xmm0 + movntdq [rdx + 48], xmm0 + lea rdx, [rdx + 64] loop .1 mfence @SetQwords: - jnc .2 - mov [rdx], r8 + push rdi + mov rcx, rbx + mov rax, r8 + mov rdi, rdx + rep stosq + pop rdi .2: + pop rbx ret diff --git a/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm b/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm index 5ddcae9ca5..21f504e3b7 100644 --- a/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm +++ b/MdePkg/Library/BaseMemoryLibSse2/X64/ZeroMem.nasm @@ -32,7 +32,7 @@ ASM_PFX(InternalMemZeroMem): xor rcx, rcx xor eax, eax sub rcx, rdi - and rcx, 15 + and rcx, 63 mov r8, rdi jz .0 cmp rcx, rdx @@ -41,13 +41,16 @@ ASM_PFX(InternalMemZeroMem): rep stosb .0: mov rcx, rdx - and edx, 15 - shr rcx, 4 + and edx, 63 + shr rcx, 6 jz @ZeroBytes pxor xmm0, xmm0 .1: - movntdq [rdi], xmm0 ; rdi should be 16-byte aligned - add rdi, 16 + movntdq [rdi], xmm0 + movntdq [rdi + 16], xmm0 + movntdq [rdi + 32], xmm0 + movntdq [rdi + 48], xmm0 + add rdi, 64 loop .1 mfence @ZeroBytes: