This patch adds a bit of optimized assembly code to the ipchksum() algorithm for x86 targets in order to take advantage of larger load sizes and the add-with-carry instruction. The same assembly (with one minor manual tweak) works for both 32 and 64 bit mode (with most of the work being done by GCC which automatically inserts `rax` or `eax` in the inline assembly depending on the build target). Change-Id: I484620dc14679ff5ca02b2ced2f84650730a6efc Signed-off-by: Julius Werner <jwerner@chromium.org> Reviewed-on: https://review.coreboot.org/c/coreboot/+/80255 Reviewed-by: Arthur Heymans <arthur@aheymans.xyz> Tested-by: build bot (Jenkins) <no-reply@coreboot.org>
101 lines
2.7 KiB
C
101 lines
2.7 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
|
|
|
|
#include <commonlib/bsd/ipchksum.h>
|
|
|
|
/* See RFC 1071 for mathematical explanations of why we can first sum in a larger register and
|
|
then narrow down, why we don't need to worry about endianness, etc. */
|
|
uint16_t ipchksum(const void *data, size_t size)
|
|
{
|
|
const uint8_t *p1 = data;
|
|
unsigned long wide_sum = 0;
|
|
uint32_t sum = 0;
|
|
size_t i = 0;
|
|
|
|
#if defined(__aarch64__)
|
|
size_t size16 = size / 16;
|
|
const uint64_t *p8 = data;
|
|
if (size16) {
|
|
unsigned long tmp1, tmp2;
|
|
i = size16 * 16;
|
|
asm (
|
|
"adds xzr, xzr, xzr\n\t" /* init carry flag for addition */
|
|
"1:\n\t"
|
|
"ldp %[v1], %[v2], [%[p8]], #16\n\t"
|
|
"adcs %[wsum], %[wsum], %[v1]\n\t"
|
|
"adcs %[wsum], %[wsum], %[v2]\n\t"
|
|
"sub %[size16], %[size16], #1\n\t"
|
|
"cbnz %[size16], 1b\n\t"
|
|
"adcs %[wsum], %[wsum], xzr\n\t" /* use up last carry */
|
|
: [v1] "=r" (tmp1),
|
|
[v2] "=r" (tmp2),
|
|
[wsum] "+r" (wide_sum),
|
|
[p8] "+r" (p8),
|
|
[size16] "+r" (size16)
|
|
:: "cc"
|
|
);
|
|
}
|
|
#elif defined(__i386__) || defined(__x86_64__)
|
|
size_t size8 = size / 8;
|
|
const uint64_t *p8 = data;
|
|
i = size8 * 8;
|
|
asm (
|
|
"clc\n\t"
|
|
"1:\n\t"
|
|
"jecxz 2f\n\t" /* technically RCX on 64, but not gonna be that big */
|
|
"adc (%[p8]), %[wsum]\n\t"
|
|
#if defined(__i386__)
|
|
"adc 4(%[p8]), %[wsum]\n\t"
|
|
#endif /* __i386__ */
|
|
"lea -1(%[size8]), %[size8]\n\t" /* Use LEA as a makeshift ADD that */
|
|
"lea 8(%[p8]), %[p8]\n\t" /* doesn't modify the carry flag. */
|
|
"jmp 1b\n\t"
|
|
"2:\n\t"
|
|
"setc %b[size8]\n\t" /* reuse size register to save last carry */
|
|
"add %[size8], %[wsum]\n\t"
|
|
: [wsum] "+r" (wide_sum),
|
|
[p8] "+r" (p8),
|
|
[size8] "+c" (size8) /* put size in ECX so we can JECXZ */
|
|
:: "cc"
|
|
);
|
|
#endif /* __i386__ || __x86_64__ */
|
|
|
|
while (wide_sum) {
|
|
sum += wide_sum & 0xFFFF;
|
|
wide_sum >>= 16;
|
|
}
|
|
sum = (sum & 0xFFFF) + (sum >> 16);
|
|
|
|
for (; i < size; i++) {
|
|
uint32_t v = p1[i];
|
|
if (i % 2)
|
|
v <<= 8;
|
|
sum += v;
|
|
|
|
/* Doing this unconditionally seems to be faster. */
|
|
sum = (sum & 0xFFFF) + (sum >> 16);
|
|
}
|
|
|
|
return (uint16_t)~sum;
|
|
}
|
|
|
|
uint16_t ipchksum_add(size_t offset, uint16_t first, uint16_t second)
|
|
{
|
|
first = ~first;
|
|
second = ~second;
|
|
|
|
/*
|
|
* Since the checksum is calculated in 16-bit chunks, if the offset at which
|
|
* the data covered by the second checksum would start (if both data streams
|
|
* came one after the other) is odd, that means the second stream starts in
|
|
* the middle of a 16-bit chunk. This means the second checksum is byte
|
|
* swapped compared to what we need it to be, and we must swap it back.
|
|
*/
|
|
if (offset % 2)
|
|
second = (second >> 8) | (second << 8);
|
|
|
|
uint32_t sum = first + second;
|
|
sum = (sum & 0xFFFF) + (sum >> 16);
|
|
|
|
return (uint16_t)~sum;
|
|
}
|