/*
* include/asm-x86_64/xor.h
*
* Optimized RAID-5 checksumming functions for MMX and SSE.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* You should have received a copy of the GNU General Public License
* (for example /usr/src/linux/COPYING); if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* Cache avoiding checksumming functions utilizing KNI instructions
* Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
*/
/*
* Based on
* High-speed RAID5 checksumming functions utilizing SSE instructions.
* Copyright (C) 1998 Ingo Molnar.
*/
/*
* x86-64 changes / gcc fixes from Andi Kleen.
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
/* Doesn't use gcc to save the XMM registers, because there is no easy way to
tell it to do a clts before the register saving. */
#define XMMS_SAVE \
asm volatile ( \
"movq %%cr0,%0 ;\n\t" \
"clts ;\n\t" \
"movups %%xmm0,(%1) ;\n\t" \
"movups %%xmm1,0x10(%1) ;\n\t" \
"movups %%xmm2,0x20(%1) ;\n\t" \
"movups %%xmm3,0x30(%1) ;\n\t" \
: "=&r" (cr0) \
: "r" (xmm_save) \
: "memory")
#define XMMS_RESTORE \
asm volatile ( \
"sfence ;\n\t" \
"movups (%1),%%xmm0 ;\n\t" \
"movups 0x10(%1),%%xmm1 ;\n\t" \
"movups 0x20(%1),%%xmm2 ;\n\t" \
"movups 0x30(%1),%%xmm3 ;\n\t" \
"movq %0,%%cr0 ;\n\t" \
: \
: "r" (cr0), "r" (xmm_save) \
: "memory")
#define OFFS(x) "16*("#x")"
#define PF_OFFS(x) "320+16*("#x")"
#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
#define ST(x,y) " movntdq %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
static void
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
unsigned int lines = bytes >> 7;
unsigned long cr0;
xmm_store_t xmm_save[4];
XMMS_SAVE;
asm volatile (
#undef BLOCK
#define BLOCK(i) \
LD(i,0) \
LD(i+1,1) \
PF1(i) \
LD(i+2,2) \
LD(i+3,3) \
PF0(i+4) \
XO1(i,0) \
XO1(i+1,1) \
ST(i,0) \
ST(i+1,1) \
XO1(i+2,2) \
XO1(i+3,3) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
" .p2align 4 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
" decl %[cnt]\n"
" leaq 128(%[p1]),%[p1]\n"
" leaq 128(%[p2]),%[p2]\n"
" jnz 1b\n"
: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
:
: "memory");
XMMS_RESTORE;
}
static void
xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
unsigned int lines = bytes >> 7;
xmm_store_t xmm_save[4];
unsigned long cr0;
XMMS_SAVE;
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
LD(i,0) \
LD(i+1,1) \
XO1(i,0) \
XO1(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
PF2(i) \
PF0(i+4) \
XO1(i+2,2) \
XO1(i+3,3) \
XO2(i,0) \
XO2(i+1,1) \
ST(i,0) \
ST(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
" .p2align 4 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
" decl %[cnt]\n"
" leaq 128(%[p1]),%[p1]\n"
" leaq 128(%[p2]),%[p2]\n"
" leaq 128(%[p3]),%[p3]\n"
" jnz 1b"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
:
: "memory");
XMMS_RESTORE;
}
static void
xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
unsigned int lines = bytes >> 7;
xmm_store_t xmm_save[4];
unsigned long cr0;
XMMS_SAVE;
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
LD(i,0) \
LD(i+1,1) \
XO1(i,0) \
XO1(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
PF2(i) \
XO1(i+2,2) \
XO1(i+3,3) \
PF3(i) \
PF0(i+4) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
XO3(i,0) \
XO3(i+1,1) \
ST(i,0) \
ST(i+1,1) \
XO3(i+2,2) \
XO3(i+3,3) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
" .align 32 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
" decl %[cnt]\n"
" leaq 128(%[p1]),%[p1]\n"
" leaq 128(%[p2]),%[p2]\n"
" leaq 128(%[p3]),%[p3]\n"
" leaq 128(%[p4]),%[p4]\n"
" jnz 1b"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
:
: "memory" );
XMMS_RESTORE;
}
static void
xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
unsigned int lines = bytes >> 7;
xmm_store_t xmm_save[4];
unsigned long cr0;
XMMS_SAVE;
__asm__ __volatile__ (
#undef BLOCK
#define BLOCK(i) \
PF1(i) \
LD(i,0) \
LD(i+1,1) \
XO1(i,0) \
XO1(i+1,1) \
LD(i+2,2) \
LD(i+3,3) \
PF2(i) \
XO1(i+2,2) \
XO1(i+3,3) \
PF3(i) \
XO2(i,0) \
XO2(i+1,1) \
XO2(i+2,2) \
XO2(i+3,3) \
PF4(i) \
PF0(i+4) \
XO3(i,0) \
XO3(i+1,1) \
XO3(i+2,2) \
XO3(i+3,3) \
XO4(i,0) \
XO4(i+1,1) \
ST(i,0) \
ST(i+1,1) \
XO4(i+2,2) \
XO4(i+3,3) \
ST(i+2,2) \
ST(i+3,3) \
PF0(0)
" .p2align 4 ;\n"
" 1: ;\n"
BLOCK(0)
BLOCK(4)
" decl %[cnt]\n"
" leaq 128(%[p1]),%[p1]\n"
" leaq 128(%[p2]),%[p2]\n"
" leaq 128(%[p3]),%[p3]\n"
" leaq 128(%[p4]),%[p4]\n"
" leaq 128(%[p5]),%[p5]\n"
" jnz 1b"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
[p5] "+r" (p5)
:
: "memory");
XMMS_RESTORE;
}
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC__MINOR__ >= 3)
#define STORE_NTI(x,mem) __builtin_ia32_movnti(&(mem), (x))
#else
#define STORE_NTI(x,mem) asm("movnti %1,%0" : "=m" (mem) : "r" (x))
#endif
static void
xor_64regs_stream_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
long lines = bytes / (sizeof (long)) / 8;
do {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = p1[0]; /* Pull the stuff into registers */
d1 = p1[1]; /* ... in bursts, if possible. */
d2 = p1[2];
d3 = p1[3];
d4 = p1[4];
d5 = p1[5];
d6 = p1[6];
d7 = p1[7];
__builtin_prefetch(p1 + 5*64, 0, 0);
d0 ^= p2[0];
d1 ^= p2[1];
d2 ^= p2[2];
d3 ^= p2[3];
d4 ^= p2[4];
d5 ^= p2[5];
d6 ^= p2[6];
d7 ^= p2[7];
__builtin_prefetch(p2 + 5*64, 0, 0);
STORE_NTI(d0, p1[0]);
STORE_NTI(d1, p1[1]);
STORE_NTI(d2, p1[2]);
STORE_NTI(d3, p1[3]);
STORE_NTI(d4, p1[4]);
STORE_NTI(d5, p1[5]);
STORE_NTI(d6, p1[6]);
STORE_NTI(d7, p1[7]);
p1 += 8;
p2 += 8;
} while (--lines > 0);
}
static void
xor_64regs_stream_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
long lines = bytes / (sizeof (long)) / 8;
do {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = p1[0]; /* Pull the stuff into registers */
d1 = p1[1]; /* ... in bursts, if possible. */
d2 = p1[2];
d3 = p1[3];
d4 = p1[4];
d5 = p1[5];
d6 = p1[6];
d7 = p1[7];
__builtin_prefetch(p1 + 5*64, 0, 0);
d0 ^= p2[0];
d1 ^= p2[1];
d2 ^= p2[2];
d3 ^= p2[3];
d4 ^= p2[4];
d5 ^= p2[5];
d6 ^= p2[6];
d7 ^= p2[7];
__builtin_prefetch(p2 + 5*64, 0, 0);
d0 ^= p3[0];
d1 ^= p3[1];
d2 ^= p3[2];
d3 ^= p3[3];
d4 ^= p3[4];
d5 ^= p3[5];
d6 ^= p3[6];
d7 ^= p3[7];
__builtin_prefetch(p3 + 5*64, 0, 0);
STORE_NTI(d0, p1[0]);
STORE_NTI(d1, p1[1]);
STORE_NTI(d2, p1[2]);
STORE_NTI(d3, p1[3]);
STORE_NTI(d4, p1[4]);
STORE_NTI(d5, p1[5]);
STORE_NTI(d6, p1[6]);
STORE_NTI(d7, p1[7]);
p1 += 8;
p2 += 8;
p3 += 8;
} while (--lines > 0);
}
static void
xor_64regs_stream_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
long lines = bytes / (sizeof (long)) / 8;
do {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = p1[0]; /* Pull the stuff into registers */
d1 = p1[1]; /* ... in bursts, if possible. */
d2 = p1[2];
d3 = p1[3];
d4 = p1[4];
d5 = p1[5];
d6 = p1[6];
d7 = p1[7];
__builtin_prefetch(p1 + 5*64, 0, 0);
d0 ^= p2[0];
d1 ^= p2[1];
d2 ^= p2[2];
d3 ^= p2[3];
d4 ^= p2[4];
d5 ^= p2[5];
d6 ^= p2[6];
d7 ^= p2[7];
__builtin_prefetch(p2 + 5*64, 0, 0);
d0 ^= p3[0];
d1 ^= p3[1];
d2 ^= p3[2];
d3 ^= p3[3];
d4 ^= p3[4];
d5 ^= p3[5];
d6 ^= p3[6];
d7 ^= p3[7];
__builtin_prefetch(p3 + 5*64, 0, 0);
d0 ^= p4[0];
d1 ^= p4[1];
d2 ^= p4[2];
d3 ^= p4[3];
d4 ^= p4[4];
d5 ^= p4[5];
d6 ^= p4[6];
d7 ^= p4[7];
__builtin_prefetch(p4 + 5*64, 0, 0);
STORE_NTI(d0, p1[0]);
STORE_NTI(d1, p1[1]);
STORE_NTI(d2, p1[2]);
STORE_NTI(d3, p1[3]);
STORE_NTI(d4, p1[4]);
STORE_NTI(d5, p1[5]);
STORE_NTI(d6, p1[6]);
STORE_NTI(d7, p1[7]);
p1 += 8;
p2 += 8;
p3 += 8;
p4 += 8;
} while (--lines > 0);
}
static void
xor_64regs_stream_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
long lines = bytes / (sizeof (long)) / 8;
do {
register long d0, d1, d2, d3, d4, d5, d6, d7;
d0 = p1[0]; /* Pull the stuff into registers */
d1 = p1[1]; /* ... in bursts, if possible. */
d2 = p1[2];
d3 = p1[3];
d4 = p1[4];
d5 = p1[5];
d6 = p1[6];
d7 = p1[7];
__builtin_prefetch(p1 + 5*64, 0, 0);
d0 ^= p2[0];
d1 ^= p2[1];
d2 ^= p2[2];
d3 ^= p2[3];
d4 ^= p2[4];
d5 ^= p2[5];
d6 ^= p2[6];
d7 ^= p2[7];
__builtin_prefetch(p2 + 5*64, 0, 0);
d0 ^= p3[0];
d1 ^= p3[1];
d2 ^= p3[2];
d3 ^= p3[3];
d4 ^= p3[4];
d5 ^= p3[5];
d6 ^= p3[6];
d7 ^= p3[7];
__builtin_prefetch(p3 + 5*64, 0, 0);
d0 ^= p4[0];
d1 ^= p4[1];
d2 ^= p4[2];
d3 ^= p4[3];
d4 ^= p4[4];
d5 ^= p4[5];
d6 ^= p4[6];
d7 ^= p4[7];
__builtin_prefetch(p4 + 5*64, 0, 0);
d0 ^= p5[0];
d1 ^= p5[1];
d2 ^= p5[2];
d3 ^= p5[3];
d4 ^= p5[4];
d5 ^= p5[5];
d6 ^= p5[6];
d7 ^= p5[7];
__builtin_prefetch(p5 + 5*64, 0, 0);
STORE_NTI(d0, p1[0]);
STORE_NTI(d1, p1[1]);
STORE_NTI(d2, p1[2]);
STORE_NTI(d3, p1[3]);
STORE_NTI(d4, p1[4]);
STORE_NTI(d5, p1[5]);
STORE_NTI(d6, p1[6]);
STORE_NTI(d7, p1[7]);
p1 += 8;
p2 += 8;
p3 += 8;
p4 += 8;
p5 += 8;
} while (--lines > 0);
}
static struct xor_block_template xor_block_sse = {
name: "128byte sse streaming",
do_2: xor_sse_2,
do_3: xor_sse_3,
do_4: xor_sse_4,
do_5: xor_sse_5,
};
static struct xor_block_template xor_block_64regs_stream = {
name: "64byte int streaming",
do_2: xor_64regs_stream_2,
do_3: xor_64regs_stream_3,
do_4: xor_64regs_stream_4,
do_5: xor_64regs_stream_5,
};
/* AK: the speed test is useless: it only tests cache hot */
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
xor_speed(&xor_block_sse); \
xor_speed(&xor_block_64regs_stream); \
} while (0)
#define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)