2 * include/asm-x86_64/xor.h
4 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
11 * You should have received a copy of the GNU General Public License
12 * (for example /usr/src/linux/COPYING); if not, write to the Free
13 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 * Cache avoiding checksumming functions utilizing KNI instructions
19 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
24 * High-speed RAID5 checksumming functions utilizing SSE instructions.
25 * Copyright (C) 1998 Ingo Molnar.
29 * x86-64 changes / gcc fixes from Andi Kleen.
30 * Copyright 2002 Andi Kleen, SuSE Labs.
33 typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
35 /* Doesn't use gcc to save the XMM registers, because there is no easy way to
36 tell it to do a clts before the register saving. */
39 "movq %%cr0,%0 ;\n\t" \
41 "movups %%xmm0,(%1) ;\n\t" \
42 "movups %%xmm1,0x10(%1) ;\n\t" \
43 "movups %%xmm2,0x20(%1) ;\n\t" \
44 "movups %%xmm3,0x30(%1) ;\n\t" \
49 #define XMMS_RESTORE \
52 "movups (%1),%%xmm0 ;\n\t" \
53 "movups 0x10(%1),%%xmm1 ;\n\t" \
54 "movups 0x20(%1),%%xmm2 ;\n\t" \
55 "movups 0x30(%1),%%xmm3 ;\n\t" \
56 "movq %0,%%cr0 ;\n\t" \
58 : "r" (cr0), "r" (xmm_save) \
61 #define OFFS(x) "16*("#x")"
62 #define PF_OFFS(x) "320+16*("#x")"
63 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
64 #define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
65 #define ST(x,y) " movntdq %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
66 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
67 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
68 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
69 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
70 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
71 #define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
72 #define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
73 #define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
74 #define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
75 #define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
78 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
80 unsigned int lines = bytes >> 7;
82 xmm_store_t xmm_save[4];
114 " leaq 128(%[p1]),%[p1]\n"
115 " leaq 128(%[p2]),%[p2]\n"
117 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
125 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
128 unsigned int lines = bytes >> 7;
129 xmm_store_t xmm_save[4];
134 __asm__ __volatile__ (
167 " leaq 128(%[p1]),%[p1]\n"
168 " leaq 128(%[p2]),%[p2]\n"
169 " leaq 128(%[p3]),%[p3]\n"
171 : [cnt] "+r" (lines),
172 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
179 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180 unsigned long *p3, unsigned long *p4)
182 unsigned int lines = bytes >> 7;
183 xmm_store_t xmm_save[4];
188 __asm__ __volatile__ (
226 " leaq 128(%[p1]),%[p1]\n"
227 " leaq 128(%[p2]),%[p2]\n"
228 " leaq 128(%[p3]),%[p3]\n"
229 " leaq 128(%[p4]),%[p4]\n"
231 : [cnt] "+r" (lines),
232 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
240 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
241 unsigned long *p3, unsigned long *p4, unsigned long *p5)
243 unsigned int lines = bytes >> 7;
244 xmm_store_t xmm_save[4];
249 __asm__ __volatile__ (
292 " leaq 128(%[p1]),%[p1]\n"
293 " leaq 128(%[p2]),%[p2]\n"
294 " leaq 128(%[p3]),%[p3]\n"
295 " leaq 128(%[p4]),%[p4]\n"
296 " leaq 128(%[p5]),%[p5]\n"
298 : [cnt] "+r" (lines),
299 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
307 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC__MINOR__ >= 3)
308 #define STORE_NTI(x,mem) __builtin_ia32_movnti(&(mem), (x))
310 #define STORE_NTI(x,mem) asm("movnti %1,%0" : "=m" (mem) : "r" (x))
315 xor_64regs_stream_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
317 long lines = bytes / (sizeof (long)) / 8;
320 register long d0, d1, d2, d3, d4, d5, d6, d7;
321 d0 = p1[0]; /* Pull the stuff into registers */
322 d1 = p1[1]; /* ... in bursts, if possible. */
329 __builtin_prefetch(p1 + 5*64, 0, 0);
338 __builtin_prefetch(p2 + 5*64, 0, 0);
339 STORE_NTI(d0, p1[0]);
340 STORE_NTI(d1, p1[1]);
341 STORE_NTI(d2, p1[2]);
342 STORE_NTI(d3, p1[3]);
343 STORE_NTI(d4, p1[4]);
344 STORE_NTI(d5, p1[5]);
345 STORE_NTI(d6, p1[6]);
346 STORE_NTI(d7, p1[7]);
349 } while (--lines > 0);
353 xor_64regs_stream_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
356 long lines = bytes / (sizeof (long)) / 8;
359 register long d0, d1, d2, d3, d4, d5, d6, d7;
360 d0 = p1[0]; /* Pull the stuff into registers */
361 d1 = p1[1]; /* ... in bursts, if possible. */
368 __builtin_prefetch(p1 + 5*64, 0, 0);
377 __builtin_prefetch(p2 + 5*64, 0, 0);
386 __builtin_prefetch(p3 + 5*64, 0, 0);
387 STORE_NTI(d0, p1[0]);
388 STORE_NTI(d1, p1[1]);
389 STORE_NTI(d2, p1[2]);
390 STORE_NTI(d3, p1[3]);
391 STORE_NTI(d4, p1[4]);
392 STORE_NTI(d5, p1[5]);
393 STORE_NTI(d6, p1[6]);
394 STORE_NTI(d7, p1[7]);
398 } while (--lines > 0);
402 xor_64regs_stream_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
403 unsigned long *p3, unsigned long *p4)
405 long lines = bytes / (sizeof (long)) / 8;
408 register long d0, d1, d2, d3, d4, d5, d6, d7;
409 d0 = p1[0]; /* Pull the stuff into registers */
410 d1 = p1[1]; /* ... in bursts, if possible. */
417 __builtin_prefetch(p1 + 5*64, 0, 0);
426 __builtin_prefetch(p2 + 5*64, 0, 0);
435 __builtin_prefetch(p3 + 5*64, 0, 0);
444 __builtin_prefetch(p4 + 5*64, 0, 0);
445 STORE_NTI(d0, p1[0]);
446 STORE_NTI(d1, p1[1]);
447 STORE_NTI(d2, p1[2]);
448 STORE_NTI(d3, p1[3]);
449 STORE_NTI(d4, p1[4]);
450 STORE_NTI(d5, p1[5]);
451 STORE_NTI(d6, p1[6]);
452 STORE_NTI(d7, p1[7]);
457 } while (--lines > 0);
461 xor_64regs_stream_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
462 unsigned long *p3, unsigned long *p4, unsigned long *p5)
464 long lines = bytes / (sizeof (long)) / 8;
467 register long d0, d1, d2, d3, d4, d5, d6, d7;
468 d0 = p1[0]; /* Pull the stuff into registers */
469 d1 = p1[1]; /* ... in bursts, if possible. */
476 __builtin_prefetch(p1 + 5*64, 0, 0);
485 __builtin_prefetch(p2 + 5*64, 0, 0);
494 __builtin_prefetch(p3 + 5*64, 0, 0);
503 __builtin_prefetch(p4 + 5*64, 0, 0);
512 __builtin_prefetch(p5 + 5*64, 0, 0);
513 STORE_NTI(d0, p1[0]);
514 STORE_NTI(d1, p1[1]);
515 STORE_NTI(d2, p1[2]);
516 STORE_NTI(d3, p1[3]);
517 STORE_NTI(d4, p1[4]);
518 STORE_NTI(d5, p1[5]);
519 STORE_NTI(d6, p1[6]);
520 STORE_NTI(d7, p1[7]);
526 } while (--lines > 0);
530 static struct xor_block_template xor_block_sse = {
531 name: "128byte sse streaming",
538 static struct xor_block_template xor_block_64regs_stream = {
539 name: "64byte int streaming",
540 do_2: xor_64regs_stream_2,
541 do_3: xor_64regs_stream_3,
542 do_4: xor_64regs_stream_4,
543 do_5: xor_64regs_stream_5,
546 /* AK: the speed test is useless: it only tests cache hot */
547 #undef XOR_TRY_TEMPLATES
548 #define XOR_TRY_TEMPLATES \
550 xor_speed(&xor_block_sse); \
551 xor_speed(&xor_block_64regs_stream); \
554 #define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)