2 * include/asm-i386/xor.h
4 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
11 * You should have received a copy of the GNU General Public License
12 * (for example /usr/src/linux/COPYING); if not, write to the Free
13 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 * High-speed RAID5 checksumming functions utilizing MMX instructions.
18 * Copyright (C) 1998 Ingo Molnar.
23 if (!(current->flags & PF_USEDFPU)) \
24 __asm__ __volatile__ (" clts;\n"); \
25 __asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0])); \
30 __asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0])); \
31 if (!(current->flags & PF_USEDFPU)) \
35 #define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
36 #define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
37 #define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
38 #define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
39 #define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
40 #define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
44 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
46 unsigned long lines = bytes >> 7;
51 __asm__ __volatile__ (
88 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
91 unsigned long lines = bytes >> 7;
96 __asm__ __volatile__ (
130 "+r" (p1), "+r" (p2), "+r" (p3)
138 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
139 unsigned long *p3, unsigned long *p4)
141 unsigned long lines = bytes >> 7;
146 __asm__ __volatile__ (
185 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
194 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
195 unsigned long *p3, unsigned long *p4, unsigned long *p5)
197 unsigned long lines = bytes >> 7;
202 /* need to save/restore p4/p5 manually otherwise gcc's 10 argument
203 limit gets exceeded (+ counts as two arguments) */
204 __asm__ __volatile__ (
252 "+r" (p1), "+r" (p2), "+r" (p3)
268 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
270 unsigned long lines = bytes >> 6;
275 __asm__ __volatile__ (
278 " movq (%1), %%mm0 ;\n"
279 " movq 8(%1), %%mm1 ;\n"
280 " pxor (%2), %%mm0 ;\n"
281 " movq 16(%1), %%mm2 ;\n"
282 " movq %%mm0, (%1) ;\n"
283 " pxor 8(%2), %%mm1 ;\n"
284 " movq 24(%1), %%mm3 ;\n"
285 " movq %%mm1, 8(%1) ;\n"
286 " pxor 16(%2), %%mm2 ;\n"
287 " movq 32(%1), %%mm4 ;\n"
288 " movq %%mm2, 16(%1) ;\n"
289 " pxor 24(%2), %%mm3 ;\n"
290 " movq 40(%1), %%mm5 ;\n"
291 " movq %%mm3, 24(%1) ;\n"
292 " pxor 32(%2), %%mm4 ;\n"
293 " movq 48(%1), %%mm6 ;\n"
294 " movq %%mm4, 32(%1) ;\n"
295 " pxor 40(%2), %%mm5 ;\n"
296 " movq 56(%1), %%mm7 ;\n"
297 " movq %%mm5, 40(%1) ;\n"
298 " pxor 48(%2), %%mm6 ;\n"
299 " pxor 56(%2), %%mm7 ;\n"
300 " movq %%mm6, 48(%1) ;\n"
301 " movq %%mm7, 56(%1) ;\n"
316 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
319 unsigned long lines = bytes >> 6;
324 __asm__ __volatile__ (
325 " .align 32,0x90 ;\n"
327 " movq (%1), %%mm0 ;\n"
328 " movq 8(%1), %%mm1 ;\n"
329 " pxor (%2), %%mm0 ;\n"
330 " movq 16(%1), %%mm2 ;\n"
331 " pxor 8(%2), %%mm1 ;\n"
332 " pxor (%3), %%mm0 ;\n"
333 " pxor 16(%2), %%mm2 ;\n"
334 " movq %%mm0, (%1) ;\n"
335 " pxor 8(%3), %%mm1 ;\n"
336 " pxor 16(%3), %%mm2 ;\n"
337 " movq 24(%1), %%mm3 ;\n"
338 " movq %%mm1, 8(%1) ;\n"
339 " movq 32(%1), %%mm4 ;\n"
340 " movq 40(%1), %%mm5 ;\n"
341 " pxor 24(%2), %%mm3 ;\n"
342 " movq %%mm2, 16(%1) ;\n"
343 " pxor 32(%2), %%mm4 ;\n"
344 " pxor 24(%3), %%mm3 ;\n"
345 " pxor 40(%2), %%mm5 ;\n"
346 " movq %%mm3, 24(%1) ;\n"
347 " pxor 32(%3), %%mm4 ;\n"
348 " pxor 40(%3), %%mm5 ;\n"
349 " movq 48(%1), %%mm6 ;\n"
350 " movq %%mm4, 32(%1) ;\n"
351 " movq 56(%1), %%mm7 ;\n"
352 " pxor 48(%2), %%mm6 ;\n"
353 " movq %%mm5, 40(%1) ;\n"
354 " pxor 56(%2), %%mm7 ;\n"
355 " pxor 48(%3), %%mm6 ;\n"
356 " pxor 56(%3), %%mm7 ;\n"
357 " movq %%mm6, 48(%1) ;\n"
358 " movq %%mm7, 56(%1) ;\n"
366 "+r" (p1), "+r" (p2), "+r" (p3)
374 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
375 unsigned long *p3, unsigned long *p4)
377 unsigned long lines = bytes >> 6;
382 __asm__ __volatile__ (
383 " .align 32,0x90 ;\n"
385 " movq (%1), %%mm0 ;\n"
386 " movq 8(%1), %%mm1 ;\n"
387 " pxor (%2), %%mm0 ;\n"
388 " movq 16(%1), %%mm2 ;\n"
389 " pxor 8(%2), %%mm1 ;\n"
390 " pxor (%3), %%mm0 ;\n"
391 " pxor 16(%2), %%mm2 ;\n"
392 " pxor 8(%3), %%mm1 ;\n"
393 " pxor (%4), %%mm0 ;\n"
394 " movq 24(%1), %%mm3 ;\n"
395 " pxor 16(%3), %%mm2 ;\n"
396 " pxor 8(%4), %%mm1 ;\n"
397 " movq %%mm0, (%1) ;\n"
398 " movq 32(%1), %%mm4 ;\n"
399 " pxor 24(%2), %%mm3 ;\n"
400 " pxor 16(%4), %%mm2 ;\n"
401 " movq %%mm1, 8(%1) ;\n"
402 " movq 40(%1), %%mm5 ;\n"
403 " pxor 32(%2), %%mm4 ;\n"
404 " pxor 24(%3), %%mm3 ;\n"
405 " movq %%mm2, 16(%1) ;\n"
406 " pxor 40(%2), %%mm5 ;\n"
407 " pxor 32(%3), %%mm4 ;\n"
408 " pxor 24(%4), %%mm3 ;\n"
409 " movq %%mm3, 24(%1) ;\n"
410 " movq 56(%1), %%mm7 ;\n"
411 " movq 48(%1), %%mm6 ;\n"
412 " pxor 40(%3), %%mm5 ;\n"
413 " pxor 32(%4), %%mm4 ;\n"
414 " pxor 48(%2), %%mm6 ;\n"
415 " movq %%mm4, 32(%1) ;\n"
416 " pxor 56(%2), %%mm7 ;\n"
417 " pxor 40(%4), %%mm5 ;\n"
418 " pxor 48(%3), %%mm6 ;\n"
419 " pxor 56(%3), %%mm7 ;\n"
420 " movq %%mm5, 40(%1) ;\n"
421 " pxor 48(%4), %%mm6 ;\n"
422 " pxor 56(%4), %%mm7 ;\n"
423 " movq %%mm6, 48(%1) ;\n"
424 " movq %%mm7, 56(%1) ;\n"
433 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
441 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
442 unsigned long *p3, unsigned long *p4, unsigned long *p5)
444 unsigned long lines = bytes >> 6;
449 /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
450 __asm__ __volatile__ (
453 " .align 32,0x90 ;\n"
455 " movq (%1), %%mm0 ;\n"
456 " movq 8(%1), %%mm1 ;\n"
457 " pxor (%2), %%mm0 ;\n"
458 " pxor 8(%2), %%mm1 ;\n"
459 " movq 16(%1), %%mm2 ;\n"
460 " pxor (%3), %%mm0 ;\n"
461 " pxor 8(%3), %%mm1 ;\n"
462 " pxor 16(%2), %%mm2 ;\n"
463 " pxor (%4), %%mm0 ;\n"
464 " pxor 8(%4), %%mm1 ;\n"
465 " pxor 16(%3), %%mm2 ;\n"
466 " movq 24(%1), %%mm3 ;\n"
467 " pxor (%5), %%mm0 ;\n"
468 " pxor 8(%5), %%mm1 ;\n"
469 " movq %%mm0, (%1) ;\n"
470 " pxor 16(%4), %%mm2 ;\n"
471 " pxor 24(%2), %%mm3 ;\n"
472 " movq %%mm1, 8(%1) ;\n"
473 " pxor 16(%5), %%mm2 ;\n"
474 " pxor 24(%3), %%mm3 ;\n"
475 " movq 32(%1), %%mm4 ;\n"
476 " movq %%mm2, 16(%1) ;\n"
477 " pxor 24(%4), %%mm3 ;\n"
478 " pxor 32(%2), %%mm4 ;\n"
479 " movq 40(%1), %%mm5 ;\n"
480 " pxor 24(%5), %%mm3 ;\n"
481 " pxor 32(%3), %%mm4 ;\n"
482 " pxor 40(%2), %%mm5 ;\n"
483 " movq %%mm3, 24(%1) ;\n"
484 " pxor 32(%4), %%mm4 ;\n"
485 " pxor 40(%3), %%mm5 ;\n"
486 " movq 48(%1), %%mm6 ;\n"
487 " movq 56(%1), %%mm7 ;\n"
488 " pxor 32(%5), %%mm4 ;\n"
489 " pxor 40(%4), %%mm5 ;\n"
490 " pxor 48(%2), %%mm6 ;\n"
491 " pxor 56(%2), %%mm7 ;\n"
492 " movq %%mm4, 32(%1) ;\n"
493 " pxor 48(%3), %%mm6 ;\n"
494 " pxor 56(%3), %%mm7 ;\n"
495 " pxor 40(%5), %%mm5 ;\n"
496 " pxor 48(%4), %%mm6 ;\n"
497 " pxor 56(%4), %%mm7 ;\n"
498 " movq %%mm5, 40(%1) ;\n"
499 " pxor 48(%5), %%mm6 ;\n"
500 " pxor 56(%5), %%mm7 ;\n"
501 " movq %%mm6, 48(%1) ;\n"
502 " movq %%mm7, 56(%1) ;\n"
514 "+r" (p1), "+r" (p2), "+r" (p3)
521 static struct xor_block_template xor_block_pII_mmx = {
529 static struct xor_block_template xor_block_p5_mmx = {
541 * Cache avoiding checksumming functions utilizing KNI instructions
542 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
546 __asm__ __volatile__ ( \
547 "movl %%cr0,%0 ;\n\t" \
549 "movups %%xmm0,(%1) ;\n\t" \
550 "movups %%xmm1,0x10(%1) ;\n\t" \
551 "movups %%xmm2,0x20(%1) ;\n\t" \
552 "movups %%xmm3,0x30(%1) ;\n\t" \
557 #define XMMS_RESTORE \
558 __asm__ __volatile__ ( \
560 "movups (%1),%%xmm0 ;\n\t" \
561 "movups 0x10(%1),%%xmm1 ;\n\t" \
562 "movups 0x20(%1),%%xmm2 ;\n\t" \
563 "movups 0x30(%1),%%xmm3 ;\n\t" \
564 "movl %0,%%cr0 ;\n\t" \
566 : "r" (cr0), "r" (xmm_save) \
569 #define ALIGN16 __attribute__((aligned(16)))
571 #define OFFS(x) "16*("#x")"
572 #define PF_OFFS(x) "256+16*("#x")"
573 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
574 #define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
575 #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
576 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
577 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
578 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
579 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
580 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
581 #define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
582 #define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
583 #define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
584 #define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
585 #define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
589 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
591 unsigned long lines = bytes >> 8;
592 char xmm_save[16*4] ALIGN16;
597 __asm__ __volatile__ (
642 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
645 unsigned long lines = bytes >> 8;
646 char xmm_save[16*4] ALIGN16;
651 __asm__ __volatile__ (
695 "+r" (p1), "+r"(p2), "+r"(p3)
703 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
704 unsigned long *p3, unsigned long *p4)
706 unsigned long lines = bytes >> 8;
707 char xmm_save[16*4] ALIGN16;
712 __asm__ __volatile__ (
763 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
771 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
772 unsigned long *p3, unsigned long *p4, unsigned long *p5)
774 unsigned long lines = bytes >> 8;
775 char xmm_save[16*4] ALIGN16;
780 /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
781 __asm__ __volatile__ (
843 "+r" (p1), "+r" (p2), "+r" (p3)
850 static struct xor_block_template xor_block_pIII_sse = {
858 /* Also try the generic routines. */
859 #include <asm-generic/xor.h>
861 #undef XOR_TRY_TEMPLATES
862 #define XOR_TRY_TEMPLATES \
864 xor_speed(&xor_block_8regs); \
865 xor_speed(&xor_block_32regs); \
867 xor_speed(&xor_block_pIII_sse); \
868 if (md_cpu_has_mmx()) { \
869 xor_speed(&xor_block_pII_mmx); \
870 xor_speed(&xor_block_p5_mmx); \
874 /* We force the use of the SSE xor block because it can write around L2.
875 We may also be able to load into the L1 only depending on how the cpu
876 deals with a load to a line that is being prefetched. */
877 #define XOR_SELECT_TEMPLATE(FASTEST) \
878 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)