import of upstream 2.4.34.4 from kernel.org
[linux-2.4.git] / include / asm-x86_64 / xor.h
1 /*
2  * include/asm-x86_64/xor.h
3  *
4  * Optimized RAID-5 checksumming functions for MMX and SSE.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2, or (at your option)
9  * any later version.
10  *
11  * You should have received a copy of the GNU General Public License
12  * (for example /usr/src/linux/COPYING); if not, write to the Free
13  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14  */
15
16
17 /*
18  * Cache avoiding checksumming functions utilizing KNI instructions
19  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
20  */
21
22 /*
23  * Based on
24  * High-speed RAID5 checksumming functions utilizing SSE instructions.
25  * Copyright (C) 1998 Ingo Molnar.
26  */
27
28 /* 
29  * x86-64 changes / gcc fixes from Andi Kleen. 
30  * Copyright 2002 Andi Kleen, SuSE Labs.
31  */
32
33 typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
34
35 /* Doesn't use gcc to save the XMM registers, because there is no easy way to 
36    tell it to do a clts before the register saving. */
37 #define XMMS_SAVE                               \
38         asm volatile (                  \
39                 "movq %%cr0,%0          ;\n\t"  \
40                 "clts                   ;\n\t"  \
41                 "movups %%xmm0,(%1)     ;\n\t"  \
42                 "movups %%xmm1,0x10(%1) ;\n\t"  \
43                 "movups %%xmm2,0x20(%1) ;\n\t"  \
44                 "movups %%xmm3,0x30(%1) ;\n\t"  \
45                 : "=&r" (cr0)                   \
46                 : "r" (xmm_save)                \
47                 : "memory")
48
49 #define XMMS_RESTORE                            \
50         asm volatile (                  \
51                 "sfence                 ;\n\t"  \
52                 "movups (%1),%%xmm0     ;\n\t"  \
53                 "movups 0x10(%1),%%xmm1 ;\n\t"  \
54                 "movups 0x20(%1),%%xmm2 ;\n\t"  \
55                 "movups 0x30(%1),%%xmm3 ;\n\t"  \
56                 "movq   %0,%%cr0        ;\n\t"  \
57                 :                               \
58                 : "r" (cr0), "r" (xmm_save)     \
59                 : "memory")
60
61 #define OFFS(x)         "16*("#x")"
62 #define PF_OFFS(x)      "320+16*("#x")"
63 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
64 #define LD(x,y)         "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
65 #define ST(x,y)         "       movntdq %%xmm"#y",   "OFFS(x)"(%[p1])   ;\n"
66 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
67 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
68 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
69 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
70 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
71 #define XO1(x,y)        "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
72 #define XO2(x,y)        "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
73 #define XO3(x,y)        "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
74 #define XO4(x,y)        "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
75 #define XO5(x,y)        "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
76
77 static void
78 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
79 {
80         unsigned int lines = bytes >> 7;
81         unsigned long cr0;
82         xmm_store_t xmm_save[4];
83
84         XMMS_SAVE;
85
86         asm volatile (
87 #undef BLOCK
88 #define BLOCK(i) \
89                 LD(i,0)                                 \
90                         LD(i+1,1)                       \
91                 PF1(i)                                  \
92                                 LD(i+2,2)               \
93                                         LD(i+3,3)       \
94                 PF0(i+4)                                \
95                 XO1(i,0)                                \
96                         XO1(i+1,1)                      \
97                 ST(i,0)                                 \
98                         ST(i+1,1)                       \
99                                 XO1(i+2,2)              \
100                                         XO1(i+3,3)      \
101                                 ST(i+2,2)               \
102                                         ST(i+3,3)       \
103
104
105                 PF0(0)
106
107         " .p2align 4                    ;\n"
108         " 1:                            ;\n"
109
110                 BLOCK(0)
111                 BLOCK(4)
112
113         "       decl %[cnt]\n"
114         "       leaq 128(%[p1]),%[p1]\n"
115         "       leaq 128(%[p2]),%[p2]\n"
116         "       jnz 1b\n"
117         : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
118         :
119         : "memory");
120
121         XMMS_RESTORE;
122 }
123
124 static void
125 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
126           unsigned long *p3)
127 {
128         unsigned int lines = bytes >> 7;
129         xmm_store_t xmm_save[4];
130         unsigned long cr0;
131
132         XMMS_SAVE;
133
134         __asm__ __volatile__ (
135 #undef BLOCK
136 #define BLOCK(i) \
137                 PF1(i)                                  \
138                 LD(i,0)                                 \
139                         LD(i+1,1)                       \
140                 XO1(i,0)                                \
141                         XO1(i+1,1)                      \
142                                 LD(i+2,2)               \
143                                         LD(i+3,3)       \
144                 PF2(i)                                  \
145                 PF0(i+4)                                \
146                                 XO1(i+2,2)              \
147                                         XO1(i+3,3)      \
148                 XO2(i,0)                                \
149                         XO2(i+1,1)                      \
150                 ST(i,0)                                 \
151                         ST(i+1,1)                       \
152                                 XO2(i+2,2)              \
153                                         XO2(i+3,3)      \
154                                 ST(i+2,2)               \
155                                         ST(i+3,3)       \
156
157
158                 PF0(0)
159
160         " .p2align 4                    ;\n"
161         " 1:                            ;\n"
162
163                 BLOCK(0)
164                 BLOCK(4)
165
166         "       decl %[cnt]\n"  
167         "       leaq 128(%[p1]),%[p1]\n" 
168         "       leaq 128(%[p2]),%[p2]\n" 
169         "       leaq 128(%[p3]),%[p3]\n" 
170         "       jnz  1b"
171         : [cnt] "+r" (lines),
172           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
173         :
174         : "memory"); 
175         XMMS_RESTORE;
176 }
177
178 static void
179 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180           unsigned long *p3, unsigned long *p4)
181 {
182         unsigned int lines = bytes >> 7;
183         xmm_store_t xmm_save[4]; 
184         unsigned long cr0;
185
186         XMMS_SAVE;
187
188         __asm__ __volatile__ (
189 #undef BLOCK
190 #define BLOCK(i) \
191                 PF1(i)                                  \
192                 LD(i,0)                                 \
193                         LD(i+1,1)                       \
194                 XO1(i,0)                                \
195                         XO1(i+1,1)                      \
196                                 LD(i+2,2)               \
197                                         LD(i+3,3)       \
198                 PF2(i)                                  \
199                                 XO1(i+2,2)              \
200                                         XO1(i+3,3)      \
201                 PF3(i)                                  \
202                 PF0(i+4)                                \
203                 XO2(i,0)                                \
204                         XO2(i+1,1)                      \
205                                 XO2(i+2,2)              \
206                                         XO2(i+3,3)      \
207                 XO3(i,0)                                \
208                         XO3(i+1,1)                      \
209                 ST(i,0)                                 \
210                         ST(i+1,1)                       \
211                                 XO3(i+2,2)              \
212                                         XO3(i+3,3)      \
213                                 ST(i+2,2)               \
214                                         ST(i+3,3)       \
215
216
217                 PF0(0)
218
219         " .align 32                     ;\n"
220         " 1:                            ;\n"
221
222                 BLOCK(0)
223                 BLOCK(4)
224
225         "       decl %[cnt]\n"  
226         "       leaq 128(%[p1]),%[p1]\n" 
227         "       leaq 128(%[p2]),%[p2]\n" 
228         "       leaq 128(%[p3]),%[p3]\n" 
229         "       leaq 128(%[p4]),%[p4]\n" 
230         "       jnz  1b"        
231         : [cnt] "+r" (lines),
232           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
233         : 
234         : "memory" );
235
236         XMMS_RESTORE;
237 }
238
239 static void
240 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
241           unsigned long *p3, unsigned long *p4, unsigned long *p5)
242 {
243         unsigned int lines = bytes >> 7;
244         xmm_store_t xmm_save[4];
245         unsigned long cr0;
246
247         XMMS_SAVE;
248
249         __asm__ __volatile__ (
250 #undef BLOCK
251 #define BLOCK(i) \
252                 PF1(i)                                  \
253                 LD(i,0)                                 \
254                         LD(i+1,1)                       \
255                 XO1(i,0)                                \
256                         XO1(i+1,1)                      \
257                                 LD(i+2,2)               \
258                                         LD(i+3,3)       \
259                 PF2(i)                                  \
260                                 XO1(i+2,2)              \
261                                         XO1(i+3,3)      \
262                 PF3(i)                                  \
263                 XO2(i,0)                                \
264                         XO2(i+1,1)                      \
265                                 XO2(i+2,2)              \
266                                         XO2(i+3,3)      \
267                 PF4(i)                                  \
268                 PF0(i+4)                                \
269                 XO3(i,0)                                \
270                         XO3(i+1,1)                      \
271                                 XO3(i+2,2)              \
272                                         XO3(i+3,3)      \
273                 XO4(i,0)                                \
274                         XO4(i+1,1)                      \
275                 ST(i,0)                                 \
276                         ST(i+1,1)                       \
277                                 XO4(i+2,2)              \
278                                         XO4(i+3,3)      \
279                                 ST(i+2,2)               \
280                                         ST(i+3,3)       \
281
282
283                 PF0(0)
284
285         " .p2align 4                    ;\n"
286         " 1:                            ;\n"
287
288                 BLOCK(0)
289                 BLOCK(4)
290
291         "       decl %[cnt]\n"  
292         "       leaq 128(%[p1]),%[p1]\n" 
293         "       leaq 128(%[p2]),%[p2]\n" 
294         "       leaq 128(%[p3]),%[p3]\n" 
295         "       leaq 128(%[p4]),%[p4]\n" 
296         "       leaq 128(%[p5]),%[p5]\n" 
297         "       jnz  1b"        
298         : [cnt] "+r" (lines),
299           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), 
300           [p5] "+r" (p5)
301         : 
302         : "memory");
303
304         XMMS_RESTORE;
305 }
306
307 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC__MINOR__ >= 3)
308 #define STORE_NTI(x,mem) __builtin_ia32_movnti(&(mem), (x))
309 #else
310 #define STORE_NTI(x,mem)  asm("movnti %1,%0" : "=m" (mem) : "r" (x)) 
311 #endif
312
313
314 static void
315 xor_64regs_stream_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
316 {
317         long lines = bytes / (sizeof (long)) / 8;
318
319         do {
320                 register long d0, d1, d2, d3, d4, d5, d6, d7;
321                 d0 = p1[0];     /* Pull the stuff into registers        */
322                 d1 = p1[1];     /*  ... in bursts, if possible.         */
323                 d2 = p1[2];
324                 d3 = p1[3];
325                 d4 = p1[4];
326                 d5 = p1[5];
327                 d6 = p1[6];
328                 d7 = p1[7];
329                 __builtin_prefetch(p1 + 5*64, 0, 0);
330                 d0 ^= p2[0];
331                 d1 ^= p2[1];
332                 d2 ^= p2[2];
333                 d3 ^= p2[3];
334                 d4 ^= p2[4];
335                 d5 ^= p2[5];
336                 d6 ^= p2[6];
337                 d7 ^= p2[7];
338                 __builtin_prefetch(p2 + 5*64, 0, 0);
339                 STORE_NTI(d0, p1[0]);
340                 STORE_NTI(d1, p1[1]);
341                 STORE_NTI(d2, p1[2]);
342                 STORE_NTI(d3, p1[3]);
343                 STORE_NTI(d4, p1[4]);
344                 STORE_NTI(d5, p1[5]);
345                 STORE_NTI(d6, p1[6]);
346                 STORE_NTI(d7, p1[7]);
347                 p1 += 8;
348                 p2 += 8;
349         } while (--lines > 0);
350 }
351
352 static void
353 xor_64regs_stream_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
354             unsigned long *p3)
355 {
356         long lines = bytes / (sizeof (long)) / 8;
357
358         do {
359                 register long d0, d1, d2, d3, d4, d5, d6, d7;
360                 d0 = p1[0];     /* Pull the stuff into registers        */
361                 d1 = p1[1];     /*  ... in bursts, if possible.         */
362                 d2 = p1[2];
363                 d3 = p1[3];
364                 d4 = p1[4];
365                 d5 = p1[5];
366                 d6 = p1[6];
367                 d7 = p1[7];
368                 __builtin_prefetch(p1 + 5*64, 0, 0);
369                 d0 ^= p2[0];
370                 d1 ^= p2[1];
371                 d2 ^= p2[2];
372                 d3 ^= p2[3];
373                 d4 ^= p2[4];
374                 d5 ^= p2[5];
375                 d6 ^= p2[6];
376                 d7 ^= p2[7];
377                 __builtin_prefetch(p2 + 5*64, 0, 0);
378                 d0 ^= p3[0];
379                 d1 ^= p3[1];
380                 d2 ^= p3[2];
381                 d3 ^= p3[3];
382                 d4 ^= p3[4];
383                 d5 ^= p3[5];
384                 d6 ^= p3[6];
385                 d7 ^= p3[7];
386                 __builtin_prefetch(p3 + 5*64, 0, 0);
387                 STORE_NTI(d0, p1[0]);
388                 STORE_NTI(d1, p1[1]);
389                 STORE_NTI(d2, p1[2]);
390                 STORE_NTI(d3, p1[3]);
391                 STORE_NTI(d4, p1[4]);
392                 STORE_NTI(d5, p1[5]);
393                 STORE_NTI(d6, p1[6]);
394                 STORE_NTI(d7, p1[7]);
395                 p1 += 8;
396                 p2 += 8;
397                 p3 += 8;
398         } while (--lines > 0);
399 }
400
401 static void
402 xor_64regs_stream_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
403             unsigned long *p3, unsigned long *p4)
404 {
405         long lines = bytes / (sizeof (long)) / 8;
406
407         do {
408                 register long d0, d1, d2, d3, d4, d5, d6, d7;
409                 d0 = p1[0];     /* Pull the stuff into registers        */
410                 d1 = p1[1];     /*  ... in bursts, if possible.         */
411                 d2 = p1[2];
412                 d3 = p1[3];
413                 d4 = p1[4];
414                 d5 = p1[5];
415                 d6 = p1[6];
416                 d7 = p1[7];
417                 __builtin_prefetch(p1 + 5*64, 0, 0);
418                 d0 ^= p2[0];
419                 d1 ^= p2[1];
420                 d2 ^= p2[2];
421                 d3 ^= p2[3];
422                 d4 ^= p2[4];
423                 d5 ^= p2[5];
424                 d6 ^= p2[6];
425                 d7 ^= p2[7];
426                 __builtin_prefetch(p2 + 5*64, 0, 0);
427                 d0 ^= p3[0];
428                 d1 ^= p3[1];
429                 d2 ^= p3[2];
430                 d3 ^= p3[3];
431                 d4 ^= p3[4];
432                 d5 ^= p3[5];
433                 d6 ^= p3[6];
434                 d7 ^= p3[7];
435                 __builtin_prefetch(p3 + 5*64, 0, 0);
436                 d0 ^= p4[0];
437                 d1 ^= p4[1];
438                 d2 ^= p4[2];
439                 d3 ^= p4[3];
440                 d4 ^= p4[4];
441                 d5 ^= p4[5];
442                 d6 ^= p4[6];
443                 d7 ^= p4[7];
444                 __builtin_prefetch(p4 + 5*64, 0, 0);
445                 STORE_NTI(d0, p1[0]);
446                 STORE_NTI(d1, p1[1]);
447                 STORE_NTI(d2, p1[2]);
448                 STORE_NTI(d3, p1[3]);
449                 STORE_NTI(d4, p1[4]);
450                 STORE_NTI(d5, p1[5]);
451                 STORE_NTI(d6, p1[6]);
452                 STORE_NTI(d7, p1[7]);
453                 p1 += 8;
454                 p2 += 8;
455                 p3 += 8;
456                 p4 += 8;
457         } while (--lines > 0);
458 }
459
460 static void
461 xor_64regs_stream_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
462             unsigned long *p3, unsigned long *p4, unsigned long *p5)
463 {
464         long lines = bytes / (sizeof (long)) / 8;
465
466         do {
467                 register long d0, d1, d2, d3, d4, d5, d6, d7;
468                 d0 = p1[0];     /* Pull the stuff into registers        */
469                 d1 = p1[1];     /*  ... in bursts, if possible.         */
470                 d2 = p1[2];
471                 d3 = p1[3];
472                 d4 = p1[4];
473                 d5 = p1[5];
474                 d6 = p1[6];
475                 d7 = p1[7];
476                 __builtin_prefetch(p1 + 5*64, 0, 0);
477                 d0 ^= p2[0];
478                 d1 ^= p2[1];
479                 d2 ^= p2[2];
480                 d3 ^= p2[3];
481                 d4 ^= p2[4];
482                 d5 ^= p2[5];
483                 d6 ^= p2[6];
484                 d7 ^= p2[7];
485                 __builtin_prefetch(p2 + 5*64, 0, 0);
486                 d0 ^= p3[0];
487                 d1 ^= p3[1];
488                 d2 ^= p3[2];
489                 d3 ^= p3[3];
490                 d4 ^= p3[4];
491                 d5 ^= p3[5];
492                 d6 ^= p3[6];
493                 d7 ^= p3[7];
494                 __builtin_prefetch(p3 + 5*64, 0, 0);
495                 d0 ^= p4[0];
496                 d1 ^= p4[1];
497                 d2 ^= p4[2];
498                 d3 ^= p4[3];
499                 d4 ^= p4[4];
500                 d5 ^= p4[5];
501                 d6 ^= p4[6];
502                 d7 ^= p4[7];
503                 __builtin_prefetch(p4 + 5*64, 0, 0);
504                 d0 ^= p5[0];
505                 d1 ^= p5[1];
506                 d2 ^= p5[2];
507                 d3 ^= p5[3];
508                 d4 ^= p5[4];
509                 d5 ^= p5[5];
510                 d6 ^= p5[6];
511                 d7 ^= p5[7];
512                 __builtin_prefetch(p5 + 5*64, 0, 0);
513                 STORE_NTI(d0, p1[0]);
514                 STORE_NTI(d1, p1[1]);
515                 STORE_NTI(d2, p1[2]);
516                 STORE_NTI(d3, p1[3]);
517                 STORE_NTI(d4, p1[4]);
518                 STORE_NTI(d5, p1[5]);
519                 STORE_NTI(d6, p1[6]);
520                 STORE_NTI(d7, p1[7]);
521                 p1 += 8;
522                 p2 += 8;
523                 p3 += 8;
524                 p4 += 8;
525                 p5 += 8;
526         } while (--lines > 0);
527 }
528
529
530 static struct xor_block_template xor_block_sse = {
531         name: "128byte sse streaming",
532         do_2: xor_sse_2,
533         do_3: xor_sse_3,
534         do_4: xor_sse_4,
535         do_5: xor_sse_5,
536 };
537
538 static struct xor_block_template xor_block_64regs_stream = {
539         name: "64byte int streaming",
540         do_2: xor_64regs_stream_2,
541         do_3: xor_64regs_stream_3,
542         do_4: xor_64regs_stream_4,
543         do_5: xor_64regs_stream_5,
544 };
545
546 /* AK: the speed test is useless: it only tests cache hot */
547 #undef XOR_TRY_TEMPLATES
548 #define XOR_TRY_TEMPLATES                               \
549         do {                                            \
550                 xor_speed(&xor_block_sse);      \
551                 xor_speed(&xor_block_64regs_stream);    \
552         } while (0)
553
554 #define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)