special usb hub handling, IDE disks, and retries all over the place
[linux-2.4.git] / include / asm-i386 / xor.h
1 /*
2  * include/asm-i386/xor.h
3  *
4  * Optimized RAID-5 checksumming functions for MMX and SSE.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2, or (at your option)
9  * any later version.
10  *
11  * You should have received a copy of the GNU General Public License
12  * (for example /usr/src/linux/COPYING); if not, write to the Free
13  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14  */
15
16 /*
17  * High-speed RAID5 checksumming functions utilizing MMX instructions.
18  * Copyright (C) 1998 Ingo Molnar.
19  */
20
21 #define FPU_SAVE                                                        \
22   do {                                                                  \
23         if (!(current->flags & PF_USEDFPU))                             \
24                 __asm__ __volatile__ (" clts;\n");                      \
25         __asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0]));    \
26   } while (0)
27
28 #define FPU_RESTORE                                                     \
29   do {                                                                  \
30         __asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0]));         \
31         if (!(current->flags & PF_USEDFPU))                             \
32                 stts();                                                 \
33   } while (0)
34
35 #define LD(x,y)         "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
36 #define ST(x,y)         "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
37 #define XO1(x,y)        "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
38 #define XO2(x,y)        "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
39 #define XO3(x,y)        "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
40 #define XO4(x,y)        "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
41
42
43 static void
44 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45 {
46         unsigned long lines = bytes >> 7;
47         char fpu_save[108];
48
49         FPU_SAVE;
50
51         __asm__ __volatile__ (
52 #undef BLOCK
53 #define BLOCK(i) \
54         LD(i,0)                                 \
55                 LD(i+1,1)                       \
56                         LD(i+2,2)               \
57                                 LD(i+3,3)       \
58         XO1(i,0)                                \
59         ST(i,0)                                 \
60                 XO1(i+1,1)                      \
61                 ST(i+1,1)                       \
62                         XO1(i+2,2)              \
63                         ST(i+2,2)               \
64                                 XO1(i+3,3)      \
65                                 ST(i+3,3)
66
67         " .align 32                     ;\n"
68         " 1:                            ;\n"
69
70         BLOCK(0)
71         BLOCK(4)
72         BLOCK(8)
73         BLOCK(12)
74
75         "       addl $128, %1         ;\n"
76         "       addl $128, %2         ;\n"
77         "       decl %0               ;\n"
78         "       jnz 1b                ;\n"
79         : "+r" (lines),
80           "+r" (p1), "+r" (p2)
81         :
82         : "memory");
83
84         FPU_RESTORE;
85 }
86
87 static void
88 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
89               unsigned long *p3)
90 {
91         unsigned long lines = bytes >> 7;
92         char fpu_save[108];
93
94         FPU_SAVE;
95
96         __asm__ __volatile__ (
97 #undef BLOCK
98 #define BLOCK(i) \
99         LD(i,0)                                 \
100                 LD(i+1,1)                       \
101                         LD(i+2,2)               \
102                                 LD(i+3,3)       \
103         XO1(i,0)                                \
104                 XO1(i+1,1)                      \
105                         XO1(i+2,2)              \
106                                 XO1(i+3,3)      \
107         XO2(i,0)                                \
108         ST(i,0)                                 \
109                 XO2(i+1,1)                      \
110                 ST(i+1,1)                       \
111                         XO2(i+2,2)              \
112                         ST(i+2,2)               \
113                                 XO2(i+3,3)      \
114                                 ST(i+3,3)
115
116         " .align 32                     ;\n"
117         " 1:                            ;\n"
118
119         BLOCK(0)
120         BLOCK(4)
121         BLOCK(8)
122         BLOCK(12)
123
124         "       addl $128, %1         ;\n"
125         "       addl $128, %2         ;\n"
126         "       addl $128, %3         ;\n"
127         "       decl %0               ;\n"
128         "       jnz 1b                ;\n"
129         : "+r" (lines),
130           "+r" (p1), "+r" (p2), "+r" (p3)
131         :
132         : "memory");
133
134         FPU_RESTORE;
135 }
136
137 static void
138 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
139               unsigned long *p3, unsigned long *p4)
140 {
141         unsigned long lines = bytes >> 7;
142         char fpu_save[108];
143
144         FPU_SAVE;
145
146         __asm__ __volatile__ (
147 #undef BLOCK
148 #define BLOCK(i) \
149         LD(i,0)                                 \
150                 LD(i+1,1)                       \
151                         LD(i+2,2)               \
152                                 LD(i+3,3)       \
153         XO1(i,0)                                \
154                 XO1(i+1,1)                      \
155                         XO1(i+2,2)              \
156                                 XO1(i+3,3)      \
157         XO2(i,0)                                \
158                 XO2(i+1,1)                      \
159                         XO2(i+2,2)              \
160                                 XO2(i+3,3)      \
161         XO3(i,0)                                \
162         ST(i,0)                                 \
163                 XO3(i+1,1)                      \
164                 ST(i+1,1)                       \
165                         XO3(i+2,2)              \
166                         ST(i+2,2)               \
167                                 XO3(i+3,3)      \
168                                 ST(i+3,3)
169
170         " .align 32                     ;\n"
171         " 1:                            ;\n"
172
173         BLOCK(0)
174         BLOCK(4)
175         BLOCK(8)
176         BLOCK(12)
177
178         "       addl $128, %1         ;\n"
179         "       addl $128, %2         ;\n"
180         "       addl $128, %3         ;\n"
181         "       addl $128, %4         ;\n"
182         "       decl %0               ;\n"
183         "       jnz 1b                ;\n"
184         : "+r" (lines),
185           "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
186         :
187         : "memory");
188
189         FPU_RESTORE;
190 }
191
192
193 static void
194 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
195               unsigned long *p3, unsigned long *p4, unsigned long *p5)
196 {
197         unsigned long lines = bytes >> 7;
198         char fpu_save[108];
199
200         FPU_SAVE;
201
202         /* need to save/restore p4/p5 manually otherwise gcc's 10 argument
203            limit gets exceeded (+ counts as two arguments) */
204         __asm__ __volatile__ (
205                 "  pushl %4\n"
206                 "  pushl %5\n"
207 #undef BLOCK
208 #define BLOCK(i) \
209         LD(i,0)                                 \
210                 LD(i+1,1)                       \
211                         LD(i+2,2)               \
212                                 LD(i+3,3)       \
213         XO1(i,0)                                \
214                 XO1(i+1,1)                      \
215                         XO1(i+2,2)              \
216                                 XO1(i+3,3)      \
217         XO2(i,0)                                \
218                 XO2(i+1,1)                      \
219                         XO2(i+2,2)              \
220                                 XO2(i+3,3)      \
221         XO3(i,0)                                \
222                 XO3(i+1,1)                      \
223                         XO3(i+2,2)              \
224                                 XO3(i+3,3)      \
225         XO4(i,0)                                \
226         ST(i,0)                                 \
227                 XO4(i+1,1)                      \
228                 ST(i+1,1)                       \
229                         XO4(i+2,2)              \
230                         ST(i+2,2)               \
231                                 XO4(i+3,3)      \
232                                 ST(i+3,3)
233
234         " .align 32                     ;\n"
235         " 1:                            ;\n"
236
237         BLOCK(0)
238         BLOCK(4)
239         BLOCK(8)
240         BLOCK(12)
241
242         "       addl $128, %1         ;\n"
243         "       addl $128, %2         ;\n"
244         "       addl $128, %3         ;\n"
245         "       addl $128, %4         ;\n"
246         "       addl $128, %5         ;\n"
247         "       decl %0               ;\n"
248         "       jnz 1b                ;\n"
249         "       popl %5\n"
250         "       popl %4\n"
251         : "+r" (lines),
252           "+r" (p1), "+r" (p2), "+r" (p3)
253         : "r" (p4), "r" (p5) 
254         : "memory");
255
256         FPU_RESTORE;
257 }
258
259 #undef LD
260 #undef XO1
261 #undef XO2
262 #undef XO3
263 #undef XO4
264 #undef ST
265 #undef BLOCK
266
267 static void
268 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
269 {
270         unsigned long lines = bytes >> 6;
271         char fpu_save[108];
272
273         FPU_SAVE;
274
275         __asm__ __volatile__ (
276         " .align 32                  ;\n"
277         " 1:                         ;\n"
278         "       movq   (%1), %%mm0   ;\n"
279         "       movq  8(%1), %%mm1   ;\n"
280         "       pxor   (%2), %%mm0   ;\n"
281         "       movq 16(%1), %%mm2   ;\n"
282         "       movq %%mm0,   (%1)   ;\n"
283         "       pxor  8(%2), %%mm1   ;\n"
284         "       movq 24(%1), %%mm3   ;\n"
285         "       movq %%mm1,  8(%1)   ;\n"
286         "       pxor 16(%2), %%mm2   ;\n"
287         "       movq 32(%1), %%mm4   ;\n"
288         "       movq %%mm2, 16(%1)   ;\n"
289         "       pxor 24(%2), %%mm3   ;\n"
290         "       movq 40(%1), %%mm5   ;\n"
291         "       movq %%mm3, 24(%1)   ;\n"
292         "       pxor 32(%2), %%mm4   ;\n"
293         "       movq 48(%1), %%mm6   ;\n"
294         "       movq %%mm4, 32(%1)   ;\n"
295         "       pxor 40(%2), %%mm5   ;\n"
296         "       movq 56(%1), %%mm7   ;\n"
297         "       movq %%mm5, 40(%1)   ;\n"
298         "       pxor 48(%2), %%mm6   ;\n"
299         "       pxor 56(%2), %%mm7   ;\n"
300         "       movq %%mm6, 48(%1)   ;\n"
301         "       movq %%mm7, 56(%1)   ;\n"
302         
303         "       addl $64, %1         ;\n"
304         "       addl $64, %2         ;\n"
305         "       decl %0              ;\n"
306         "       jnz 1b               ;\n"
307         : "+r" (lines),
308           "+r" (p1), "+r" (p2)
309         :
310         : "memory");
311
312         FPU_RESTORE;
313 }
314
315 static void
316 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
317              unsigned long *p3)
318 {
319         unsigned long lines = bytes >> 6;
320         char fpu_save[108];
321
322         FPU_SAVE;
323
324         __asm__ __volatile__ (
325         " .align 32,0x90             ;\n"
326         " 1:                         ;\n"
327         "       movq   (%1), %%mm0   ;\n"
328         "       movq  8(%1), %%mm1   ;\n"
329         "       pxor   (%2), %%mm0   ;\n"
330         "       movq 16(%1), %%mm2   ;\n"
331         "       pxor  8(%2), %%mm1   ;\n"
332         "       pxor   (%3), %%mm0   ;\n"
333         "       pxor 16(%2), %%mm2   ;\n"
334         "       movq %%mm0,   (%1)   ;\n"
335         "       pxor  8(%3), %%mm1   ;\n"
336         "       pxor 16(%3), %%mm2   ;\n"
337         "       movq 24(%1), %%mm3   ;\n"
338         "       movq %%mm1,  8(%1)   ;\n"
339         "       movq 32(%1), %%mm4   ;\n"
340         "       movq 40(%1), %%mm5   ;\n"
341         "       pxor 24(%2), %%mm3   ;\n"
342         "       movq %%mm2, 16(%1)   ;\n"
343         "       pxor 32(%2), %%mm4   ;\n"
344         "       pxor 24(%3), %%mm3   ;\n"
345         "       pxor 40(%2), %%mm5   ;\n"
346         "       movq %%mm3, 24(%1)   ;\n"
347         "       pxor 32(%3), %%mm4   ;\n"
348         "       pxor 40(%3), %%mm5   ;\n"
349         "       movq 48(%1), %%mm6   ;\n"
350         "       movq %%mm4, 32(%1)   ;\n"
351         "       movq 56(%1), %%mm7   ;\n"
352         "       pxor 48(%2), %%mm6   ;\n"
353         "       movq %%mm5, 40(%1)   ;\n"
354         "       pxor 56(%2), %%mm7   ;\n"
355         "       pxor 48(%3), %%mm6   ;\n"
356         "       pxor 56(%3), %%mm7   ;\n"
357         "       movq %%mm6, 48(%1)   ;\n"
358         "       movq %%mm7, 56(%1)   ;\n"
359       
360         "       addl $64, %1         ;\n"
361         "       addl $64, %2         ;\n"
362         "       addl $64, %3         ;\n"
363         "       decl %0              ;\n"
364         "       jnz 1b               ;\n"
365         : "+r" (lines),
366           "+r" (p1), "+r" (p2), "+r" (p3)
367         :
368         : "memory" );
369
370         FPU_RESTORE;
371 }
372
373 static void
374 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
375              unsigned long *p3, unsigned long *p4)
376 {
377         unsigned long lines = bytes >> 6;
378         char fpu_save[108];
379
380         FPU_SAVE;
381
382         __asm__ __volatile__ (
383         " .align 32,0x90             ;\n"
384         " 1:                         ;\n"
385         "       movq   (%1), %%mm0   ;\n"
386         "       movq  8(%1), %%mm1   ;\n"
387         "       pxor   (%2), %%mm0   ;\n"
388         "       movq 16(%1), %%mm2   ;\n"
389         "       pxor  8(%2), %%mm1   ;\n"
390         "       pxor   (%3), %%mm0   ;\n"
391         "       pxor 16(%2), %%mm2   ;\n"
392         "       pxor  8(%3), %%mm1   ;\n"
393         "       pxor   (%4), %%mm0   ;\n"
394         "       movq 24(%1), %%mm3   ;\n"
395         "       pxor 16(%3), %%mm2   ;\n"
396         "       pxor  8(%4), %%mm1   ;\n"
397         "       movq %%mm0,   (%1)   ;\n"
398         "       movq 32(%1), %%mm4   ;\n"
399         "       pxor 24(%2), %%mm3   ;\n"
400         "       pxor 16(%4), %%mm2   ;\n"
401         "       movq %%mm1,  8(%1)   ;\n"
402         "       movq 40(%1), %%mm5   ;\n"
403         "       pxor 32(%2), %%mm4   ;\n"
404         "       pxor 24(%3), %%mm3   ;\n"
405         "       movq %%mm2, 16(%1)   ;\n"
406         "       pxor 40(%2), %%mm5   ;\n"
407         "       pxor 32(%3), %%mm4   ;\n"
408         "       pxor 24(%4), %%mm3   ;\n"
409         "       movq %%mm3, 24(%1)   ;\n"
410         "       movq 56(%1), %%mm7   ;\n"
411         "       movq 48(%1), %%mm6   ;\n"
412         "       pxor 40(%3), %%mm5   ;\n"
413         "       pxor 32(%4), %%mm4   ;\n"
414         "       pxor 48(%2), %%mm6   ;\n"
415         "       movq %%mm4, 32(%1)   ;\n"
416         "       pxor 56(%2), %%mm7   ;\n"
417         "       pxor 40(%4), %%mm5   ;\n"
418         "       pxor 48(%3), %%mm6   ;\n"
419         "       pxor 56(%3), %%mm7   ;\n"
420         "       movq %%mm5, 40(%1)   ;\n"
421         "       pxor 48(%4), %%mm6   ;\n"
422         "       pxor 56(%4), %%mm7   ;\n"
423         "       movq %%mm6, 48(%1)   ;\n"
424         "       movq %%mm7, 56(%1)   ;\n"
425       
426         "       addl $64, %1         ;\n"
427         "       addl $64, %2         ;\n"
428         "       addl $64, %3         ;\n"
429         "       addl $64, %4         ;\n"
430         "       decl %0              ;\n"
431         "       jnz 1b               ;\n"
432         : "+r" (lines),
433           "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
434         :
435         : "memory");
436
437         FPU_RESTORE;
438 }
439
440 static void
441 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
442              unsigned long *p3, unsigned long *p4, unsigned long *p5)
443 {
444         unsigned long lines = bytes >> 6;
445         char fpu_save[108];
446
447         FPU_SAVE;
448
449         /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
450         __asm__ __volatile__ (
451         "       pushl %4\n"
452         "       pushl %5\n"             
453         " .align 32,0x90             ;\n"
454         " 1:                         ;\n"
455         "       movq   (%1), %%mm0   ;\n"
456         "       movq  8(%1), %%mm1   ;\n"
457         "       pxor   (%2), %%mm0   ;\n"
458         "       pxor  8(%2), %%mm1   ;\n"
459         "       movq 16(%1), %%mm2   ;\n"
460         "       pxor   (%3), %%mm0   ;\n"
461         "       pxor  8(%3), %%mm1   ;\n"
462         "       pxor 16(%2), %%mm2   ;\n"
463         "       pxor   (%4), %%mm0   ;\n"
464         "       pxor  8(%4), %%mm1   ;\n"
465         "       pxor 16(%3), %%mm2   ;\n"
466         "       movq 24(%1), %%mm3   ;\n"
467         "       pxor   (%5), %%mm0   ;\n"
468         "       pxor  8(%5), %%mm1   ;\n"
469         "       movq %%mm0,   (%1)   ;\n"
470         "       pxor 16(%4), %%mm2   ;\n"
471         "       pxor 24(%2), %%mm3   ;\n"
472         "       movq %%mm1,  8(%1)   ;\n"
473         "       pxor 16(%5), %%mm2   ;\n"
474         "       pxor 24(%3), %%mm3   ;\n"
475         "       movq 32(%1), %%mm4   ;\n"
476         "       movq %%mm2, 16(%1)   ;\n"
477         "       pxor 24(%4), %%mm3   ;\n"
478         "       pxor 32(%2), %%mm4   ;\n"
479         "       movq 40(%1), %%mm5   ;\n"
480         "       pxor 24(%5), %%mm3   ;\n"
481         "       pxor 32(%3), %%mm4   ;\n"
482         "       pxor 40(%2), %%mm5   ;\n"
483         "       movq %%mm3, 24(%1)   ;\n"
484         "       pxor 32(%4), %%mm4   ;\n"
485         "       pxor 40(%3), %%mm5   ;\n"
486         "       movq 48(%1), %%mm6   ;\n"
487         "       movq 56(%1), %%mm7   ;\n"
488         "       pxor 32(%5), %%mm4   ;\n"
489         "       pxor 40(%4), %%mm5   ;\n"
490         "       pxor 48(%2), %%mm6   ;\n"
491         "       pxor 56(%2), %%mm7   ;\n"
492         "       movq %%mm4, 32(%1)   ;\n"
493         "       pxor 48(%3), %%mm6   ;\n"
494         "       pxor 56(%3), %%mm7   ;\n"
495         "       pxor 40(%5), %%mm5   ;\n"
496         "       pxor 48(%4), %%mm6   ;\n"
497         "       pxor 56(%4), %%mm7   ;\n"
498         "       movq %%mm5, 40(%1)   ;\n"
499         "       pxor 48(%5), %%mm6   ;\n"
500         "       pxor 56(%5), %%mm7   ;\n"
501         "       movq %%mm6, 48(%1)   ;\n"
502         "       movq %%mm7, 56(%1)   ;\n"
503       
504         "       addl $64, %1         ;\n"
505         "       addl $64, %2         ;\n"
506         "       addl $64, %3         ;\n"
507         "       addl $64, %4         ;\n"
508         "       addl $64, %5         ;\n"
509         "       decl %0              ;\n"
510         "       jnz 1b               ;\n"
511         "       popl %5\n"
512         "       popl %4\n"
513         : "+g" (lines),
514           "+r" (p1), "+r" (p2), "+r" (p3)
515         : "r" (p4), "r" (p5)
516         : "memory");
517
518         FPU_RESTORE;
519 }
520
521 static struct xor_block_template xor_block_pII_mmx = {
522         name: "pII_mmx",
523         do_2: xor_pII_mmx_2,
524         do_3: xor_pII_mmx_3,
525         do_4: xor_pII_mmx_4,
526         do_5: xor_pII_mmx_5,
527 };
528
529 static struct xor_block_template xor_block_p5_mmx = {
530         name: "p5_mmx",
531         do_2: xor_p5_mmx_2,
532         do_3: xor_p5_mmx_3,
533         do_4: xor_p5_mmx_4,
534         do_5: xor_p5_mmx_5,
535 };
536
537 #undef FPU_SAVE
538 #undef FPU_RESTORE
539
540 /*
541  * Cache avoiding checksumming functions utilizing KNI instructions
542  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
543  */
544
545 #define XMMS_SAVE                               \
546         __asm__ __volatile__ (                  \
547                 "movl %%cr0,%0          ;\n\t"  \
548                 "clts                   ;\n\t"  \
549                 "movups %%xmm0,(%1)     ;\n\t"  \
550                 "movups %%xmm1,0x10(%1) ;\n\t"  \
551                 "movups %%xmm2,0x20(%1) ;\n\t"  \
552                 "movups %%xmm3,0x30(%1) ;\n\t"  \
553                 : "=&r" (cr0)                   \
554                 : "r" (xmm_save)                \
555                 : "memory")
556
557 #define XMMS_RESTORE                            \
558         __asm__ __volatile__ (                  \
559                 "sfence                 ;\n\t"  \
560                 "movups (%1),%%xmm0     ;\n\t"  \
561                 "movups 0x10(%1),%%xmm1 ;\n\t"  \
562                 "movups 0x20(%1),%%xmm2 ;\n\t"  \
563                 "movups 0x30(%1),%%xmm3 ;\n\t"  \
564                 "movl   %0,%%cr0        ;\n\t"  \
565                 :                               \
566                 : "r" (cr0), "r" (xmm_save)     \
567                 : "memory")
568
569 #define ALIGN16 __attribute__((aligned(16)))
570
571 #define OFFS(x)         "16*("#x")"
572 #define PF_OFFS(x)      "256+16*("#x")"
573 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%1)            ;\n"
574 #define LD(x,y)         "       movaps   "OFFS(x)"(%1), %%xmm"#y"       ;\n"
575 #define ST(x,y)         "       movaps %%xmm"#y",   "OFFS(x)"(%1)       ;\n"
576 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%2)            ;\n"
577 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%3)            ;\n"
578 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%4)            ;\n"
579 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%5)            ;\n"
580 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%6)            ;\n"
581 #define XO1(x,y)        "       xorps   "OFFS(x)"(%2), %%xmm"#y"        ;\n"
582 #define XO2(x,y)        "       xorps   "OFFS(x)"(%3), %%xmm"#y"        ;\n"
583 #define XO3(x,y)        "       xorps   "OFFS(x)"(%4), %%xmm"#y"        ;\n"
584 #define XO4(x,y)        "       xorps   "OFFS(x)"(%5), %%xmm"#y"        ;\n"
585 #define XO5(x,y)        "       xorps   "OFFS(x)"(%6), %%xmm"#y"        ;\n"
586
587
588 static void
589 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
590 {
591         unsigned long lines = bytes >> 8;
592         char xmm_save[16*4] ALIGN16;
593         int cr0;
594
595         XMMS_SAVE;
596
597         __asm__ __volatile__ (
598 #undef BLOCK
599 #define BLOCK(i) \
600                 LD(i,0)                                 \
601                         LD(i+1,1)                       \
602                 PF1(i)                                  \
603                                 PF1(i+2)                \
604                                 LD(i+2,2)               \
605                                         LD(i+3,3)       \
606                 PF0(i+4)                                \
607                                 PF0(i+6)                \
608                 XO1(i,0)                                \
609                         XO1(i+1,1)                      \
610                                 XO1(i+2,2)              \
611                                         XO1(i+3,3)      \
612                 ST(i,0)                                 \
613                         ST(i+1,1)                       \
614                                 ST(i+2,2)               \
615                                         ST(i+3,3)       \
616
617
618                 PF0(0)
619                                 PF0(2)
620
621         " .align 32                     ;\n"
622         " 1:                            ;\n"
623
624                 BLOCK(0)
625                 BLOCK(4)
626                 BLOCK(8)
627                 BLOCK(12)
628
629         "       addl $256, %1           ;\n"
630         "       addl $256, %2           ;\n"
631         "       decl %0                 ;\n"
632         "       jnz 1b                  ;\n"
633         : "+r" (lines),
634           "+r" (p1), "+r" (p2)
635         :
636         : "memory");
637
638         XMMS_RESTORE;
639 }
640
641 static void
642 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
643           unsigned long *p3)
644 {
645         unsigned long lines = bytes >> 8;
646         char xmm_save[16*4] ALIGN16;
647         int cr0;
648
649         XMMS_SAVE;
650
651         __asm__ __volatile__ (
652 #undef BLOCK
653 #define BLOCK(i) \
654                 PF1(i)                                  \
655                                 PF1(i+2)                \
656                 LD(i,0)                                 \
657                         LD(i+1,1)                       \
658                                 LD(i+2,2)               \
659                                         LD(i+3,3)       \
660                 PF2(i)                                  \
661                                 PF2(i+2)                \
662                 PF0(i+4)                                \
663                                 PF0(i+6)                \
664                 XO1(i,0)                                \
665                         XO1(i+1,1)                      \
666                                 XO1(i+2,2)              \
667                                         XO1(i+3,3)      \
668                 XO2(i,0)                                \
669                         XO2(i+1,1)                      \
670                                 XO2(i+2,2)              \
671                                         XO2(i+3,3)      \
672                 ST(i,0)                                 \
673                         ST(i+1,1)                       \
674                                 ST(i+2,2)               \
675                                         ST(i+3,3)       \
676
677
678                 PF0(0)
679                                 PF0(2)
680
681         " .align 32                     ;\n"
682         " 1:                            ;\n"
683
684                 BLOCK(0)
685                 BLOCK(4)
686                 BLOCK(8)
687                 BLOCK(12)
688
689         "       addl $256, %1           ;\n"
690         "       addl $256, %2           ;\n"
691         "       addl $256, %3           ;\n"
692         "       decl %0                 ;\n"
693         "       jnz 1b                  ;\n"
694         : "+r" (lines),
695           "+r" (p1), "+r"(p2), "+r"(p3)
696         :
697         : "memory" );
698
699         XMMS_RESTORE;
700 }
701
702 static void
703 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
704           unsigned long *p3, unsigned long *p4)
705 {
706         unsigned long lines = bytes >> 8;
707         char xmm_save[16*4] ALIGN16;
708         int cr0;
709
710         XMMS_SAVE;
711
712         __asm__ __volatile__ (
713 #undef BLOCK
714 #define BLOCK(i) \
715                 PF1(i)                                  \
716                                 PF1(i+2)                \
717                 LD(i,0)                                 \
718                         LD(i+1,1)                       \
719                                 LD(i+2,2)               \
720                                         LD(i+3,3)       \
721                 PF2(i)                                  \
722                                 PF2(i+2)                \
723                 XO1(i,0)                                \
724                         XO1(i+1,1)                      \
725                                 XO1(i+2,2)              \
726                                         XO1(i+3,3)      \
727                 PF3(i)                                  \
728                                 PF3(i+2)                \
729                 PF0(i+4)                                \
730                                 PF0(i+6)                \
731                 XO2(i,0)                                \
732                         XO2(i+1,1)                      \
733                                 XO2(i+2,2)              \
734                                         XO2(i+3,3)      \
735                 XO3(i,0)                                \
736                         XO3(i+1,1)                      \
737                                 XO3(i+2,2)              \
738                                         XO3(i+3,3)      \
739                 ST(i,0)                                 \
740                         ST(i+1,1)                       \
741                                 ST(i+2,2)               \
742                                         ST(i+3,3)       \
743
744
745                 PF0(0)
746                                 PF0(2)
747
748         " .align 32                     ;\n"
749         " 1:                            ;\n"
750
751                 BLOCK(0)
752                 BLOCK(4)
753                 BLOCK(8)
754                 BLOCK(12)
755
756         "       addl $256, %1           ;\n"
757         "       addl $256, %2           ;\n"
758         "       addl $256, %3           ;\n"
759         "       addl $256, %4           ;\n"
760         "       decl %0                 ;\n"
761         "       jnz 1b                  ;\n"
762         : "+r" (lines),
763           "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
764         :
765         : "memory" );
766
767         XMMS_RESTORE;
768 }
769
770 static void
771 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
772           unsigned long *p3, unsigned long *p4, unsigned long *p5)
773 {
774         unsigned long lines = bytes >> 8;
775         char xmm_save[16*4] ALIGN16;
776         int cr0;
777
778         XMMS_SAVE;
779
780         /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
781         __asm__ __volatile__ (
782                 " pushl %4\n"
783                 " pushl %5\n"
784 #undef BLOCK
785 #define BLOCK(i) \
786                 PF1(i)                                  \
787                                 PF1(i+2)                \
788                 LD(i,0)                                 \
789                         LD(i+1,1)                       \
790                                 LD(i+2,2)               \
791                                         LD(i+3,3)       \
792                 PF2(i)                                  \
793                                 PF2(i+2)                \
794                 XO1(i,0)                                \
795                         XO1(i+1,1)                      \
796                                 XO1(i+2,2)              \
797                                         XO1(i+3,3)      \
798                 PF3(i)                                  \
799                                 PF3(i+2)                \
800                 XO2(i,0)                                \
801                         XO2(i+1,1)                      \
802                                 XO2(i+2,2)              \
803                                         XO2(i+3,3)      \
804                 PF4(i)                                  \
805                                 PF4(i+2)                \
806                 PF0(i+4)                                \
807                                 PF0(i+6)                \
808                 XO3(i,0)                                \
809                         XO3(i+1,1)                      \
810                                 XO3(i+2,2)              \
811                                         XO3(i+3,3)      \
812                 XO4(i,0)                                \
813                         XO4(i+1,1)                      \
814                                 XO4(i+2,2)              \
815                                         XO4(i+3,3)      \
816                 ST(i,0)                                 \
817                         ST(i+1,1)                       \
818                                 ST(i+2,2)               \
819                                         ST(i+3,3)       \
820
821
822                 PF0(0)
823                                 PF0(2)
824
825         " .align 32                     ;\n"
826         " 1:                            ;\n"
827
828                 BLOCK(0)
829                 BLOCK(4)
830                 BLOCK(8)
831                 BLOCK(12)
832
833         "       addl $256, %1           ;\n"
834         "       addl $256, %2           ;\n"
835         "       addl $256, %3           ;\n"
836         "       addl $256, %4           ;\n"
837         "       addl $256, %5           ;\n"
838         "       decl %0                 ;\n"
839         "       jnz 1b                  ;\n"
840         "       popl %5\n"      
841         "       popl %4\n"      
842         : "+r" (lines),
843           "+r" (p1), "+r" (p2), "+r" (p3)
844         : "r" (p4), "r" (p5)
845         : "memory");
846
847         XMMS_RESTORE;
848 }
849
850 static struct xor_block_template xor_block_pIII_sse = {
851         name: "pIII_sse",
852         do_2: xor_sse_2,
853         do_3: xor_sse_3,
854         do_4: xor_sse_4,
855         do_5: xor_sse_5,
856 };
857
858 /* Also try the generic routines.  */
859 #include <asm-generic/xor.h>
860
861 #undef XOR_TRY_TEMPLATES
862 #define XOR_TRY_TEMPLATES                               \
863         do {                                            \
864                 xor_speed(&xor_block_8regs);            \
865                 xor_speed(&xor_block_32regs);           \
866                 if (cpu_has_xmm)                        \
867                         xor_speed(&xor_block_pIII_sse); \
868                 if (md_cpu_has_mmx()) {                 \
869                         xor_speed(&xor_block_pII_mmx);  \
870                         xor_speed(&xor_block_p5_mmx);   \
871                 }                                       \
872         } while (0)
873
874 /* We force the use of the SSE xor block because it can write around L2.
875    We may also be able to load into the L1 only depending on how the cpu
876    deals with a load to a line that is being prefetched.  */
877 #define XOR_SELECT_TEMPLATE(FASTEST) \
878         (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)