cleanup
[linux-2.4.21-pre4.git] / include / asm-x86_64 / xor.h
1 /*
2  * include/asm-x86_64/xor.h
3  *
4  * Optimized RAID-5 checksumming functions for MMX and SSE.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2, or (at your option)
9  * any later version.
10  *
11  * You should have received a copy of the GNU General Public License
12  * (for example /usr/src/linux/COPYING); if not, write to the Free
13  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14  */
15
16
17 /*
18  * Cache avoiding checksumming functions utilizing KNI instructions
19  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
20  */
21
22 /*
23  * Based on
24  * High-speed RAID5 checksumming functions utilizing SSE instructions.
25  * Copyright (C) 1998 Ingo Molnar.
26  */
27
28 /* 
29  * x86-64 changes / gcc fixes from Andi Kleen. 
30  * Copyright 2002 Andi Kleen, SuSE Labs.
31  *
32  * This hasn't been optimized for the hammer yet, but there are likely
33  * no advantages to be gotten from x86-64 here anyways.
34  */
35
36 typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
37
38 /* Doesn't use gcc to save the XMM registers, because there is no easy way to 
39    tell it to do a clts before the register saving. */
40 #define XMMS_SAVE                               \
41         asm volatile (                  \
42                 "movq %%cr0,%0          ;\n\t"  \
43                 "clts                   ;\n\t"  \
44                 "movups %%xmm0,(%1)     ;\n\t"  \
45                 "movups %%xmm1,0x10(%1) ;\n\t"  \
46                 "movups %%xmm2,0x20(%1) ;\n\t"  \
47                 "movups %%xmm3,0x30(%1) ;\n\t"  \
48                 : "=&r" (cr0)                   \
49                 : "r" (xmm_save)                \
50                 : "memory")
51
52 #define XMMS_RESTORE                            \
53         asm volatile (                  \
54                 "sfence                 ;\n\t"  \
55                 "movups (%1),%%xmm0     ;\n\t"  \
56                 "movups 0x10(%1),%%xmm1 ;\n\t"  \
57                 "movups 0x20(%1),%%xmm2 ;\n\t"  \
58                 "movups 0x30(%1),%%xmm3 ;\n\t"  \
59                 "movq   %0,%%cr0        ;\n\t"  \
60                 :                               \
61                 : "r" (cr0), "r" (xmm_save)     \
62                 : "memory")
63
64 #define OFFS(x)         "16*("#x")"
65 #define PF_OFFS(x)      "256+16*("#x")"
66 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
67 #define LD(x,y)         "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
68 #define ST(x,y)         "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
69 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
70 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
71 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
72 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
73 #define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
74 #define XO1(x,y)        "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
75 #define XO2(x,y)        "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
76 #define XO3(x,y)        "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
77 #define XO4(x,y)        "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
78 #define XO5(x,y)        "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
79
80
81 static void
82 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
83 {
84         unsigned int lines = bytes >> 8;
85         unsigned long cr0;
86         xmm_store_t xmm_save[4];
87
88         XMMS_SAVE;
89
90         asm volatile (
91 #undef BLOCK
92 #define BLOCK(i) \
93                 LD(i,0)                                 \
94                         LD(i+1,1)                       \
95                 PF1(i)                                  \
96                                 PF1(i+2)                \
97                                 LD(i+2,2)               \
98                                         LD(i+3,3)       \
99                 PF0(i+4)                                \
100                                 PF0(i+6)                \
101                 XO1(i,0)                                \
102                         XO1(i+1,1)                      \
103                                 XO1(i+2,2)              \
104                                         XO1(i+3,3)      \
105                 ST(i,0)                                 \
106                         ST(i+1,1)                       \
107                                 ST(i+2,2)               \
108                                         ST(i+3,3)       \
109
110
111                 PF0(0)
112                                 PF0(2)
113
114         " .align 32                     ;\n"
115         " 1:                            ;\n"
116
117                 BLOCK(0)
118                 BLOCK(4)
119                 BLOCK(8)
120                 BLOCK(12)
121
122         "       addq %[inc], %[p1]           ;\n"
123         "       addq %[inc], %[p2]           ;\n"
124                 "               decl %[cnt] ; jnz 1b"
125         : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
126         : [inc] "r" (256UL) 
127         : "memory");
128
129         XMMS_RESTORE;
130 }
131
132 static void
133 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
134           unsigned long *p3)
135 {
136         unsigned int lines = bytes >> 8;
137         xmm_store_t xmm_save[4];
138         unsigned long cr0;
139
140         XMMS_SAVE;
141
142         __asm__ __volatile__ (
143 #undef BLOCK
144 #define BLOCK(i) \
145                 PF1(i)                                  \
146                                 PF1(i+2)                \
147                 LD(i,0)                                 \
148                         LD(i+1,1)                       \
149                                 LD(i+2,2)               \
150                                         LD(i+3,3)       \
151                 PF2(i)                                  \
152                                 PF2(i+2)                \
153                 PF0(i+4)                                \
154                                 PF0(i+6)                \
155                 XO1(i,0)                                \
156                         XO1(i+1,1)                      \
157                                 XO1(i+2,2)              \
158                                         XO1(i+3,3)      \
159                 XO2(i,0)                                \
160                         XO2(i+1,1)                      \
161                                 XO2(i+2,2)              \
162                                         XO2(i+3,3)      \
163                 ST(i,0)                                 \
164                         ST(i+1,1)                       \
165                                 ST(i+2,2)               \
166                                         ST(i+3,3)       \
167
168
169                 PF0(0)
170                                 PF0(2)
171
172         " .align 32                     ;\n"
173         " 1:                            ;\n"
174
175                 BLOCK(0)
176                 BLOCK(4)
177                 BLOCK(8)
178                 BLOCK(12)
179
180         "       addq %[inc], %[p1]           ;\n"
181         "       addq %[inc], %[p2]          ;\n"
182         "       addq %[inc], %[p3]           ;\n"
183                 "               decl %[cnt] ; jnz 1b"
184         : [cnt] "+r" (lines),
185           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
186         : [inc] "r" (256UL)
187         : "memory"); 
188         XMMS_RESTORE;
189 }
190
191 static void
192 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
193           unsigned long *p3, unsigned long *p4)
194 {
195         unsigned int lines = bytes >> 8;
196         xmm_store_t xmm_save[4]; 
197         unsigned long cr0;
198
199         XMMS_SAVE;
200
201         __asm__ __volatile__ (
202 #undef BLOCK
203 #define BLOCK(i) \
204                 PF1(i)                                  \
205                                 PF1(i+2)                \
206                 LD(i,0)                                 \
207                         LD(i+1,1)                       \
208                                 LD(i+2,2)               \
209                                         LD(i+3,3)       \
210                 PF2(i)                                  \
211                                 PF2(i+2)                \
212                 XO1(i,0)                                \
213                         XO1(i+1,1)                      \
214                                 XO1(i+2,2)              \
215                                         XO1(i+3,3)      \
216                 PF3(i)                                  \
217                                 PF3(i+2)                \
218                 PF0(i+4)                                \
219                                 PF0(i+6)                \
220                 XO2(i,0)                                \
221                         XO2(i+1,1)                      \
222                                 XO2(i+2,2)              \
223                                         XO2(i+3,3)      \
224                 XO3(i,0)                                \
225                         XO3(i+1,1)                      \
226                                 XO3(i+2,2)              \
227                                         XO3(i+3,3)      \
228                 ST(i,0)                                 \
229                         ST(i+1,1)                       \
230                                 ST(i+2,2)               \
231                                         ST(i+3,3)       \
232
233
234                 PF0(0)
235                                 PF0(2)
236
237         " .align 32                     ;\n"
238         " 1:                            ;\n"
239
240                 BLOCK(0)
241                 BLOCK(4)
242                 BLOCK(8)
243                 BLOCK(12)
244
245         "       addq %[inc], %[p1]           ;\n"
246         "       addq %[inc], %[p2]           ;\n"
247         "       addq %[inc], %[p3]           ;\n"
248         "       addq %[inc], %[p4]           ;\n"
249         "       decl %[cnt] ; jnz 1b"
250         : [cnt] "+c" (lines),
251           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
252         : [inc] "r" (256UL)
253         : "memory" );
254
255         XMMS_RESTORE;
256 }
257
258 static void
259 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
260           unsigned long *p3, unsigned long *p4, unsigned long *p5)
261 {
262         unsigned int lines = bytes >> 8;
263         xmm_store_t xmm_save[4];
264         unsigned long cr0;
265
266         XMMS_SAVE;
267
268         __asm__ __volatile__ (
269 #undef BLOCK
270 #define BLOCK(i) \
271                 PF1(i)                                  \
272                                 PF1(i+2)                \
273                 LD(i,0)                                 \
274                         LD(i+1,1)                       \
275                                 LD(i+2,2)               \
276                                         LD(i+3,3)       \
277                 PF2(i)                                  \
278                                 PF2(i+2)                \
279                 XO1(i,0)                                \
280                         XO1(i+1,1)                      \
281                                 XO1(i+2,2)              \
282                                         XO1(i+3,3)      \
283                 PF3(i)                                  \
284                                 PF3(i+2)                \
285                 XO2(i,0)                                \
286                         XO2(i+1,1)                      \
287                                 XO2(i+2,2)              \
288                                         XO2(i+3,3)      \
289                 PF4(i)                                  \
290                                 PF4(i+2)                \
291                 PF0(i+4)                                \
292                                 PF0(i+6)                \
293                 XO3(i,0)                                \
294                         XO3(i+1,1)                      \
295                                 XO3(i+2,2)              \
296                                         XO3(i+3,3)      \
297                 XO4(i,0)                                \
298                         XO4(i+1,1)                      \
299                                 XO4(i+2,2)              \
300                                         XO4(i+3,3)      \
301                 ST(i,0)                                 \
302                         ST(i+1,1)                       \
303                                 ST(i+2,2)               \
304                                         ST(i+3,3)       \
305
306
307                 PF0(0)
308                                 PF0(2)
309
310         " .align 32                     ;\n"
311         " 1:                            ;\n"
312
313                 BLOCK(0)
314                 BLOCK(4)
315                 BLOCK(8)
316                 BLOCK(12)
317
318         "       addq %[inc], %[p1]           ;\n"
319         "       addq %[inc], %[p2]           ;\n"
320         "       addq %[inc], %[p3]           ;\n"
321         "       addq %[inc], %[p4]           ;\n"
322         "       addq %[inc], %[p5]           ;\n"
323         "       decl %[cnt] ; jnz 1b"
324         : [cnt] "+c" (lines),
325           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), 
326           [p5] "+r" (p5)
327         : [inc] "r" (256UL)
328         : "memory");
329
330         XMMS_RESTORE;
331 }
332
333 static struct xor_block_template xor_block_sse = {
334         name: "generic_sse",
335         do_2: xor_sse_2,
336         do_3: xor_sse_3,
337         do_4: xor_sse_4,
338         do_5: xor_sse_5,
339 };
340
341 #undef XOR_TRY_TEMPLATES
342 #define XOR_TRY_TEMPLATES                               \
343         do {                                            \
344                 xor_speed(&xor_block_sse);      \
345         } while (0)
346
347 /* We force the use of the SSE xor block because it can write around L2.
348    We may also be able to load into the L1 only depending on how the cpu
349    deals with a load to a line that is being prefetched.  */
350 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)