import of upstream 2.4.34.4 from kernel.org
[linux-2.4.git] / arch / sparc64 / lib / VIScsumcopy.S
1 /* $Id: VIScsumcopy.S,v 1.8 2000/02/20 23:21:39 davem Exp $
2  * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous
3  *            copying utilizing the UltraSparc Visual Instruction Set.
4  *
5  * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
6  *
7  * Based on older sparc32/sparc64 checksum.S, which is:
8  *
9  *      Copyright(C) 1995 Linus Torvalds
10  *      Copyright(C) 1995 Miguel de Icaza
11  *      Copyright(C) 1996,1997 David S. Miller
12  *    derived from:
13  *        Linux/Alpha checksum c-code
14  *        Linux/ix86 inline checksum assembly
15  *        RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
16  *        David Mosberger-Tang for optimized reference c-code
17  *        BSD4.4 portable checksum routine
18  */
19
20 #ifdef __sparc_v9__
21 #define STACKOFF        0x7ff+128
22 #else
23 #define STACKOFF        64
24 #endif
25
26 #ifdef __KERNEL__
27 #include <asm/head.h>
28 #include <asm/asi.h>
29 #include <asm/page.h>
30 #include <asm/visasm.h>
31 #define ASI_BLK_XOR     0
32 #define ASI_BLK_XOR1    (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)
33 #define ASI_BLK_OR      (ASI_BLK_P & ~ASI_P)
34 #else
35 #define ASI_P           0x80
36 #define ASI_BLK_P       0xf0
37 #define FRPS_FEF        0x04
38 #define FPRS_DU         0x02
39 #define FPRS_DL         0x01
40 #define ASI_BLK_XOR     (ASI_BLK_P ^ ASI_P)
41 #endif
42
43 #define src             o0
44 #define dst             o1
45 #define len             o2
46 #define sum             o3
47 #define x1              g1
48 #define x2              g2
49 #define x3              o4
50 #define x4              g4
51 #define x5              g5
52 #define x6              g7
53 #define x7              g3
54 #define x8              o5
55
56 /* Dobrou noc, SunSoft engineers. Spete sladce.
57  * This has a couple of tricks in and those
58  * tricks are UltraLinux trade secrets :))
59  * Once AGAIN, the SunSoft engineers are caught
60  * asleep at the keyboard :)).
61  * The main loop does about 20 superscalar cycles
62  * per 64bytes checksummed/copied.
63  */
64
65 #define LDBLK(O0)                                                                       \
66         ldda            [%src] %asi, %O0        /*  Load        Group           */
67
68 #define STBLK                                                                           \
69         stda            %f48, [%dst] ASI_BLK_P  /*  Store                       */
70
71 #define ST(fx,off)                                                                      \
72         std             %fx, [%dst + off]       /*  Store                       */
73
74 #define SYNC                                                                            \
75         membar          #Sync
76
77
78 #define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...)  \
79         LOAD                                    /*  Load        (Group)         */;     \
80         faligndata      %A14, %F0, %A14         /*  FPA         Group           */;     \
81         inc             %x5                     /*  IEU0                        */;     \
82         STORE1                                  /*  Store (optional)            */;     \
83         faligndata      %F0, %F2, %A0           /*  FPA         Group           */;     \
84         srl             %x5, 1, %x5             /*  IEU0                        */;     \
85         add             %sum, %x4, %sum         /*  IEU1                        */;     \
86         fpadd32         %F0, %f0, %F0           /*  FPA         Group           */;     \
87         inc             %x6                     /*  IEU0                        */;     \
88         STORE2                                  /*  Store (optional)            */;     \
89         faligndata      %F2, %F4, %A2           /*  FPA         Group           */;     \
90         srl             %x6, 1, %x6             /*  IEU0                        */;     \
91         add             %sum, %x5, %sum         /*  IEU1                        */;     \
92         fpadd32         %F2, %f2, %F2           /*  FPA         Group           */;     \
93         add             %src, 64, %src          /*  IEU0                        */;     \
94         fcmpgt32        %f0, %F0, %x1           /*  FPM                         */;     \
95         add             %dst, 64, %dst          /*  IEU1        Group           */;     \
96         inc             %x7                     /*  IEU0                        */;     \
97         STORE3                                  /*  Store (optional)            */;     \
98         faligndata      %F4, %F6, %A4           /*  FPA                         */;     \
99         fpadd32         %F4, %f4, %F4           /*  FPA         Group           */;     \
100         add             %sum, %x6, %sum         /*  IEU1                        */;     \
101         fcmpgt32        %f2, %F2, %x2           /*  FPM                         */;     \
102         srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
103         inc             %x8                     /*  IEU1                        */;     \
104         STORE4                                  /*  Store (optional)            */;     \
105         faligndata      %F6, %F8, %A6           /*  FPA                         */;     \
106         fpadd32         %F6, %f6, %F6           /*  FPA         Group           */;     \
107         srl             %x8, 1, %x8             /*  IEU0                        */;     \
108         fcmpgt32        %f4, %F4, %x3           /*  FPM                         */;     \
109         add             %sum, %x7, %sum         /*  IEU0        Group           */;     \
110         inc             %x1                     /*  IEU1                        */;     \
111         STORE5                                  /*  Store (optional)            */;     \
112         faligndata      %F8, %F10, %A8          /*  FPA                         */;     \
113         fpadd32         %F8, %f8, %F8           /*  FPA         Group           */;     \
114         srl             %x1, 1, %x1             /*  IEU0                        */;     \
115         fcmpgt32        %f6, %F6, %x4           /*  FPM                         */;     \
116         add             %sum, %x8, %sum         /*  IEU0        Group           */;     \
117         inc             %x2                     /*  IEU1                        */;     \
118         STORE6                                  /*  Store (optional)            */;     \
119         faligndata      %F10, %F12, %A10        /*  FPA                         */;     \
120         fpadd32         %F10, %f10, %F10        /*  FPA         Group           */;     \
121         srl             %x2, 1, %x2             /*  IEU0                        */;     \
122         fcmpgt32        %f8, %F8, %x5           /*  FPM                         */;     \
123         add             %sum, %x1, %sum         /*  IEU0        Group           */;     \
124         inc             %x3                     /*  IEU1                        */;     \
125         STORE7                                  /*  Store (optional)            */;     \
126         faligndata      %F12, %F14, %A12        /*  FPA                         */;     \
127         fpadd32         %F12, %f12, %F12        /*  FPA         Group           */;     \
128         srl             %x3, 1, %x3             /*  IEU0                        */;     \
129         fcmpgt32        %f10, %F10, %x6         /*  FPM                         */;     \
130         add             %sum, %x2, %sum         /*  IEU0        Group           */;     \
131         inc             %x4                     /*  IEU1                        */;     \
132         STORE8                                  /*  Store (optional)            */;     \
133         fmovd           %F14, %B14              /*  FPA                         */;     \
134         fpadd32         %F14, %f14, %F14        /*  FPA         Group           */;     \
135         srl             %x4, 1, %x4             /*  IEU0                        */;     \
136         fcmpgt32        %f12, %F12, %x7         /*  FPM                         */;     \
137         add             %sum, %x3, %sum         /*  IEU0        Group           */;     \
138         subcc           %len, 64, %len          /*  IEU1                        */;     \
139         BRANCH                                  /*  CTI                         */;     \
140         fcmpgt32        %f14, %F14, %x8         /*  FPM         Group           */;
141
142 #define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \
143         inc             %x5                     /*  IEU0        Group           */;     \
144         fpadd32         %f2, %f0, %S0           /*  FPA                         */;     \
145         add             %sum, %x4, %sum         /*  IEU1                        */;     \
146         srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
147         fpadd32         %f6, %f4, %S1           /*  FPA                         */;     \
148         inc             %x6                     /*  IEU1                        */;     \
149         fpadd32         %f10, %f8, %S2          /*  FPA         Group           */;     \
150         add             %sum, %x5, %sum         /*  IEU0                        */;     \
151         fcmpgt32        %f0, %S0, %x1           /*  FPM                         */;     \
152         fpadd32         %f14, %f12, %S3         /*  FPA         Group           */;     \
153         srl             %x6, 1, %x6             /*  IEU0                        */;     \
154         fcmpgt32        %f4, %S1, %x2           /*  FPM                         */;     \
155         add             %sum, %x6, %sum         /*  IEU0        Group           */;     \
156         fzero           %fz                     /*  FPA                         */;     \
157         fcmpgt32        %f8, %S2, %x3           /*  FPM                         */;     \
158         inc             %x7                     /*  IEU0        Group           */;     \
159         inc             %x8                     /*  IEU1                        */;     \
160         srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
161         inc             %x1                     /*  IEU1                        */;     \
162         fpadd32         %S0, %S1, %T0           /*  FPA                         */;     \
163         fpadd32         %S2, %S3, %T1           /*  FPA         Group           */;     \
164         add             %sum, %x7, %sum         /*  IEU0                        */;     \
165         fcmpgt32        %f12, %S3, %x4          /*  FPM                         */;     \
166         srl             %x8, 1, %x8             /*  IEU0        Group           */;     \
167         inc             %x2                     /*  IEU1                        */;     \
168         srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
169         add             %sum, %x8, %sum         /*  IEU1                        */;     \
170         add             %sum, %x1, %sum         /*  IEU0        Group           */;     \
171         fcmpgt32        %S0, %T0, %x5           /*  FPM                         */;     \
172         srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
173         fcmpgt32        %S2, %T1, %x6           /*  FPM                         */;     \
174         inc             %x3                     /*  IEU0        Group           */;     \
175         add             %sum, %x2, %sum         /*  IEU1                        */;     \
176         srl             %x3, 1, %x3             /*  IEU0        Group           */;     \
177         inc             %x4                     /*  IEU1                        */;     \
178         fpadd32         %T0, %T1, %U0           /*  FPA         Group           */;     \
179         add             %sum, %x3, %sum         /*  IEU0                        */;     \
180         fcmpgt32        %fz, %f2, %x7           /*  FPM                         */;     \
181         srl             %x4, 1, %x4             /*  IEU0        Group           */;     \
182         fcmpgt32        %fz, %f6, %x8           /*  FPM                         */;     \
183         inc             %x5                     /*  IEU0        Group           */;     \
184         add             %sum, %x4, %sum         /*  IEU1                        */;     \
185         srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
186         fcmpgt32        %fz, %f10, %x1          /*  FPM                         */;     \
187         inc             %x6                     /*  IEU0        Group           */;     \
188         add             %sum, %x5, %sum         /*  IEU1                        */;     \
189         fmovd           %FA, %FB                /*  FPA         Group           */;     \
190         fcmpgt32        %fz, %f14, %x2          /*  FPM                         */;     \
191         srl             %x6, 1, %x6             /*  IEU0        Group           */;     \
192         ba,pt           %xcc, ett               /*  CTI                         */;     \
193          inc            %x7                     /*  IEU1                        */;
194
195 #define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB)                                \
196         END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62)
197
198 #define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz)                                   \
199         fpadd32         %U0, %U1, %V0           /*  FPA         Group           */;     \
200         srl             %x7, 1, %x7             /*  IEU0                        */;     \
201         add             %sum, %x6, %sum         /*  IEU1                        */;     \
202         std             %V0, [%sp + STACKOFF]   /*  Store       Group           */;     \
203         inc             %x8                     /*  IEU0                        */;     \
204         sub             %sum, %x7, %sum         /*  IEU1                        */;     \
205         srl             %x8, 1, %x8             /*  IEU0        Group           */;     \
206         fcmpgt32        %fz, %S1, %x3           /*  FPM                         */;     \
207         inc             %x1                     /*  IEU0        Group           */;     \
208         fcmpgt32        %fz, %S3, %x4           /*  FPM                         */;     \
209         srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
210         sub             %sum, %x8, %sum         /*  IEU1                        */;     \
211         ldx             [%sp + STACKOFF], %x8   /*  Load        Group           */;     \
212         inc             %x2                     /*  IEU0                        */;     \
213         sub             %sum, %x1, %sum         /*  IEU1                        */;     \
214         srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
215         fcmpgt32        %fz, %T1, %x5           /*  FPM                         */;     \
216         inc             %x3                     /*  IEU0        Group           */;     \
217         fcmpgt32        %T0, %U0, %x6           /*  FPM                         */;     \
218         srl             %x3, 1, %x3             /*  IEU0        Group           */;     \
219         sub             %sum, %x2, %sum         /*  IEU1                        */;     \
220         inc             %x4                     /*  IEU0        Group           */;     \
221         sub             %sum, %x3, %sum         /*  IEU1                        */;     \
222         srl             %x4, 1, %x4             /*  IEU0        Group           */;     \
223         fcmpgt32        %fz, %U1, %x7           /*  FPM                         */;     \
224         inc             %x5                     /*  IEU0        Group           */;     \
225         fcmpgt32        %U0, %V0, %x1           /*  FPM                         */;     \
226         srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
227         sub             %sum, %x4, %sum         /*  IEU1                        */;     \
228         sub             %sum, %x5, %sum         /*  IEU0        Group           */;     \
229         fcmpgt32        %fz, %V0, %x2           /*  FPM                         */;     \
230         inc             %x6                     /*  IEU0        Group           */;     \
231         inc             %x7                     /*  IEU1                        */;     \
232         srl             %x6, 1, %x6             /*  IEU0        Group           */;     \
233         inc             %x1                     /*  IEU1                        */;     \
234         srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
235         add             %sum, %x6, %sum         /*  IEU1                        */;     \
236         srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
237         sub             %sum, %x7, %sum         /*  IEU1                        */;     \
238         inc             %x2                     /*  IEU0        Group           */;     \
239         add             %sum, %x1, %sum         /*  IEU1                        */;     \
240         srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
241         sub             %sum, %x2, %sum         /*  IEU0        Group           */;     \
242         addcc           %sum, %x8, %sum         /*  IEU1        Group           */;     \
243         bcs,a,pn        %xcc, 33f               /*  CTI                         */;     \
244          add            %sum, 1, %sum           /*  IEU0        (Group)         */;     \
245 33:                                             /*  That's it                   */;
246
247         .text
248         .globl          csum_partial_copy_vis
249         .align          32
250 /* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp.
251  * csum_partial_copy_from_user
252  * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256
253  */
254 csum_partial_copy_vis:
255         andcc           %dst, 7, %g0            /*  IEU1        Group           */
256         be,pt           %icc, 4f                /*  CTI                         */
257          and            %dst, 0x38, %o4         /*  IEU0                        */
258         mov             1, %g5                  /*  IEU0        Group           */
259         andcc           %dst, 2, %g0            /*  IEU1                        */
260         be,pt           %icc, 1f                /*  CTI                         */
261          and            %dst, 4, %g7            /*  IEU0        Group           */
262         lduha           [%src] %asi, %g2        /*  Load                        */
263         sub             %len, 2, %len           /*  IEU0        Group           */
264         add             %dst, 2, %dst           /*  IEU1                        */
265         andcc           %dst, 4, %g7            /*  IEU1        Group           */
266         sll             %g5, 16, %g5            /*  IEU0                        */
267         sth             %g2, [%dst - 2]         /*  Store       Group           */
268         sll             %g2, 16, %g2            /*  IEU0                        */
269         add             %src, 2, %src           /*  IEU1                        */
270         addcc           %g2, %sum, %sum         /*  IEU1        Group           */
271         bcs,a,pn        %icc, 1f                /*  CTI                         */
272          add            %sum, %g5, %sum         /*  IEU0                        */
273 1:      lduwa           [%src] %asi, %g2        /*  Load                        */
274         brz,a,pn        %g7, 4f                 /*  CTI+IEU1    Group           */
275          and            %dst, 0x38, %o4         /*  IEU0                        */
276         add             %dst, 4, %dst           /*  IEU0        Group           */
277         sub             %len, 4, %len           /*  IEU1                        */
278         addcc           %g2, %sum, %sum         /*  IEU1        Group           */
279         bcs,a,pn        %icc, 1f                /*  CTI                         */
280          add            %sum, 1, %sum           /*  IEU0                        */
281 1:      and             %dst, 0x38, %o4         /*  IEU0        Group           */
282         stw             %g2, [%dst - 4]         /*  Store                       */
283         add             %src, 4, %src           /*  IEU1                        */
284 4:
285 #ifdef __KERNEL__
286         VISEntry
287 #endif
288         mov             %src, %g7               /*  IEU1        Group           */
289         fzero           %f48                    /*  FPA                         */
290         alignaddr       %src, %g0, %src         /*  Single      Group           */
291         subcc           %g7, %src, %g7          /*  IEU1        Group           */
292         be,pt           %xcc, 1f                /*  CTI                         */
293          mov            0x40, %g1               /*  IEU0                        */
294         lduwa           [%src] %asi, %g2        /*  Load        Group           */
295         subcc           %sum, %g2, %sum         /*  IEU1        Group+load stall*/
296         bcs,a,pn        %icc, 1f                /*  CTI                         */
297          sub            %sum, 1, %sum           /*  IEU0                        */
298 1:      srl             %sum, 0, %sum           /*  IEU0        Group           */
299         clr             %g5                     /*  IEU1                        */
300         brz,pn          %o4, 3f                 /*  CTI+IEU1    Group           */
301          sub            %g1, %o4, %g1           /*  IEU0                        */
302         ldda            [%src] %asi, %f0        /*  Load                        */
303         clr             %o4                     /*  IEU0        Group           */
304         andcc           %dst, 8, %g0            /*  IEU1                        */
305         be,pn           %icc, 1f                /*  CTI                         */
306          ldda           [%src + 8] %asi, %f2    /*  Load        Group           */
307         add             %src, 8, %src           /*  IEU0                        */
308         sub             %len, 8, %len           /*  IEU1                        */
309         fpadd32         %f0, %f48, %f50         /*  FPA                         */
310         addcc           %dst, 8, %dst           /*  IEU1        Group           */
311         faligndata      %f0, %f2, %f16          /*  FPA                         */
312         fcmpgt32        %f48, %f50, %o4         /*  FPM         Group           */
313         fmovd           %f2, %f0                /*  FPA         Group           */
314         ldda            [%src + 8] %asi, %f2    /*  Load                        */
315         std             %f16, [%dst - 8]        /*  Store                       */
316         fmovd           %f50, %f48              /*  FPA                         */
317 1:      andcc           %g1, 0x10, %g0          /*  IEU1        Group           */
318         be,pn           %icc, 1f                /*  CTI                         */
319          and            %g1, 0x20, %g1          /*  IEU0                        */
320         fpadd32         %f0, %f48, %f50         /*  FPA                         */
321         ldda            [%src + 16] %asi, %f4   /*  Load        Group           */
322         add             %src, 16, %src          /*  IEU0                        */
323         add             %dst, 16, %dst          /*  IEU1                        */
324         faligndata      %f0, %f2, %f16          /*  FPA                         */
325         fcmpgt32        %f48, %f50, %g5         /*  FPM         Group           */
326         sub             %len, 16, %len          /*  IEU0                        */
327         inc             %o4                     /*  IEU1                        */
328         std             %f16, [%dst - 16]       /*  Store       Group           */
329         fpadd32         %f2, %f50, %f48         /*  FPA                         */
330         srl             %o4, 1, %o5             /*  IEU0                        */
331         faligndata      %f2, %f4, %f18          /*  FPA         Group           */
332         std             %f18, [%dst - 8]        /*  Store                       */
333         fcmpgt32        %f50, %f48, %o4         /*  FPM         Group           */
334         add             %o5, %sum, %sum         /*  IEU0                        */
335         ldda            [%src + 8] %asi, %f2    /*  Load                        */
336         fmovd           %f4, %f0                /*  FPA                         */
337 1:      brz,a,pn        %g1, 4f                 /*  CTI+IEU1    Group           */
338          rd             %asi, %g2               /*  LSU         Group + 4 bubbles*/
339         inc             %g5                     /*  IEU0                        */
340         fpadd32         %f0, %f48, %f50         /*  FPA                         */
341         ldda            [%src + 16] %asi, %f4   /*  Load        Group           */
342         srl             %g5, 1, %g5             /*  IEU0                        */
343         add             %dst, 32, %dst          /*  IEU1                        */
344         faligndata      %f0, %f2, %f16          /*  FPA                         */
345         fcmpgt32        %f48, %f50, %o5         /*  FPM         Group           */
346         inc             %o4                     /*  IEU0                        */
347         ldda            [%src + 24] %asi, %f6   /*  Load                        */
348         srl             %o4, 1, %o4             /*  IEU0        Group           */
349         add             %g5, %sum, %sum         /*  IEU1                        */
350         ldda            [%src + 32] %asi, %f8   /*  Load                        */
351         fpadd32         %f2, %f50, %f48         /*  FPA                         */
352         faligndata      %f2, %f4, %f18          /*  FPA         Group           */
353         sub             %len, 32, %len          /*  IEU0                        */
354         std             %f16, [%dst - 32]       /*  Store                       */
355         fcmpgt32        %f50, %f48, %g3         /*  FPM         Group           */
356         inc             %o5                     /*  IEU0                        */
357         add             %o4, %sum, %sum         /*  IEU1                        */
358         fpadd32         %f4, %f48, %f50         /*  FPA                         */
359         faligndata      %f4, %f6, %f20          /*  FPA         Group           */
360         srl             %o5, 1, %o5             /*  IEU0                        */
361         fcmpgt32        %f48, %f50, %g5         /*  FPM         Group           */
362         add             %o5, %sum, %sum         /*  IEU0                        */
363         std             %f18, [%dst - 24]       /*  Store                       */
364         fpadd32         %f6, %f50, %f48         /*  FPA                         */
365         inc             %g3                     /*  IEU0        Group           */
366         std             %f20, [%dst - 16]       /*  Store                       */
367         add             %src, 32, %src          /*  IEU1                        */
368         faligndata      %f6, %f8, %f22          /*  FPA                         */
369         fcmpgt32        %f50, %f48, %o4         /*  FPM         Group           */
370         srl             %g3, 1, %g3             /*  IEU0                        */
371         std             %f22, [%dst - 8]        /*  Store                       */      
372         add             %g3, %sum, %sum         /*  IEU0        Group           */
373 3:      rd              %asi, %g2               /*  LSU         Group + 4 bubbles*/
374 #ifdef __KERNEL__
375 4:      sethi           %hi(vis0s), %g7         /*  IEU0        Group           */
376         or              %g2, ASI_BLK_OR, %g2    /*  IEU1                        */
377 #else
378 4:      rd              %pc, %g7                /*  LSU         Group + 4 bubbles*/
379 #endif
380         inc             %g5                     /*  IEU0        Group           */
381         and             %src, 0x38, %g3         /*  IEU1                        */      
382         membar          #StoreLoad              /*  LSU         Group           */
383         srl             %g5, 1, %g5             /*  IEU0                        */
384         inc             %o4                     /*  IEU1                        */
385         sll             %g3, 8, %g3             /*  IEU0        Group           */
386         sub             %len, 0xc0, %len        /*  IEU1                        */
387         addcc           %g5, %sum, %sum         /*  IEU1        Group           */
388         srl             %o4, 1, %o4             /*  IEU0                        */
389         add             %g7, %g3, %g7           /*  IEU0        Group           */
390         add             %o4, %sum, %sum         /*  IEU1                        */
391 #ifdef __KERNEL__
392         jmpl            %g7 + %lo(vis0s), %g0   /*  CTI+IEU1    Group           */
393 #else
394         jmpl            %g7 + (vis0s - 4b), %g0 /*  CTI+IEU1    Group           */
395 #endif
396          fzero          %f32                    /*  FPA                         */
397
398         .align          2048
399 vis0s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
400         add             %src, 128, %src         /*  IEU0        Group           */
401         ldda            [%src-128] %asi, %f0    /*  Load        Group           */
402         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
403         fmovd           %f48, %f62              /*  FPA         Group   f0 available*/
404         faligndata      %f0, %f2, %f48          /*  FPA         Group   f2 available*/
405         fcmpgt32        %f32, %f2, %x1          /*  FPM         Group   f4 available*/
406         fpadd32         %f0, %f62, %f0          /*  FPA                         */
407         fcmpgt32        %f32, %f4, %x2          /*  FPM         Group   f6 available*/
408         faligndata      %f2, %f4, %f50          /*  FPA                         */
409         fcmpgt32        %f62, %f0, %x3          /*  FPM         Group   f8 available*/
410         faligndata      %f4, %f6, %f52          /*  FPA                         */
411         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group   f10 available*/
412         inc             %x1                     /*  IEU0                        */
413         faligndata      %f6, %f8, %f54          /*  FPA                         */
414         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group   f12 available*/
415         srl             %x1, 1, %x1             /*  IEU0                        */
416         inc             %x2                     /*  IEU1                        */
417         faligndata      %f8, %f10, %f56         /*  FPA                         */
418         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group   f14 available*/
419         srl             %x2, 1, %x2             /*  IEU0                        */
420         add             %sum, %x1, %sum         /*  IEU1                        */
421         faligndata      %f10, %f12, %f58        /*  FPA                         */
422         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
423         inc             %x3                     /*  IEU0                        */
424         add             %sum, %x2, %sum         /*  IEU1                        */
425         faligndata      %f12, %f14, %f60        /*  FPA                         */
426         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
427         srl             %x3, 1, %x3             /*  IEU0                        */
428         inc             %x4                     /*  IEU1                        */
429         fmovd           %f14, %f62              /*  FPA                         */
430         srl             %x4, 1, %x4             /*  IEU0        Group           */
431         add             %sum, %x3, %sum         /*  IEU1                        */
432 vis0:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
433                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
434                         ,LDBLK(f32),    STBLK,,,,,,,,
435                         ,bcs,pn %icc, vis0e1)
436         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
437                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
438                         ,LDBLK(f0),     STBLK,,,,,,,,
439                         ,bcs,pn %icc, vis0e2)
440         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
441                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
442                         ,LDBLK(f16),    STBLK,,,,,,,,
443                         ,bcc,pt %icc, vis0)
444 vis0e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
445                         ,f48,f50,f52,f54,f56,f58,f60,f62,f32,
446                         ,SYNC,          STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
447                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2)
448 vis0e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
449                         ,f48,f50,f52,f54,f56,f58,f60,f62,f0,
450                         ,SYNC,          STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
451                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3)
452 vis0e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
453                         ,f48,f50,f52,f54,f56,f58,f60,f62,f16,
454                         ,SYNC,          STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
455                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1)
456         .align          2048
457 vis1s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
458         add             %src, 128 - 8, %src     /*  IEU0        Group           */
459         ldda            [%src-128] %asi, %f0    /*  Load        Group           */
460         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
461         fmovd           %f0, %f58               /*  FPA         Group           */
462         fmovd           %f48, %f0               /*  FPA         Group           */
463         fcmpgt32        %f32, %f2, %x2          /*  FPM         Group           */
464         faligndata      %f2, %f4, %f48          /*  FPA                         */
465         fcmpgt32        %f32, %f4, %x3          /*  FPM         Group           */
466         faligndata      %f4, %f6, %f50          /*  FPA                         */
467         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
468         faligndata      %f6, %f8, %f52          /*  FPA                         */
469         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
470         inc             %x2                     /*  IEU1                        */
471         faligndata      %f8, %f10, %f54         /*  FPA                         */
472         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
473         srl             %x2, 1, %x2             /*  IEU0                        */
474         faligndata      %f10, %f12, %f56        /*  FPA                         */
475         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
476         inc             %x3                     /*  IEU0                        */
477         add             %sum, %x2, %sum         /*  IEU1                        */
478         faligndata      %f12, %f14, %f58        /*  FPA                         */
479         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
480         srl             %x3, 1, %x3             /*  IEU0                        */
481         inc             %x4                     /*  IEU1                        */
482         fmovd           %f14, %f60              /*  FPA                         */
483         srl             %x4, 1, %x4             /*  IEU0        Group           */
484         add             %sum, %x3, %sum         /*  IEU1                        */
485 vis1:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
486                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
487                         ,LDBLK(f32),    ,STBLK,,,,,,,
488                         ,bcs,pn %icc, vis1e1)
489         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
490                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
491                         ,LDBLK(f0),     ,STBLK,,,,,,,
492                         ,bcs,pn %icc, vis1e2)
493         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
494                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
495                         ,LDBLK(f16),    ,STBLK,,,,,,,
496                         ,bcc,pt %icc, vis1)
497 vis1e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
498                         ,f62,f48,f50,f52,f54,f56,f58,f60,f32,
499                         ,SYNC,          ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
500                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2)
501 vis1e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
502                         ,f62,f48,f50,f52,f54,f56,f58,f60,f0,
503                         ,SYNC,          ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
504                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3)
505 vis1e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
506                         ,f62,f48,f50,f52,f54,f56,f58,f60,f16,
507                         ,SYNC,          ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
508                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1)
509         .align          2048
510 vis2s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
511         add             %src, 128 - 16, %src    /*  IEU0        Group           */
512         ldda            [%src-128] %asi, %f0    /*  Load        Group           */
513         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
514         fmovd           %f0, %f56               /*  FPA         Group           */
515         fmovd           %f48, %f0               /*  FPA         Group           */      
516         sub             %dst, 64, %dst          /*  IEU0                        */
517         fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
518         fcmpgt32        %f32, %f4, %x3          /*  FPM         Group           */
519         faligndata      %f4, %f6, %f48          /*  FPA                         */
520         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
521         faligndata      %f6, %f8, %f50          /*  FPA                         */
522         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
523         faligndata      %f8, %f10, %f52         /*  FPA                         */
524         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
525         faligndata      %f10, %f12, %f54        /*  FPA                         */
526         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
527         inc             %x3                     /*  IEU0                        */
528         faligndata      %f12, %f14, %f56        /*  FPA                         */
529         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
530         srl             %x3, 1, %x3             /*  IEU0                        */
531         inc             %x4                     /*  IEU1                        */
532         fmovd           %f14, %f58              /*  FPA                         */
533         srl             %x4, 1, %x4             /*  IEU0        Group           */
534         add             %sum, %x3, %sum         /*  IEU1                        */
535 vis2:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
536                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
537                         ,LDBLK(f32),    ,,STBLK,,,,,,
538                         ,bcs,pn %icc, vis2e1)
539         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
540                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
541                         ,LDBLK(f0),     ,,STBLK,,,,,,
542                         ,bcs,pn %icc, vis2e2)
543         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
544                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
545                         ,LDBLK(f16),    ,,STBLK,,,,,,
546                         ,bcc,pt %icc, vis2)
547 vis2e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
548                         ,f60,f62,f48,f50,f52,f54,f56,f58,f32,
549                         ,SYNC,          ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
550                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2)
551 vis2e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
552                         ,f60,f62,f48,f50,f52,f54,f56,f58,f0,
553                         ,SYNC,          ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
554                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3)
555 vis2e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
556                         ,f60,f62,f48,f50,f52,f54,f56,f58,f16,
557                         ,SYNC,          ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
558                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1)
559         .align          2048
560 vis3s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
561         add             %src, 128 - 24, %src    /*  IEU0        Group           */
562         ldda            [%src-128] %asi, %f0    /*  Load        Group           */
563         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
564         fmovd           %f0, %f54               /*  FPA         Group           */
565         fmovd           %f48, %f0               /*  FPA         Group           */
566         sub             %dst, 64, %dst          /*  IEU0                        */
567         fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
568         fpsub32         %f4, %f4, %f4           /*  FPA         Group           */
569         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
570         faligndata      %f6, %f8, %f48          /*  FPA                         */
571         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
572         faligndata      %f8, %f10, %f50         /*  FPA                         */
573         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
574         faligndata      %f10, %f12, %f52        /*  FPA                         */
575         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
576         faligndata      %f12, %f14, %f54        /*  FPA                         */
577         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
578         fmovd           %f14, %f56              /*  FPA                         */
579         inc             %x4                     /*  IEU0                        */
580         srl             %x4, 1, %x4             /*  IEU0        Group           */
581 vis3:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
582                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
583                         ,LDBLK(f32),    ,,,STBLK,,,,,
584                         ,bcs,pn %icc, vis3e1)
585         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
586                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
587                         ,LDBLK(f0),     ,,,STBLK,,,,,
588                         ,bcs,pn %icc, vis3e2)
589         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
590                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
591                         ,LDBLK(f16),    ,,,STBLK,,,,,
592                         ,bcc,pt %icc, vis3)
593 vis3e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
594                         ,f58,f60,f62,f48,f50,f52,f54,f56,f32,
595                         ,SYNC,          ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
596                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2)
597 vis3e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
598                         ,f58,f60,f62,f48,f50,f52,f54,f56,f0,
599                         ,SYNC,          ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
600                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3)
601 vis3e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
602                         ,f58,f60,f62,f48,f50,f52,f54,f56,f16,
603                         ,SYNC,          ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
604                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1)
605         .align          2048
606 vis4s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
607         add             %src, 128 - 32, %src    /*  IEU0        Group           */
608         ldda            [%src-128] %asi, %f0    /*  Load        Group           */
609         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
610         fmovd           %f0, %f52               /*  FPA         Group           */
611         fmovd           %f48, %f0               /*  FPA         Group           */
612         sub             %dst, 64, %dst          /*  IEU0                        */
613         fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
614         fpsub32         %f4, %f4, %f4           /*  FPA         Group           */
615         fpsub32         %f6, %f6, %f6           /*  FPA         Group           */
616         clr             %x4                     /*  IEU0                        */
617         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
618         faligndata      %f8, %f10, %f48         /*  FPA                         */
619         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
620         faligndata      %f10, %f12, %f50        /*  FPA                         */
621         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
622         faligndata      %f12, %f14, %f52        /*  FPA                         */
623         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
624         fmovd           %f14, %f54              /*  FPA                         */
625 vis4:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
626                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
627                         ,LDBLK(f32),    ,,,,STBLK,,,,
628                         ,bcs,pn %icc, vis4e1)
629         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
630                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
631                         ,LDBLK(f0),     ,,,,STBLK,,,,
632                         ,bcs,pn %icc, vis4e2)
633         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
634                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
635                         ,LDBLK(f16),    ,,,,STBLK,,,,
636                         ,bcc,pt %icc, vis4)
637 vis4e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
638                         ,f56,f58,f60,f62,f48,f50,f52,f54,f32,
639                         ,SYNC,          ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
640                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2)
641 vis4e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
642                         ,f56,f58,f60,f62,f48,f50,f52,f54,f0,
643                         ,SYNC,          ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
644                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3)
645 vis4e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
646                         ,f56,f58,f60,f62,f48,f50,f52,f54,f16,
647                         ,SYNC,          ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
648                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1)
649         .align          2048
650 vis5s:  add             %src, 128 - 40, %src    /*  IEU0        Group           */
651         ldda            [%src-88] %asi, %f10    /*  Load        Group           */
652         ldda            [%src-80] %asi, %f12    /*  Load        Group           */
653         ldda            [%src-72] %asi, %f14    /*  Load        Group           */
654         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
655         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
656         fmovd           %f48, %f0               /*  FPA         Group           */
657         fmuld           %f32, %f32, %f2         /*  FPM                         */
658         clr             %x4                     /*  IEU0                        */
659         faddd           %f32, %f32, %f4         /*  FPA         Group           */
660         fmuld           %f32, %f32, %f6         /*  FPM                         */
661         clr             %x5                     /*  IEU0                        */
662         faddd           %f32, %f32, %f8         /*  FPA         Group           */
663         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
664         sub             %dst, 64, %dst          /*  IEU0                        */
665         faligndata      %f10, %f12, %f48        /*  FPA                         */
666         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
667         faligndata      %f12, %f14, %f50        /*  FPA                         */
668         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
669         fmovd           %f14, %f52              /*  FPA                         */
670 vis5:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
671                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
672                         ,LDBLK(f32),    ,,,,,STBLK,,,
673                         ,bcs,pn %icc, vis5e1)
674         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
675                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
676                         ,LDBLK(f0),     ,,,,,STBLK,,,
677                         ,bcs,pn %icc, vis5e2)
678         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
679                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
680                         ,LDBLK(f16),    ,,,,,STBLK,,,
681                         ,bcc,pt %icc, vis5)
682 vis5e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
683                         ,f54,f56,f58,f60,f62,f48,f50,f52,f32,
684                         ,SYNC,          ,,,,,STBLK,ST(f48,64),ST(f50,72),
685                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2)
686 vis5e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
687                         ,f54,f56,f58,f60,f62,f48,f50,f52,f0,
688                         ,SYNC,          ,,,,,STBLK,ST(f48,64),ST(f50,72),
689                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3)
690 vis5e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
691                         ,f54,f56,f58,f60,f62,f48,f50,f52,f16,
692                         ,SYNC,          ,,,,,STBLK,ST(f48,64),ST(f50,72),
693                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1)
694         .align          2048
695 vis6s:  add             %src, 128 - 48, %src    /*  IEU0        Group           */
696         ldda            [%src-80] %asi, %f12    /*  Load        Group           */
697         ldda            [%src-72] %asi, %f14    /*  Load        Group           */
698         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
699         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
700         fmovd           %f48, %f0               /*  FPA         Group           */
701         fmuld           %f32, %f32, %f2         /*  FPM                         */
702         clr             %x4                     /*  IEU0                        */
703         faddd           %f32, %f32, %f4         /*  FPA         Group           */
704         fmuld           %f32, %f32, %f6         /*  FPM                         */
705         clr             %x5                     /*  IEU0                        */
706         faddd           %f32, %f32, %f8         /*  FPA         Group           */
707         fmuld           %f32, %f32, %f10        /*  FPM                         */
708         clr             %x6                     /*  IEU0                        */
709         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
710         sub             %dst, 64, %dst          /*  IEU0                        */
711         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
712         faligndata      %f12, %f14, %f48        /*  FPA                         */
713         fmovd           %f14, %f50              /*  FPA         Group           */
714 vis6:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
715                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
716                         ,LDBLK(f32),    ,,,,,,STBLK,,
717                         ,bcs,pn %icc, vis6e1)
718         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
719                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
720                         ,LDBLK(f0),     ,,,,,,STBLK,,
721                         ,bcs,pn %icc, vis6e2)
722         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
723                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
724                         ,LDBLK(f16),    ,,,,,,STBLK,,
725                         ,bcc,pt %icc, vis6)
726 vis6e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
727                         ,f52,f54,f56,f58,f60,f62,f48,f50,f32,
728                         ,SYNC,          ,,,,,,STBLK,ST(f48,64),
729                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2)
730 vis6e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
731                         ,f52,f54,f56,f58,f60,f62,f48,f50,f0,
732                         ,SYNC,          ,,,,,,STBLK,ST(f48,64),
733                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3)
734 vis6e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
735                         ,f52,f54,f56,f58,f60,f62,f48,f50,f16,
736                         ,SYNC,          ,,,,,,STBLK,ST(f48,64),
737                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1)
738         .align          2048
739 vis7s:  add             %src, 128 - 56, %src    /*  IEU0        Group           */
740         ldda            [%src-72] %asi, %f14    /*  Load        Group           */
741         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
742         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
743         fmovd           %f48, %f0               /*  FPA         Group           */
744         fmuld           %f32, %f32, %f2         /*  FPM                         */
745         clr             %x4                     /*  IEU0                        */
746         faddd           %f32, %f32, %f4         /*  FPA         Group           */
747         fmuld           %f32, %f32, %f6         /*  FPM                         */
748         clr             %x5                     /*  IEU0                        */
749         faddd           %f32, %f32, %f8         /*  FPA         Group           */
750         fmuld           %f32, %f32, %f10        /*  FPM                         */
751         clr             %x6                     /*  IEU0                        */
752         faddd           %f32, %f32, %f12        /*  FPA         Group           */
753         clr             %x7                     /*  IEU0                        */
754         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
755         sub             %dst, 64, %dst          /*  IEU0                        */
756         fmovd           %f14, %f48              /*  FPA                         */
757 vis7:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
758                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
759                         ,LDBLK(f32),    ,,,,,,,STBLK,
760                         ,bcs,pn %icc, vis7e1)
761         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
762                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
763                         ,LDBLK(f0),     ,,,,,,,STBLK,
764                         ,bcs,pn %icc, vis7e2)
765         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
766                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
767                         ,LDBLK(f16),    ,,,,,,,STBLK,
768                         ,bcc,pt %icc, vis7)
769 vis7e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
770                         ,f50,f52,f54,f56,f58,f60,f62,f48,f32,
771                         ,SYNC,          ,,,,,,,STBLK,
772                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2)
773 vis7e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
774                         ,f50,f52,f54,f56,f58,f60,f62,f48,f0,
775                         ,SYNC,          ,,,,,,,STBLK,
776                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3)
777 vis7e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
778                         ,f50,f52,f54,f56,f58,f60,f62,f48,f16,
779                         ,SYNC,          ,,,,,,,STBLK,
780                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1)
781 e1:     END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6)
782 e2:     END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6)
783 e3:     END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6)
784 ett:    rd              %asi, %x4               /*  LSU         Group+4bubbles  */
785         rd              %gsr, %x3               /*  LSU         Group+4bubbles  */
786 #ifdef __KERNEL__
787         srl             %x4, 3, %x5             /*  IEU0        Group           */
788         xor             %x4, ASI_BLK_XOR1, %x4  /*  IEU1                        */
789         wr              %x4, %x5, %asi          /*  LSU         Group+4bubbles  */
790 #else
791         wr              %x4, ASI_BLK_XOR, %asi  /*  LSU         Group+4bubbles  */
792 #endif
793         andcc           %x3, 7, %x3             /*  IEU1        Group           */
794         add             %dst, 8, %dst           /*  IEU0                        */
795         bne,pn          %icc, 1f                /*  CTI                         */
796          fzero          %f10                    /*  FPA                         */
797         brz,a,pn        %len, 2f                /*  CTI+IEU1    Group           */
798          std            %f6, [%dst - 8]         /*  Store                       */
799 1:      cmp             %len, 8                 /*  IEU1                        */
800         blu,pn          %icc, 3f                /*  CTI                         */
801          sub            %src, 64, %src          /*  IEU0        Group           */
802 1:      ldda            [%src] %asi, %f2        /*  Load        Group           */
803         fpadd32         %f10, %f2, %f12         /*  FPA         Group+load stall*/
804         add             %src, 8, %src           /*  IEU0                        */
805         add             %dst, 8, %dst           /*  IEU1                        */
806         faligndata      %f6, %f2, %f14          /*  FPA         Group           */
807         fcmpgt32        %f10, %f12, %x5         /*  FPM         Group           */
808         std             %f14, [%dst - 16]       /*  Store                       */
809         fmovd           %f2, %f6                /*  FPA                         */
810         fmovd           %f12, %f10              /*  FPA         Group           */
811         sub             %len, 8, %len           /*  IEU1                        */
812         fzero           %f16                    /*  FPA         Group - FPU nop */
813         fzero           %f18                    /*  FPA         Group - FPU nop */
814         inc             %x5                     /*  IEU0                        */
815         srl             %x5, 1, %x5             /*  IEU0        Group (regdep)  */
816         cmp             %len, 8                 /*  IEU1                        */
817         bgeu,pt         %icc, 1b                /*  CTI                         */
818          add            %x5, %sum, %sum         /*  IEU0        Group           */
819 3:      brz,a,pt        %x3, 2f                 /*  CTI+IEU1                    */
820          std            %f6, [%dst - 8]         /*  Store       Group           */
821         st              %f7, [%dst - 8]         /*  Store       Group           */
822         sub             %dst, 4, %dst           /*  IEU0                        */
823         add             %len, 4, %len           /*  IEU1                        */
824 2:
825 #ifdef __KERNEL__
826         sub             %sp, 8, %sp             /*  IEU0        Group           */
827 #endif
828         END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62)
829         membar          #Sync                   /*  LSU         Group           */
830 #ifdef __KERNEL__
831         VISExit
832         add             %sp, 8, %sp             /*  IEU0        Group           */
833 #endif
834 23:     brnz,pn         %len, 26f               /*  CTI+IEU1    Group           */
835 24:      sllx           %sum, 32, %g1           /*  IEU0                        */
836 25:     addcc           %sum, %g1, %src         /*  IEU1        Group           */
837         srlx            %src, 32, %src          /*  IEU0        Group (regdep)  */
838         bcs,a,pn        %xcc, 1f                /*  CTI                         */
839          add            %src, 1, %src           /*  IEU1                        */
840 #ifndef __KERNEL__
841 1:      retl                                    /*  CTI         Group brk forced*/
842          srl            %src, 0, %src           /*  IEU0                        */
843 #else
844 1:      sethi           %uhi(PAGE_OFFSET), %g4  /*  IEU0        Group           */
845         retl                                    /*  CTI         Group brk forced*/
846          sllx           %g4, 32, %g4            /*  IEU0                        */
847 #endif
848 26:     andcc           %len, 8, %g0            /*  IEU1        Group           */
849         be,pn           %icc, 1f                /*  CTI                         */
850          lduwa          [%src] %asi, %o4        /*  Load                        */
851         lduwa           [%src+4] %asi, %g2      /*  Load        Group           */
852         add             %src, 8, %src           /*  IEU0                        */
853         add             %dst, 8, %dst           /*  IEU1                        */
854         sllx            %o4, 32, %g5            /*  IEU0        Group           */
855         stw             %o4, [%dst - 8]         /*  Store                       */
856         or              %g5, %g2, %g5           /*  IEU0        Group           */
857         stw             %g2, [%dst - 4]         /*  Store                       */
858         addcc           %g5, %sum, %sum         /*  IEU1        Group           */
859         bcs,a,pn        %xcc, 1f                /*  CTI                         */
860          add            %sum, 1, %sum           /*  IEU0                        */
861 1:      andcc           %len, 4, %g0            /*  IEU1        Group           */
862         be,a,pn         %icc, 1f                /*  CTI                         */
863          clr            %g2                     /*  IEU0                        */
864         lduwa           [%src] %asi, %g7        /*  Load                        */
865         add             %src, 4, %src           /*  IEU0        Group           */
866         add             %dst, 4, %dst           /*  IEU1                        */
867         sllx            %g7, 32, %g2            /*  IEU0        Group           */
868         stw             %g7, [%dst - 4]         /*  Store                       */
869 1:      andcc           %len, 2, %g0            /*  IEU1                        */
870         be,a,pn         %icc, 1f                /*  CTI                         */
871          clr            %g3                     /*  IEU0        Group           */
872         lduha           [%src] %asi, %g7        /*  Load                        */
873         add             %src, 2, %src           /*  IEU1                        */
874         add             %dst, 2, %dst           /*  IEU0        Group           */
875         sll             %g7, 16, %g3            /*  IEU0        Group           */
876         sth             %g7, [%dst - 2]         /*  Store                       */
877 1:      andcc           %len, 1, %g0            /*  IEU1                        */
878         be,a,pn         %icc, 1f                /*  CTI                         */
879          clr            %o5                     /*  IEU0        Group           */
880         lduba           [%src] %asi, %g7        /*  Load                        */
881         sll             %g7, 8, %o5             /*  IEU0        Group           */
882         stb             %g7, [%dst]             /*  Store                       */
883 1:      or              %g2, %g3, %g3           /*  IEU1                        */
884         or              %o5, %g3, %g3           /*  IEU0        Group (regdep)  */
885         addcc           %g3, %sum, %sum         /*  IEU1        Group (regdep)  */
886         bcs,a,pn        %xcc, 1f                /*  CTI                         */
887          add            %sum, 1, %sum           /*  IEU0                        */
888 1:      ba,pt           %xcc, 25b               /*  CTI         Group           */
889          sllx           %sum, 32, %g1           /*  IEU0                        */
890
891 #ifdef __KERNEL__
892 end:
893
894         .section        __ex_table
895         .align          4
896         .word           csum_partial_copy_vis, 0, end, cpc_handler
897 #endif