+#include <tommath.h>
+#ifdef BN_FAST_S_MP_SQR_C
/* LibTomMath, multiple-precision integer library -- Tom St Denis
*
- * LibTomMath is library that provides for multiple-precision
+ * LibTomMath is a library that provides multiple-precision
* integer arithmetic as well as number theoretic functionality.
*
- * The library is designed directly after the MPI library by
+ * The library was designed directly after the MPI library by
* Michael Fromberger but has been written from scratch with
* additional optimizations in place.
*
*
* Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
*/
-#include <tommath.h>
-/* fast squaring
- *
- * This is the comba method where the columns of the product
- * are computed first then the carries are computed. This
- * has the effect of making a very simple inner loop that
- * is executed the most
- *
- * W2 represents the outer products and W the inner.
- *
- * A further optimizations is made because the inner
- * products are of the form "A * B * 2". The *2 part does
- * not need to be computed until the end which is good
- * because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
-int
-fast_s_mp_sqr (mp_int * a, mp_int * b)
+/* the jist of squaring...
+ * you do like mult except the offset of the tmpx [one that
+ * starts closer to zero] can't equal the offset of tmpy.
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens. You double all those
+ * you add in the inner loop
+
+After that loop you do the squares and add them in.
+*/
+
+int fast_s_mp_sqr (mp_int * a, mp_int * b)
{
- int olduse, newused, res, ix, pa;
- mp_word W2[MP_WARRAY], W[MP_WARRAY];
-
- /* calculate size of product and allocate as required */
- pa = a->used;
- newused = pa + pa + 1;
- if (b->alloc < newused) {
- if ((res = mp_grow (b, newused)) != MP_OKAY) {
+ int olduse, res, pa, ix, iz;
+ mp_digit W[MP_WARRAY], *tmpx;
+ mp_word W1;
+
+ /* grow the destination as required */
+ pa = a->used + a->used;
+ if (b->alloc < pa) {
+ if ((res = mp_grow (b, pa)) != MP_OKAY) {
return res;
}
}
- /* zero temp buffer (columns)
- * Note that there are two buffers. Since squaring requires
- * a outter and inner product and the inner product requires
- * computing a product and doubling it (a relatively expensive
- * op to perform n**2 times if you don't have to) the inner and
- * outer products are computed in different buffers. This way
- * the inner product can be doubled using n doublings instead of
- * n**2
- */
- memset (W, 0, newused * sizeof (mp_word));
- memset (W2, 0, newused * sizeof (mp_word));
-
- /* This computes the inner product. To simplify the inner N**2 loop
- * the multiplication by two is done afterwards in the N loop.
- */
- for (ix = 0; ix < pa; ix++) {
- /* compute the outer product
- *
- * Note that every outer product is computed
- * for a particular column only once which means that
- * there is no need todo a double precision addition
- */
- W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
-
- {
- register mp_digit tmpx, *tmpy;
- register mp_word *_W;
- register int iy;
-
- /* copy of left side */
- tmpx = a->dp[ix];
-
- /* alias for right side */
- tmpy = a->dp + (ix + 1);
-
- /* the column to store the result in */
- _W = W + (ix + ix + 1);
-
- /* inner products */
- for (iy = ix + 1; iy < pa; iy++) {
- *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+ /* number of output digits to produce */
+ W1 = 0;
+ for (ix = 0; ix < pa; ix++) {
+ int tx, ty, iy;
+ mp_word _W;
+ mp_digit *tmpy;
+
+ /* clear counter */
+ _W = 0;
+
+ /* get offsets into the two bignums */
+ ty = MIN(a->used-1, ix);
+ tx = ix - ty;
+
+ /* setup temp aliases */
+ tmpx = a->dp + tx;
+ tmpy = a->dp + ty;
+
+ /* this is the number of times the loop will iterrate, essentially
+ while (tx++ < a->used && ty-- >= 0) { ... }
+ */
+ iy = MIN(a->used-tx, ty+1);
+
+ /* now for squaring tx can never equal ty
+ * we halve the distance since they approach at a rate of 2x
+ * and we have to round because odd cases need to be executed
+ */
+ iy = MIN(iy, (ty-tx+1)>>1);
+
+ /* execute loop */
+ for (iz = 0; iz < iy; iz++) {
+ _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
}
- }
+
+ /* double the inner product and add carry */
+ _W = _W + _W + W1;
+
+ /* even columns have the square term in them */
+ if ((ix&1) == 0) {
+ _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
+ }
+
+ /* store it */
+ W[ix] = (mp_digit)(_W & MP_MASK);
+
+ /* make next carry */
+ W1 = _W >> ((mp_word)DIGIT_BIT);
}
/* setup dest */
olduse = b->used;
- b->used = newused;
+ b->used = a->used+a->used;
- /* now compute digits */
{
- register mp_digit *tmpb;
-
- /* double first value, since the inner products are
- * half of what they should be
- */
- W[0] += W[0] + W2[0];
-
+ mp_digit *tmpb;
tmpb = b->dp;
- for (ix = 1; ix < newused; ix++) {
- /* double/add next digit */
- W[ix] += W[ix] + W2[ix];
-
- W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
- *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+ for (ix = 0; ix < pa; ix++) {
+ *tmpb++ = W[ix] & MP_MASK;
}
- /* set the last value. Note even if the carry is zero
- * this is required since the next step will not zero
- * it if b originally had a value at b->dp[2*a.used]
- */
- *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
- /* clear high digits */
+ /* clear unused digits [that existed in the old copy of c] */
for (; ix < olduse; ix++) {
*tmpb++ = 0;
}
}
-
mp_clamp (b);
return MP_OKAY;
}
+#endif