diff options
Diffstat (limited to 'mpi')
-rw-r--r-- | mpi/ChangeLog | 27 | ||||
-rw-r--r-- | mpi/Makefile.am | 8 | ||||
-rw-r--r-- | mpi/config.links | 82 | ||||
-rw-r--r-- | mpi/longlong.h | 14 | ||||
-rw-r--r-- | mpi/mpi-internal.h | 19 | ||||
-rw-r--r-- | mpi/mpi-pow.c | 21 | ||||
-rw-r--r-- | mpi/mpih-div.c | 4 | ||||
-rw-r--r-- | mpi/mpih-mul.c | 119 | ||||
-rw-r--r-- | mpi/power/distfiles | 7 | ||||
-rw-r--r-- | mpi/power/mpih-add1.S | 86 | ||||
-rw-r--r-- | mpi/power/mpih-lshift.S | 64 | ||||
-rw-r--r-- | mpi/power/mpih-mul1.S | 115 | ||||
-rw-r--r-- | mpi/power/mpih-mul2.S | 130 | ||||
-rw-r--r-- | mpi/power/mpih-mul3.S | 135 | ||||
-rw-r--r-- | mpi/power/mpih-rshift.S | 64 | ||||
-rw-r--r-- | mpi/power/mpih-sub1.S | 87 |
16 files changed, 924 insertions, 58 deletions
diff --git a/mpi/ChangeLog b/mpi/ChangeLog index d9e7faa30..7119d1e2a 100644 --- a/mpi/ChangeLog +++ b/mpi/ChangeLog @@ -1,3 +1,30 @@ +Mon Jul 17 16:35:47 CEST 2000 Werner Koch <wk@> + + * power/: Add all files from GMP for this CPU. Converted comments to + CPP comments because some ASes complain about ' in comments. + + * config.links: Support for BSDI 4.x; by Wayne Chapeskie. Add support + for FreeBSD 5 and made the case stmt looking nicer; by Jun Kuriyama. + Add support for NetBSD. + (sparc8): Made the search path the same as sparc9 + (sparc64-unknown-linux-gnu): use udiv module; by Adam Mitchell. + + * Makefile.am: c/SFLAGS/ASFLAGS/. This has only been used by the + powerpc and actually never passed the -Wa,foo to the cc. + + * mpih-div.c (mpihelp_divrem): The MPN_COPY_DECR copied one element + too many. This is a gmp2.0.2p9.txt patch. + + * longlong.h (umul_ppmm): Fixes for ARM-4. By Sean MacLennan. + + * mpi-internal.h (karatsuba_ctx): New. + * mpih-mul.c (mpihelp_release_karatsuba_ctx): New. + (mpihelp_mul_karatsuba_case): New. + (mpihelp_mul): Splitted to make use of the new functions. + * mpi-pow.c (mpi_powm): Make use of the new splitted function to avoid + multiple allocation of temporary memory during the karatsuba operations. + * mpi_mpow.c: Removed the unused Barrett code. + 2000-03-21 16:17:30 Werner Koch ([email protected]) * config.links: Add support for FreeBSD 5. diff --git a/mpi/Makefile.am b/mpi/Makefile.am index 98ad5fcc3..f7567e5f2 100644 --- a/mpi/Makefile.am +++ b/mpi/Makefile.am @@ -3,12 +3,13 @@ INCLUDES = -I$(top_srcdir)/gcrypt CFLAGS = @CFLAGS@ @MPI_OPT_FLAGS@ -SFLAGS = @MPI_SFLAGS@ +ASFLAGS = @MPI_SFLAGS@ EXTRA_DIST = config.links DISTCLEANFILES = mpih-add1.S mpih-mul1.S mpih-mul2.S mpih-mul3.S \ mpih-lshift.S mpih-rshift.S mpih-sub1.S asm-syntax.h sysdep.h # Note: we only use .S files so we should delete all left over .s +# CLEANFILES = _*.s CLEANFILES = *.s noinst_LTLIBRARIES = libmpi.la @@ -56,4 +57,9 @@ libmpi_la_LIBADD = $(common_asm_objects) @MPI_EXTRA_ASM_OBJS@ .S.s: $(CPP) $(INCLUDES) $(DEFS) $< | grep -v '^#' >$*.s +# Hmmm, we should use this, so that OSes which do not distinguish +# filename case still work. We have to see how libtool can handle this +# $(CPP) $(INCLUDES) $(DEFS) $< | grep -v '^#' > _$*.s +# $(COMPILE) -c _$*.s +# mv -f _$*.o $*.o diff --git a/mpi/config.links b/mpi/config.links index 40125e47b..6a2cbfb53 100644 --- a/mpi/config.links +++ b/mpi/config.links @@ -1,4 +1,4 @@ -# sourced my ../configure to get the list of files to link +# sourced by ../configure to get the list of files to link # this should set $mpi_ln_src and mpi_ln_dst. # Note: this is called from the above directory. @@ -12,23 +12,40 @@ echo '/* created by config.links - do not edit */' >./mpi/asm-syntax.h if test "$try_asm_modules" = "yes" ; then case "${target}" in - i[34]86*-*-freebsd*-elf | i[34]86*-*-freebsd[3-9]* | i[34]86*-*-freebsdelf*) + i[34]86*-*-freebsd*-elf | \ + i[34]86*-*-freebsd[3-9]* | \ + i[34]86*-*-freebsdelf* | \ + i[34]86*-*-netbsd* ) echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h path="i386" ;; - i[56]86*-*-freebsd*-elf | i[56]86*-*-freebsd[3-9]* | i[56]86*-*-freebsdelf*) + i[56]86*-*-freebsd*-elf | \ + i[56]86*-*-freebsd[3-9]* | \ + i[56]86*-*-freebsdelf* | \ + i[56]86*-*-netbsd* | \ + pentium-*-netbsd* | \ + pentiumpro-*-netbsd*) echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h path="i586 i386" ;; - i[34]86*-*-linuxaout* | i[34]86*-*-linuxoldld* | i[34]86*-*-*bsd*) + i[34]86*-*-bsdi4*) + echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h + cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h + path="i386" + ;; + i[34]86*-*-linuxaout* | \ + i[34]86*-*-linuxoldld* | \ + i[34]86*-*-*bsd*) echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h path="i386" ;; - i[56]86*-*-linuxaout* | i[56]86*-*-linuxoldld* | i[56]86*-*-*bsd*) + i[56]86*-*-linuxaout* | \ + i[56]86*-*-linuxoldld* | \ + i[56]86*-*-*bsd*) echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h @@ -49,7 +66,9 @@ case "${target}" in cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h path="i386" ;; - i[56]86*-*-* | pentium-*-* | pentiumpro-*-*) + i[56]86*-*-* | \ + pentium-*-* | \ + pentiumpro-*-*) echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h path="i586 i386" @@ -74,13 +93,23 @@ case "${target}" in path="pa7100 hppa1.1 hppa" mpi_extra_modules="udiv-qrnnd" ;; - sparc9*-*-* | sparc64*-*-* | ultrasparc*-*-*) + sparc64-*-linux-gnu) + # An extra rule because we have an report for this one only. + # Should be compared against the next GMP version + echo '/* configured for sparc64-*-linux-gnu */' >>./mpi/asm-syntax.h + path="sparc32v8 sparc32" + mpi_extra_modules="udiv" + ;; + sparc9*-*-* | \ + sparc64*-*-* | \ + ultrasparc*-*-* ) echo '/* configured for sparc9 or higher */' >>./mpi/asm-syntax.h path="sparc32v8 sparc32" ;; - sparc8*-*-* | microsparc*-*-*) + sparc8*-*-* | \ + microsparc*-*-*) echo '/* configured for sparc8 */' >>./mpi/asm-syntax.h - path="sparc32v8" + path="sparc32v8 sparc32" ;; supersparc*-*-*) echo '/* configured for supersparc */' >>./mpi/asm-syntax.h @@ -92,7 +121,8 @@ case "${target}" in path="sparc32" mpi_extra_modules="udiv" ;; - mips[34]*-*-* | mips*-*-irix6*) + mips[34]*-*-* | \ + mips*-*-irix6*) echo '/* configured for MIPS3 */' >>./mpi/asm-syntax.h path="mips3" ;; @@ -103,7 +133,8 @@ case "${target}" in # Motorola 68k configurations. Let m68k mean 68020-68040. # mc68000 or mc68060 configurations need to be specified explicitly - m680[234]0*-*-linuxaout* | m68k*-*-linuxaout*) + m680[234]0*-*-linuxaout* | \ + m68k*-*-linuxaout*) echo '#define MIT_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h path="m68k/mc68020 m68k" @@ -113,7 +144,8 @@ case "${target}" in cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h path="m68k" ;; - m680[234]0*-*-linux* | m68k*-*-linux*) + m680[234]0*-*-linux* | \ + m68k*-*-linux*) echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h ;; @@ -127,12 +159,14 @@ case "${target}" in cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h path="m68k/mc68020 m68k" ;; - m68000*-*-* | m68060*-*-*) + m68000*-*-* | \ + m68060*-*-*) echo '#define MIT_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h path="m68k/mc68000" ;; - m680[234]0*-*-* | m68k*-*-*) + m680[234]0*-*-* | \ + m68k*-*-*) echo '#define MIT_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h path="m68k/mc68020 m68k" @@ -144,25 +178,37 @@ case "${target}" in cat $srcdir/mpi/powerpc32/syntax.h >>./mpi/asm-syntax.h path="powerpc32" ;; - rs6000-*-aix[456789]* | rs6000-*-aix3.2.[456789]) + rs6000-*-aix[456789]* | \ + rs6000-*-aix3.2.[456789]) mpi_sflags="-Wa,-mpwr" path="power" mpi_extra_modules="udiv-w-sdiv" ;; - rs6000-*-* | power-*-* | power2-*-*) + rs6000-*-* | \ + power-*-* | \ + power2-*-*) mpi_sflags="-Wa,-mppc" path="power" mpi_extra_modules="udiv-w-sdiv" ;; + powerpc-ibm-aix4.2.* ) + # I am not sure about this one but a machine identified by + # powerpc-ibm-aix4.2.1.0 cannot use the powerpc32 code. + mpi_sflags="-Wa,-mpwr" + path="power" + mpi_extra_modules="udiv-w-sdiv" + ;; ppc601-*-*) mpi_sflags="-Wa,-mppc" path="power powerpc32" ;; - ppc60[234]*-*-* | powerpc*-*-*) + ppc60[234]*-*-* | \ + powerpc*-*-*) mpi_sflags="-Wa,-mppc" path="powerpc32" ;; - ppc620-*-* | powerpc64*-*-*) + ppc620-*-* | \ + powerpc64*-*-*) mpi_sflags="-Wa,-mppc" path="powerpc64" ;; diff --git a/mpi/longlong.h b/mpi/longlong.h index c92435570..e36beae49 100644 --- a/mpi/longlong.h +++ b/mpi/longlong.h @@ -199,6 +199,8 @@ extern UDItype __udiv_qrnnd (); "rI" ((USItype)(bh)), \ "r" ((USItype)(al)), \ "rI" ((USItype)(bl))) +#ifdef __ARM_ARCH_3__ +/* SAM This does not work on arm4 */ #define umul_ppmm(xh, xl, a, b) \ __asm__ ("%@ Inlined umul_ppmm mov %|r0, %2, lsr #16 @@ -218,6 +220,18 @@ extern UDItype __udiv_qrnnd (); : "r" ((USItype)(a)), \ "r" ((USItype)(b)) \ : "r0", "r1", "r2") +#elif __ARM_ARCH_4__ +#define umul_ppmm(xh, xl, a, b) \ + __asm__ ("%@ Inlined umul_ppmm + umull %r1, %r0, %r2, %r3" \ + : "=&r" ((USItype)(xh)), \ + "=r" ((USItype)(xl)) \ + : "r" ((USItype)(a)), \ + "r" ((USItype)(b)) \ + : "r0", "r1") +#else +#error Untested architecture +#endif #define UMUL_TIME 20 #define UDIV_TIME 100 #endif /* __arm__ */ diff --git a/mpi/mpi-internal.h b/mpi/mpi-internal.h index 035d33cb3..cde1c0ce2 100644 --- a/mpi/mpi-internal.h +++ b/mpi/mpi-internal.h @@ -1,6 +1,6 @@ /* mpi-internal.h - Internal to the Multi Precision Integers * Copyright (C) 1998 Free Software Foundation, Inc. - * Copyright (C) 1994, 1996 Free Software Foundation, Inc. + * Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. * * This file is part of GnuPG. * @@ -186,6 +186,17 @@ mpi_limb_t mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, int mpihelp_cmp( mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size ); /*-- mpihelp-mul.c --*/ + +struct karatsuba_ctx { + struct karatsuba_ctx *next; + mpi_ptr_t tspace; + mpi_size_t tspace_size; + mpi_ptr_t tp; + mpi_size_t tp_size; +}; + +void mpihelp_release_karatsuba_ctx( struct karatsuba_ctx *ctx ); + mpi_limb_t mpihelp_addmul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); mpi_limb_t mpihelp_submul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, @@ -198,6 +209,12 @@ void mpih_sqr_n_basecase( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size ); void mpih_sqr_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size, mpi_ptr_t tspace); +void mpihelp_mul_karatsuba_case( mpi_ptr_t prodp, + mpi_ptr_t up, mpi_size_t usize, + mpi_ptr_t vp, mpi_size_t vsize, + struct karatsuba_ctx *ctx ); + + /*-- mpihelp-mul_1.c (or xxx/cpu/ *.S) --*/ mpi_limb_t mpihelp_mul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); diff --git a/mpi/mpi-pow.c b/mpi/mpi-pow.c index a57eff87a..e0bed13ac 100644 --- a/mpi/mpi-pow.c +++ b/mpi/mpi-pow.c @@ -1,6 +1,6 @@ /* mpi-pow.c - MPI functions * Copyright (C) 1998 Free Software Foundation, Inc. - * Copyright (C) 1994, 1996 Free Software Foundation, Inc. + * Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. * * This file is part of GnuPG. * @@ -30,9 +30,10 @@ #include <config.h> #include <stdio.h> #include <stdlib.h> -#include <assert.h> +#include <string.h> #include "mpi-internal.h" #include "longlong.h" +#include <assert.h> /**************** @@ -159,7 +160,9 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod) int c; mpi_limb_t e; mpi_limb_t carry_limb; + struct karatsuba_ctx karactx; + memset( &karactx, 0, sizeof karactx ); negative_result = (ep[0] & 1) && base->sign; i = esize - 1; @@ -177,6 +180,7 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod) * by RP (==RES->d), and with 50% probability in the area originally * pointed to by XP. */ + for(;;) { while( c ) { mpi_ptr_t tp; @@ -194,7 +198,6 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod) mpi_free_limb_space( tspace ); tsize = 2 * rsize; tspace = mpi_alloc_limb_space( tsize, 0 ); - } mpih_sqr_n( xp, rp, rsize, tspace ); } @@ -209,7 +212,15 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod) rsize = xsize; if( (mpi_limb_signed_t)e < 0 ) { - mpihelp_mul( xp, rp, rsize, bp, bsize ); + /*mpihelp_mul( xp, rp, rsize, bp, bsize );*/ + if( bsize < KARATSUBA_THRESHOLD ) { + mpihelp_mul( xp, rp, rsize, bp, bsize ); + } + else { + mpihelp_mul_karatsuba_case( + xp, rp, rsize, bp, bsize, &karactx ); + } + xsize = rsize + bsize; if( xsize > msize ) { mpihelp_divrem(xp + msize, 0, xp, xsize, mp, msize); @@ -258,6 +269,8 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod) if( mod_shift_cnt ) mpihelp_rshift( rp, rp, rsize, mod_shift_cnt); MPN_NORMALIZE (rp, rsize); + + mpihelp_release_karatsuba_ctx( &karactx ); } if( negative_result && rsize ) { diff --git a/mpi/mpih-div.c b/mpi/mpih-div.c index 0d711cb58..3b90994e8 100644 --- a/mpi/mpih-div.c +++ b/mpi/mpih-div.c @@ -1,6 +1,6 @@ /* mpihelp-div.c - MPI helper functions * Copyright (C) 1998 Free Software Foundation, Inc. - * Copyright (C) 1994, 1996 Free Software Foundation, Inc. + * Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. * * This file is part of GnuPG. * @@ -338,7 +338,7 @@ mpihelp_divrem( mpi_ptr_t qp, mpi_size_t qextra_limbs, } else { n2 = np[dsize - 1]; - MPN_COPY_DECR (np + 1, np, dsize); + MPN_COPY_DECR (np + 1, np, dsize - 1); np[0] = 0; } diff --git a/mpi/mpih-mul.c b/mpi/mpih-mul.c index 67749f4cd..76d47fe65 100644 --- a/mpi/mpih-mul.c +++ b/mpi/mpih-mul.c @@ -1,5 +1,5 @@ /* mpihelp-mul.c - MPI helper functions - * Copyright (C) 1994, 1996, 1998, 1999 Free Software Foundation, Inc. + * Copyright (C) 1994, 1996, 1998, 1999, 2000 Free Software Foundation, Inc. * * This file is part of GnuPG. * @@ -29,10 +29,10 @@ #include <config.h> #include <stdio.h> #include <stdlib.h> +#include <string.h> #include "mpi-internal.h" #include "longlong.h" -#include "g10lib.h" /* for g10_is_secure() */ - +#include "g10lib.h" /* g10_is_secure() */ #define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \ @@ -373,6 +373,86 @@ mpihelp_mul_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size) } + +void +mpihelp_mul_karatsuba_case( mpi_ptr_t prodp, + mpi_ptr_t up, mpi_size_t usize, + mpi_ptr_t vp, mpi_size_t vsize, + struct karatsuba_ctx *ctx ) +{ + mpi_limb_t cy; + + if( !ctx->tspace || ctx->tspace_size < vsize ) { + if( ctx->tspace ) + mpi_free_limb_space( ctx->tspace ); + ctx->tspace = mpi_alloc_limb_space( 2 * vsize, + g10_is_secure( up ) || g10_is_secure( vp ) ); + ctx->tspace_size = vsize; + } + + MPN_MUL_N_RECURSE( prodp, up, vp, vsize, ctx->tspace ); + + prodp += vsize; + up += vsize; + usize -= vsize; + if( usize >= vsize ) { + if( !ctx->tp || ctx->tp_size < vsize ) { + if( ctx->tp ) + mpi_free_limb_space( ctx->tp ); + ctx->tp = mpi_alloc_limb_space( 2 * vsize, g10_is_secure( up ) + || g10_is_secure( vp ) ); + ctx->tp_size = vsize; + } + + do { + MPN_MUL_N_RECURSE( ctx->tp, up, vp, vsize, ctx->tspace ); + cy = mpihelp_add_n( prodp, prodp, ctx->tp, vsize ); + mpihelp_add_1( prodp + vsize, ctx->tp + vsize, vsize, cy ); + prodp += vsize; + up += vsize; + usize -= vsize; + } while( usize >= vsize ); + } + + if( usize ) { + if( usize < KARATSUBA_THRESHOLD ) { + mpihelp_mul( ctx->tspace, vp, vsize, up, usize ); + } + else { + if( !ctx->next ) { + ctx->next = g10_xcalloc( 1, sizeof *ctx ); + } + mpihelp_mul_karatsuba_case( ctx->tspace, + vp, vsize, + up, usize, + ctx->next ); + } + + cy = mpihelp_add_n( prodp, prodp, ctx->tspace, vsize); + mpihelp_add_1( prodp + vsize, ctx->tspace + vsize, usize, cy ); + } +} + + +void +mpihelp_release_karatsuba_ctx( struct karatsuba_ctx *ctx ) +{ + struct karatsuba_ctx *ctx2; + + if( ctx->tp ) + mpi_free_limb_space( ctx->tp ); + if( ctx->tspace ) + mpi_free_limb_space( ctx->tspace ); + for( ctx=ctx->next; ctx; ctx = ctx2 ) { + ctx2 = ctx->next; + if( ctx->tp ) + mpi_free_limb_space( ctx->tp ); + if( ctx->tspace ) + mpi_free_limb_space( ctx->tspace ); + g10_free( ctx ); + } +} + /* Multiply the natural numbers u (pointed to by UP, with USIZE limbs) * and v (pointed to by VP, with VSIZE limbs), and store the result at * PRODP. USIZE + VSIZE limbs are always stored, but if the input @@ -394,7 +474,7 @@ mpihelp_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize, { mpi_ptr_t prod_endp = prodp + usize + vsize - 1; mpi_limb_t cy; - mpi_ptr_t tspace; + struct karatsuba_ctx ctx; if( vsize < KARATSUBA_THRESHOLD ) { mpi_size_t i; @@ -438,34 +518,9 @@ mpihelp_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize, return cy; } - tspace = mpi_alloc_limb_space( 2 * vsize, - g10_is_secure( up ) || g10_is_secure( vp ) ); - MPN_MUL_N_RECURSE( prodp, up, vp, vsize, tspace ); - - prodp += vsize; - up += vsize; - usize -= vsize; - if( usize >= vsize ) { - mpi_ptr_t tp = mpi_alloc_limb_space( 2 * vsize, g10_is_secure( up ) - || g10_is_secure( vp ) ); - do { - MPN_MUL_N_RECURSE( tp, up, vp, vsize, tspace ); - cy = mpihelp_add_n( prodp, prodp, tp, vsize ); - mpihelp_add_1( prodp + vsize, tp + vsize, vsize, cy ); - prodp += vsize; - up += vsize; - usize -= vsize; - } while( usize >= vsize ); - mpi_free_limb_space( tp ); - } - - if( usize ) { - mpihelp_mul( tspace, vp, vsize, up, usize ); - cy = mpihelp_add_n( prodp, prodp, tspace, vsize); - mpihelp_add_1( prodp + vsize, tspace + vsize, usize, cy ); - } - - mpi_free_limb_space( tspace ); + memset( &ctx, 0, sizeof ctx ); + mpihelp_mul_karatsuba_case( prodp, up, usize, vp, vsize, &ctx ); + mpihelp_release_karatsuba_ctx( &ctx ); return *prod_endp; } diff --git a/mpi/power/distfiles b/mpi/power/distfiles index e69de29bb..e664c8db6 100644 --- a/mpi/power/distfiles +++ b/mpi/power/distfiles @@ -0,0 +1,7 @@ +mpih-add1.S +mpih-lshift.S +mpih-mul1.S +mpih-mul2.S +mpih-mul3.S +mpih-rshift.S +mpih-sub1.S diff --git a/mpi/power/mpih-add1.S b/mpi/power/mpih-add1.S new file mode 100644 index 000000000..ad27f3d81 --- /dev/null +++ b/mpi/power/mpih-add1.S @@ -0,0 +1,86 @@ +/* IBM POWER add_n -- Add two limb vectors of equal, non-zero length. + * + * Copyright (C) 1992, 1994, 1995, 1996, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +/* +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + */ + + .toc + .extern mpihelp_add_n[DS] + .extern .mpihelp_add_n +.csect [PR] + .align 2 + .globl mpihelp_add_n + .globl .mpihelp_add_n + .csect mpihelp_add_n[DS] +mpihelp_add_n: + .long .mpihelp_add_n, TOC[tc0], 0 + .csect [PR] +.mpihelp_add_n: + andil. 10,6,1 # odd or even number of limbs? + l 8,0(4) # load least significant s1 limb + l 0,0(5) # load least significant s2 limb + cal 3,-4(3) # offset res_ptr, it's updated before it's used + sri 10,6,1 # count for unrolled loop + a 7,0,8 # add least significant limbs, set cy + mtctr 10 # copy count into CTR + beq 0,Leven # branch if even # of limbs (# of limbs >= 2) + +# We have an odd # of limbs. Add the first limbs separately. + cmpi 1,10,0 # is count for unrolled loop zero? + bne 1,L1 # branch if not + st 7,4(3) + aze 3,10 # use the fact that r10 is zero... + br # return + +# We added least significant limbs. Now reload the next limbs to enter loop. +L1: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + stu 7,4(3) + ae 7,0,8 # add limbs, set cy +Leven: lu 9,4(4) # load s1 limb and update s1_ptr + lu 10,4(5) # load s2 limb and update s2_ptr + bdz Lend # If done, skip loop + +Loop: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + ae 11,9,10 # add previous limbs with cy, set cy + stu 7,4(3) # + lu 9,4(4) # load s1 limb and update s1_ptr + lu 10,4(5) # load s2 limb and update s2_ptr + ae 7,0,8 # add previous limbs with cy, set cy + stu 11,4(3) # + bdn Loop # decrement CTR and loop back + +Lend: ae 11,9,10 # add limbs with cy, set cy + st 7,4(3) # + st 11,8(3) # + lil 3,0 # load cy into ... + aze 3,3 # ... return value register + br + diff --git a/mpi/power/mpih-lshift.S b/mpi/power/mpih-lshift.S new file mode 100644 index 000000000..5c53a0ae6 --- /dev/null +++ b/mpi/power/mpih-lshift.S @@ -0,0 +1,64 @@ +/* IBM POWER lshift + * + * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +/* +# INPUT PARAMETERS +# res_ptr r3 +# s_ptr r4 +# size r5 +# cnt r6 + */ + + .toc + .extern mpihelp_lshift[DS] + .extern .mpihelp_lshift +.csect [PR] + .align 2 + .globl mpihelp_lshift + .globl .mpihelp_lshift + .csect mpihelp_lshift[DS] +mpihelp_lshift: + .long .mpihelp_lshift, TOC[tc0], 0 + .csect [PR] +.mpihelp_lshift: + sli 0,5,2 + cax 9,3,0 + cax 4,4,0 + sfi 8,6,32 + mtctr 5 # put limb count in CTR loop register + lu 0,-4(4) # read most significant limb + sre 3,0,8 # compute carry out limb, and init MQ register + bdz Lend2 # if just one limb, skip loop + lu 0,-4(4) # read 2:nd most significant limb + sreq 7,0,8 # compute most significant limb of result + bdz Lend # if just two limb, skip loop +Loop: lu 0,-4(4) # load next lower limb + stu 7,-4(9) # store previous result during read latency + sreq 7,0,8 # compute result limb + bdn Loop # loop back until CTR is zero +Lend: stu 7,-4(9) # store 2:nd least significant limb +Lend2: sle 7,0,6 # compute least significant limb + st 7,-4(9) # store it + br + diff --git a/mpi/power/mpih-mul1.S b/mpi/power/mpih-mul1.S new file mode 100644 index 000000000..3b71b5aa9 --- /dev/null +++ b/mpi/power/mpih-mul1.S @@ -0,0 +1,115 @@ +/* IBM POWER mul_1 -- Multiply a limb vector with a limb and store + * the result in a second limb vector. + * + * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +/* +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To +# obtain that operation, we have to use the 32x32->64 signed multiplication +# instruction, and add the appropriate compensation to the high limb of the +# result. We add the multiplicand if the multiplier has its most significant +# bit set, and we add the multiplier if the multiplicand has its most +# significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit +# we can branch in zero cycles, so that's how we perform the additions. + */ + + .toc + .csect .mpihelp_mul_1[PR] + .align 2 + .globl mpihelp_mul_1 + .globl .mpihelp_mul_1 + .csect mpihelp_mul_1[DS] +mpihelp_mul_1: + .long .mpihelp_mul_1[PR], TOC[tc0], 0 + .csect .mpihelp_mul_1[PR] +.mpihelp_mul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 8 + ai 0,0,0 # reset carry + cax 9,9,7 + blt Lneg +Lpos: bdz Lend +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 8,0,9 + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 8,0,10 + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + cax 10,10,0 # adjust high limb for negative s2_limb + mfmq 0 + ae 8,0,9 + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + cax 9,9,0 # adjust high limb for negative s2_limb + mfmq 0 + ae 8,0,10 + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br + diff --git a/mpi/power/mpih-mul2.S b/mpi/power/mpih-mul2.S new file mode 100644 index 000000000..19ddee86d --- /dev/null +++ b/mpi/power/mpih-mul2.S @@ -0,0 +1,130 @@ +/* IBM POWER addmul_1 -- Multiply a limb vector with a limb and add + * the result to a second limb vector. + * + * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "sysdep.h" +#include "asm-syntax.h" + + + +/* +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To +# obtain that operation, we have to use the 32x32->64 signed multiplication +# instruction, and add the appropriate compensation to the high limb of the +# result. We add the multiplicand if the multiplier has its most significant +# bit set, and we add the multiplier if the multiplicand has its most +# significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit +# we can branch in zero cycles, so that's how we perform the additions. + */ + + .toc + .csect .mpihelp_addmul_1[PR] + .align 2 + .globl mpihelp_addmul_1 + .globl .mpihelp_addmul_1 + .csect mpihelp_addmul_1[DS] +mpihelp_addmul_1: + .long .mpihelp_addmul_1[PR], TOC[tc0], 0 + .csect .mpihelp_addmul_1[PR] +.mpihelp_addmul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 8 + cax 9,9,7 + l 7,4(3) + a 8,8,7 # add res_limb + blt Lneg +Lpos: bdz Lend + +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 8,0,9 # low limb + old_cy_limb + old cy + l 7,4(3) + aze 10,10 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 8,0,10 + l 7,4(3) + aze 9,9 + a 8,8,7 + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 7 + ae 8,7,9 + l 7,4(3) + ae 10,10,0 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 7 + ae 8,7,10 + l 7,4(3) + ae 9,9,0 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br + diff --git a/mpi/power/mpih-mul3.S b/mpi/power/mpih-mul3.S new file mode 100644 index 000000000..e875e88ea --- /dev/null +++ b/mpi/power/mpih-mul3.S @@ -0,0 +1,135 @@ +/* IBM POWER submul_1 -- Multiply a limb vector with a limb and subtract + * the result from a second limb vector. + * + * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "sysdep.h" +#include "asm-syntax.h" + + +/* + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To +# obtain that operation, we have to use the 32x32->64 signed multiplication +# instruction, and add the appropriate compensation to the high limb of the +# result. We add the multiplicand if the multiplier has its most significant +# bit set, and we add the multiplier if the multiplicand has its most +# significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit +# we can branch in zero cycles, so that's how we perform the additions. + */ + + .toc + .csect .mpihelp_submul_1[PR] + .align 2 + .globl mpihelp_submul_1 + .globl .mpihelp_submul_1 + .csect mpihelp_submul_1[DS] +mpihelp_submul_1: + .long .mpihelp_submul_1[PR], TOC[tc0], 0 + .csect .mpihelp_submul_1[PR] +.mpihelp_submul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 11 + cax 9,9,7 + l 7,4(3) + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + blt Lneg +Lpos: bdz Lend + +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 11,0,9 # low limb + old_cy_limb + old cy + l 7,4(3) + aze 10,10 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 11,0,10 + l 7,4(3) + aze 9,9 + sf 8,11,7 + a 11,8,11 # invert cy (r11 is junk) + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 7 + ae 11,7,9 + l 7,4(3) + ae 10,10,0 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 7 + ae 11,7,10 + l 7,4(3) + ae 9,9,0 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br + diff --git a/mpi/power/mpih-rshift.S b/mpi/power/mpih-rshift.S new file mode 100644 index 000000000..e29645072 --- /dev/null +++ b/mpi/power/mpih-rshift.S @@ -0,0 +1,64 @@ +/* IBM POWER rshift + * + * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "sysdep.h" +#include "asm-syntax.h" + + +/* +# INPUT PARAMETERS +# res_ptr r3 +# s_ptr r4 +# size r5 +# cnt r6 +*/ + + .toc + .extern mpihelp_rshift[DS] + .extern .mpihelp_rshift +.csect [PR] + .align 2 + .globl mpihelp_rshift + .globl .mpihelp_rshift + .csect mpihelp_rshift[DS] +mpihelp_rshift: + .long .mpihelp_rshift, TOC[tc0], 0 + .csect [PR] +.mpihelp_rshift: + sfi 8,6,32 + mtctr 5 # put limb count in CTR loop register + l 0,0(4) # read least significant limb + ai 9,3,-4 # adjust res_ptr since it's offset in the stu:s + sle 3,0,8 # compute carry limb, and init MQ register + bdz Lend2 # if just one limb, skip loop + lu 0,4(4) # read 2:nd least significant limb + sleq 7,0,8 # compute least significant limb of result + bdz Lend # if just two limb, skip loop +Loop: lu 0,4(4) # load next higher limb + stu 7,4(9) # store previous result during read latency + sleq 7,0,8 # compute result limb + bdn Loop # loop back until CTR is zero +Lend: stu 7,4(9) # store 2:nd most significant limb +Lend2: sre 7,0,6 # compute most significant limb + st 7,4(9) # store it + br + + diff --git a/mpi/power/mpih-sub1.S b/mpi/power/mpih-sub1.S new file mode 100644 index 000000000..a3605533e --- /dev/null +++ b/mpi/power/mpih-sub1.S @@ -0,0 +1,87 @@ +/* IBM POWER sub_n -- Subtract two limb vectors of equal, non-zero length. + * + * Copyright (C) 1992, 1994, 1995, 1996, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +/* +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + */ + + .toc + .extern mpihelp_sub_n[DS] + .extern .mpihelp_sub_n +.csect [PR] + .align 2 + .globl mpihelp_sub_n + .globl .mpihelp_sub_n + .csect mpihelp_sub_n[DS] +mpihelp_sub_n: + .long .mpihelp_sub_n, TOC[tc0], 0 + .csect [PR] +.mpihelp_sub_n: + andil. 10,6,1 # odd or even number of limbs? + l 8,0(4) # load least significant s1 limb + l 0,0(5) # load least significant s2 limb + cal 3,-4(3) # offset res_ptr, it's updated before it's used + sri 10,6,1 # count for unrolled loop + sf 7,0,8 # subtract least significant limbs, set cy + mtctr 10 # copy count into CTR + beq 0,Leven # branch if even # of limbs (# of limbs >= 2) + +# We have an odd # of limbs. Add the first limbs separately. + cmpi 1,10,0 # is count for unrolled loop zero? + bne 1,L1 # branch if not + st 7,4(3) + sfe 3,0,0 # load !cy into ... + sfi 3,3,0 # ... return value register + br # return + +# We added least significant limbs. Now reload the next limbs to enter loop. +L1: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + stu 7,4(3) + sfe 7,0,8 # subtract limbs, set cy +Leven: lu 9,4(4) # load s1 limb and update s1_ptr + lu 10,4(5) # load s2 limb and update s2_ptr + bdz Lend # If done, skip loop + +Loop: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + sfe 11,10,9 # subtract previous limbs with cy, set cy + stu 7,4(3) # + lu 9,4(4) # load s1 limb and update s1_ptr + lu 10,4(5) # load s2 limb and update s2_ptr + sfe 7,0,8 # subtract previous limbs with cy, set cy + stu 11,4(3) # + bdn Loop # decrement CTR and loop back + +Lend: sfe 11,10,9 # subtract limbs with cy, set cy + st 7,4(3) # + st 11,8(3) # + sfe 3,0,0 # load !cy into ... + sfi 3,3,0 # ... return value register + br + |