aboutsummaryrefslogtreecommitdiffstats
path: root/mpi
diff options
context:
space:
mode:
Diffstat (limited to 'mpi')
-rw-r--r--mpi/ChangeLog27
-rw-r--r--mpi/Makefile.am8
-rw-r--r--mpi/config.links82
-rw-r--r--mpi/longlong.h14
-rw-r--r--mpi/mpi-internal.h19
-rw-r--r--mpi/mpi-pow.c21
-rw-r--r--mpi/mpih-div.c4
-rw-r--r--mpi/mpih-mul.c119
-rw-r--r--mpi/power/distfiles7
-rw-r--r--mpi/power/mpih-add1.S86
-rw-r--r--mpi/power/mpih-lshift.S64
-rw-r--r--mpi/power/mpih-mul1.S115
-rw-r--r--mpi/power/mpih-mul2.S130
-rw-r--r--mpi/power/mpih-mul3.S135
-rw-r--r--mpi/power/mpih-rshift.S64
-rw-r--r--mpi/power/mpih-sub1.S87
16 files changed, 924 insertions, 58 deletions
diff --git a/mpi/ChangeLog b/mpi/ChangeLog
index d9e7faa30..7119d1e2a 100644
--- a/mpi/ChangeLog
+++ b/mpi/ChangeLog
@@ -1,3 +1,30 @@
+Mon Jul 17 16:35:47 CEST 2000 Werner Koch <wk@>
+
+ * power/: Add all files from GMP for this CPU. Converted comments to
+ CPP comments because some ASes complain about ' in comments.
+
+ * config.links: Support for BSDI 4.x; by Wayne Chapeskie. Add support
+ for FreeBSD 5 and made the case stmt looking nicer; by Jun Kuriyama.
+ Add support for NetBSD.
+ (sparc8): Made the search path the same as sparc9
+ (sparc64-unknown-linux-gnu): use udiv module; by Adam Mitchell.
+
+ * Makefile.am: c/SFLAGS/ASFLAGS/. This has only been used by the
+ powerpc and actually never passed the -Wa,foo to the cc.
+
+ * mpih-div.c (mpihelp_divrem): The MPN_COPY_DECR copied one element
+ too many. This is a gmp2.0.2p9.txt patch.
+
+ * longlong.h (umul_ppmm): Fixes for ARM-4. By Sean MacLennan.
+
+ * mpi-internal.h (karatsuba_ctx): New.
+ * mpih-mul.c (mpihelp_release_karatsuba_ctx): New.
+ (mpihelp_mul_karatsuba_case): New.
+ (mpihelp_mul): Splitted to make use of the new functions.
+ * mpi-pow.c (mpi_powm): Make use of the new splitted function to avoid
+ multiple allocation of temporary memory during the karatsuba operations.
+ * mpi_mpow.c: Removed the unused Barrett code.
+
2000-03-21 16:17:30 Werner Koch ([email protected])
* config.links: Add support for FreeBSD 5.
diff --git a/mpi/Makefile.am b/mpi/Makefile.am
index 98ad5fcc3..f7567e5f2 100644
--- a/mpi/Makefile.am
+++ b/mpi/Makefile.am
@@ -3,12 +3,13 @@
INCLUDES = -I$(top_srcdir)/gcrypt
CFLAGS = @CFLAGS@ @MPI_OPT_FLAGS@
-SFLAGS = @MPI_SFLAGS@
+ASFLAGS = @MPI_SFLAGS@
EXTRA_DIST = config.links
DISTCLEANFILES = mpih-add1.S mpih-mul1.S mpih-mul2.S mpih-mul3.S \
mpih-lshift.S mpih-rshift.S mpih-sub1.S asm-syntax.h sysdep.h
# Note: we only use .S files so we should delete all left over .s
+# CLEANFILES = _*.s
CLEANFILES = *.s
noinst_LTLIBRARIES = libmpi.la
@@ -56,4 +57,9 @@ libmpi_la_LIBADD = $(common_asm_objects) @MPI_EXTRA_ASM_OBJS@
.S.s:
$(CPP) $(INCLUDES) $(DEFS) $< | grep -v '^#' >$*.s
+# Hmmm, we should use this, so that OSes which do not distinguish
+# filename case still work. We have to see how libtool can handle this
+# $(CPP) $(INCLUDES) $(DEFS) $< | grep -v '^#' > _$*.s
+# $(COMPILE) -c _$*.s
+# mv -f _$*.o $*.o
diff --git a/mpi/config.links b/mpi/config.links
index 40125e47b..6a2cbfb53 100644
--- a/mpi/config.links
+++ b/mpi/config.links
@@ -1,4 +1,4 @@
-# sourced my ../configure to get the list of files to link
+# sourced by ../configure to get the list of files to link
# this should set $mpi_ln_src and mpi_ln_dst.
# Note: this is called from the above directory.
@@ -12,23 +12,40 @@ echo '/* created by config.links - do not edit */' >./mpi/asm-syntax.h
if test "$try_asm_modules" = "yes" ; then
case "${target}" in
- i[34]86*-*-freebsd*-elf | i[34]86*-*-freebsd[3-9]* | i[34]86*-*-freebsdelf*)
+ i[34]86*-*-freebsd*-elf | \
+ i[34]86*-*-freebsd[3-9]* | \
+ i[34]86*-*-freebsdelf* | \
+ i[34]86*-*-netbsd* )
echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h
path="i386"
;;
- i[56]86*-*-freebsd*-elf | i[56]86*-*-freebsd[3-9]* | i[56]86*-*-freebsdelf*)
+ i[56]86*-*-freebsd*-elf | \
+ i[56]86*-*-freebsd[3-9]* | \
+ i[56]86*-*-freebsdelf* | \
+ i[56]86*-*-netbsd* | \
+ pentium-*-netbsd* | \
+ pentiumpro-*-netbsd*)
echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h
path="i586 i386"
;;
- i[34]86*-*-linuxaout* | i[34]86*-*-linuxoldld* | i[34]86*-*-*bsd*)
+ i[34]86*-*-bsdi4*)
+ echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
+ cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h
+ path="i386"
+ ;;
+ i[34]86*-*-linuxaout* | \
+ i[34]86*-*-linuxoldld* | \
+ i[34]86*-*-*bsd*)
echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h
echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h
cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h
path="i386"
;;
- i[56]86*-*-linuxaout* | i[56]86*-*-linuxoldld* | i[56]86*-*-*bsd*)
+ i[56]86*-*-linuxaout* | \
+ i[56]86*-*-linuxoldld* | \
+ i[56]86*-*-*bsd*)
echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h
echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h
cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h
@@ -49,7 +66,9 @@ case "${target}" in
cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h
path="i386"
;;
- i[56]86*-*-* | pentium-*-* | pentiumpro-*-*)
+ i[56]86*-*-* | \
+ pentium-*-* | \
+ pentiumpro-*-*)
echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h
path="i586 i386"
@@ -74,13 +93,23 @@ case "${target}" in
path="pa7100 hppa1.1 hppa"
mpi_extra_modules="udiv-qrnnd"
;;
- sparc9*-*-* | sparc64*-*-* | ultrasparc*-*-*)
+ sparc64-*-linux-gnu)
+ # An extra rule because we have an report for this one only.
+ # Should be compared against the next GMP version
+ echo '/* configured for sparc64-*-linux-gnu */' >>./mpi/asm-syntax.h
+ path="sparc32v8 sparc32"
+ mpi_extra_modules="udiv"
+ ;;
+ sparc9*-*-* | \
+ sparc64*-*-* | \
+ ultrasparc*-*-* )
echo '/* configured for sparc9 or higher */' >>./mpi/asm-syntax.h
path="sparc32v8 sparc32"
;;
- sparc8*-*-* | microsparc*-*-*)
+ sparc8*-*-* | \
+ microsparc*-*-*)
echo '/* configured for sparc8 */' >>./mpi/asm-syntax.h
- path="sparc32v8"
+ path="sparc32v8 sparc32"
;;
supersparc*-*-*)
echo '/* configured for supersparc */' >>./mpi/asm-syntax.h
@@ -92,7 +121,8 @@ case "${target}" in
path="sparc32"
mpi_extra_modules="udiv"
;;
- mips[34]*-*-* | mips*-*-irix6*)
+ mips[34]*-*-* | \
+ mips*-*-irix6*)
echo '/* configured for MIPS3 */' >>./mpi/asm-syntax.h
path="mips3"
;;
@@ -103,7 +133,8 @@ case "${target}" in
# Motorola 68k configurations. Let m68k mean 68020-68040.
# mc68000 or mc68060 configurations need to be specified explicitly
- m680[234]0*-*-linuxaout* | m68k*-*-linuxaout*)
+ m680[234]0*-*-linuxaout* | \
+ m68k*-*-linuxaout*)
echo '#define MIT_SYNTAX' >>./mpi/asm-syntax.h
cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h
path="m68k/mc68020 m68k"
@@ -113,7 +144,8 @@ case "${target}" in
cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h
path="m68k"
;;
- m680[234]0*-*-linux* | m68k*-*-linux*)
+ m680[234]0*-*-linux* | \
+ m68k*-*-linux*)
echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h
;;
@@ -127,12 +159,14 @@ case "${target}" in
cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h
path="m68k/mc68020 m68k"
;;
- m68000*-*-* | m68060*-*-*)
+ m68000*-*-* | \
+ m68060*-*-*)
echo '#define MIT_SYNTAX' >>./mpi/asm-syntax.h
cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h
path="m68k/mc68000"
;;
- m680[234]0*-*-* | m68k*-*-*)
+ m680[234]0*-*-* | \
+ m68k*-*-*)
echo '#define MIT_SYNTAX' >>./mpi/asm-syntax.h
cat $srcdir/mpi/m68k/syntax.h >>./mpi/asm-syntax.h
path="m68k/mc68020 m68k"
@@ -144,25 +178,37 @@ case "${target}" in
cat $srcdir/mpi/powerpc32/syntax.h >>./mpi/asm-syntax.h
path="powerpc32"
;;
- rs6000-*-aix[456789]* | rs6000-*-aix3.2.[456789])
+ rs6000-*-aix[456789]* | \
+ rs6000-*-aix3.2.[456789])
mpi_sflags="-Wa,-mpwr"
path="power"
mpi_extra_modules="udiv-w-sdiv"
;;
- rs6000-*-* | power-*-* | power2-*-*)
+ rs6000-*-* | \
+ power-*-* | \
+ power2-*-*)
mpi_sflags="-Wa,-mppc"
path="power"
mpi_extra_modules="udiv-w-sdiv"
;;
+ powerpc-ibm-aix4.2.* )
+ # I am not sure about this one but a machine identified by
+ # powerpc-ibm-aix4.2.1.0 cannot use the powerpc32 code.
+ mpi_sflags="-Wa,-mpwr"
+ path="power"
+ mpi_extra_modules="udiv-w-sdiv"
+ ;;
ppc601-*-*)
mpi_sflags="-Wa,-mppc"
path="power powerpc32"
;;
- ppc60[234]*-*-* | powerpc*-*-*)
+ ppc60[234]*-*-* | \
+ powerpc*-*-*)
mpi_sflags="-Wa,-mppc"
path="powerpc32"
;;
- ppc620-*-* | powerpc64*-*-*)
+ ppc620-*-* | \
+ powerpc64*-*-*)
mpi_sflags="-Wa,-mppc"
path="powerpc64"
;;
diff --git a/mpi/longlong.h b/mpi/longlong.h
index c92435570..e36beae49 100644
--- a/mpi/longlong.h
+++ b/mpi/longlong.h
@@ -199,6 +199,8 @@ extern UDItype __udiv_qrnnd ();
"rI" ((USItype)(bh)), \
"r" ((USItype)(al)), \
"rI" ((USItype)(bl)))
+#ifdef __ARM_ARCH_3__
+/* SAM This does not work on arm4 */
#define umul_ppmm(xh, xl, a, b) \
__asm__ ("%@ Inlined umul_ppmm
mov %|r0, %2, lsr #16
@@ -218,6 +220,18 @@ extern UDItype __udiv_qrnnd ();
: "r" ((USItype)(a)), \
"r" ((USItype)(b)) \
: "r0", "r1", "r2")
+#elif __ARM_ARCH_4__
+#define umul_ppmm(xh, xl, a, b) \
+ __asm__ ("%@ Inlined umul_ppmm
+ umull %r1, %r0, %r2, %r3" \
+ : "=&r" ((USItype)(xh)), \
+ "=r" ((USItype)(xl)) \
+ : "r" ((USItype)(a)), \
+ "r" ((USItype)(b)) \
+ : "r0", "r1")
+#else
+#error Untested architecture
+#endif
#define UMUL_TIME 20
#define UDIV_TIME 100
#endif /* __arm__ */
diff --git a/mpi/mpi-internal.h b/mpi/mpi-internal.h
index 035d33cb3..cde1c0ce2 100644
--- a/mpi/mpi-internal.h
+++ b/mpi/mpi-internal.h
@@ -1,6 +1,6 @@
/* mpi-internal.h - Internal to the Multi Precision Integers
* Copyright (C) 1998 Free Software Foundation, Inc.
- * Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ * Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
*
* This file is part of GnuPG.
*
@@ -186,6 +186,17 @@ mpi_limb_t mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
int mpihelp_cmp( mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size );
/*-- mpihelp-mul.c --*/
+
+struct karatsuba_ctx {
+ struct karatsuba_ctx *next;
+ mpi_ptr_t tspace;
+ mpi_size_t tspace_size;
+ mpi_ptr_t tp;
+ mpi_size_t tp_size;
+};
+
+void mpihelp_release_karatsuba_ctx( struct karatsuba_ctx *ctx );
+
mpi_limb_t mpihelp_addmul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
mpi_size_t s1_size, mpi_limb_t s2_limb);
mpi_limb_t mpihelp_submul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
@@ -198,6 +209,12 @@ void mpih_sqr_n_basecase( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size );
void mpih_sqr_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size,
mpi_ptr_t tspace);
+void mpihelp_mul_karatsuba_case( mpi_ptr_t prodp,
+ mpi_ptr_t up, mpi_size_t usize,
+ mpi_ptr_t vp, mpi_size_t vsize,
+ struct karatsuba_ctx *ctx );
+
+
/*-- mpihelp-mul_1.c (or xxx/cpu/ *.S) --*/
mpi_limb_t mpihelp_mul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
mpi_size_t s1_size, mpi_limb_t s2_limb);
diff --git a/mpi/mpi-pow.c b/mpi/mpi-pow.c
index a57eff87a..e0bed13ac 100644
--- a/mpi/mpi-pow.c
+++ b/mpi/mpi-pow.c
@@ -1,6 +1,6 @@
/* mpi-pow.c - MPI functions
* Copyright (C) 1998 Free Software Foundation, Inc.
- * Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ * Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
*
* This file is part of GnuPG.
*
@@ -30,9 +30,10 @@
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
-#include <assert.h>
+#include <string.h>
#include "mpi-internal.h"
#include "longlong.h"
+#include <assert.h>
/****************
@@ -159,7 +160,9 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
int c;
mpi_limb_t e;
mpi_limb_t carry_limb;
+ struct karatsuba_ctx karactx;
+ memset( &karactx, 0, sizeof karactx );
negative_result = (ep[0] & 1) && base->sign;
i = esize - 1;
@@ -177,6 +180,7 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
* by RP (==RES->d), and with 50% probability in the area originally
* pointed to by XP.
*/
+
for(;;) {
while( c ) {
mpi_ptr_t tp;
@@ -194,7 +198,6 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
mpi_free_limb_space( tspace );
tsize = 2 * rsize;
tspace = mpi_alloc_limb_space( tsize, 0 );
-
}
mpih_sqr_n( xp, rp, rsize, tspace );
}
@@ -209,7 +212,15 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
rsize = xsize;
if( (mpi_limb_signed_t)e < 0 ) {
- mpihelp_mul( xp, rp, rsize, bp, bsize );
+ /*mpihelp_mul( xp, rp, rsize, bp, bsize );*/
+ if( bsize < KARATSUBA_THRESHOLD ) {
+ mpihelp_mul( xp, rp, rsize, bp, bsize );
+ }
+ else {
+ mpihelp_mul_karatsuba_case(
+ xp, rp, rsize, bp, bsize, &karactx );
+ }
+
xsize = rsize + bsize;
if( xsize > msize ) {
mpihelp_divrem(xp + msize, 0, xp, xsize, mp, msize);
@@ -258,6 +269,8 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
if( mod_shift_cnt )
mpihelp_rshift( rp, rp, rsize, mod_shift_cnt);
MPN_NORMALIZE (rp, rsize);
+
+ mpihelp_release_karatsuba_ctx( &karactx );
}
if( negative_result && rsize ) {
diff --git a/mpi/mpih-div.c b/mpi/mpih-div.c
index 0d711cb58..3b90994e8 100644
--- a/mpi/mpih-div.c
+++ b/mpi/mpih-div.c
@@ -1,6 +1,6 @@
/* mpihelp-div.c - MPI helper functions
* Copyright (C) 1998 Free Software Foundation, Inc.
- * Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ * Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
*
* This file is part of GnuPG.
*
@@ -338,7 +338,7 @@ mpihelp_divrem( mpi_ptr_t qp, mpi_size_t qextra_limbs,
}
else {
n2 = np[dsize - 1];
- MPN_COPY_DECR (np + 1, np, dsize);
+ MPN_COPY_DECR (np + 1, np, dsize - 1);
np[0] = 0;
}
diff --git a/mpi/mpih-mul.c b/mpi/mpih-mul.c
index 67749f4cd..76d47fe65 100644
--- a/mpi/mpih-mul.c
+++ b/mpi/mpih-mul.c
@@ -1,5 +1,5 @@
/* mpihelp-mul.c - MPI helper functions
- * Copyright (C) 1994, 1996, 1998, 1999 Free Software Foundation, Inc.
+ * Copyright (C) 1994, 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
*
* This file is part of GnuPG.
*
@@ -29,10 +29,10 @@
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include "mpi-internal.h"
#include "longlong.h"
-#include "g10lib.h" /* for g10_is_secure() */
-
+#include "g10lib.h" /* g10_is_secure() */
#define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \
@@ -373,6 +373,86 @@ mpihelp_mul_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size)
}
+
+void
+mpihelp_mul_karatsuba_case( mpi_ptr_t prodp,
+ mpi_ptr_t up, mpi_size_t usize,
+ mpi_ptr_t vp, mpi_size_t vsize,
+ struct karatsuba_ctx *ctx )
+{
+ mpi_limb_t cy;
+
+ if( !ctx->tspace || ctx->tspace_size < vsize ) {
+ if( ctx->tspace )
+ mpi_free_limb_space( ctx->tspace );
+ ctx->tspace = mpi_alloc_limb_space( 2 * vsize,
+ g10_is_secure( up ) || g10_is_secure( vp ) );
+ ctx->tspace_size = vsize;
+ }
+
+ MPN_MUL_N_RECURSE( prodp, up, vp, vsize, ctx->tspace );
+
+ prodp += vsize;
+ up += vsize;
+ usize -= vsize;
+ if( usize >= vsize ) {
+ if( !ctx->tp || ctx->tp_size < vsize ) {
+ if( ctx->tp )
+ mpi_free_limb_space( ctx->tp );
+ ctx->tp = mpi_alloc_limb_space( 2 * vsize, g10_is_secure( up )
+ || g10_is_secure( vp ) );
+ ctx->tp_size = vsize;
+ }
+
+ do {
+ MPN_MUL_N_RECURSE( ctx->tp, up, vp, vsize, ctx->tspace );
+ cy = mpihelp_add_n( prodp, prodp, ctx->tp, vsize );
+ mpihelp_add_1( prodp + vsize, ctx->tp + vsize, vsize, cy );
+ prodp += vsize;
+ up += vsize;
+ usize -= vsize;
+ } while( usize >= vsize );
+ }
+
+ if( usize ) {
+ if( usize < KARATSUBA_THRESHOLD ) {
+ mpihelp_mul( ctx->tspace, vp, vsize, up, usize );
+ }
+ else {
+ if( !ctx->next ) {
+ ctx->next = g10_xcalloc( 1, sizeof *ctx );
+ }
+ mpihelp_mul_karatsuba_case( ctx->tspace,
+ vp, vsize,
+ up, usize,
+ ctx->next );
+ }
+
+ cy = mpihelp_add_n( prodp, prodp, ctx->tspace, vsize);
+ mpihelp_add_1( prodp + vsize, ctx->tspace + vsize, usize, cy );
+ }
+}
+
+
+void
+mpihelp_release_karatsuba_ctx( struct karatsuba_ctx *ctx )
+{
+ struct karatsuba_ctx *ctx2;
+
+ if( ctx->tp )
+ mpi_free_limb_space( ctx->tp );
+ if( ctx->tspace )
+ mpi_free_limb_space( ctx->tspace );
+ for( ctx=ctx->next; ctx; ctx = ctx2 ) {
+ ctx2 = ctx->next;
+ if( ctx->tp )
+ mpi_free_limb_space( ctx->tp );
+ if( ctx->tspace )
+ mpi_free_limb_space( ctx->tspace );
+ g10_free( ctx );
+ }
+}
+
/* Multiply the natural numbers u (pointed to by UP, with USIZE limbs)
* and v (pointed to by VP, with VSIZE limbs), and store the result at
* PRODP. USIZE + VSIZE limbs are always stored, but if the input
@@ -394,7 +474,7 @@ mpihelp_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
{
mpi_ptr_t prod_endp = prodp + usize + vsize - 1;
mpi_limb_t cy;
- mpi_ptr_t tspace;
+ struct karatsuba_ctx ctx;
if( vsize < KARATSUBA_THRESHOLD ) {
mpi_size_t i;
@@ -438,34 +518,9 @@ mpihelp_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
return cy;
}
- tspace = mpi_alloc_limb_space( 2 * vsize,
- g10_is_secure( up ) || g10_is_secure( vp ) );
- MPN_MUL_N_RECURSE( prodp, up, vp, vsize, tspace );
-
- prodp += vsize;
- up += vsize;
- usize -= vsize;
- if( usize >= vsize ) {
- mpi_ptr_t tp = mpi_alloc_limb_space( 2 * vsize, g10_is_secure( up )
- || g10_is_secure( vp ) );
- do {
- MPN_MUL_N_RECURSE( tp, up, vp, vsize, tspace );
- cy = mpihelp_add_n( prodp, prodp, tp, vsize );
- mpihelp_add_1( prodp + vsize, tp + vsize, vsize, cy );
- prodp += vsize;
- up += vsize;
- usize -= vsize;
- } while( usize >= vsize );
- mpi_free_limb_space( tp );
- }
-
- if( usize ) {
- mpihelp_mul( tspace, vp, vsize, up, usize );
- cy = mpihelp_add_n( prodp, prodp, tspace, vsize);
- mpihelp_add_1( prodp + vsize, tspace + vsize, usize, cy );
- }
-
- mpi_free_limb_space( tspace );
+ memset( &ctx, 0, sizeof ctx );
+ mpihelp_mul_karatsuba_case( prodp, up, usize, vp, vsize, &ctx );
+ mpihelp_release_karatsuba_ctx( &ctx );
return *prod_endp;
}
diff --git a/mpi/power/distfiles b/mpi/power/distfiles
index e69de29bb..e664c8db6 100644
--- a/mpi/power/distfiles
+++ b/mpi/power/distfiles
@@ -0,0 +1,7 @@
+mpih-add1.S
+mpih-lshift.S
+mpih-mul1.S
+mpih-mul2.S
+mpih-mul3.S
+mpih-rshift.S
+mpih-sub1.S
diff --git a/mpi/power/mpih-add1.S b/mpi/power/mpih-add1.S
new file mode 100644
index 000000000..ad27f3d81
--- /dev/null
+++ b/mpi/power/mpih-add1.S
@@ -0,0 +1,86 @@
+/* IBM POWER add_n -- Add two limb vectors of equal, non-zero length.
+ *
+ * Copyright (C) 1992, 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# s2_ptr r5
+# size r6
+ */
+
+ .toc
+ .extern mpihelp_add_n[DS]
+ .extern .mpihelp_add_n
+.csect [PR]
+ .align 2
+ .globl mpihelp_add_n
+ .globl .mpihelp_add_n
+ .csect mpihelp_add_n[DS]
+mpihelp_add_n:
+ .long .mpihelp_add_n, TOC[tc0], 0
+ .csect [PR]
+.mpihelp_add_n:
+ andil. 10,6,1 # odd or even number of limbs?
+ l 8,0(4) # load least significant s1 limb
+ l 0,0(5) # load least significant s2 limb
+ cal 3,-4(3) # offset res_ptr, it's updated before it's used
+ sri 10,6,1 # count for unrolled loop
+ a 7,0,8 # add least significant limbs, set cy
+ mtctr 10 # copy count into CTR
+ beq 0,Leven # branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs. Add the first limbs separately.
+ cmpi 1,10,0 # is count for unrolled loop zero?
+ bne 1,L1 # branch if not
+ st 7,4(3)
+ aze 3,10 # use the fact that r10 is zero...
+ br # return
+
+# We added least significant limbs. Now reload the next limbs to enter loop.
+L1: lu 8,4(4) # load s1 limb and update s1_ptr
+ lu 0,4(5) # load s2 limb and update s2_ptr
+ stu 7,4(3)
+ ae 7,0,8 # add limbs, set cy
+Leven: lu 9,4(4) # load s1 limb and update s1_ptr
+ lu 10,4(5) # load s2 limb and update s2_ptr
+ bdz Lend # If done, skip loop
+
+Loop: lu 8,4(4) # load s1 limb and update s1_ptr
+ lu 0,4(5) # load s2 limb and update s2_ptr
+ ae 11,9,10 # add previous limbs with cy, set cy
+ stu 7,4(3) #
+ lu 9,4(4) # load s1 limb and update s1_ptr
+ lu 10,4(5) # load s2 limb and update s2_ptr
+ ae 7,0,8 # add previous limbs with cy, set cy
+ stu 11,4(3) #
+ bdn Loop # decrement CTR and loop back
+
+Lend: ae 11,9,10 # add limbs with cy, set cy
+ st 7,4(3) #
+ st 11,8(3) #
+ lil 3,0 # load cy into ...
+ aze 3,3 # ... return value register
+ br
+
diff --git a/mpi/power/mpih-lshift.S b/mpi/power/mpih-lshift.S
new file mode 100644
index 000000000..5c53a0ae6
--- /dev/null
+++ b/mpi/power/mpih-lshift.S
@@ -0,0 +1,64 @@
+/* IBM POWER lshift
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr r3
+# s_ptr r4
+# size r5
+# cnt r6
+ */
+
+ .toc
+ .extern mpihelp_lshift[DS]
+ .extern .mpihelp_lshift
+.csect [PR]
+ .align 2
+ .globl mpihelp_lshift
+ .globl .mpihelp_lshift
+ .csect mpihelp_lshift[DS]
+mpihelp_lshift:
+ .long .mpihelp_lshift, TOC[tc0], 0
+ .csect [PR]
+.mpihelp_lshift:
+ sli 0,5,2
+ cax 9,3,0
+ cax 4,4,0
+ sfi 8,6,32
+ mtctr 5 # put limb count in CTR loop register
+ lu 0,-4(4) # read most significant limb
+ sre 3,0,8 # compute carry out limb, and init MQ register
+ bdz Lend2 # if just one limb, skip loop
+ lu 0,-4(4) # read 2:nd most significant limb
+ sreq 7,0,8 # compute most significant limb of result
+ bdz Lend # if just two limb, skip loop
+Loop: lu 0,-4(4) # load next lower limb
+ stu 7,-4(9) # store previous result during read latency
+ sreq 7,0,8 # compute result limb
+ bdn Loop # loop back until CTR is zero
+Lend: stu 7,-4(9) # store 2:nd least significant limb
+Lend2: sle 7,0,6 # compute least significant limb
+ st 7,-4(9) # store it
+ br
+
diff --git a/mpi/power/mpih-mul1.S b/mpi/power/mpih-mul1.S
new file mode 100644
index 000000000..3b71b5aa9
--- /dev/null
+++ b/mpi/power/mpih-mul1.S
@@ -0,0 +1,115 @@
+/* IBM POWER mul_1 -- Multiply a limb vector with a limb and store
+ * the result in a second limb vector.
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# size r5
+# s2_limb r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result. We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set. We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work). Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+ */
+
+ .toc
+ .csect .mpihelp_mul_1[PR]
+ .align 2
+ .globl mpihelp_mul_1
+ .globl .mpihelp_mul_1
+ .csect mpihelp_mul_1[DS]
+mpihelp_mul_1:
+ .long .mpihelp_mul_1[PR], TOC[tc0], 0
+ .csect .mpihelp_mul_1[PR]
+.mpihelp_mul_1:
+
+ cal 3,-4(3)
+ l 0,0(4)
+ cmpi 0,6,0
+ mtctr 5
+ mul 9,0,6
+ srai 7,0,31
+ and 7,7,6
+ mfmq 8
+ ai 0,0,0 # reset carry
+ cax 9,9,7
+ blt Lneg
+Lpos: bdz Lend
+Lploop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ mfmq 0
+ ae 8,0,9
+ bge Lp0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Lp0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ mfmq 0
+ ae 8,0,10
+ bge Lp1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Lp1: bdn Lploop
+ b Lend
+
+Lneg: cax 9,9,0
+ bdz Lend
+Lnloop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ cax 10,10,0 # adjust high limb for negative s2_limb
+ mfmq 0
+ ae 8,0,9
+ bge Ln0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Ln0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ cax 9,9,0 # adjust high limb for negative s2_limb
+ mfmq 0
+ ae 8,0,10
+ bge Ln1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Ln1: bdn Lnloop
+ b Lend
+
+Lend0: cal 9,0(10)
+Lend: st 8,4(3)
+ aze 3,9
+ br
+
diff --git a/mpi/power/mpih-mul2.S b/mpi/power/mpih-mul2.S
new file mode 100644
index 000000000..19ddee86d
--- /dev/null
+++ b/mpi/power/mpih-mul2.S
@@ -0,0 +1,130 @@
+/* IBM POWER addmul_1 -- Multiply a limb vector with a limb and add
+ * the result to a second limb vector.
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+
+
+/*
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# size r5
+# s2_limb r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result. We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set. We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work). Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+ */
+
+ .toc
+ .csect .mpihelp_addmul_1[PR]
+ .align 2
+ .globl mpihelp_addmul_1
+ .globl .mpihelp_addmul_1
+ .csect mpihelp_addmul_1[DS]
+mpihelp_addmul_1:
+ .long .mpihelp_addmul_1[PR], TOC[tc0], 0
+ .csect .mpihelp_addmul_1[PR]
+.mpihelp_addmul_1:
+
+ cal 3,-4(3)
+ l 0,0(4)
+ cmpi 0,6,0
+ mtctr 5
+ mul 9,0,6
+ srai 7,0,31
+ and 7,7,6
+ mfmq 8
+ cax 9,9,7
+ l 7,4(3)
+ a 8,8,7 # add res_limb
+ blt Lneg
+Lpos: bdz Lend
+
+Lploop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ mfmq 0
+ ae 8,0,9 # low limb + old_cy_limb + old cy
+ l 7,4(3)
+ aze 10,10 # propagate cy to new cy_limb
+ a 8,8,7 # add res_limb
+ bge Lp0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Lp0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ mfmq 0
+ ae 8,0,10
+ l 7,4(3)
+ aze 9,9
+ a 8,8,7
+ bge Lp1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Lp1: bdn Lploop
+
+ b Lend
+
+Lneg: cax 9,9,0
+ bdz Lend
+Lnloop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ mfmq 7
+ ae 8,7,9
+ l 7,4(3)
+ ae 10,10,0 # propagate cy to new cy_limb
+ a 8,8,7 # add res_limb
+ bge Ln0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Ln0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ mfmq 7
+ ae 8,7,10
+ l 7,4(3)
+ ae 9,9,0 # propagate cy to new cy_limb
+ a 8,8,7 # add res_limb
+ bge Ln1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Ln1: bdn Lnloop
+ b Lend
+
+Lend0: cal 9,0(10)
+Lend: st 8,4(3)
+ aze 3,9
+ br
+
diff --git a/mpi/power/mpih-mul3.S b/mpi/power/mpih-mul3.S
new file mode 100644
index 000000000..e875e88ea
--- /dev/null
+++ b/mpi/power/mpih-mul3.S
@@ -0,0 +1,135 @@
+/* IBM POWER submul_1 -- Multiply a limb vector with a limb and subtract
+ * the result from a second limb vector.
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+
+/*
+
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# size r5
+# s2_limb r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result. We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set. We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work). Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+ */
+
+ .toc
+ .csect .mpihelp_submul_1[PR]
+ .align 2
+ .globl mpihelp_submul_1
+ .globl .mpihelp_submul_1
+ .csect mpihelp_submul_1[DS]
+mpihelp_submul_1:
+ .long .mpihelp_submul_1[PR], TOC[tc0], 0
+ .csect .mpihelp_submul_1[PR]
+.mpihelp_submul_1:
+
+ cal 3,-4(3)
+ l 0,0(4)
+ cmpi 0,6,0
+ mtctr 5
+ mul 9,0,6
+ srai 7,0,31
+ and 7,7,6
+ mfmq 11
+ cax 9,9,7
+ l 7,4(3)
+ sf 8,11,7 # add res_limb
+ a 11,8,11 # invert cy (r11 is junk)
+ blt Lneg
+Lpos: bdz Lend
+
+Lploop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ mfmq 0
+ ae 11,0,9 # low limb + old_cy_limb + old cy
+ l 7,4(3)
+ aze 10,10 # propagate cy to new cy_limb
+ sf 8,11,7 # add res_limb
+ a 11,8,11 # invert cy (r11 is junk)
+ bge Lp0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Lp0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ mfmq 0
+ ae 11,0,10
+ l 7,4(3)
+ aze 9,9
+ sf 8,11,7
+ a 11,8,11 # invert cy (r11 is junk)
+ bge Lp1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Lp1: bdn Lploop
+
+ b Lend
+
+Lneg: cax 9,9,0
+ bdz Lend
+Lnloop: lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 10,0,6
+ mfmq 7
+ ae 11,7,9
+ l 7,4(3)
+ ae 10,10,0 # propagate cy to new cy_limb
+ sf 8,11,7 # add res_limb
+ a 11,8,11 # invert cy (r11 is junk)
+ bge Ln0
+ cax 10,10,6 # adjust high limb for negative limb from s1
+Ln0: bdz Lend0
+ lu 0,4(4)
+ stu 8,4(3)
+ cmpi 0,0,0
+ mul 9,0,6
+ mfmq 7
+ ae 11,7,10
+ l 7,4(3)
+ ae 9,9,0 # propagate cy to new cy_limb
+ sf 8,11,7 # add res_limb
+ a 11,8,11 # invert cy (r11 is junk)
+ bge Ln1
+ cax 9,9,6 # adjust high limb for negative limb from s1
+Ln1: bdn Lnloop
+ b Lend
+
+Lend0: cal 9,0(10)
+Lend: st 8,4(3)
+ aze 3,9
+ br
+
diff --git a/mpi/power/mpih-rshift.S b/mpi/power/mpih-rshift.S
new file mode 100644
index 000000000..e29645072
--- /dev/null
+++ b/mpi/power/mpih-rshift.S
@@ -0,0 +1,64 @@
+/* IBM POWER rshift
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+
+/*
+# INPUT PARAMETERS
+# res_ptr r3
+# s_ptr r4
+# size r5
+# cnt r6
+*/
+
+ .toc
+ .extern mpihelp_rshift[DS]
+ .extern .mpihelp_rshift
+.csect [PR]
+ .align 2
+ .globl mpihelp_rshift
+ .globl .mpihelp_rshift
+ .csect mpihelp_rshift[DS]
+mpihelp_rshift:
+ .long .mpihelp_rshift, TOC[tc0], 0
+ .csect [PR]
+.mpihelp_rshift:
+ sfi 8,6,32
+ mtctr 5 # put limb count in CTR loop register
+ l 0,0(4) # read least significant limb
+ ai 9,3,-4 # adjust res_ptr since it's offset in the stu:s
+ sle 3,0,8 # compute carry limb, and init MQ register
+ bdz Lend2 # if just one limb, skip loop
+ lu 0,4(4) # read 2:nd least significant limb
+ sleq 7,0,8 # compute least significant limb of result
+ bdz Lend # if just two limb, skip loop
+Loop: lu 0,4(4) # load next higher limb
+ stu 7,4(9) # store previous result during read latency
+ sleq 7,0,8 # compute result limb
+ bdn Loop # loop back until CTR is zero
+Lend: stu 7,4(9) # store 2:nd most significant limb
+Lend2: sre 7,0,6 # compute most significant limb
+ st 7,4(9) # store it
+ br
+
+
diff --git a/mpi/power/mpih-sub1.S b/mpi/power/mpih-sub1.S
new file mode 100644
index 000000000..a3605533e
--- /dev/null
+++ b/mpi/power/mpih-sub1.S
@@ -0,0 +1,87 @@
+/* IBM POWER sub_n -- Subtract two limb vectors of equal, non-zero length.
+ *
+ * Copyright (C) 1992, 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr r3
+# s1_ptr r4
+# s2_ptr r5
+# size r6
+ */
+
+ .toc
+ .extern mpihelp_sub_n[DS]
+ .extern .mpihelp_sub_n
+.csect [PR]
+ .align 2
+ .globl mpihelp_sub_n
+ .globl .mpihelp_sub_n
+ .csect mpihelp_sub_n[DS]
+mpihelp_sub_n:
+ .long .mpihelp_sub_n, TOC[tc0], 0
+ .csect [PR]
+.mpihelp_sub_n:
+ andil. 10,6,1 # odd or even number of limbs?
+ l 8,0(4) # load least significant s1 limb
+ l 0,0(5) # load least significant s2 limb
+ cal 3,-4(3) # offset res_ptr, it's updated before it's used
+ sri 10,6,1 # count for unrolled loop
+ sf 7,0,8 # subtract least significant limbs, set cy
+ mtctr 10 # copy count into CTR
+ beq 0,Leven # branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs. Add the first limbs separately.
+ cmpi 1,10,0 # is count for unrolled loop zero?
+ bne 1,L1 # branch if not
+ st 7,4(3)
+ sfe 3,0,0 # load !cy into ...
+ sfi 3,3,0 # ... return value register
+ br # return
+
+# We added least significant limbs. Now reload the next limbs to enter loop.
+L1: lu 8,4(4) # load s1 limb and update s1_ptr
+ lu 0,4(5) # load s2 limb and update s2_ptr
+ stu 7,4(3)
+ sfe 7,0,8 # subtract limbs, set cy
+Leven: lu 9,4(4) # load s1 limb and update s1_ptr
+ lu 10,4(5) # load s2 limb and update s2_ptr
+ bdz Lend # If done, skip loop
+
+Loop: lu 8,4(4) # load s1 limb and update s1_ptr
+ lu 0,4(5) # load s2 limb and update s2_ptr
+ sfe 11,10,9 # subtract previous limbs with cy, set cy
+ stu 7,4(3) #
+ lu 9,4(4) # load s1 limb and update s1_ptr
+ lu 10,4(5) # load s2 limb and update s2_ptr
+ sfe 7,0,8 # subtract previous limbs with cy, set cy
+ stu 11,4(3) #
+ bdn Loop # decrement CTR and loop back
+
+Lend: sfe 11,10,9 # subtract limbs with cy, set cy
+ st 7,4(3) #
+ st 11,8(3) #
+ sfe 3,0,0 # load !cy into ...
+ sfi 3,3,0 # ... return value register
+ br
+