17 files changed, 967 insertions, 195 deletions
diff --git a/mpi/ChangeLog b/mpi/ChangeLog
index d6cf6e3ee..56dd3bb91 100644
--- a/mpi/ChangeLog
+++ b/mpi/ChangeLog
@@ -1,5 +1,64 @@
-Mon Aug 30 20:38:33 CEST 1999  Werner Koch  <[email protected]>
+Wed Mar 22 13:50:24 CET 2000  Werner Koch  <[email protected]>
+
+	* config.links: Add support for FreeBSD 5 and made the case stmt
+	looking nicer.	From Jun Kuriyama.
+
+Fri Mar 17 17:50:25 CET 2000  Werner Koch  <[email protected]>
+
+	* config.links (sparc64-unknown-linux-gnu): use udic module.
+	From Adam Mitchell.
+
+2000-03-14 12:03:56  Werner Koch  ([email protected])
+
+	* Makefile.am: Do not use .s and .S files but a temp names, so that
+	OSes with caseinsensitive filenames do work.  From Frank Donahoe.
+
+Tue Mar  7 18:45:31 CET 2000  Werner Koch  <[email protected]>
+
+	* mpih-mul.c (mpihelp_mul_karatsuba_case): It seems that the
+	untested part works fine.  Removed the debugging message.
+
+	* longlong.h (umul_ppmm): Fixes for ARM-4. By Sean MacLennan.
+
+	* config.links: Add support for NetBSD.
+
+Thu Jan 13 19:31:58 CET 2000  Werner Koch  <[email protected]>
+
+	* mpi-internal.h (karatsuba_ctx): New.
+	* mpih-mul.c (mpihelp_release_karatsuba_ctx): New.
+	(mpihelp_mul_karatsuba_case): New.
+	(mpihelp_mul): Splitted to make use of the new functions.
+	* mpi-pow.c (mpi_powm): Make use of the new splitted function
+	to avoid multiple allocation of temporary memory during the
+	karatsuba operations.
 
+	* mpi_mpow.c: Removed the unused Barrett code.
+
+Sun Dec 19 15:22:26 CET 1999  Werner Koch  <[email protected]>
+
+	* power/ : Converted more comments to C comments because some AS
+	complain about ' in comments.
+
+Thu Dec 16 10:07:58 CET 1999  Werner Koch  <[email protected]>
+
+	* Makefile.am: c/SFLAGS/ASFLAGS/. This has only been used by the
+	powerpc and actually never passed the -Wa,foo to the cc.
+
+Thu Dec  9 10:31:05 CET 1999  Werner Koch  <[email protected]>
+
+	* power/: Add all files from GMP for this CPU.
+
+	* config.links:  Support for BSDI 4.x. By Wayne Chapeskie.
+	(sparc8): Made the search path the same as sparc9
+
+	* mpih-div.c (mpihelp_divrem): The MPN_COPY_DECR copied one
+	elemnat too many.  This is gmp2.0.2p9.txt patch.
+
+Sat Oct  9 20:34:41 CEST 1999  Werner Koch  <[email protected]>
+
+	* Makefile.am:	Removed libtool.
+
+Mon Aug 30 20:38:33 CEST 1999  Werner Koch  <[email protected]>
 
 	* config.links: Add case label for DJGPP
 
diff --git a/mpi/Makefile.am b/mpi/Makefile.am
index ef9816aa5..cdc39ee76 100644
--- a/mpi/Makefile.am
+++ b/mpi/Makefile.am
@@ -3,18 +3,18 @@
 
 INCLUDES =  -I$(top_srcdir)/include
 CFLAGS = @CFLAGS@ @MPI_OPT_FLAGS@
-SFLAGS = @MPI_SFLAGS@
+ASFLAGS = @MPI_SFLAGS@
 
 EXTRA_DIST = config.links
 DISTCLEANFILES = mpih-add1.S mpih-mul1.S mpih-mul2.S mpih-mul3.S  \
 		 mpih-lshift.S mpih-rshift.S mpih-sub1.S asm-syntax.h sysdep.h
 # Note: we only use .S files so we should delete all left over .s
-CLEANFILES = *.s
+CLEANFILES = _*.s
 
-noinst_LTLIBRARIES = libmpi.la
+noinst_LIBRARIES = libmpi.a
 
-libmpi_la_LDFLAGS =
-libmpi_la_SOURCES = longlong.h	   \
+# libmpi_a_LDFLAGS =
+libmpi_a_SOURCES = longlong.h	  \
 	      mpi-add.c      \
 	      mpi-bit.c      \
 	      mpi-cmp.c      \
@@ -37,24 +37,24 @@ libmpi_la_SOURCES = longlong.h	   \
 
 # Note this objects are actually links, the sourcefiles are
 # distributed by special code in dist-hook
-common_asm_objects = mpih-mul1.lo    \
-		     mpih-mul2.lo    \
-		     mpih-mul3.lo    \
-		     mpih-add1.lo    \
-		     mpih-sub1.lo    \
-		     mpih-lshift.lo  \
-		     mpih-rshift.lo
+common_asm_objects = mpih-mul1.o    \
+		     mpih-mul2.o    \
+		     mpih-mul3.o    \
+		     mpih-add1.o    \
+		     mpih-sub1.o    \
+		     mpih-lshift.o  \
+		     mpih-rshift.o
 
-libmpi_la_DEPENDENCIES = $(common_asm_objects) @MPI_EXTRA_ASM_OBJS@
-libmpi_la_LIBADD = $(common_asm_objects) @MPI_EXTRA_ASM_OBJS@
+libmpi_a_DEPENDENCIES = $(common_asm_objects) @MPI_EXTRA_ASM_OBJS@
+libmpi_a_LIBADD = $(common_asm_objects) @MPI_EXTRA_ASM_OBJS@
 
 # cancel the default rules used by libtool which do not really
 # work and add one to cpp .S files
 .S.o:
+	 $(CPP) $(INCLUDES) $(DEFS) $< | grep -v '^#' > _$*.s
+	 $(COMPILE) -c _$*.s
+	 mv -f _$*.o $*.o
 
 .S.lo:
 
-.S.s:
-	 $(CPP) $(INCLUDES) $(DEFS) $< | grep -v '^#' >$*.s
-
 
diff --git a/mpi/config.links b/mpi/config.links
index da44a9122..6a2cbfb53 100644
--- a/mpi/config.links
+++ b/mpi/config.links
@@ -1,4 +1,4 @@
-# sourced my ../configure to get the list of files to link
+# sourced by ../configure to get the list of files to link
 # this should set $mpi_ln_src and mpi_ln_dst.
 # Note: this is called from the above directory.
 
@@ -12,23 +12,40 @@ echo '/* created by config.links - do not edit */' >./mpi/asm-syntax.h
 
 if test "$try_asm_modules" = "yes" ; then
 case "${target}" in
-    i[34]86*-*-freebsd*-elf | i[34]86*-*-freebsd[34]* | i[34]86*-*-freebsdelf*)
+    i[34]86*-*-freebsd*-elf  | \
+    i[34]86*-*-freebsd[3-9]* | \
+    i[34]86*-*-freebsdelf*   | \
+    i[34]86*-*-netbsd* )
        echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
        cat  $srcdir/mpi/i386/syntax.h	   >>./mpi/asm-syntax.h
        path="i386"
        ;;
-    i[56]86*-*-freebsd*-elf | i[56]86*-*-freebsd[34]* | i[56]86*-*-freebsdelf*)
+    i[56]86*-*-freebsd*-elf  | \
+    i[56]86*-*-freebsd[3-9]* | \
+    i[56]86*-*-freebsdelf*   | \
+    i[56]86*-*-netbsd*	     | \
+    pentium-*-netbsd*	     | \
+    pentiumpro-*-netbsd*)
        echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
        cat  $srcdir/mpi/i386/syntax.h	   >>./mpi/asm-syntax.h
        path="i586 i386"
        ;;
-    i[34]86*-*-linuxaout* | i[34]86*-*-linuxoldld* | i[34]86*-*-*bsd*)
+    i[34]86*-*-bsdi4*)
+       echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
+       cat  $srcdir/mpi/i386/syntax.h	 >>./mpi/asm-syntax.h
+       path="i386"
+       ;;
+    i[34]86*-*-linuxaout*  | \
+    i[34]86*-*-linuxoldld* | \
+    i[34]86*-*-*bsd*)
 	echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h
 	echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
 	path="i386"
 	;;
-    i[56]86*-*-linuxaout* | i[56]86*-*-linuxoldld* | i[56]86*-*-*bsd*)
+    i[56]86*-*-linuxaout*  | \
+    i[56]86*-*-linuxoldld* | \
+    i[56]86*-*-*bsd*)
 	echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h
 	echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
@@ -49,7 +66,9 @@ case "${target}" in
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
 	path="i386"
 	;;
-    i[56]86*-*-* | pentium-*-* | pentiumpro-*-*)
+    i[56]86*-*-*  | \
+    pentium-*-*   | \
+    pentiumpro-*-*)
 	echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
 	path="i586 i386"
@@ -74,13 +93,23 @@ case "${target}" in
 	path="pa7100 hppa1.1 hppa"
 	mpi_extra_modules="udiv-qrnnd"
 	;;
-    sparc9*-*-* | sparc64*-*-* | ultrasparc*-*-*)
+    sparc64-*-linux-gnu)
+	# An extra rule because we have an report for this one only.
+	# Should be compared against the next GMP version
+	echo '/* configured for sparc64-*-linux-gnu */' >>./mpi/asm-syntax.h
+	path="sparc32v8 sparc32"
+	mpi_extra_modules="udiv"
+	;;
+    sparc9*-*-*     | \
+    sparc64*-*-*    | \
+    ultrasparc*-*-* )
 	echo '/* configured for sparc9 or higher */' >>./mpi/asm-syntax.h
 	path="sparc32v8 sparc32"
 	;;
-    sparc8*-*-* | microsparc*-*-*)
+    sparc8*-*-*     | \
+    microsparc*-*-*)
 	echo '/* configured for sparc8 */' >>./mpi/asm-syntax.h
-	path="sparc32v8"
+	path="sparc32v8 sparc32"
 	;;
     supersparc*-*-*)
 	echo '/* configured for supersparc */' >>./mpi/asm-syntax.h
@@ -92,7 +121,8 @@ case "${target}" in
 	path="sparc32"
 	mpi_extra_modules="udiv"
 	;;
-    mips[34]*-*-* | mips*-*-irix6*)
+    mips[34]*-*-* | \
+    mips*-*-irix6*)
        echo '/* configured for MIPS3 */' >>./mpi/asm-syntax.h
        path="mips3"
        ;;
@@ -103,7 +133,8 @@ case "${target}" in
 
     # Motorola 68k configurations.  Let m68k mean 68020-68040.
     # mc68000 or mc68060 configurations need to be specified explicitly
-    m680[234]0*-*-linuxaout* | m68k*-*-linuxaout*)
+    m680[234]0*-*-linuxaout* | \
+    m68k*-*-linuxaout*)
 	echo '#define MIT_SYNTAX'           >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	path="m68k/mc68020 m68k"
@@ -113,7 +144,8 @@ case "${target}" in
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	path="m68k"
 	;;
-    m680[234]0*-*-linux* | m68k*-*-linux*)
+    m680[234]0*-*-linux* | \
+    m68k*-*-linux*)
 	echo '#define ELF_SYNTAX'           >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	;;
@@ -127,12 +159,14 @@ case "${target}" in
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	path="m68k/mc68020 m68k"
 	;;
-    m68000*-*-* | m68060*-*-*)
+    m68000*-*-* | \
+    m68060*-*-*)
 	echo '#define MIT_SYNTAX'           >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	path="m68k/mc68000"
 	;;
-    m680[234]0*-*-* | m68k*-*-*)
+    m680[234]0*-*-* | \
+    m68k*-*-*)
 	echo '#define MIT_SYNTAX'           >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	path="m68k/mc68020 m68k"
@@ -144,25 +178,37 @@ case "${target}" in
 	cat   $srcdir/mpi/powerpc32/syntax.h	>>./mpi/asm-syntax.h
 	path="powerpc32"
 	;;
-    rs6000-*-aix[456789]* | rs6000-*-aix3.2.[456789])
+    rs6000-*-aix[456789]*    | \
+    rs6000-*-aix3.2.[456789])
 	mpi_sflags="-Wa,-mpwr"
 	path="power"
 	mpi_extra_modules="udiv-w-sdiv"
 	;;
-    rs6000-*-* | power-*-* | power2-*-*)
+    rs6000-*-* | \
+    power-*-*  | \
+    power2-*-*)
 	mpi_sflags="-Wa,-mppc"
 	path="power"
 	mpi_extra_modules="udiv-w-sdiv"
 	;;
+    powerpc-ibm-aix4.2.* )
+	# I am not sure about this one but a machine identified by
+	# powerpc-ibm-aix4.2.1.0 cannot use the powerpc32 code.
+	mpi_sflags="-Wa,-mpwr"
+	path="power"
+	mpi_extra_modules="udiv-w-sdiv"
+	;;
     ppc601-*-*)
 	mpi_sflags="-Wa,-mppc"
 	path="power powerpc32"
 	;;
-    ppc60[234]*-*-* | powerpc*-*-*)
+    ppc60[234]*-*-* | \
+    powerpc*-*-*)
 	mpi_sflags="-Wa,-mppc"
 	path="powerpc32"
 	;;
-    ppc620-*-* | powerpc64*-*-*)
+    ppc620-*-*	    | \
+    powerpc64*-*-*)
 	mpi_sflags="-Wa,-mppc"
 	path="powerpc64"
 	;;
diff --git a/mpi/longlong.h b/mpi/longlong.h
index c92435570..e36beae49 100644
--- a/mpi/longlong.h
+++ b/mpi/longlong.h
@@ -199,6 +199,8 @@ extern UDItype __udiv_qrnnd ();
 	     "rI" ((USItype)(bh)),                                      \
 	     "r" ((USItype)(al)),                                       \
 	     "rI" ((USItype)(bl)))
+#ifdef __ARM_ARCH_3__
+/* SAM This does not work on arm4 */
 #define umul_ppmm(xh, xl, a, b) \
   __asm__ ("%@ Inlined umul_ppmm
 	mov	%|r0, %2, lsr #16
@@ -218,6 +220,18 @@ extern UDItype __udiv_qrnnd ();
 	   : "r" ((USItype)(a)),                                        \
 	     "r" ((USItype)(b))                                         \
 	   : "r0", "r1", "r2")
+#elif __ARM_ARCH_4__
+#define umul_ppmm(xh, xl, a, b) \
+  __asm__ ("%@ Inlined umul_ppmm
+	umull	%r1, %r0, %r2, %r3" \
+		   : "=&r" ((USItype)(xh)), \
+		     "=r" ((USItype)(xl)) \
+		   : "r" ((USItype)(a)), \
+		     "r" ((USItype)(b)) \
+		   : "r0", "r1")
+#else
+#error Untested architecture
+#endif
 #define UMUL_TIME 20
 #define UDIV_TIME 100
 #endif /* __arm__ */
diff --git a/mpi/mpi-internal.h b/mpi/mpi-internal.h
index 035d33cb3..2b521c952 100644
--- a/mpi/mpi-internal.h
+++ b/mpi/mpi-internal.h
@@ -186,6 +186,17 @@ mpi_limb_t mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
 int mpihelp_cmp( mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size );
 
 /*-- mpihelp-mul.c --*/
+
+struct karatsuba_ctx {
+    struct karatsuba_ctx *next;
+    mpi_ptr_t tspace;
+    mpi_size_t tspace_size;
+    mpi_ptr_t tp;
+    mpi_size_t tp_size;
+};
+
+void mpihelp_release_karatsuba_ctx( struct karatsuba_ctx *ctx );
+
 mpi_limb_t mpihelp_addmul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
 			     mpi_size_t s1_size, mpi_limb_t s2_limb);
 mpi_limb_t mpihelp_submul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
@@ -198,6 +209,12 @@ void mpih_sqr_n_basecase( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size );
 void mpih_sqr_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size,
 						mpi_ptr_t tspace);
 
+void mpihelp_mul_karatsuba_case( mpi_ptr_t prodp,
+				 mpi_ptr_t up, mpi_size_t usize,
+				 mpi_ptr_t vp, mpi_size_t vsize,
+				 struct karatsuba_ctx *ctx );
+
+
 /*-- mpihelp-mul_1.c (or xxx/cpu/ *.S) --*/
 mpi_limb_t mpihelp_mul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
 			  mpi_size_t s1_size, mpi_limb_t s2_limb);
diff --git a/mpi/mpi-mpow.c b/mpi/mpi-mpow.c
index a8c561dd1..001802191 100644
--- a/mpi/mpi-mpow.c
+++ b/mpi/mpi-mpow.c
@@ -1,5 +1,5 @@
 /* mpi-mpow.c  -  MPI functions
- *	Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ *	Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
  *
  * This file is part of GnuPG.
  *
@@ -25,22 +25,6 @@
 #include "longlong.h"
 #include <assert.h>
 
-/* Barrett is slower than the classical way.  It can be tweaked by
- * using partial multiplications
- */
-/*#define USE_BARRETT*/
-
-
-
-#ifdef USE_BARRETT
-static void barrett_mulm( MPI w, MPI u, MPI v, MPI m, MPI y, int k, MPI r1, MPI r2 );
-static MPI init_barrett( MPI m, int *k, MPI *r1, MPI *r2 );
-static int calc_barrett( MPI r, MPI x, MPI m, MPI y, int k, MPI r1, MPI r2  );
-#else
-#define barrett_mulm( w, u, v, m, y, k, r1, r2 ) mpi_mulm( (w), (u), (v), (m) )
-#endif
-
-
 static int
 build_index( MPI *exparray, int k, int i, int t )
 {
@@ -53,7 +37,6 @@ build_index( MPI *exparray, int k, int i, int t )
 	if( mpi_test_bit( exparray[j], bitno ) )
 	    index |= 1;
     }
-    /*log_debug("t=%d i=%d index=%d\n", t, i, index );*/
     return index;
 }
 
@@ -68,35 +51,25 @@ mpi_mulpowm( MPI res, MPI *basearray, MPI *exparray, MPI m)
     int i, j, idx;
     MPI *G;	/* table with precomputed values of size 2^k */
     MPI tmp;
-  #ifdef USE_BARRETT
-    MPI barrett_y, barrett_r1, barrett_r2;
-    int barrett_k;
-  #endif
 
     for(k=0; basearray[k]; k++ )
 	;
     assert(k);
     for(t=0, i=0; (tmp=exparray[i]); i++ ) {
-	/*log_mpidump("exp: ", tmp );*/
 	j = mpi_get_nbits(tmp);
 	if( j > t )
 	    t = j;
     }
-    /*log_mpidump("mod: ", m );*/
     assert(i==k);
     assert(t);
     assert( k < 10 );
 
     G = m_alloc_clear( (1<<k) * sizeof *G );
-  #ifdef USE_BARRETT
-    barrett_y = init_barrett( m, &barrett_k, &barrett_r1, &barrett_r2 );
-  #endif
     /* and calculate */
     tmp =  mpi_alloc( mpi_get_nlimbs(m)+1 );
     mpi_set_ui( res, 1 );
     for(i = 1; i <= t; i++ ) {
-	barrett_mulm(tmp, res, res, m, barrett_y, barrett_k,
-				       barrett_r1, barrett_r2 );
+	mpi_mulm(tmp, res, res, m );
 	idx = build_index( exparray, k, i, t );
 	assert( idx >= 0 && idx < (1<<k) );
 	if( !G[idx] ) {
@@ -108,115 +81,21 @@ mpi_mulpowm( MPI res, MPI *basearray, MPI *exparray, MPI m)
 			if( !G[idx] )
 			    G[idx] = mpi_copy( basearray[j] );
 			else
-			    barrett_mulm( G[idx], G[idx], basearray[j],
-					       m, barrett_y, barrett_k, barrett_r1, barrett_r2	);
+			    mpi_mulm( G[idx], G[idx], basearray[j], m );
 		    }
 		}
 		if( !G[idx] )
 		    G[idx] = mpi_alloc(0);
 	    }
 	}
-	barrett_mulm(res, tmp, G[idx], m, barrett_y, barrett_k, barrett_r1, barrett_r2	);
+	mpi_mulm(res, tmp, G[idx], m );
     }
 
     /* cleanup */
     mpi_free(tmp);
-  #ifdef USE_BARRETT
-    mpi_free(barrett_y);
-    mpi_free(barrett_r1);
-    mpi_free(barrett_r2);
-  #endif
     for(i=0; i < (1<<k); i++ )
 	mpi_free(G[i]);
     m_free(G);
 }
 
 
-
-#ifdef USE_BARRETT
-static void
-barrett_mulm( MPI w, MPI u, MPI v, MPI m, MPI y, int k, MPI r1, MPI r2	)
-{
-    mpi_mul(w, u, v);
-    if( calc_barrett( w, w, m, y, k, r1, r2 ) )
-	mpi_fdiv_r( w, w, m );
-}
-
-/****************
- * Barrett precalculation: y = floor(b^(2k) / m)
- */
-static MPI
-init_barrett( MPI m, int *k, MPI *r1, MPI *r2 )
-{
-    MPI tmp;
-
-    mpi_normalize( m );
-    *k = mpi_get_nlimbs( m );
-    tmp = mpi_alloc( *k + 1 );
-    mpi_set_ui( tmp, 1 );
-    mpi_lshift_limbs( tmp, 2 * *k );
-    mpi_fdiv_q( tmp, tmp, m );
-    *r1 = mpi_alloc( 2* *k + 1 );
-    *r2 = mpi_alloc( 2* *k + 1 );
-    return tmp;
-}
-
-/****************
- * Barrett reduction: We assume that these conditions are met:
- * Given x =(x_2k-1 ...x_0)_b
- *	 m =(m_k-1 ....m_0)_b	  with m_k-1 != 0
- * Output r = x mod m
- * Before using this function init_barret must be used to calucalte y and k.
- * Returns: false = no error
- *	    true = can't perform barret reduction
- */
-static int
-calc_barrett( MPI r, MPI x, MPI m, MPI y, int k, MPI r1, MPI r2 )
-{
-    int xx = k > 3 ? k-3:0;
-
-    mpi_normalize( x );
-    if( mpi_get_nlimbs(x) > 2*k )
-	return 1; /* can't do it */
-
-    /* 1. q1 = floor( x / b^k-1)
-     *	  q2 = q1 * y
-     *	  q3 = floor( q2 / b^k+1 )
-     * Actually, we don't need qx, we can work direct on r2
-     */
-    mpi_set( r2, x );
-    mpi_rshift_limbs( r2, k-1 );
-    mpi_mul( r2, r2, y );
-    mpi_rshift_limbs( r2, k+1 );
-
-    /* 2. r1 = x mod b^k+1
-     *	  r2 = q3 * m mod b^k+1
-     *	  r  = r1 - r2
-     * 3. if r < 0 then  r = r + b^k+1
-     */
-    mpi_set( r1, x );
-    if( r1->nlimbs > k+1 ) /* quick modulo operation */
-	r1->nlimbs = k+1;
-    mpi_mul( r2, r2, m );
-    if( r2->nlimbs > k+1 ) /* quick modulo operation */
-	r2->nlimbs = k+1;
-    mpi_sub( r, r1, r2 );
-
-    if( mpi_is_neg( r ) ) {
-	MPI tmp;
-
-	tmp = mpi_alloc( k + 2 );
-	mpi_set_ui( tmp, 1 );
-	mpi_lshift_limbs( tmp, k+1 );
-	mpi_add( r, r, tmp );
-	mpi_free(tmp);
-    }
-
-    /* 4. while r >= m do r = r - m */
-    while( mpi_cmp( r, m ) >= 0 )
-	mpi_sub( r, r, m );
-
-    return 0;
-}
-#endif /* USE_BARRETT */
-
diff --git a/mpi/mpi-pow.c b/mpi/mpi-pow.c
index e8d55f9b9..fbd2cb8ef 100644
--- a/mpi/mpi-pow.c
+++ b/mpi/mpi-pow.c
@@ -1,6 +1,6 @@
 /* mpi-pow.c  -  MPI functions
  *	Copyright (C) 1998 Free Software Foundation, Inc.
- *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *	Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
  *
  * This file is part of GnuPG.
  *
@@ -30,6 +30,7 @@
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include "mpi-internal.h"
 #include "longlong.h"
 #include <assert.h>
@@ -159,7 +160,9 @@ mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
 	int c;
 	mpi_limb_t e;
 	mpi_limb_t carry_limb;
+	struct karatsuba_ctx karactx;
 
+	memset( &karactx, 0, sizeof karactx );
 	negative_result = (ep[0] & 1) && base->sign;
 
 	i = esize - 1;
@@ -177,6 +180,7 @@ mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
 	 * by RP (==RES->d), and with 50% probability in the area originally
 	 * pointed to by XP.
 	 */
+
 	for(;;) {
 	    while( c ) {
 		mpi_ptr_t tp;
@@ -194,7 +198,6 @@ mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
 			mpi_free_limb_space( tspace );
 			tsize = 2 * rsize;
 			tspace = mpi_alloc_limb_space( tsize, 0 );
-
 		    }
 		    mpih_sqr_n( xp, rp, rsize, tspace );
 		}
@@ -209,7 +212,15 @@ mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
 		rsize = xsize;
 
 		if( (mpi_limb_signed_t)e < 0 ) {
-		    mpihelp_mul( xp, rp, rsize, bp, bsize );
+		    /*mpihelp_mul( xp, rp, rsize, bp, bsize );*/
+		    if( bsize < KARATSUBA_THRESHOLD ) {
+			mpihelp_mul( xp, rp, rsize, bp, bsize );
+		    }
+		    else {
+			mpihelp_mul_karatsuba_case(
+				     xp, rp, rsize, bp, bsize, &karactx );
+		    }
+
 		    xsize = rsize + bsize;
 		    if( xsize > msize ) {
 			mpihelp_divrem(xp + msize, 0, xp, xsize, mp, msize);
@@ -258,6 +269,8 @@ mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
 	if( mod_shift_cnt )
 	    mpihelp_rshift( rp, rp, rsize, mod_shift_cnt);
 	MPN_NORMALIZE (rp, rsize);
+
+	mpihelp_release_karatsuba_ctx( &karactx );
     }
 
     if( negative_result && rsize ) {
diff --git a/mpi/mpih-div.c b/mpi/mpih-div.c
index 0d711cb58..bb837208b 100644
--- a/mpi/mpih-div.c
+++ b/mpi/mpih-div.c
@@ -338,7 +338,7 @@ mpihelp_divrem( mpi_ptr_t qp, mpi_size_t qextra_limbs,
 		}
 		else {
 		    n2 = np[dsize - 1];
-		    MPN_COPY_DECR (np + 1, np, dsize);
+		    MPN_COPY_DECR (np + 1, np, dsize - 1);
 		    np[0] = 0;
 		}
 
diff --git a/mpi/mpih-mul.c b/mpi/mpih-mul.c
index 7707c0e30..e1bfef55b 100644
--- a/mpi/mpih-mul.c
+++ b/mpi/mpih-mul.c
@@ -1,5 +1,5 @@
 /* mpihelp-mul.c  -  MPI helper functions
- * Copyright (C) 1994, 1996, 1998, 1999 Free Software Foundation, Inc.
+ * Copyright (C) 1994, 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
  *
  * This file is part of GnuPG.
  *
@@ -29,6 +29,7 @@
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include "mpi-internal.h"
 #include "longlong.h"
 
@@ -372,6 +373,86 @@ mpihelp_mul_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size)
 }
 
 
+
+void
+mpihelp_mul_karatsuba_case( mpi_ptr_t prodp,
+			    mpi_ptr_t up, mpi_size_t usize,
+			    mpi_ptr_t vp, mpi_size_t vsize,
+			    struct karatsuba_ctx *ctx )
+{
+    mpi_limb_t cy;
+
+    if( !ctx->tspace || ctx->tspace_size < vsize ) {
+	if( ctx->tspace )
+	    mpi_free_limb_space( ctx->tspace );
+	ctx->tspace = mpi_alloc_limb_space( 2 * vsize,
+				       m_is_secure( up ) || m_is_secure( vp ) );
+	ctx->tspace_size = vsize;
+    }
+
+    MPN_MUL_N_RECURSE( prodp, up, vp, vsize, ctx->tspace );
+
+    prodp += vsize;
+    up += vsize;
+    usize -= vsize;
+    if( usize >= vsize ) {
+	if( !ctx->tp || ctx->tp_size < vsize ) {
+	    if( ctx->tp )
+		mpi_free_limb_space( ctx->tp );
+	    ctx->tp = mpi_alloc_limb_space( 2 * vsize, m_is_secure( up )
+						      || m_is_secure( vp ) );
+	    ctx->tp_size = vsize;
+	}
+
+	do {
+	    MPN_MUL_N_RECURSE( ctx->tp, up, vp, vsize, ctx->tspace );
+	    cy = mpihelp_add_n( prodp, prodp, ctx->tp, vsize );
+	    mpihelp_add_1( prodp + vsize, ctx->tp + vsize, vsize, cy );
+	    prodp += vsize;
+	    up += vsize;
+	    usize -= vsize;
+	} while( usize >= vsize );
+    }
+
+    if( usize ) {
+	if( usize < KARATSUBA_THRESHOLD ) {
+	    mpihelp_mul( ctx->tspace, vp, vsize, up, usize );
+	}
+	else {
+	    if( !ctx->next ) {
+		ctx->next = m_alloc_clear( sizeof *ctx );
+	    }
+	    mpihelp_mul_karatsuba_case( ctx->tspace,
+					vp, vsize,
+					up, usize,
+					ctx->next );
+	}
+
+	cy = mpihelp_add_n( prodp, prodp, ctx->tspace, vsize);
+	mpihelp_add_1( prodp + vsize, ctx->tspace + vsize, usize, cy );
+    }
+}
+
+
+void
+mpihelp_release_karatsuba_ctx( struct karatsuba_ctx *ctx )
+{
+    struct karatsuba_ctx *ctx2;
+
+    if( ctx->tp )
+	mpi_free_limb_space( ctx->tp );
+    if( ctx->tspace )
+	mpi_free_limb_space( ctx->tspace );
+    for( ctx=ctx->next; ctx; ctx = ctx2 ) {
+	ctx2 = ctx->next;
+	if( ctx->tp )
+	    mpi_free_limb_space( ctx->tp );
+	if( ctx->tspace )
+	    mpi_free_limb_space( ctx->tspace );
+	m_free( ctx );
+    }
+}
+
 /* Multiply the natural numbers u (pointed to by UP, with USIZE limbs)
  * and v (pointed to by VP, with VSIZE limbs), and store the result at
  * PRODP.  USIZE + VSIZE limbs are always stored, but if the input
@@ -393,7 +474,7 @@ mpihelp_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
 {
     mpi_ptr_t prod_endp = prodp + usize + vsize - 1;
     mpi_limb_t cy;
-    mpi_ptr_t tspace;
+    struct karatsuba_ctx ctx;
 
     if( vsize < KARATSUBA_THRESHOLD ) {
 	mpi_size_t i;
@@ -437,34 +518,9 @@ mpihelp_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
 	return cy;
     }
 
-    tspace = mpi_alloc_limb_space( 2 * vsize,
-				   m_is_secure( up ) || m_is_secure( vp ) );
-    MPN_MUL_N_RECURSE( prodp, up, vp, vsize, tspace );
-
-    prodp += vsize;
-    up += vsize;
-    usize -= vsize;
-    if( usize >= vsize ) {
-	mpi_ptr_t tp = mpi_alloc_limb_space( 2 * vsize, m_is_secure( up )
-							|| m_is_secure( vp ) );
-	do {
-	    MPN_MUL_N_RECURSE( tp, up, vp, vsize, tspace );
-	    cy = mpihelp_add_n( prodp, prodp, tp, vsize );
-	    mpihelp_add_1( prodp + vsize, tp + vsize, vsize, cy );
-	    prodp += vsize;
-	    up += vsize;
-	    usize -= vsize;
-	} while( usize >= vsize );
-	mpi_free_limb_space( tp );
-    }
-
-    if( usize ) {
-	mpihelp_mul( tspace, vp, vsize, up, usize );
-	cy = mpihelp_add_n( prodp, prodp, tspace, vsize);
-	mpihelp_add_1( prodp + vsize, tspace + vsize, usize, cy );
-    }
-
-    mpi_free_limb_space( tspace );
+    memset( &ctx, 0, sizeof ctx );
+    mpihelp_mul_karatsuba_case( prodp, up, usize, vp, vsize, &ctx );
+    mpihelp_release_karatsuba_ctx( &ctx );
     return *prod_endp;
 }
 
diff --git a/mpi/power/distfiles b/mpi/power/distfiles
index e69de29bb..e664c8db6 100644
--- a/mpi/power/distfiles
+++ b/mpi/power/distfiles
@@ -0,0 +1,7 @@
+mpih-add1.S
+mpih-lshift.S
+mpih-mul1.S
+mpih-mul2.S
+mpih-mul3.S
+mpih-rshift.S
+mpih-sub1.S
diff --git a/mpi/power/mpih-add1.S b/mpi/power/mpih-add1.S
new file mode 100644
index 000000000..ad27f3d81
--- /dev/null
+++ b/mpi/power/mpih-add1.S
@@ -0,0 +1,86 @@
+/* IBM POWER add_n -- Add two limb vectors of equal, non-zero length.
+ *
+ * Copyright (C) 1992, 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+ */
+
+	.toc
+	.extern mpihelp_add_n[DS]
+	.extern .mpihelp_add_n
+.csect [PR]
+	.align 2
+	.globl mpihelp_add_n
+	.globl .mpihelp_add_n
+	.csect mpihelp_add_n[DS]
+mpihelp_add_n:
+	.long .mpihelp_add_n, TOC[tc0], 0
+	.csect [PR]
+.mpihelp_add_n:
+	andil.	10,6,1		# odd or even number of limbs?
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3) 	# offset res_ptr, it's updated before it's used
+	sri	10,6,1		# count for unrolled loop
+	a	7,0,8		# add least significant limbs, set cy
+	mtctr	10		# copy count into CTR
+	beq	0,Leven 	# branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		# is count for unrolled loop zero?
+	bne	1,L1		# branch if not
+	st	7,4(3)
+	aze	3,10		# use the fact that r10 is zero...
+	br			# return
+
+# We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)
+	ae	7,0,8		# add limbs, set cy
+Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5) 	# load s2 limb and update s2_ptr
+	bdz	Lend		# If done, skip loop
+
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	ae	11,9,10 	# add previous limbs with cy, set cy
+	stu	7,4(3)		#
+	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5) 	# load s2 limb and update s2_ptr
+	ae	7,0,8		# add previous limbs with cy, set cy
+	stu	11,4(3) 	#
+	bdn	Loop		# decrement CTR and loop back
+
+Lend:	ae	11,9,10 	# add limbs with cy, set cy
+	st	7,4(3)		#
+	st	11,8(3) 	#
+	lil	3,0		# load cy into ...
+	aze	3,3		# ... return value register
+	br
+
diff --git a/mpi/power/mpih-lshift.S b/mpi/power/mpih-lshift.S
new file mode 100644
index 000000000..5c53a0ae6
--- /dev/null
+++ b/mpi/power/mpih-lshift.S
@@ -0,0 +1,64 @@
+/* IBM POWER lshift
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr 	r4
+# size		r5
+# cnt		r6
+ */
+
+	.toc
+	.extern mpihelp_lshift[DS]
+	.extern .mpihelp_lshift
+.csect [PR]
+	.align 2
+	.globl mpihelp_lshift
+	.globl .mpihelp_lshift
+	.csect mpihelp_lshift[DS]
+mpihelp_lshift:
+	.long .mpihelp_lshift, TOC[tc0], 0
+	.csect [PR]
+.mpihelp_lshift:
+	sli	0,5,2
+	cax	9,3,0
+	cax	4,4,0
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	lu	0,-4(4) 	# read most significant limb
+	sre	3,0,8		# compute carry out limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,-4(4) 	# read 2:nd most significant limb
+	sreq	7,0,8		# compute most significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,-4(4) 	# load next lower limb
+	stu	7,-4(9) 	# store previous result during read latency
+	sreq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,-4(9) 	# store 2:nd least significant limb
+Lend2:	sle	7,0,6		# compute least significant limb
+	st	7,-4(9) 	# store it
+	br
+
diff --git a/mpi/power/mpih-mul1.S b/mpi/power/mpih-mul1.S
new file mode 100644
index 000000000..3b71b5aa9
--- /dev/null
+++ b/mpi/power/mpih-mul1.S
@@ -0,0 +1,115 @@
+/* IBM POWER  mul_1 -- Multiply a limb vector with a limb and store
+ * the result in a second limb vector.
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.	We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+ */
+
+	.toc
+	.csect .mpihelp_mul_1[PR]
+	.align 2
+	.globl mpihelp_mul_1
+	.globl .mpihelp_mul_1
+	.csect mpihelp_mul_1[DS]
+mpihelp_mul_1:
+	.long .mpihelp_mul_1[PR], TOC[tc0], 0
+	.csect .mpihelp_mul_1[PR]
+.mpihelp_mul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	ai	0,0,0		# reset carry
+	cax	9,9,7
+	blt	Lneg
+Lpos:	bdz	Lend
+Lploop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9
+	bge	Lp0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	cax	10,10,0 	# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,9
+	bge	Ln0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	cax	9,9,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,10
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
+
diff --git a/mpi/power/mpih-mul2.S b/mpi/power/mpih-mul2.S
new file mode 100644
index 000000000..19ddee86d
--- /dev/null
+++ b/mpi/power/mpih-mul2.S
@@ -0,0 +1,130 @@
+/* IBM POWER addmul_1 -- Multiply a limb vector with a limb and add
+ *			 the result to a second limb vector.
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.	We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+ */
+
+	.toc
+	.csect .mpihelp_addmul_1[PR]
+	.align 2
+	.globl mpihelp_addmul_1
+	.globl .mpihelp_addmul_1
+	.csect mpihelp_addmul_1[DS]
+mpihelp_addmul_1:
+	.long .mpihelp_addmul_1[PR], TOC[tc0], 0
+	.csect .mpihelp_addmul_1[PR]
+.mpihelp_addmul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	cax	9,9,7
+	l	7,4(3)
+	a	8,8,7		# add res_limb
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Lp0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	l	7,4(3)
+	aze	9,9
+	a	8,8,7
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	8,7,9
+	l	7,4(3)
+	ae	10,10,0 	# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	8,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
+
diff --git a/mpi/power/mpih-mul3.S b/mpi/power/mpih-mul3.S
new file mode 100644
index 000000000..e875e88ea
--- /dev/null
+++ b/mpi/power/mpih-mul3.S
@@ -0,0 +1,135 @@
+/* IBM POWER submul_1 -- Multiply a limb vector with a limb and subtract
+ *			 the result from a second limb vector.
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+
+/*
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.	We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+ */
+
+	.toc
+	.csect .mpihelp_submul_1[PR]
+	.align 2
+	.globl mpihelp_submul_1
+	.globl .mpihelp_submul_1
+	.csect mpihelp_submul_1[DS]
+mpihelp_submul_1:
+	.long .mpihelp_submul_1[PR], TOC[tc0], 0
+	.csect .mpihelp_submul_1[PR]
+.mpihelp_submul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	11
+	cax	9,9,7
+	l	7,4(3)
+	sf	8,11,7		# add res_limb
+	a	11,8,11 	# invert cy (r11 is junk)
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	11,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11 	# invert cy (r11 is junk)
+	bge	Lp0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	11,0,10
+	l	7,4(3)
+	aze	9,9
+	sf	8,11,7
+	a	11,8,11 	# invert cy (r11 is junk)
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	11,7,9
+	l	7,4(3)
+	ae	10,10,0 	# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11 	# invert cy (r11 is junk)
+	bge	Ln0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	11,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11 	# invert cy (r11 is junk)
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
+
diff --git a/mpi/power/mpih-rshift.S b/mpi/power/mpih-rshift.S
new file mode 100644
index 000000000..e29645072
--- /dev/null
+++ b/mpi/power/mpih-rshift.S
@@ -0,0 +1,64 @@
+/* IBM POWER rshift
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr 	r4
+# size		r5
+# cnt		r6
+*/
+
+	.toc
+	.extern mpihelp_rshift[DS]
+	.extern .mpihelp_rshift
+.csect [PR]
+	.align 2
+	.globl mpihelp_rshift
+	.globl .mpihelp_rshift
+	.csect mpihelp_rshift[DS]
+mpihelp_rshift:
+	.long .mpihelp_rshift, TOC[tc0], 0
+	.csect [PR]
+.mpihelp_rshift:
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	l	0,0(4)		# read least significant limb
+	ai	9,3,-4		# adjust res_ptr since it's offset in the stu:s
+	sle	3,0,8		# compute carry limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,4(4)		# read 2:nd least significant limb
+	sleq	7,0,8		# compute least significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,4(4)		# load next higher limb
+	stu	7,4(9)		# store previous result during read latency
+	sleq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,4(9)		# store 2:nd most significant limb
+Lend2:	sre	7,0,6		# compute most significant limb
+	st	7,4(9)		# store it
+	br
+
+
diff --git a/mpi/power/mpih-sub1.S b/mpi/power/mpih-sub1.S
new file mode 100644
index 000000000..a3605533e
--- /dev/null
+++ b/mpi/power/mpih-sub1.S
@@ -0,0 +1,87 @@
+/* IBM POWER sub_n -- Subtract two limb vectors of equal, non-zero length.
+ *
+ * Copyright (C) 1992, 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+ */
+
+	.toc
+	.extern mpihelp_sub_n[DS]
+	.extern .mpihelp_sub_n
+.csect [PR]
+	.align 2
+	.globl mpihelp_sub_n
+	.globl .mpihelp_sub_n
+	.csect mpihelp_sub_n[DS]
+mpihelp_sub_n:
+	.long .mpihelp_sub_n, TOC[tc0], 0
+	.csect [PR]
+.mpihelp_sub_n:
+	andil.	10,6,1		# odd or even number of limbs?
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3) 	# offset res_ptr, it's updated before it's used
+	sri	10,6,1		# count for unrolled loop
+	sf	7,0,8		# subtract least significant limbs, set cy
+	mtctr	10		# copy count into CTR
+	beq	0,Leven 	# branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		# is count for unrolled loop zero?
+	bne	1,L1		# branch if not
+	st	7,4(3)
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br			# return
+
+# We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)
+	sfe	7,0,8		# subtract limbs, set cy
+Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5) 	# load s2 limb and update s2_ptr
+	bdz	Lend		# If done, skip loop
+
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	sfe	11,10,9 	# subtract previous limbs with cy, set cy
+	stu	7,4(3)		#
+	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5) 	# load s2 limb and update s2_ptr
+	sfe	7,0,8		# subtract previous limbs with cy, set cy
+	stu	11,4(3) 	#
+	bdn	Loop		# decrement CTR and loop back
+
+Lend:	sfe	11,10,9 	# subtract limbs with cy, set cy
+	st	7,4(3)		#
+	st	11,8(3) 	#
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br
+