From eed2faab53f859c98bf85f1e324614da640bf3ff Mon Sep 17 00:00:00 2001 From: Werner Koch Date: Tue, 26 May 1998 13:38:00 +0000 Subject: add-key works --- mpi/alpha/distfiles | 7 +- mpi/alpha/mpih-add1.S | 8 -- mpi/alpha/mpih-lshift.S | 122 +++++++++++++++++++++++++++ mpi/alpha/mpih-mul1.S | 89 ++++++++++++++++++++ mpi/alpha/mpih-mul2.S | 96 ++++++++++++++++++++++ mpi/alpha/mpih-mul3.S | 94 +++++++++++++++++++++ mpi/alpha/mpih-rshift.S | 120 +++++++++++++++++++++++++++ mpi/alpha/mpih-shift.S | 213 ------------------------------------------------ mpi/alpha/mpih-sub1.S | 123 ++++++++++++++++++++++++++++ 9 files changed, 650 insertions(+), 222 deletions(-) create mode 100644 mpi/alpha/mpih-lshift.S create mode 100644 mpi/alpha/mpih-mul1.S create mode 100644 mpi/alpha/mpih-mul2.S create mode 100644 mpi/alpha/mpih-mul3.S create mode 100644 mpi/alpha/mpih-rshift.S delete mode 100644 mpi/alpha/mpih-shift.S create mode 100644 mpi/alpha/mpih-sub1.S (limited to 'mpi/alpha') diff --git a/mpi/alpha/distfiles b/mpi/alpha/distfiles index e92d183d2..f2ab9fc3c 100644 --- a/mpi/alpha/distfiles +++ b/mpi/alpha/distfiles @@ -1,6 +1,11 @@ README mpih-add1.S -mpih-shift.S +mpih-sub1.S +mpih-mul1.S +mpih-mul2.S +mpih-mul3.S +mpih-lshift.S +mpih-rshift.S udiv-qrnnd.S diff --git a/mpi/alpha/mpih-add1.S b/mpi/alpha/mpih-add1.S index 54cec43fa..dc3bcfbb8 100644 --- a/mpi/alpha/mpih-add1.S +++ b/mpi/alpha/mpih-add1.S @@ -19,14 +19,6 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - * The GNU MP Library itself is published under the LGPL; - * however I decided to publish this code under the plain GPL. */ diff --git a/mpi/alpha/mpih-lshift.S b/mpi/alpha/mpih-lshift.S new file mode 100644 index 000000000..9688588fa --- /dev/null +++ b/mpi/alpha/mpih-lshift.S @@ -0,0 +1,122 @@ +/* alpha lshift + * Copyright (C) 1994, 1995 Free Software Foundation, Inc. + * Copyright (C) 1998 Free Software Foundation, Inc. + * + * This file is part of GNUPG. + * + * GNUPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GNUPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + + + +/******************* + * mpi_limb_t + * mpihelp_lshift( mpi_ptr_t wp, (r16) + * mpi_ptr_t up, (r17) + * mpi_size_t usize, (r18) + * unsigned cnt) (r19) + * + * This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, + * it would take 4 cycles/limb. It should be possible to get down to 3 + * cycles/limb since both ldq and stq can be paired with the other used + * instructions. But there are many restrictions in the 21064 pipeline that + * makes it hard, if not impossible, to get down to 3 cycles/limb: + * + * 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. + * 2. Only aligned instruction pairs can be paired. + * 3. The store buffer or silo might not be able to deal with the bandwidth. + */ + + .set noreorder + .set noat +.text + .align 3 + .globl mpihelp_lshift + .ent mpihelp_lshift +mpihelp_lshift: + .frame $30,0,$26,0 + + s8addq $18,$17,$17 # make r17 point at end of s1 + ldq $4,-8($17) # load first limb + subq $17,8,$17 + subq $31,$19,$7 + s8addq $18,$16,$16 # make r16 point at end of RES + subq $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + srl $4,$7,$0 # compute function result + + beq $20,.L0 + subq $18,$20,$18 + + .align 3 +.Loop0: + ldq $3,-8($17) + subq $16,8,$16 + subq $17,8,$17 + subq $20,1,$20 + sll $4,$19,$5 + srl $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stq $8,0($16) + bne $20,.Loop0 + +.L0: beq $18,.Lend + + .align 3 +.Loop: ldq $3,-8($17) + subq $16,32,$16 + subq $18,4,$18 + sll $4,$19,$5 + srl $3,$7,$6 + + ldq $4,-16($17) + sll $3,$19,$1 + bis $5,$6,$8 + stq $8,24($16) + srl $4,$7,$2 + + ldq $3,-24($17) + sll $4,$19,$5 + bis $1,$2,$8 + stq $8,16($16) + srl $3,$7,$6 + + ldq $4,-32($17) + sll $3,$19,$1 + bis $5,$6,$8 + stq $8,8($16) + srl $4,$7,$2 + + subq $17,32,$17 + bis $1,$2,$8 + stq $8,0($16) + + bgt $18,.Loop + +.Lend: sll $4,$19,$8 + stq $8,-8($16) + ret $31,($26),1 + .end mpihelp_lshift + + diff --git a/mpi/alpha/mpih-mul1.S b/mpi/alpha/mpih-mul1.S new file mode 100644 index 000000000..5b24d98d1 --- /dev/null +++ b/mpi/alpha/mpih-mul1.S @@ -0,0 +1,89 @@ +/* Alpha 21064 mpih-mul1.S -- Multiply a limb vector with a limb and store + * the result in a second limb vector. + * + * Copyright (C) 1992, 1994, 1995, 1998 Free Software Foundation, Inc. + * + * This file is part of GNUPG. + * + * GNUPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GNUPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + + +/******************* + * mpi_limb_t + * mpihelp_mul_1( mpi_ptr_t res_ptr, (r16) + * mpi_ptr_t s1_ptr, (r17) + * mpi_size_t s1_size, (r18) + * mpi_limb_t s2_limb) (r19) + * + * This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5. + * + * To improve performance for long multiplications, we would use + * 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use + * these instructions without slowing down the general code: 1. We can + * only have two prefetches in operation at any time in the Alpha + * architecture. 2. There will seldom be any special alignment + * between RES_PTR and S1_PTR. Maybe we can simply divide the current + * loop into an inner and outer loop, having the inner loop handle + * exactly one prefetch block? + */ + + .set noreorder + .set noat +.text + .align 3 + .globl mpihelp_mul_1 + .ent mpihelp_mul_1 2 +mpihelp_mul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + bic $31,$31,$4 # clear cy_limb + umulh $2,$19,$0 # $0 = prod_high + beq $18,Lend1 # jump if size was == 1 + ldq $2,8($17) # $2 = s1_limb + subq $18,1,$18 # size-- + stq $3,0($16) + beq $18,Lend2 # jump if size was == 2 + + .align 3 +Loop: mulq $2,$19,$3 # $3 = prod_low + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,16($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + stq $3,8($16) + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $16,8,$16 # res_ptr++ + bne $18,Loop + +Lend2: mulq $2,$19,$3 # $3 = prod_low + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + stq $3,8($16) + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +Lend1: stq $3,0($16) + ret $31,($26),1 + + .end mpihelp_mul_1 + + diff --git a/mpi/alpha/mpih-mul2.S b/mpi/alpha/mpih-mul2.S new file mode 100644 index 000000000..0c8d361c3 --- /dev/null +++ b/mpi/alpha/mpih-mul2.S @@ -0,0 +1,96 @@ +/* Alpha 21064 addmul_1 -- Multiply a limb vector with a limb and add + * the result to a second limb vector. + * + * Copyright (C) 1992, 1994, 1995, 1998 Free Software Foundation, Inc. + * + * This file is part of GNUPG. + * + * GNUPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GNUPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + + +/******************* + * mpi_limb_t + * mpihelp_addmul_1( mpi_ptr_t res_ptr, (r16) + * mpi_ptr_t s1_ptr, (r17) + * mpi_size_t s1_size, (r18) + * mpi_limb_t s2_limb) (r19) + * + * This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5. + */ + + + .set noreorder + .set noat +.text + .align 3 + .globl mpihelp_addmul_1 + .ent mpihelp_addmul_1 2 +mpihelp_addmul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + umulh $2,$19,$0 # $0 = prod_high + beq $18,.Lend1 # jump if size was == 1 + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + addq $5,$3,$3 + cmpult $3,$5,$4 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + beq $18,.Lend2 # jump if size was == 2 + + .align 3 +.Loop: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + addq $5,$0,$0 # combine carries + bne $18,.Loop + +.Lend2: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $5,$0,$0 # combine carries + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +.Lend1: addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $0,$5,$0 + ret $31,($26),1 + + .end mpihelp_addmul_1 + diff --git a/mpi/alpha/mpih-mul3.S b/mpi/alpha/mpih-mul3.S new file mode 100644 index 000000000..bdf16b57b --- /dev/null +++ b/mpi/alpha/mpih-mul3.S @@ -0,0 +1,94 @@ +/* Alpha 21064 submul_1 -- Multiply a limb vector with a limb and + * subtract the result from a second limb vector. + * Copyright (C) 1992, 1994, 1995, 1998 Free Software Foundation, Inc. + * + * This file is part of GNUPG. + * + * GNUPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GNUPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + + +/******************* + * mpi_limb_t + * mpihelp_submul_1( mpi_ptr_t res_ptr, (r16 ) + * mpi_ptr_t s1_ptr, (r17 ) + * mpi_size_t s1_size, (r18 ) + * mpi_limb_t s2_limb) (r19 ) + * + * This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5. + */ + + .set noreorder + .set noat +.text + .align 3 + .globl mpihelp_submul_1 + .ent mpihelp_submul_1 2 +mpihelp_submul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + umulh $2,$19,$0 # $0 = prod_high + beq $18,.Lend1 # jump if size was == 1 + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + subq $5,$3,$3 + cmpult $5,$3,$4 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + beq $18,.Lend2 # jump if size was == 2 + + .align 3 +.Loop: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + addq $5,$0,$0 # combine carries + bne $18,.Loop + +.Lend2: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $5,$0,$0 # combine carries + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +.Lend1: subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $0,$5,$0 + ret $31,($26),1 + + .end mpihelp_submul_1 + diff --git a/mpi/alpha/mpih-rshift.S b/mpi/alpha/mpih-rshift.S new file mode 100644 index 000000000..e93315ae2 --- /dev/null +++ b/mpi/alpha/mpih-rshift.S @@ -0,0 +1,120 @@ +/* alpha rshift + * Copyright (C) 1994, 1995 Free Software Foundation, Inc. + * Copyright (C) 1998 Free Software Foundation, Inc. + * + * This file is part of GNUPG. + * + * GNUPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GNUPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + + + + +/******************* + * mpi_limb_t + * mpihelp_rshift( mpi_ptr_t wp, (r16) + * mpi_ptr_t up, (r17) + * mpi_size_t usize, (r18) + * unsigned cnt) (r19) + * + * This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, + * it would take 4 cycles/limb. It should be possible to get down to 3 + * cycles/limb since both ldq and stq can be paired with the other used + * instructions. But there are many restrictions in the 21064 pipeline that + * makes it hard, if not impossible, to get down to 3 cycles/limb: + * + * 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. + * 2. Only aligned instruction pairs can be paired. + * 3. The store buffer or silo might not be able to deal with the bandwidth. + */ + + .set noreorder + .set noat +.text + .align 3 + .globl mpihelp_rshift + .ent mpihelp_rshift +mpihelp_rshift: + .frame $30,0,$26,0 + + ldq $4,0($17) # load first limb + addq $17,8,$17 + subq $31,$19,$7 + subq $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + sll $4,$7,$0 # compute function result + + beq $20,.R0 + subq $18,$20,$18 + + .align 3 +.Roop0: + ldq $3,0($17) + addq $16,8,$16 + addq $17,8,$17 + subq $20,1,$20 + srl $4,$19,$5 + sll $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stq $8,-8($16) + bne $20,.Roop0 + +.R0: beq $18,.Rend + + .align 3 +.Roop: ldq $3,0($17) + addq $16,32,$16 + subq $18,4,$18 + srl $4,$19,$5 + sll $3,$7,$6 + + ldq $4,8($17) + srl $3,$19,$1 + bis $5,$6,$8 + stq $8,-32($16) + sll $4,$7,$2 + + ldq $3,16($17) + srl $4,$19,$5 + bis $1,$2,$8 + stq $8,-24($16) + sll $3,$7,$6 + + ldq $4,24($17) + srl $3,$19,$1 + bis $5,$6,$8 + stq $8,-16($16) + sll $4,$7,$2 + + addq $17,32,$17 + bis $1,$2,$8 + stq $8,-8($16) + + bgt $18,.Roop + +.Rend: srl $4,$19,$8 + stq $8,0($16) + ret $31,($26),1 + .end mpihelp_rshift + diff --git a/mpi/alpha/mpih-shift.S b/mpi/alpha/mpih-shift.S deleted file mode 100644 index 8bbd10cd3..000000000 --- a/mpi/alpha/mpih-shift.S +++ /dev/null @@ -1,213 +0,0 @@ -/* alpha rshift, lshift - * Copyright (C) 1994, 1995 Free Software Foundation, Inc. - * Copyright (C) 1998 Free Software Foundation, Inc. - * - * This file is part of GNUPG. - * - * GNUPG is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * GNUPG is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - * The GNU MP Library itself is published under the LGPL; - * however I decided to publish this code under the plain GPL. - */ - - - -/******************* - * mpi_limb_t - * mpihelp_lshift( mpi_ptr_t wp, (r16) - * mpi_ptr_t up, (r17) - * mpi_size_t usize, (r18) - * unsigned cnt) (r19) - * - * This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, - * it would take 4 cycles/limb. It should be possible to get down to 3 - * cycles/limb since both ldq and stq can be paired with the other used - * instructions. But there are many restrictions in the 21064 pipeline that - * makes it hard, if not impossible, to get down to 3 cycles/limb: - * - * 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. - * 2. Only aligned instruction pairs can be paired. - * 3. The store buffer or silo might not be able to deal with the bandwidth. - */ - - .set noreorder - .set noat -.text - .align 3 - .globl mpihelp_lshift - .ent mpihelp_lshift -mpihelp_lshift: - .frame $30,0,$26,0 - - s8addq $18,$17,$17 # make r17 point at end of s1 - ldq $4,-8($17) # load first limb - subq $17,8,$17 - subq $31,$19,$7 - s8addq $18,$16,$16 # make r16 point at end of RES - subq $18,1,$18 - and $18,4-1,$20 # number of limbs in first loop - srl $4,$7,$0 # compute function result - - beq $20,.L0 - subq $18,$20,$18 - - .align 3 -.Loop0: - ldq $3,-8($17) - subq $16,8,$16 - subq $17,8,$17 - subq $20,1,$20 - sll $4,$19,$5 - srl $3,$7,$6 - bis $3,$3,$4 - bis $5,$6,$8 - stq $8,0($16) - bne $20,.Loop0 - -.L0: beq $18,.Lend - - .align 3 -.Loop: ldq $3,-8($17) - subq $16,32,$16 - subq $18,4,$18 - sll $4,$19,$5 - srl $3,$7,$6 - - ldq $4,-16($17) - sll $3,$19,$1 - bis $5,$6,$8 - stq $8,24($16) - srl $4,$7,$2 - - ldq $3,-24($17) - sll $4,$19,$5 - bis $1,$2,$8 - stq $8,16($16) - srl $3,$7,$6 - - ldq $4,-32($17) - sll $3,$19,$1 - bis $5,$6,$8 - stq $8,8($16) - srl $4,$7,$2 - - subq $17,32,$17 - bis $1,$2,$8 - stq $8,0($16) - - bgt $18,.Loop - -.Lend: sll $4,$19,$8 - stq $8,-8($16) - ret $31,($26),1 - .end mpihelp_lshift - - - - - -/******************* - * mpi_limb_t - * mpihelp_rshift( mpi_ptr_t wp, (r16) - * mpi_ptr_t up, (r17) - * mpi_size_t usize, (r18) - * unsigned cnt) (r19) - * - * This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, - * it would take 4 cycles/limb. It should be possible to get down to 3 - * cycles/limb since both ldq and stq can be paired with the other used - * instructions. But there are many restrictions in the 21064 pipeline that - * makes it hard, if not impossible, to get down to 3 cycles/limb: - * - * 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. - * 2. Only aligned instruction pairs can be paired. - * 3. The store buffer or silo might not be able to deal with the bandwidth. - */ - - .set noreorder - .set noat -.text - .align 3 - .globl mpihelp_rshift - .ent mpihelp_rshift -mpihelp_rshift: - .frame $30,0,$26,0 - - ldq $4,0($17) # load first limb - addq $17,8,$17 - subq $31,$19,$7 - subq $18,1,$18 - and $18,4-1,$20 # number of limbs in first loop - sll $4,$7,$0 # compute function result - - beq $20,.R0 - subq $18,$20,$18 - - .align 3 -.Roop0: - ldq $3,0($17) - addq $16,8,$16 - addq $17,8,$17 - subq $20,1,$20 - srl $4,$19,$5 - sll $3,$7,$6 - bis $3,$3,$4 - bis $5,$6,$8 - stq $8,-8($16) - bne $20,.Roop0 - -.R0: beq $18,.Rend - - .align 3 -.Roop: ldq $3,0($17) - addq $16,32,$16 - subq $18,4,$18 - srl $4,$19,$5 - sll $3,$7,$6 - - ldq $4,8($17) - srl $3,$19,$1 - bis $5,$6,$8 - stq $8,-32($16) - sll $4,$7,$2 - - ldq $3,16($17) - srl $4,$19,$5 - bis $1,$2,$8 - stq $8,-24($16) - sll $3,$7,$6 - - ldq $4,24($17) - srl $3,$19,$1 - bis $5,$6,$8 - stq $8,-16($16) - sll $4,$7,$2 - - addq $17,32,$17 - bis $1,$2,$8 - stq $8,-8($16) - - bgt $18,.Roop - -.Rend: srl $4,$19,$8 - stq $8,0($16) - ret $31,($26),1 - .end mpihelp_rshift - diff --git a/mpi/alpha/mpih-sub1.S b/mpi/alpha/mpih-sub1.S new file mode 100644 index 000000000..bf614309b --- /dev/null +++ b/mpi/alpha/mpih-sub1.S @@ -0,0 +1,123 @@ +/* Alpha sub_n -- Subtract two limb vectors of the same length > 0 and + * store difference in a third limb vector. + * Copyright (C) 1995, 1998 Free Software Foundation, Inc. + * + * This file is part of GNUPG. + * + * GNUPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GNUPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + + +/******************* + * mpi_limb_t + * mpihelp_sub_n( mpi_ptr_t res_ptr, (r16) + * mpi_ptr_t s1_ptr, (r17) + * mpi_ptr_t s2_ptr, (r18) + * mpi_size_t size) (r19) + */ + + .set noreorder + .set noat +.text + .align 3 + .globl mpihelp_sub_n + .ent mpihelp_sub_n +mpihelp_sub_n: + .frame $30,0,$26,0 + + ldq $3,0($17) + ldq $4,0($18) + + subq $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if multiple of 4 limbs, skip first loop + + subq $19,$2,$19 + +.Loop0: subq $2,1,$2 + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + + addq $17,8,$17 + addq $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addq $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 3 +.Loop: subq $19,4,$19 + + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + + ldq $3,16($17) + addq $6,$0,$6 + ldq $4,16($18) + cmpult $6,$0,$1 + subq $5,$6,$6 + cmpult $5,$6,$0 + stq $6,8($16) + or $0,$1,$0 + + ldq $5,24($17) + addq $4,$0,$4 + ldq $6,24($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,16($16) + or $0,$1,$0 + + ldq $3,32($17) + addq $6,$0,$6 + ldq $4,32($18) + cmpult $6,$0,$1 + subq $5,$6,$6 + cmpult $5,$6,$0 + stq $6,24($16) + or $0,$1,$0 + + addq $17,32,$17 + addq $18,32,$18 + addq $16,32,$16 + bne $19,.Loop + +.Lend: addq $4,$0,$4 + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end mpihelp_sub_n + + -- cgit v1.2.3