First Commit
This commit is contained in:
197
externals/libressl/crypto/bn/arch/amd64/bignum_sqr.S
vendored
Normal file
197
externals/libressl/crypto/bn/arch/amd64/bignum_sqr.S
vendored
Normal file
@@ -0,0 +1,197 @@
|
||||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Square z := x^2
|
||||
// Input x[n]; output z[k]
|
||||
//
|
||||
// extern void bignum_sqr
|
||||
// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
|
||||
//
|
||||
// Does the "z := x^2" operation where x is n digits and result z is k.
|
||||
// Truncates the result in general unless k >= 2 * n
|
||||
//
|
||||
// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x
|
||||
// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "s2n_bignum_internal.h"
|
||||
|
||||
.intel_syntax noprefix
|
||||
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr)
|
||||
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr)
|
||||
.text
|
||||
|
||||
// First three are where arguments come in, but n is moved.
|
||||
|
||||
#define p rdi
|
||||
#define z rsi
|
||||
#define x rcx
|
||||
#define n r8
|
||||
|
||||
// These are always local scratch since multiplier result is in these
|
||||
|
||||
#define a rax
|
||||
#define d rdx
|
||||
|
||||
// Other variables
|
||||
|
||||
#define i rbx
|
||||
#define ll rbp
|
||||
#define hh r9
|
||||
#define k r10
|
||||
#define y r11
|
||||
#define htop r12
|
||||
#define l r13
|
||||
#define h r14
|
||||
#define c r15
|
||||
|
||||
// Short versions
|
||||
|
||||
#define llshort ebp
|
||||
|
||||
S2N_BN_SYMBOL(bignum_sqr):
|
||||
endbr64
|
||||
|
||||
#if WINDOWS_ABI
|
||||
push rdi
|
||||
push rsi
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
mov rcx, r9
|
||||
#endif
|
||||
|
||||
// We use too many registers, and also we need rax:rdx for multiplications
|
||||
|
||||
push rbx
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
mov n, rdx
|
||||
|
||||
// If p = 0 the result is trivial and nothing needs doing
|
||||
|
||||
test p, p
|
||||
jz end
|
||||
|
||||
// initialize (hh,ll) = 0
|
||||
|
||||
xor llshort, llshort
|
||||
xor hh, hh
|
||||
|
||||
// Iterate outer loop from k = 0 ... k = p - 1 producing result digits
|
||||
|
||||
xor k, k
|
||||
|
||||
outerloop:
|
||||
|
||||
// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
|
||||
// We want to accumulate all x[i] * x[k - i] for bot <= i < top
|
||||
// For the optimization of squaring we avoid duplication and do
|
||||
// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n
|
||||
// Initialize i = bot; in fact just compute bot as i directly.
|
||||
|
||||
xor c, c
|
||||
lea i, [k+1]
|
||||
mov htop, i
|
||||
shr htop, 1
|
||||
sub i, n
|
||||
cmovc i, c
|
||||
cmp htop, n
|
||||
cmovnc htop, n
|
||||
|
||||
// Initialize the three-part local sum (c,h,l); c was already done above
|
||||
|
||||
xor l, l
|
||||
xor h, h
|
||||
|
||||
// If htop <= bot then main doubled part of the sum is empty
|
||||
|
||||
cmp i, htop
|
||||
jnc nosumming
|
||||
|
||||
// Use a moving pointer for [y] = x[k-i] for the cofactor
|
||||
|
||||
mov a, k
|
||||
sub a, i
|
||||
lea y, [x+8*a]
|
||||
|
||||
// Do the main part of the sum x[i] * x[k - i] for 2 * i < k
|
||||
|
||||
innerloop:
|
||||
mov a, [x+8*i]
|
||||
mul QWORD PTR [y]
|
||||
add l, a
|
||||
adc h, d
|
||||
adc c, 0
|
||||
sub y, 8
|
||||
inc i
|
||||
cmp i, htop
|
||||
jc innerloop
|
||||
|
||||
// Now double it
|
||||
|
||||
add l, l
|
||||
adc h, h
|
||||
adc c, c
|
||||
|
||||
// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
|
||||
|
||||
nosumming:
|
||||
test k, 1
|
||||
jnz innerend
|
||||
cmp i, n
|
||||
jnc innerend
|
||||
|
||||
mov a, [x+8*i]
|
||||
mul a
|
||||
add l, a
|
||||
adc h, d
|
||||
adc c, 0
|
||||
|
||||
// Now add the local sum into the global sum, store and shift
|
||||
|
||||
innerend:
|
||||
add l, ll
|
||||
mov [z+8*k], l
|
||||
adc h, hh
|
||||
mov ll, h
|
||||
adc c, 0
|
||||
mov hh, c
|
||||
|
||||
inc k
|
||||
cmp k, p
|
||||
jc outerloop
|
||||
|
||||
// Restore registers and return
|
||||
|
||||
end:
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbp
|
||||
pop rbx
|
||||
#if WINDOWS_ABI
|
||||
pop rsi
|
||||
pop rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
Reference in New Issue
Block a user