push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
+
emms
+ sub rsp, 376 ; # local variable stack space (n*16)
- sub rsp, 392 ; # local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rsp + nb010_nouter], eax
jmp .nb010_threadloop
.nb010_end:
- emms
-
mov eax, [rsp + nb010_nouter]
mov ebx, [rsp + nb010_ninner]
mov rcx, [rbp + nb010_outeriter]
mov [rcx], eax
mov [rdx], ebx
- add rsp, 392
-
- pop r15
- pop r14
- pop r13
- pop r12
+ add rsp, 376
+ emms
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
-
.globl nb_kernel010nf_x86_64_sse
.globl _nb_kernel010nf_x86_64_sse
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- sub rsp, 264 ; # local variable stack space (n*16+8)
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 248 ; # local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 264
+ add rsp, 248
emms
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 376 ; # local variable stack space (n*16+8)
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 368 ; # local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
+
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 376
+ add rsp, 368
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel030nf_x86_64_sse
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- sub rsp, 248 ; # local variable stack space (n*16+8)
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 240 ; # local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 248
+ add rsp, 240
emms
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 312 ; # local variable stack space (n*16+8)
+ sub rsp, 304 ; # local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 312
+ add rsp, 304
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- sub rsp, 216 ; # local variable stack space (n*16+8)
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 208 ; # local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
+
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 216
+ add rsp, 208
emms
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
-
- sub rsp, 696
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 688
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
+
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 696
+ add rsp, 688
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel101nf_x86_64_sse
.globl _nb_kernel101nf_x86_64_sse
nb_kernel101nf_x86_64_sse:
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- sub rsp, 360
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 352
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 360
+ add rsp, 352
emms
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 1512
+ sub rsp, 1504
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
+
;# zero 32-bit iteration counters
mov eax, 0
mov [rsp + nb102_nouter], eax
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1512
+ add rsp, 1504
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel102nf_x86_64_sse
.globl _nb_kernel102nf_x86_64_sse
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- sub rsp, 776
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 768
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 776
+ add rsp, 768
emms
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 696 ; # local variable stack space (n*16+8)
+ sub rsp, 688 ; # local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 696
+ add rsp, 688
emms
-
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel103nf_x86_64_sse
.globl _nb_kernel103nf_x86_64_sse
nb_kernel103nf_x86_64_sse:
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 360
+ sub rsp, 352
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 360
+ add rsp, 352
emms
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1512
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 1504
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1512
+ add rsp, 1504
emms
-
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel104nf_x86_64_sse
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 824
+ sub rsp, 816
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
+
;# zero 32-bit iteration counters
mov eax, 0
mov [rsp + nb104nf_nouter], eax
mov [rcx], eax
mov [rdx], ebx
- add rsp, 824
+ add rsp, 816
emms
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb110_nouter, 380
.equiv nb110_ninner, 384
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 408
+ sub rsp, 400
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov rdx, [rbp + nb110_inneriter]
mov [rcx], eax
mov [rdx], ebx
- add rsp, 408
+ add rsp, 400
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 280
+ sub rsp, 272
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
+
;# zero 32-bit iteration counters
mov eax, 0
mov [rsp + nb110nf_nouter], eax
mov [rcx], eax
mov [rdx], ebx
- add rsp, 280
+ add rsp, 272
emms
-
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb111_nouter, 772
.equiv nb111_ninner, 776
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 792
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 784
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 792
+ add rsp, 784
emms
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
- ret
-
-
+ ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- sub rsp, 424
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 416
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 424
+ add rsp, 416
emms
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1592
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 1584
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1592
+ add rsp, 1584
emms
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel112nf_x86_64_sse
.globl _nb_kernel112nf_x86_64_sse
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- sub rsp, 840
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 832
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 840
+ add rsp, 832
emms
- pop rbx
+
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 984
+ sub rsp, 976
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 984
+ add rsp, 976
emms
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 520
+ sub rsp, 512
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 520
+ add rsp, 512
emms
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1880
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 1872
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
jmp .nb114_threadloop
.nb114_end:
-
- emms
-
mov eax, [rsp + nb114_nouter]
mov ebx, [rsp + nb114_ninner]
mov rcx, [rbp + nb114_outeriter]
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1880
-
- pop r15
- pop r14
- pop r13
- pop r12
+ add rsp, 1872
+ emms
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
- ret
-
+ ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 968
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 960
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
jmp .nb114nf_threadloop
.nb114nf_end:
-
- emms
-
mov eax, [rsp + nb114nf_nouter]
mov ebx, [rsp + nb114nf_ninner]
mov rcx, [rbp + nb114nf_outeriter]
mov [rcx], eax
mov [rdx], ebx
- add rsp, 968
-
- pop r15
- pop r14
- pop r13
- pop r12
+ add rsp, 960
+ emms
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb130_nouter, 428
.equiv nb130_ninner, 432
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 440 ;# local variable stack space (n*16+8)
+ sub rsp, 432 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 440
+ add rsp, 432
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel130nf_x86_64_sse
.globl _nb_kernel130nf_x86_64_sse
nb_kernel130nf_x86_64_sse:
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 328 ;# local variable stack space (n*16+8)
+ sub rsp, 320 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 328
+ add rsp, 320
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 920 ;# local variable stack space (n*16+8)
+ sub rsp, 912 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 920
+ add rsp, 912
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel131nf_x86_64_sse
.globl _nb_kernel131nf_x86_64_sse
nb_kernel131nf_x86_64_sse:
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 520 ;# local variable stack space (n*16+8)
+ sub rsp, 512 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 520
+ add rsp, 512
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1624 ;# local variable stack space (n*16+8)
+ sub rsp, 1616 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1624
+ add rsp, 1616
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel132nf_x86_64_sse
.globl _nb_kernel132nf_x86_64_sse
nb_kernel132nf_x86_64_sse:
.equiv nb132nf_nn1, 848
.equiv nb132nf_nouter, 852
.equiv nb132nf_ninner, 856
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 872 ;# local variable stack space (n*16+8)
+ sub rsp, 864 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 872
+ add rsp, 864
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb133_nouter, 1060
.equiv nb133_ninner, 1064
- push rbp
- mov rbp, rsp
- push rbx
-
-
+ push rbp
+ mov rbp, rsp
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1080 ;# local variable stack space (n*16+8)
+ sub rsp, 1072 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1080
+ add rsp, 1072
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel133nf_x86_64_sse
.globl _nb_kernel133nf_x86_64_sse
.equiv nb133nf_nouter, 584
.equiv nb133nf_ninner, 588
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 600 ;# local variable stack space (n*16+8)
+ sub rsp, 592 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 600
+ add rsp, 592
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1912 ;# local variable stack space (n*16+8)
+ sub rsp, 1904 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1912
+ add rsp, 1904
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel134nf_x86_64_sse
.globl _nb_kernel134nf_x86_64_sse
.equiv nb134nf_nouter, 980
.equiv nb134nf_ninner, 984
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1000 ;# local variable stack space (n*16+8)
+ sub rsp, 992 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1000
+ add rsp, 992
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 360 ;# local variable stack space (n*16+8)
+ sub rsp, 352 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 360
+ add rsp, 352
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel200nf_x86_64_sse
.globl _nb_kernel200nf_x86_64_sse
nb_kernel200nf_x86_64_sse:
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 248 ;# local variable stack space (n*16+8)
+ sub rsp, 240 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 248
+ add rsp, 240
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 808 ;# local variable stack space (n*16+8)
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 800 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 808
+ add rsp, 800
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel201nf_x86_64_sse
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 440 ;# local variable stack space (n*16+8)
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 432 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 440
+ add rsp, 432
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
-
- sub rsp, 1544 ;# local variable stack space (n*16+8)
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 1536 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1544
+ add rsp, 1536
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel202nf_x86_64_sse
.globl _nb_kernel202nf_x86_64_sse
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 808 ;# local variable stack space (n*16+8)
+ sub rsp, 800 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 808
+ add rsp, 800
emms
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 808 ;# local variable stack space (n*16+8)
+ sub rsp, 800 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 808
+ add rsp, 800
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 440 ;# local variable stack space (n*16+8)
+ sub rsp, 432 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 440
+ add rsp, 432
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1544 ;# local variable stack space (n*16+8)
+ sub rsp, 1536 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1544
+ add rsp, 1536
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel204nf_x86_64_sse
.globl _nb_kernel204nf_x86_64_sse
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 808 ;# local variable stack space (n*16+8)
+ sub rsp, 800 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 808
+ add rsp, 800
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb210_nouter, 428
.equiv nb210_ninner, 432
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 440 ;# local variable stack space (n*16+8)
+ sub rsp, 432 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 440
+ add rsp, 432
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel210nf_x86_64_sse
.globl _nb_kernel210nf_x86_64_sse
nb_kernel210nf_x86_64_sse:
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 312 ;# local variable stack space (n*16+8)
+ sub rsp, 304 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 312
+ add rsp, 304
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 872 ;# local variable stack space (n*16+8)
+ sub rsp, 864 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 872
+ add rsp, 864
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel211nf_x86_64_sse
.globl _nb_kernel211nf_x86_64_sse
nb_kernel211nf_x86_64_sse:
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 488 ;# local variable stack space (n*16+8)
+ sub rsp, 480 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 488
+ add rsp, 480
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 1624 ;# local variable stack space (n*16+8)
+ sub rsp, 1616 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1624
+ add rsp, 1616
emms
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel212nf_x86_64_sse
.globl _nb_kernel212nf_x86_64_sse
nb_kernel212nf_x86_64_sse:
.equiv nb212nf_ninner, 840
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 856 ;# local variable stack space (n*16+8)
+ sub rsp, 848 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 856
+ add rsp, 848
emms
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1064 ;# local variable stack space (n*16+8)
+ sub rsp, 1056 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1064
+ add rsp, 1056
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel213nf_x86_64_sse
.globl _nb_kernel213nf_x86_64_sse
.equiv nb213nf_nn1, 580
.equiv nb213nf_nouter, 584
.equiv nb213nf_ninner, 588
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 600 ;# local variable stack space (n*16+8)
+ sub rsp, 592 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 600
+ add rsp, 592
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1912 ;# local variable stack space (n*16+8)
+ sub rsp, 1904 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1912
+ add rsp, 1904
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb214nf_nouter, 980
.equiv nb214nf_ninner, 984
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1000 ;# local variable stack space (n*16+8)
+ sub rsp, 992 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1000
+ add rsp, 992
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb230_nouter, 428
.equiv nb230_ninner, 432
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 456 ;# local variable stack space (n*16+8)
+ sub rsp, 448 ;# local variable stack space (n*16)
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 456
+ add rsp, 448
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
sub rsp, 328 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
+
;# zero 32-bit iteration counters
mov eax, 0
mov [rsp + nb230nf_nouter], eax
add rsp, 328
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb231_nouter, 900
.equiv nb231_ninner, 904
- push rbp
- mov rbp, rsp
- push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 920 ; # local variable stack space (n*16+8)
+ push rbp
+ mov rbp, rsp
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+ emms
+ sub rsp, 912 ; # local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 920
+ add rsp, 912
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb231nf_nouter, 532
.equiv nb231nf_ninner, 536
- push rbp
- mov rbp, rsp
- push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 552 ; # local variable stack space (n*16+8)
+ push rbp
+ mov rbp, rsp
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+ emms
+ sub rsp, 544 ; # local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 552
+ add rsp, 544
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 1624 ;# local variable stack space (n*16+8)
+ sub rsp, 1616 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1624
+ add rsp, 1616
emms
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel232nf_x86_64_sse
.globl _nb_kernel232nf_x86_64_sse
nb_kernel232nf_x86_64_sse:
.equiv nb232nf_ninner, 856
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 872 ;# local variable stack space (n*16+8)
+ sub rsp, 864 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 872
+ add rsp, 864
emms
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb233_nouter, 1068
.equiv nb233_ninner, 1072
- push rbp
- mov rbp, rsp
- push rbx
-
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rbp
+ mov rbp, rsp
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+ emms
- sub rsp, 1096 ;# local variable stack space (n*16+8)
+ sub rsp, 1088 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1096
+ add rsp, 1088
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel233nf_x86_64_sse
.globl _nb_kernel233nf_x86_64_sse
.equiv nb233nf_nouter, 596
.equiv nb233nf_ninner, 600
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 616 ;# local variable stack space (n*16+8)
+ sub rsp, 608 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 616
+ add rsp, 608
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1912 ;# local variable stack space (n*16+8)
+ sub rsp, 1904 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1912
+ add rsp, 1904
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel234nf_x86_64_sse
.globl _nb_kernel234nf_x86_64_sse
.equiv nb234nf_nouter, 980
.equiv nb234nf_ninner, 984
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1000 ;# local variable stack space (n*16+8)
+ sub rsp, 992 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1000
+ add rsp, 992
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb300_nouter, 360
.equiv nb300_ninner, 364
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 376 ;# local variable stack space (n*16+8)
+ sub rsp, 368 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 376
+ add rsp, 368
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel300nf_x86_64_sse
.globl _nb_kernel300nf_x86_64_sse
nb_kernel300nf_x86_64_sse:
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 248 ;# local variable stack space (n*16+8)
+ sub rsp, 240 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 248
+ add rsp, 240
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
-
.equiv nb301_nouter, 804
.equiv nb301_ninner, 808
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 824 ;# local variable stack space (n*16+8)
+ sub rsp, 816 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 824
+ add rsp, 816
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 472 ;# local variable stack space (n*16+8)
+ sub rsp, 464 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 472
+ add rsp, 464
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 1528 ;# local variable stack space (n*4+8)
+ sub rsp, 1520 ;# local variable stack space (n*4+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1528
+ add rsp, 1520
emms
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb302nf_ninner, 776
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 792 ;# local variable stack space (n*4+8)
+ sub rsp, 784 ;# local variable stack space (n*4+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 792
+ add rsp, 784
emms
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 824 ;# local variable stack space (n*16+8)
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
+ sub rsp, 816 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 824
+ add rsp, 816
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb303nf_ninner, 456
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 472 ;# local variable stack space (n*16+8)
+ sub rsp, 464 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 472
+ add rsp, 464
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb304_ninner, 1512
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1528 ;# local variable stack space (n*16+8)
+ sub rsp, 1520 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1528
+ add rsp, 1520
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb304nf_ninner, 776
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 792 ;# local variable stack space (n*16+8)
+ sub rsp, 784 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 792
+ add rsp, 784
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
-
-
.equiv nb310_nouter, 444
.equiv nb310_ninner, 448
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 472 ;# local variable stack space (n*16+8)
+ sub rsp, 464 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 472
+ add rsp, 464
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel310nf_x86_64_sse
.globl _nb_kernel310nf_x86_64_sse
nb_kernel310nf_x86_64_sse:
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 312 ;# local variable stack space (n*16+8)
+ sub rsp, 304 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 312
+ add rsp, 304
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb311_ninner, 892
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 904 ;# local variable stack space (n*16+8)
+ sub rsp, 896 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 904
+ add rsp, 896
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb311nf_ninner, 508
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 520 ;# local variable stack space (n*16+8)
+ sub rsp, 512 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 520
+ add rsp, 512
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb312_ninner, 1608
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 1624 ;# local variable stack space (n*16+8)
+ sub rsp, 1616 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1624
+ add rsp, 1616
emms
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel312nf_x86_64_sse
.globl _nb_kernel312nf_x86_64_sse
.equiv nb312nf_ninner, 824
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 840 ;# local variable stack space (n*16+8)
+ sub rsp, 832 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 840
+ add rsp, 832
emms
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb313_ninner, 1036
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1048 ;# local variable stack space (n*16+8)
+ sub rsp, 1040 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1048
+ add rsp, 1040
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb313nf_ninner, 572
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 584 ;# local variable stack space (n*16+8)
+ sub rsp, 576 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 584
+ add rsp, 576
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
-
.equiv nb314_ninner, 1880
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1896 ;# local variable stack space (n*16+8)
+ sub rsp, 1888 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1896
+ add rsp, 1888
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb314nf_ninner, 968
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 984 ;# local variable stack space (n*16+8)
+ sub rsp, 976 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 984
+ add rsp, 976
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 440 ;# local variable stack space (n*16+8)
+ sub rsp, 432 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 440
+ add rsp, 432
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 312 ;# local variable stack space (n*16+8)
+ sub rsp, 304 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 312
+ add rsp, 304
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
-
-
.equiv nb331_ninner, 860
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 872 ;# local variable stack space (n*16+8)
+ sub rsp, 864 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 872
+ add rsp, 864
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb331nf_ninner, 508
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 520 ;# local variable stack space (n*16+8)
+ sub rsp, 512 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 520
+ add rsp, 512
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb332_ninner, 1576
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- push r12
- push r13
- push r14
- push r15
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 1592 ;# local variable stack space (n*16+8)
+
+ sub rsp, 1584 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1592
+ add rsp, 1584
emms
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb332nf_ninner, 824
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- sub rsp, 840 ;# local variable stack space (n*16+8)
+
+ sub rsp, 832 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 840
+ add rsp, 832
emms
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb333_ninner, 1084
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1112 ;# local variable stack space (n*16+8)
+ sub rsp, 1104 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1112
+ add rsp, 1104
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb333nf_ninner, 588
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 600 ;# local variable stack space (n*16+8)
+ sub rsp, 592 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 600
+ add rsp, 592
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb334_ninner, 1848
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1864 ;# local variable stack space (n*16+8)
+ sub rsp, 1856 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1864
+ add rsp, 1856
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb334nf_nn1, 1024
.equiv nb334nf_nouter, 1028
.equiv nb334nf_ninner, 1032
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 1048 ;# local variable stack space (n*16+8)
+ sub rsp, 1040 ;# local variable stack space (n*16+8)
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1048
+ add rsp, 1040
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb010_ntype, 360
.equiv nb010_nouter, 364
.equiv nb010_ninner, 368
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 392 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 384 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 392
+ add rsp, 384
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb010nf_ntype, 224
.equiv nb010nf_nouter, 228
.equiv nb010nf_ninner, 232
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 248 ;# local variable stack space (n*16+8)
+ sub rsp, 240 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 248
+ add rsp, 240
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
-
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 376 ;# local variable stack space (n*16+8)
+ sub rsp, 368 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 376
+ add rsp, 368
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel030nf_x86_64_sse2
.globl _nb_kernel030nf_x86_64_sse2
nb_kernel030nf_x86_64_sse2:
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
-
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 248 ;# local variable stack space (n*16+8)
+ sub rsp, 240 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 248
+ add rsp, 240
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb100_nn1, 288
.equiv nb100_nouter, 292
.equiv nb100_ninner, 296
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 312 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 304 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 312
+ add rsp, 304
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb100nf_nn1, 196
.equiv nb100nf_nouter, 200
.equiv nb100nf_ninner, 204
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
-
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 216 ;# local variable stack space (n*16+8)
+ sub rsp, 208 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 216
+ add rsp, 208
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb101_nn1, 672
.equiv nb101_nouter, 676
.equiv nb101_ninner, 680
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 696 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 688 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
jmp .nb101_threadloop
.nb101_end:
- emms
-
mov eax, [rsp + nb101_nouter]
mov ebx, [rsp + nb101_ninner]
mov rcx, [rbp + nb101_outeriter]
mov rdx, [rbp + nb101_inneriter]
mov [rcx], eax
mov [rdx], ebx
- add rsp, 696
-
- pop r15
- pop r14
- pop r13
- pop r12
+ add rsp, 688
+ emms
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb101nf_nn1, 336
.equiv nb101nf_nouter, 340
.equiv nb101nf_ninner, 344
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
+
emms
+ sub rsp, 352 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rsp + nb101nf_nouter], eax
mov [rsp + nb101nf_ninner], eax
-
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 360 ;# local variable stack space (n*16+8)
-
-
-
mov edi, [rdi]
mov [rsp + nb101nf_nri], edi
mov [rsp + nb101nf_iinr], rsi
mov [rcx], eax
mov [rdx], ebx
- add rsp, 360
+ add rsp, 352
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb102_nn1, 1472
.equiv nb102_nouter, 1476
.equiv nb102_ninner, 1480
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1496 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1488 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1496
+ add rsp, 1488
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb102nf_nn1, 752
.equiv nb102nf_nouter, 756
.equiv nb102nf_ninner, 760
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 792 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 784 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 792
+ add rsp, 784
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb103_nn1, 672
.equiv nb103_nouter, 676
.equiv nb103_ninner, 680
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 696 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 688 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 696
+ add rsp, 688
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb103nf_nn1, 336
.equiv nb103nf_nouter, 340
.equiv nb103nf_ninner, 344
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 360 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 352 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 360
+ add rsp, 352
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb104_nn1, 1472
.equiv nb104_nouter, 1476
.equiv nb104_ninner, 1480
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1496 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1488 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1496
+ add rsp, 1488
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb104nf_nn1, 764
.equiv nb104nf_nouter, 768
.equiv nb104nf_ninner, 772
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 792 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 784 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 792
+ add rsp, 784
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb110_ntype, 376
.equiv nb110_nouter, 380
.equiv nb110_ninner, 384
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 408 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 400 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 408
+ add rsp, 400
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel110nf_x86_64_sse2
.globl _nb_kernel110nf_x86_64_sse2
nb_kernel110nf_x86_64_sse2:
.equiv nb110nf_ntype, 248
.equiv nb110nf_nouter, 252
.equiv nb110nf_ninner, 256
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 280 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 272 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 280
+ add rsp, 272
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
- ret
-
+ ret
.equiv nb111_nn1, 768
.equiv nb111_nouter, 772
.equiv nb111_ninner, 776
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 792 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 784 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 792
+ add rsp, 784
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel111nf_x86_64_sse2
.globl _nb_kernel111nf_x86_64_sse2
nb_kernel111nf_x86_64_sse2:
.equiv nb111nf_nn1, 388
.equiv nb111nf_nouter, 392
.equiv nb111nf_ninner, 396
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 408 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 400 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 408
+ add rsp, 400
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb112_nn1, 1564
.equiv nb112_nouter, 1568
.equiv nb112_ninner, 1572
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1592 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1584 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1592
+ add rsp, 1584
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb112nf_nn1, 800
.equiv nb112nf_nouter, 804
.equiv nb112nf_ninner, 808
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 824 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 816 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 824
+ add rsp, 816
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb113_nn1, 964
.equiv nb113_nouter, 968
.equiv nb113_ninner, 972
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 984 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 976 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 984
+ add rsp, 976
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel113nf_x86_64_sse2
.globl _nb_kernel113nf_x86_64_sse2
nb_kernel113nf_x86_64_sse2:
.equiv nb113nf_nn1, 452
.equiv nb113nf_nouter, 456
.equiv nb113nf_ninner, 460
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 472 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 464 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 472
+ add rsp, 464
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb114_nn1, 1856
.equiv nb114_nouter, 1860
.equiv nb114_ninner, 1864
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1880 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1872 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1880
+ add rsp, 1872
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel114nf_x86_64_sse2
.globl _nb_kernel114nf_x86_64_sse2
.equiv nb114nf_nn1, 944
.equiv nb114nf_nouter, 948
.equiv nb114nf_ninner, 952
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 968 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 960 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 968
+ add rsp, 960
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb130_ntype, 424
.equiv nb130_nouter, 428
.equiv nb130_ninner, 432
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 456 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 448 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 456
+ add rsp, 448
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 328 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 320 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 328
+ add rsp, 320
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb131_nouter, 900
.equiv nb131_ninner, 904
- push rbp
- mov rbp, rsp
- push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rbp
+ mov rbp, rsp
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 920 ; # local variable stack space (n*16+8)
+ emms
+ sub rsp, 912 ; # local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 920
+ add rsp, 912
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel131nf_x86_64_sse2
.globl _nb_kernel131nf_x86_64_sse2
nb_kernel131nf_x86_64_sse2:
.equiv nb131nf_nn1, 496
.equiv nb131nf_nouter, 500
.equiv nb131nf_ninner, 504
-
- push rbp
- mov rbp, rsp
- push rbx
-
- emms
- push r12
- push r13
- push r14
- push r15
+ push rbp
+ mov rbp, rsp
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 520 ; # local variable stack space (n*16+8)
+ emms
+ sub rsp, 512 ; # local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 520
+ add rsp, 512
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb132_nn1, 1600
.equiv nb132_nouter, 1604
.equiv nb132_ninner, 1608
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1624 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1616 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1624
+ add rsp, 1616
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel132nf_x86_64_sse2
.globl _nb_kernel132nf_x86_64_sse2
.equiv nb132nf_nn1, 848
.equiv nb132nf_nouter, 852
.equiv nb132nf_ninner, 856
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
-
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 872 ;# local variable stack space (n*16+8)
+ sub rsp, 864 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 872
+ add rsp, 864
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb133_nn1, 1064
.equiv nb133_nouter, 1068
.equiv nb133_ninner, 1072
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1080 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1072 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1080
+ add rsp, 1072
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel133nf_x86_64_sse2
.globl _nb_kernel133nf_x86_64_sse2
nb_kernel133nf_x86_64_sse2:
.equiv nb133nf_nn1, 580
.equiv nb133nf_nouter, 584
.equiv nb133nf_ninner, 588
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 600 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 592 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rsp + nb133nf_nouter], eax
mov [rcx], eax
mov [rdx], ebx
- add rsp, 600
+ add rsp, 592
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb134_nn1, 1888
.equiv nb134_nouter, 1892
.equiv nb134_ninner, 1896
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1912 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1904 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1912
+ add rsp, 1904
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel134nf_x86_64_sse2
.globl _nb_kernel134nf_x86_64_sse2
.equiv nb134nf_nn1, 976
.equiv nb134nf_nouter, 980
.equiv nb134nf_ninner, 984
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1000 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 992 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1000
+ add rsp, 992
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb200_nn1, 336
.equiv nb200_nouter, 340
.equiv nb200_ninner, 344
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 360 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 352 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 360
+ add rsp, 352
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
+
.globl nb_kernel200nf_x86_64_sse2
.globl _nb_kernel200nf_x86_64_sse2
nb_kernel200nf_x86_64_sse2:
.equiv nb200nf_nn1, 224
.equiv nb200nf_nouter, 228
.equiv nb200nf_ninner, 232
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 248 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 240 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 248
+ add rsp, 240
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb201_nn1, 784
.equiv nb201_nouter, 788
.equiv nb201_ninner, 792
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 808 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 800 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 808
+ add rsp, 800
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb201nf_nn1, 416
.equiv nb201nf_nouter, 420
.equiv nb201nf_ninner, 424
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 440 ;# local variable stack space (n*16+8)
+ sub rsp, 432 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 440
+ add rsp, 432
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb202_nn1, 1520
.equiv nb202_nouter, 1524
.equiv nb202_ninner, 1528
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1544 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1536 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1544
+ add rsp, 1536
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb202nf_nn1, 784
.equiv nb202nf_nouter, 788
.equiv nb202nf_ninner, 792
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 808 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 800 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 808
+ add rsp, 800
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb203_nn1, 784
.equiv nb203_nouter, 788
.equiv nb203_ninner, 792
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 808 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 800 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 808
+ add rsp, 800
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel203nf_x86_64_sse2
.globl _nb_kernel203nf_x86_64_sse2
nb_kernel203nf_x86_64_sse2:
.equiv nb203nf_nn1, 416
.equiv nb203nf_nouter, 420
.equiv nb203nf_ninner, 424
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 440 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 432 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 440
+ add rsp, 432
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb204_nn1, 1520
.equiv nb204_nouter, 1524
.equiv nb204_ninner, 1528
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1544 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1536 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1544
+ add rsp, 1536
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb204nf_nn1, 784
.equiv nb204nf_nouter, 788
.equiv nb204nf_ninner, 792
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 808 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 800 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 808
+ add rsp, 800
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb210_ntype, 424
.equiv nb210_nouter, 428
.equiv nb210_ninner, 432
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 456 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 448 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 456
+ add rsp, 448
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb210nf_ntype, 280
.equiv nb210nf_nouter, 284
.equiv nb210nf_ninner, 288
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 312 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 304 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 312
+ add rsp, 304
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb211_nn1, 852
.equiv nb211_nouter, 856
.equiv nb211_ninner, 860
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 872 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 864 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 872
+ add rsp, 864
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb211nf_nn1, 468
.equiv nb211nf_nouter, 472
.equiv nb211nf_ninner, 476
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 488 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 480 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 488
+ add rsp, 480
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb212_nn1, 1600
.equiv nb212_nouter, 1604
.equiv nb212_ninner, 1608
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1624 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1616 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1624
+ add rsp, 1616
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel212nf_x86_64_sse2
.globl _nb_kernel212nf_x86_64_sse2
.equiv nb212nf_nn1, 832
.equiv nb212nf_nouter, 836
.equiv nb212nf_ninner, 840
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 856 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 848 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 856
+ add rsp, 848
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb213_nn1, 1044
.equiv nb213_nouter, 1048
.equiv nb213_ninner, 1052
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1064 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1056 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1064
+ add rsp, 1056
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb213nf_nn1, 580
.equiv nb213nf_nouter, 584
.equiv nb213nf_ninner, 588
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 600 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 592 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rsp + nb213nf_nouter], eax
mov [rcx], eax
mov [rdx], ebx
- add rsp, 600
+ add rsp, 592
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb214_nn1, 1888
.equiv nb214_nouter, 1892
.equiv nb214_ninner, 1896
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1912 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1904 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1912
+ add rsp, 1904
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel214nf_x86_64_sse2
.globl _nb_kernel214nf_x86_64_sse2
.equiv nb214nf_nn1, 976
.equiv nb214nf_nouter, 980
.equiv nb214nf_ninner, 984
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1000 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 992 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1000
+ add rsp, 992
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb230_ntype, 424
.equiv nb230_nouter, 428
.equiv nb230_ninner, 432
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 456 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 448 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 456
+ add rsp, 448
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 328 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 320 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 328
+ add rsp, 320
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb231_nouter, 900
.equiv nb231_ninner, 904
- push rbp
- mov rbp, rsp
- push rbx
-
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rbp
+ mov rbp, rsp
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 920 ; # local variable stack space (n*16+8)
+ emms
+ sub rsp, 912 ; # local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 920
+ add rsp, 912
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb231nf_nn1, 528
.equiv nb231nf_nouter, 532
.equiv nb231nf_ninner, 536
-
- push rbp
- mov rbp, rsp
- push rbx
-
- emms
- push r12
- push r13
- push r14
- push r15
+ push rbp
+ mov rbp, rsp
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 552 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 544 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 552
+ add rsp, 544
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb232_nn1, 1600
.equiv nb232_nouter, 1604
.equiv nb232_ninner, 1608
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1624 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1616 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1624
+ add rsp, 1616
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel232nf_x86_64_sse2
.globl _nb_kernel232nf_x86_64_sse2
.equiv nb232nf_nn1, 848
.equiv nb232nf_nouter, 852
.equiv nb232nf_ninner, 856
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
-
emms
-
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 872 ;# local variable stack space (n*16+8)
+ sub rsp, 864 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 872
+ add rsp, 864
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
-
-
.equiv nb233_nn1, 1056
.equiv nb233_nouter, 1060
.equiv nb233_ninner, 1064
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1080 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1072 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1080
+ add rsp, 1072
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel233nf_x86_64_sse2
.globl _nb_kernel233nf_x86_64_sse2
nb_kernel233nf_x86_64_sse2:
.equiv nb233nf_nn1, 592
.equiv nb233nf_nouter, 596
.equiv nb233nf_ninner, 600
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 616 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 608 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rsp + nb233nf_nouter], eax
mov [rcx], eax
mov [rdx], ebx
- add rsp, 616
+ add rsp, 608
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb234_nn1, 1888
.equiv nb234_nouter, 1892
.equiv nb234_ninner, 1896
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1912 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1904 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1912
+ add rsp, 1904
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
+
.globl nb_kernel234nf_x86_64_sse2
.globl _nb_kernel234nf_x86_64_sse2
.equiv nb234nf_nn1, 976
.equiv nb234nf_nouter, 980
.equiv nb234nf_ninner, 984
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1000 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 992 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1000
+ add rsp, 992
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb300_nn1, 352
.equiv nb300_nouter, 356
.equiv nb300_ninner, 360
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- push r12
- push r13
- push r14
- push r15
-
- sub rsp, 376 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 368 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 376
+ add rsp, 368
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel300nf_x86_64_sse2
.equiv nb300nf_nn1, 224
.equiv nb300nf_nouter, 228
.equiv nb300nf_ninner, 232
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 248 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 240 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 248
+ add rsp, 240
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb301_nn1, 844
.equiv nb301_nouter, 848
.equiv nb301_ninner, 852
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 872 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 864 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 872
+ add rsp, 864
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel301nf_x86_64_sse2
.globl _nb_kernel301nf_x86_64_sse2
nb_kernel301nf_x86_64_sse2:
.equiv nb301nf_nn1, 448
.equiv nb301nf_nouter, 452
.equiv nb301nf_ninner, 456
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 472 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 464 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 472
+ add rsp, 464
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb302_nn1, 1504
.equiv nb302_nouter, 1508
.equiv nb302_ninner, 1512
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1528 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1520 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1528
+ add rsp, 1520
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel302nf_x86_64_sse2
.globl _nb_kernel302nf_x86_64_sse2
nb_kernel302nf_x86_64_sse2:
.equiv nb302nf_nn1, 768
.equiv nb302nf_nouter, 772
.equiv nb302nf_ninner, 776
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 792 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 784 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 792
+ add rsp, 784
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb303_nn1, 800
.equiv nb303_nouter, 804
.equiv nb303_ninner, 808
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 824 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 816 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 824
+ add rsp, 816
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb303nf_nn1, 448
.equiv nb303nf_nouter, 452
.equiv nb303nf_ninner, 456
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 472 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 464 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 472
+ add rsp, 464
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb304_nn1, 1504
.equiv nb304_nouter, 1508
.equiv nb304_ninner, 1512
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1528 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1520 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1528
+ add rsp, 1520
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb304nf_nn1, 768
.equiv nb304nf_nouter, 772
.equiv nb304nf_ninner, 776
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 792 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 784 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 792
+ add rsp, 784
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb310_ntype, 440
.equiv nb310_nouter, 444
.equiv nb310_ninner, 448
+
push rbp
mov rbp, rsp
- push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 472 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 464 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 472
+ add rsp, 464
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel310nf_x86_64_sse2
.globl _nb_kernel310nf_x86_64_sse2
nb_kernel310nf_x86_64_sse2:
.equiv nb310nf_ntype, 280
.equiv nb310nf_nouter, 284
.equiv nb310nf_ninner, 288
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 312 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 304 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 312
+ add rsp, 304
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb311_nn1, 884
.equiv nb311_nouter, 888
.equiv nb311_ninner, 892
+
push rbp
mov rbp, rsp
- push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 904 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 896 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 904
+ add rsp, 896
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.globl nb_kernel311nf_x86_64_sse2
.globl _nb_kernel311nf_x86_64_sse2
nb_kernel311nf_x86_64_sse2:
.equiv nb311nf_nn1, 500
.equiv nb311nf_nouter, 504
.equiv nb311nf_ninner, 508
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 520 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 512 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 520
+ add rsp, 512
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb312_nn1, 1600
.equiv nb312_nouter, 1604
.equiv nb312_ninner, 1608
+
push rbp
mov rbp, rsp
- push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1624 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1616 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1624
+ add rsp, 1616
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel312nf_x86_64_sse2
.globl _nb_kernel312nf_x86_64_sse2
.equiv nb312nf_nn1, 816
.equiv nb312nf_nouter, 820
.equiv nb312nf_ninner, 824
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 840 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 832 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 840
+ add rsp, 832
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb313_nn1, 1060
.equiv nb313_nouter, 1064
.equiv nb313_ninner, 1068
+
push rbp
mov rbp, rsp
- push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1080 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1072 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1080
+ add rsp, 1072
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb313nf_nn1, 596
.equiv nb313nf_nouter, 600
.equiv nb313nf_ninner, 604
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 616 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 608 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 616
+ add rsp, 608
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
-
.equiv nb314_nn1, 1856
.equiv nb314_nouter, 1860
.equiv nb314_ninner, 1864
+
push rbp
mov rbp, rsp
- push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+
+ ;# Push integer registers on stack
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1880 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1872 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1880
+ add rsp, 1872
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel314nf_x86_64_sse2
.globl _nb_kernel314nf_x86_64_sse2
.equiv nb314nf_nn1, 944
.equiv nb314nf_nouter, 948
.equiv nb314nf_ninner, 952
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 968 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 960 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 968
+ add rsp, 960
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb330_ntype, 408
.equiv nb330_nouter, 412
.equiv nb330_ninner, 416
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 440 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 432 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 440
+ add rsp, 432
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel330nf_x86_64_sse2
.globl _nb_kernel330nf_x86_64_sse2
nb_kernel330nf_x86_64_sse2:
.equiv nb330nf_ntype, 280
.equiv nb330nf_nouter, 284
.equiv nb330nf_ninner, 288
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 312 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 304 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 312
+ add rsp, 304
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb331_nn1, 852
.equiv nb331_nouter, 856
.equiv nb331_ninner, 860
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 872 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 864 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 872
+ add rsp, 864
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb331nf_nn1, 500
.equiv nb331nf_nouter, 504
.equiv nb331nf_ninner, 508
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 520 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 512 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 520
+ add rsp, 512
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb332_nn1, 1568
.equiv nb332_nouter, 1572
.equiv nb332_ninner, 1576
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1592 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1584 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1592
+ add rsp, 1584
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
- ret
-
-
+ ret
.equiv nb332nf_nn1, 816
.equiv nb332nf_nouter, 820
.equiv nb332nf_ninner, 824
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 840 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 832 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 840
+ add rsp, 832
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb333_nn1, 1028
.equiv nb333_nouter, 1032
.equiv nb333_ninner, 1036
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1048 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1040 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1048
+ add rsp, 1040
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.equiv nb333nf_nn1, 516
.equiv nb333nf_nouter, 520
.equiv nb333nf_ninner, 524
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 536 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 528 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 536
+ add rsp, 528
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
.equiv nb334_nn1, 1840
.equiv nb334_nouter, 1844
.equiv nb334_ninner, 1848
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 1864 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 1856 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 1864
+ add rsp, 1856
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
.globl nb_kernel334nf_x86_64_sse2
.globl _nb_kernel334nf_x86_64_sse2
.equiv nb334nf_nn1, 944
.equiv nb334nf_nouter, 948
.equiv nb334nf_ninner, 952
+
push rbp
mov rbp, rsp
+
+ ;# Push integer registers on stack
push rbx
- emms
-
- push r12
- push r13
- push r14
- push r15
+ push rsi
+ push rdi
+ push r12
+ push r13
+ push r14
+ push r15
+
+ ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
+ sub rsp, 168
+
+ ;# Save xmm registers to stack
+ movaps [rsp ], xmm6
+ movaps [rsp + 16 ], xmm7
+ movaps [rsp + 32 ], xmm8
+ movaps [rsp + 48 ], xmm9
+ movaps [rsp + 64 ], xmm10
+ movaps [rsp + 80 ], xmm11
+ movaps [rsp + 96 ], xmm12
+ movaps [rsp + 112], xmm13
+ movaps [rsp + 128], xmm14
+ movaps [rsp + 144], xmm15
+
+; .if 0 # block below only read by NASM - special calling convention on win64
+%ifidn __OUTPUT_FORMAT__, win64
+ ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
+ add rbp, 48
+ ;# Adjust stack pointer for different alignment
+ ;# Move around arguments to fit AMD64 convention below
+ ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
+ ;# win64 passes args in: rcx,rdx,r8,r9 + stack
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rbp]
+ mov r9, [rbp + 8]
+%endif
+; .endif # end NASM- and win64-specific block
- sub rsp, 968 ;# local variable stack space (n*16+8)
+ emms
+ sub rsp, 960 ;# local variable stack space (n*16+8)
;# zero 32-bit iteration counters
mov eax, 0
mov [rcx], eax
mov [rdx], ebx
- add rsp, 968
+ add rsp, 960
emms
-
- pop r15
- pop r14
- pop r13
- pop r12
-
- pop rbx
+ ;# Save xmm registers to stack
+ movaps xmm6, [rsp ]
+ movaps xmm7, [rsp + 16 ]
+ movaps xmm8, [rsp + 32 ]
+ movaps xmm9, [rsp + 48 ]
+ movaps xmm10, [rsp + 64 ]
+ movaps xmm11, [rsp + 80 ]
+ movaps xmm12, [rsp + 96 ]
+ movaps xmm13, [rsp + 112]
+ movaps xmm14, [rsp + 128]
+ movaps xmm15, [rsp + 144]
+
+ ;# Reset pointers after restoring xmm6-15
+ add rsp, 168
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
pop rbp
ret
-
nb_kernel_setup_x86_64_sse2(fplog,nb_kernel_list);
#endif
-#if defined(GMX_SSE2)
-# ifdef GMX_DOUBLE
- nb_kernel_setup_sse2_double(fplog,nb_kernel_list);
-# else
- nb_kernel_setup_sse2_single(fplog,nb_kernel_list);
-# endif
-#endif
-
#if (defined GMX_IA64_ASM && defined GMX_DOUBLE)
nb_kernel_setup_ia64_double(fplog,nb_kernel_list);
#endif
/* Not free energy */
kernelptr = nb_kernel_list[nrnb_ind];
-
+
if (kernelptr == NULL)
{
/* Call a generic nonbonded kernel */