mat/internal/matfuncs/add_amd64.s
// Code generated by command: go run add_asm.go -out ../../matfuncs/add_amd64.s -stubs ../../matfuncs/add_amd64_stubs.go -pkg matfuncs. DO NOT EDIT.
//go:build amd64 && gc && !purego
#include "textflag.h"
// func AddAVX32(x1 []float32, x2 []float32, y []float32)
// Requires: AVX
TEXT ·AddAVX32(SB), NOSPLIT, $0-72
MOVQ x1_base+0(FP), AX
MOVQ x2_base+24(FP), CX
MOVQ y_base+48(FP), DX
MOVQ x1_len+8(FP), BX
unrolledLoop:
CMPQ BX, $0x00000080
JL singleRegisterLoop
VMOVUPS (AX), Y0
VMOVUPS 32(AX), Y1
VMOVUPS 64(AX), Y2
VMOVUPS 96(AX), Y3
VMOVUPS 128(AX), Y4
VMOVUPS 160(AX), Y5
VMOVUPS 192(AX), Y6
VMOVUPS 224(AX), Y7
VMOVUPS 256(AX), Y8
VMOVUPS 288(AX), Y9
VMOVUPS 320(AX), Y10
VMOVUPS 352(AX), Y11
VMOVUPS 384(AX), Y12
VMOVUPS 416(AX), Y13
VMOVUPS 448(AX), Y14
VMOVUPS 480(AX), Y15
VADDPS (CX), Y0, Y0
VADDPS 32(CX), Y1, Y1
VADDPS 64(CX), Y2, Y2
VADDPS 96(CX), Y3, Y3
VADDPS 128(CX), Y4, Y4
VADDPS 160(CX), Y5, Y5
VADDPS 192(CX), Y6, Y6
VADDPS 224(CX), Y7, Y7
VADDPS 256(CX), Y8, Y8
VADDPS 288(CX), Y9, Y9
VADDPS 320(CX), Y10, Y10
VADDPS 352(CX), Y11, Y11
VADDPS 384(CX), Y12, Y12
VADDPS 416(CX), Y13, Y13
VADDPS 448(CX), Y14, Y14
VADDPS 480(CX), Y15, Y15
VMOVUPS Y0, (DX)
VMOVUPS Y1, 32(DX)
VMOVUPS Y2, 64(DX)
VMOVUPS Y3, 96(DX)
VMOVUPS Y4, 128(DX)
VMOVUPS Y5, 160(DX)
VMOVUPS Y6, 192(DX)
VMOVUPS Y7, 224(DX)
VMOVUPS Y8, 256(DX)
VMOVUPS Y9, 288(DX)
VMOVUPS Y10, 320(DX)
VMOVUPS Y11, 352(DX)
VMOVUPS Y12, 384(DX)
VMOVUPS Y13, 416(DX)
VMOVUPS Y14, 448(DX)
VMOVUPS Y15, 480(DX)
ADDQ $0x00000200, AX
ADDQ $0x00000200, CX
ADDQ $0x00000200, DX
SUBQ $0x00000080, BX
JMP unrolledLoop
singleRegisterLoop:
CMPQ BX, $0x00000008
JL tailLoop
VMOVUPS (AX), Y0
VADDPS (CX), Y0, Y0
VMOVUPS Y0, (DX)
ADDQ $0x00000020, AX
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
SUBQ $0x00000008, BX
JMP singleRegisterLoop
tailLoop:
CMPQ BX, $0x00000000
JE end
VMOVSS (AX), X0
VADDSS (CX), X0, X0
VMOVSS X0, (DX)
ADDQ $0x00000004, AX
ADDQ $0x00000004, CX
ADDQ $0x00000004, DX
DECQ BX
JMP tailLoop
end:
RET
// func AddAVX64(x1 []float64, x2 []float64, y []float64)
// Requires: AVX
TEXT ·AddAVX64(SB), NOSPLIT, $0-72
MOVQ x1_base+0(FP), AX
MOVQ x2_base+24(FP), CX
MOVQ y_base+48(FP), DX
MOVQ x1_len+8(FP), BX
unrolledLoop:
CMPQ BX, $0x00000040
JL singleRegisterLoop
VMOVUPD (AX), Y0
VMOVUPD 32(AX), Y1
VMOVUPD 64(AX), Y2
VMOVUPD 96(AX), Y3
VMOVUPD 128(AX), Y4
VMOVUPD 160(AX), Y5
VMOVUPD 192(AX), Y6
VMOVUPD 224(AX), Y7
VMOVUPD 256(AX), Y8
VMOVUPD 288(AX), Y9
VMOVUPD 320(AX), Y10
VMOVUPD 352(AX), Y11
VMOVUPD 384(AX), Y12
VMOVUPD 416(AX), Y13
VMOVUPD 448(AX), Y14
VMOVUPD 480(AX), Y15
VADDPD (CX), Y0, Y0
VADDPD 32(CX), Y1, Y1
VADDPD 64(CX), Y2, Y2
VADDPD 96(CX), Y3, Y3
VADDPD 128(CX), Y4, Y4
VADDPD 160(CX), Y5, Y5
VADDPD 192(CX), Y6, Y6
VADDPD 224(CX), Y7, Y7
VADDPD 256(CX), Y8, Y8
VADDPD 288(CX), Y9, Y9
VADDPD 320(CX), Y10, Y10
VADDPD 352(CX), Y11, Y11
VADDPD 384(CX), Y12, Y12
VADDPD 416(CX), Y13, Y13
VADDPD 448(CX), Y14, Y14
VADDPD 480(CX), Y15, Y15
VMOVUPD Y0, (DX)
VMOVUPD Y1, 32(DX)
VMOVUPD Y2, 64(DX)
VMOVUPD Y3, 96(DX)
VMOVUPD Y4, 128(DX)
VMOVUPD Y5, 160(DX)
VMOVUPD Y6, 192(DX)
VMOVUPD Y7, 224(DX)
VMOVUPD Y8, 256(DX)
VMOVUPD Y9, 288(DX)
VMOVUPD Y10, 320(DX)
VMOVUPD Y11, 352(DX)
VMOVUPD Y12, 384(DX)
VMOVUPD Y13, 416(DX)
VMOVUPD Y14, 448(DX)
VMOVUPD Y15, 480(DX)
ADDQ $0x00000200, AX
ADDQ $0x00000200, CX
ADDQ $0x00000200, DX
SUBQ $0x00000040, BX
JMP unrolledLoop
singleRegisterLoop:
CMPQ BX, $0x00000004
JL tailLoop
VMOVUPD (AX), Y0
VADDPD (CX), Y0, Y0
VMOVUPD Y0, (DX)
ADDQ $0x00000020, AX
ADDQ $0x00000020, CX
ADDQ $0x00000020, DX
SUBQ $0x00000004, BX
JMP singleRegisterLoop
tailLoop:
CMPQ BX, $0x00000000
JE end
VMOVSD (AX), X0
VADDSD (CX), X0, X0
VMOVSD X0, (DX)
ADDQ $0x00000008, AX
ADDQ $0x00000008, CX
ADDQ $0x00000008, DX
DECQ BX
JMP tailLoop
end:
RET
// func AddSSE32(x1 []float32, x2 []float32, y []float32)
// Requires: SSE
TEXT ·AddSSE32(SB), NOSPLIT, $0-72
MOVQ x1_base+0(FP), AX
MOVQ x2_base+24(FP), CX
MOVQ y_base+48(FP), DX
MOVQ x1_len+8(FP), BX
CMPQ BX, $0x00000000
JE end
MOVQ CX, SI
ANDQ $0x0000000f, SI
JZ unrolledLoop
XORQ $0x0000000f, SI
INCQ SI
SHRQ $0x02, SI
alignmentLoop:
MOVSS (AX), X0
ADDSS (CX), X0
MOVSS X0, (DX)
ADDQ $0x00000004, AX
ADDQ $0x00000004, CX
ADDQ $0x00000004, DX
DECQ BX
JZ end
DECQ SI
JNZ alignmentLoop
unrolledLoop:
CMPQ BX, $0x00000040
JL singleRegisterLoop
MOVUPS (AX), X0
MOVUPS 16(AX), X1
MOVUPS 32(AX), X2
MOVUPS 48(AX), X3
MOVUPS 64(AX), X4
MOVUPS 80(AX), X5
MOVUPS 96(AX), X6
MOVUPS 112(AX), X7
MOVUPS 128(AX), X8
MOVUPS 144(AX), X9
MOVUPS 160(AX), X10
MOVUPS 176(AX), X11
MOVUPS 192(AX), X12
MOVUPS 208(AX), X13
MOVUPS 224(AX), X14
MOVUPS 240(AX), X15
ADDPS (CX), X0
ADDPS 16(CX), X1
ADDPS 32(CX), X2
ADDPS 48(CX), X3
ADDPS 64(CX), X4
ADDPS 80(CX), X5
ADDPS 96(CX), X6
ADDPS 112(CX), X7
ADDPS 128(CX), X8
ADDPS 144(CX), X9
ADDPS 160(CX), X10
ADDPS 176(CX), X11
ADDPS 192(CX), X12
ADDPS 208(CX), X13
ADDPS 224(CX), X14
ADDPS 240(CX), X15
MOVUPS X0, (DX)
MOVUPS X1, 16(DX)
MOVUPS X2, 32(DX)
MOVUPS X3, 48(DX)
MOVUPS X4, 64(DX)
MOVUPS X5, 80(DX)
MOVUPS X6, 96(DX)
MOVUPS X7, 112(DX)
MOVUPS X8, 128(DX)
MOVUPS X9, 144(DX)
MOVUPS X10, 160(DX)
MOVUPS X11, 176(DX)
MOVUPS X12, 192(DX)
MOVUPS X13, 208(DX)
MOVUPS X14, 224(DX)
MOVUPS X15, 240(DX)
ADDQ $0x00000100, AX
ADDQ $0x00000100, CX
ADDQ $0x00000100, DX
SUBQ $0x00000040, BX
JMP unrolledLoop
singleRegisterLoop:
CMPQ BX, $0x00000004
JL tailLoop
MOVUPS (AX), X0
ADDPS (CX), X0
MOVUPS X0, (DX)
ADDQ $0x00000010, AX
ADDQ $0x00000010, CX
ADDQ $0x00000010, DX
SUBQ $0x00000004, BX
JMP singleRegisterLoop
tailLoop:
CMPQ BX, $0x00000000
JE end
MOVSS (AX), X0
ADDSS (CX), X0
MOVSS X0, (DX)
ADDQ $0x00000004, AX
ADDQ $0x00000004, CX
ADDQ $0x00000004, DX
DECQ BX
JMP tailLoop
end:
RET
// func AddSSE64(x1 []float64, x2 []float64, y []float64)
// Requires: SSE2
TEXT ·AddSSE64(SB), NOSPLIT, $0-72
MOVQ x1_base+0(FP), AX
MOVQ x2_base+24(FP), CX
MOVQ y_base+48(FP), DX
MOVQ x1_len+8(FP), BX
CMPQ BX, $0x00000000
JE end
MOVQ CX, SI
ANDQ $0x0000000f, SI
JZ unrolledLoop
MOVSD (AX), X0
ADDSD (CX), X0
MOVSD X0, (DX)
ADDQ $0x00000008, AX
ADDQ $0x00000008, CX
ADDQ $0x00000008, DX
DECQ BX
unrolledLoop:
CMPQ BX, $0x00000020
JL singleRegisterLoop
MOVUPD (AX), X0
MOVUPD 16(AX), X1
MOVUPD 32(AX), X2
MOVUPD 48(AX), X3
MOVUPD 64(AX), X4
MOVUPD 80(AX), X5
MOVUPD 96(AX), X6
MOVUPD 112(AX), X7
MOVUPD 128(AX), X8
MOVUPD 144(AX), X9
MOVUPD 160(AX), X10
MOVUPD 176(AX), X11
MOVUPD 192(AX), X12
MOVUPD 208(AX), X13
MOVUPD 224(AX), X14
MOVUPD 240(AX), X15
ADDPD (CX), X0
ADDPD 16(CX), X1
ADDPD 32(CX), X2
ADDPD 48(CX), X3
ADDPD 64(CX), X4
ADDPD 80(CX), X5
ADDPD 96(CX), X6
ADDPD 112(CX), X7
ADDPD 128(CX), X8
ADDPD 144(CX), X9
ADDPD 160(CX), X10
ADDPD 176(CX), X11
ADDPD 192(CX), X12
ADDPD 208(CX), X13
ADDPD 224(CX), X14
ADDPD 240(CX), X15
MOVUPD X0, (DX)
MOVUPD X1, 16(DX)
MOVUPD X2, 32(DX)
MOVUPD X3, 48(DX)
MOVUPD X4, 64(DX)
MOVUPD X5, 80(DX)
MOVUPD X6, 96(DX)
MOVUPD X7, 112(DX)
MOVUPD X8, 128(DX)
MOVUPD X9, 144(DX)
MOVUPD X10, 160(DX)
MOVUPD X11, 176(DX)
MOVUPD X12, 192(DX)
MOVUPD X13, 208(DX)
MOVUPD X14, 224(DX)
MOVUPD X15, 240(DX)
ADDQ $0x00000100, AX
ADDQ $0x00000100, CX
ADDQ $0x00000100, DX
SUBQ $0x00000020, BX
JMP unrolledLoop
singleRegisterLoop:
CMPQ BX, $0x00000002
JL tailLoop
MOVUPD (AX), X0
ADDPD (CX), X0
MOVUPD X0, (DX)
ADDQ $0x00000010, AX
ADDQ $0x00000010, CX
ADDQ $0x00000010, DX
SUBQ $0x00000002, BX
JMP singleRegisterLoop
tailLoop:
CMPQ BX, $0x00000000
JE end
MOVSD (AX), X0
ADDSD (CX), X0
MOVSD X0, (DX)
ADDQ $0x00000008, AX
ADDQ $0x00000008, CX
ADDQ $0x00000008, DX
DECQ BX
JMP tailLoop
end:
RET