nlpodyssey/spago

View on GitHub
mat/internal/matfuncs/mulconst_amd64.s

Summary

Maintainability
Test Coverage
// Code generated by command: go run mulconst_asm.go -out ../../matfuncs/mulconst_amd64.s -stubs ../../matfuncs/mulconst_amd64_stubs.go -pkg matfuncs. DO NOT EDIT.

//go:build amd64 && gc && !purego

#include "textflag.h"

// func MulConstAVX32(c float32, x []float32, y []float32)
// Requires: AVX, AVX2, SSE
TEXT ·MulConstAVX32(SB), NOSPLIT, $0-56
    MOVSS        c+0(FP), X0
    MOVQ         x_base+8(FP), AX
    MOVQ         y_base+32(FP), CX
    MOVQ         x_len+16(FP), DX
    VBROADCASTSS X0, Y1

unrolledLoop14:
    CMPQ    DX, $0x00000070
    JL      unrolledLoop8
    VMULPS  (AX), Y1, Y2
    VMULPS  32(AX), Y1, Y3
    VMULPS  64(AX), Y1, Y4
    VMULPS  96(AX), Y1, Y5
    VMULPS  128(AX), Y1, Y6
    VMULPS  160(AX), Y1, Y7
    VMULPS  192(AX), Y1, Y8
    VMULPS  224(AX), Y1, Y9
    VMULPS  256(AX), Y1, Y10
    VMULPS  288(AX), Y1, Y11
    VMULPS  320(AX), Y1, Y12
    VMULPS  352(AX), Y1, Y13
    VMULPS  384(AX), Y1, Y14
    VMULPS  416(AX), Y1, Y15
    VMOVUPS Y2, (CX)
    VMOVUPS Y3, 32(CX)
    VMOVUPS Y4, 64(CX)
    VMOVUPS Y5, 96(CX)
    VMOVUPS Y6, 128(CX)
    VMOVUPS Y7, 160(CX)
    VMOVUPS Y8, 192(CX)
    VMOVUPS Y9, 224(CX)
    VMOVUPS Y10, 256(CX)
    VMOVUPS Y11, 288(CX)
    VMOVUPS Y12, 320(CX)
    VMOVUPS Y13, 352(CX)
    VMOVUPS Y14, 384(CX)
    VMOVUPS Y15, 416(CX)
    ADDQ    $0x000001c0, AX
    ADDQ    $0x000001c0, CX
    SUBQ    $0x00000070, DX
    JMP     unrolledLoop14

unrolledLoop8:
    CMPQ    DX, $0x00000040
    JL      unrolledLoop4
    VMULPS  (AX), Y1, Y2
    VMULPS  32(AX), Y1, Y3
    VMULPS  64(AX), Y1, Y4
    VMULPS  96(AX), Y1, Y5
    VMULPS  128(AX), Y1, Y6
    VMULPS  160(AX), Y1, Y7
    VMULPS  192(AX), Y1, Y8
    VMULPS  224(AX), Y1, Y9
    VMOVUPS Y2, (CX)
    VMOVUPS Y3, 32(CX)
    VMOVUPS Y4, 64(CX)
    VMOVUPS Y5, 96(CX)
    VMOVUPS Y6, 128(CX)
    VMOVUPS Y7, 160(CX)
    VMOVUPS Y8, 192(CX)
    VMOVUPS Y9, 224(CX)
    ADDQ    $0x00000100, AX
    ADDQ    $0x00000100, CX
    SUBQ    $0x00000040, DX
    JMP     unrolledLoop8

unrolledLoop4:
    CMPQ    DX, $0x00000020
    JL      unrolledLoop1
    VMULPS  (AX), Y1, Y2
    VMULPS  32(AX), Y1, Y3
    VMULPS  64(AX), Y1, Y4
    VMULPS  96(AX), Y1, Y5
    VMOVUPS Y2, (CX)
    VMOVUPS Y3, 32(CX)
    VMOVUPS Y4, 64(CX)
    VMOVUPS Y5, 96(CX)
    ADDQ    $0x00000080, AX
    ADDQ    $0x00000080, CX
    SUBQ    $0x00000020, DX
    JMP     unrolledLoop4

unrolledLoop1:
    CMPQ    DX, $0x00000008
    JL      tailLoop
    VMULPS  (AX), Y1, Y2
    VMOVUPS Y2, (CX)
    ADDQ    $0x00000020, AX
    ADDQ    $0x00000020, CX
    SUBQ    $0x00000008, DX
    JMP     unrolledLoop1

tailLoop:
    CMPQ  DX, $0x00000000
    JE    end
    MOVSS (AX), X1
    MULSS X0, X1
    MOVSS X1, (CX)
    ADDQ  $0x00000004, AX
    ADDQ  $0x00000004, CX
    DECQ  DX
    JMP   tailLoop

end:
    RET

// func MulConstAVX64(c float64, x []float64, y []float64)
// Requires: AVX, AVX2, SSE2
TEXT ·MulConstAVX64(SB), NOSPLIT, $0-56
    MOVSD        c+0(FP), X0
    MOVQ         x_base+8(FP), AX
    MOVQ         y_base+32(FP), CX
    MOVQ         x_len+16(FP), DX
    VBROADCASTSD X0, Y1

unrolledLoop14:
    CMPQ    DX, $0x00000038
    JL      unrolledLoop8
    VMULPD  (AX), Y1, Y2
    VMULPD  32(AX), Y1, Y3
    VMULPD  64(AX), Y1, Y4
    VMULPD  96(AX), Y1, Y5
    VMULPD  128(AX), Y1, Y6
    VMULPD  160(AX), Y1, Y7
    VMULPD  192(AX), Y1, Y8
    VMULPD  224(AX), Y1, Y9
    VMULPD  256(AX), Y1, Y10
    VMULPD  288(AX), Y1, Y11
    VMULPD  320(AX), Y1, Y12
    VMULPD  352(AX), Y1, Y13
    VMULPD  384(AX), Y1, Y14
    VMULPD  416(AX), Y1, Y15
    VMOVUPD Y2, (CX)
    VMOVUPD Y3, 32(CX)
    VMOVUPD Y4, 64(CX)
    VMOVUPD Y5, 96(CX)
    VMOVUPD Y6, 128(CX)
    VMOVUPD Y7, 160(CX)
    VMOVUPD Y8, 192(CX)
    VMOVUPD Y9, 224(CX)
    VMOVUPD Y10, 256(CX)
    VMOVUPD Y11, 288(CX)
    VMOVUPD Y12, 320(CX)
    VMOVUPD Y13, 352(CX)
    VMOVUPD Y14, 384(CX)
    VMOVUPD Y15, 416(CX)
    ADDQ    $0x000001c0, AX
    ADDQ    $0x000001c0, CX
    SUBQ    $0x00000038, DX
    JMP     unrolledLoop14

unrolledLoop8:
    CMPQ    DX, $0x00000020
    JL      unrolledLoop4
    VMULPD  (AX), Y1, Y2
    VMULPD  32(AX), Y1, Y3
    VMULPD  64(AX), Y1, Y4
    VMULPD  96(AX), Y1, Y5
    VMULPD  128(AX), Y1, Y6
    VMULPD  160(AX), Y1, Y7
    VMULPD  192(AX), Y1, Y8
    VMULPD  224(AX), Y1, Y9
    VMOVUPD Y2, (CX)
    VMOVUPD Y3, 32(CX)
    VMOVUPD Y4, 64(CX)
    VMOVUPD Y5, 96(CX)
    VMOVUPD Y6, 128(CX)
    VMOVUPD Y7, 160(CX)
    VMOVUPD Y8, 192(CX)
    VMOVUPD Y9, 224(CX)
    ADDQ    $0x00000100, AX
    ADDQ    $0x00000100, CX
    SUBQ    $0x00000020, DX
    JMP     unrolledLoop8

unrolledLoop4:
    CMPQ    DX, $0x00000010
    JL      unrolledLoop1
    VMULPD  (AX), Y1, Y2
    VMULPD  32(AX), Y1, Y3
    VMULPD  64(AX), Y1, Y4
    VMULPD  96(AX), Y1, Y5
    VMOVUPD Y2, (CX)
    VMOVUPD Y3, 32(CX)
    VMOVUPD Y4, 64(CX)
    VMOVUPD Y5, 96(CX)
    ADDQ    $0x00000080, AX
    ADDQ    $0x00000080, CX
    SUBQ    $0x00000010, DX
    JMP     unrolledLoop4

unrolledLoop1:
    CMPQ    DX, $0x00000004
    JL      tailLoop
    VMULPD  (AX), Y1, Y2
    VMOVUPD Y2, (CX)
    ADDQ    $0x00000020, AX
    ADDQ    $0x00000020, CX
    SUBQ    $0x00000004, DX
    JMP     unrolledLoop1

tailLoop:
    CMPQ  DX, $0x00000000
    JE    end
    MOVSD (AX), X1
    MULSD X0, X1
    MOVSD X1, (CX)
    ADDQ  $0x00000008, AX
    ADDQ  $0x00000008, CX
    DECQ  DX
    JMP   tailLoop

end:
    RET

// func MulConstSSE32(c float32, x []float32, y []float32)
// Requires: SSE
TEXT ·MulConstSSE32(SB), NOSPLIT, $0-56
    MOVSS  c+0(FP), X0
    MOVQ   x_base+8(FP), AX
    MOVQ   y_base+32(FP), CX
    MOVQ   x_len+16(FP), DX
    SHUFPS $0x00, X0, X0

unrolledLoop14:
    CMPQ   DX, $0x00000038
    JL     unrolledLoop8
    MOVUPS (AX), X1
    MOVUPS 16(AX), X2
    MOVUPS 32(AX), X3
    MOVUPS 48(AX), X4
    MOVUPS 64(AX), X5
    MOVUPS 80(AX), X6
    MOVUPS 96(AX), X7
    MOVUPS 112(AX), X8
    MOVUPS 128(AX), X9
    MOVUPS 144(AX), X10
    MOVUPS 160(AX), X11
    MOVUPS 176(AX), X12
    MOVUPS 192(AX), X13
    MOVUPS 208(AX), X14
    MULPS  X0, X1
    MULPS  X0, X2
    MULPS  X0, X3
    MULPS  X0, X4
    MULPS  X0, X5
    MULPS  X0, X6
    MULPS  X0, X7
    MULPS  X0, X8
    MULPS  X0, X9
    MULPS  X0, X10
    MULPS  X0, X11
    MULPS  X0, X12
    MULPS  X0, X13
    MULPS  X0, X14
    MOVUPS X1, (CX)
    MOVUPS X2, 16(CX)
    MOVUPS X3, 32(CX)
    MOVUPS X4, 48(CX)
    MOVUPS X5, 64(CX)
    MOVUPS X6, 80(CX)
    MOVUPS X7, 96(CX)
    MOVUPS X8, 112(CX)
    MOVUPS X9, 128(CX)
    MOVUPS X10, 144(CX)
    MOVUPS X11, 160(CX)
    MOVUPS X12, 176(CX)
    MOVUPS X13, 192(CX)
    MOVUPS X14, 208(CX)
    ADDQ   $0x000000e0, AX
    ADDQ   $0x000000e0, CX
    SUBQ   $0x00000038, DX
    JMP    unrolledLoop14

unrolledLoop8:
    CMPQ   DX, $0x00000020
    JL     unrolledLoop4
    MOVUPS (AX), X1
    MOVUPS 16(AX), X2
    MOVUPS 32(AX), X3
    MOVUPS 48(AX), X4
    MOVUPS 64(AX), X5
    MOVUPS 80(AX), X6
    MOVUPS 96(AX), X7
    MOVUPS 112(AX), X8
    MULPS  X0, X1
    MULPS  X0, X2
    MULPS  X0, X3
    MULPS  X0, X4
    MULPS  X0, X5
    MULPS  X0, X6
    MULPS  X0, X7
    MULPS  X0, X8
    MOVUPS X1, (CX)
    MOVUPS X2, 16(CX)
    MOVUPS X3, 32(CX)
    MOVUPS X4, 48(CX)
    MOVUPS X5, 64(CX)
    MOVUPS X6, 80(CX)
    MOVUPS X7, 96(CX)
    MOVUPS X8, 112(CX)
    ADDQ   $0x00000080, AX
    ADDQ   $0x00000080, CX
    SUBQ   $0x00000020, DX
    JMP    unrolledLoop8

unrolledLoop4:
    CMPQ   DX, $0x00000010
    JL     unrolledLoop1
    MOVUPS (AX), X1
    MOVUPS 16(AX), X2
    MOVUPS 32(AX), X3
    MOVUPS 48(AX), X4
    MULPS  X0, X1
    MULPS  X0, X2
    MULPS  X0, X3
    MULPS  X0, X4
    MOVUPS X1, (CX)
    MOVUPS X2, 16(CX)
    MOVUPS X3, 32(CX)
    MOVUPS X4, 48(CX)
    ADDQ   $0x00000040, AX
    ADDQ   $0x00000040, CX
    SUBQ   $0x00000010, DX
    JMP    unrolledLoop4

unrolledLoop1:
    CMPQ   DX, $0x00000004
    JL     tailLoop
    MOVUPS (AX), X1
    MULPS  X0, X1
    MOVUPS X1, (CX)
    ADDQ   $0x00000010, AX
    ADDQ   $0x00000010, CX
    SUBQ   $0x00000004, DX
    JMP    unrolledLoop1

tailLoop:
    CMPQ  DX, $0x00000000
    JE    end
    MOVSS (AX), X1
    MULSS X0, X1
    MOVSS X1, (CX)
    ADDQ  $0x00000004, AX
    ADDQ  $0x00000004, CX
    DECQ  DX
    JMP   tailLoop

end:
    RET

// func MulConstSSE64(c float64, x []float64, y []float64)
// Requires: SSE2
TEXT ·MulConstSSE64(SB), NOSPLIT, $0-56
    MOVSD  c+0(FP), X0
    MOVQ   x_base+8(FP), AX
    MOVQ   y_base+32(FP), CX
    MOVQ   x_len+16(FP), DX
    SHUFPD $0x00, X0, X0

unrolledLoop14:
    CMPQ   DX, $0x0000001c
    JL     unrolledLoop8
    MOVUPD (AX), X1
    MOVUPD 16(AX), X2
    MOVUPD 32(AX), X3
    MOVUPD 48(AX), X4
    MOVUPD 64(AX), X5
    MOVUPD 80(AX), X6
    MOVUPD 96(AX), X7
    MOVUPD 112(AX), X8
    MOVUPD 128(AX), X9
    MOVUPD 144(AX), X10
    MOVUPD 160(AX), X11
    MOVUPD 176(AX), X12
    MOVUPD 192(AX), X13
    MOVUPD 208(AX), X14
    MULPD  X0, X1
    MULPD  X0, X2
    MULPD  X0, X3
    MULPD  X0, X4
    MULPD  X0, X5
    MULPD  X0, X6
    MULPD  X0, X7
    MULPD  X0, X8
    MULPD  X0, X9
    MULPD  X0, X10
    MULPD  X0, X11
    MULPD  X0, X12
    MULPD  X0, X13
    MULPD  X0, X14
    MOVUPD X1, (CX)
    MOVUPD X2, 16(CX)
    MOVUPD X3, 32(CX)
    MOVUPD X4, 48(CX)
    MOVUPD X5, 64(CX)
    MOVUPD X6, 80(CX)
    MOVUPD X7, 96(CX)
    MOVUPD X8, 112(CX)
    MOVUPD X9, 128(CX)
    MOVUPD X10, 144(CX)
    MOVUPD X11, 160(CX)
    MOVUPD X12, 176(CX)
    MOVUPD X13, 192(CX)
    MOVUPD X14, 208(CX)
    ADDQ   $0x000000e0, AX
    ADDQ   $0x000000e0, CX
    SUBQ   $0x0000001c, DX
    JMP    unrolledLoop14

unrolledLoop8:
    CMPQ   DX, $0x00000010
    JL     unrolledLoop4
    MOVUPD (AX), X1
    MOVUPD 16(AX), X2
    MOVUPD 32(AX), X3
    MOVUPD 48(AX), X4
    MOVUPD 64(AX), X5
    MOVUPD 80(AX), X6
    MOVUPD 96(AX), X7
    MOVUPD 112(AX), X8
    MULPD  X0, X1
    MULPD  X0, X2
    MULPD  X0, X3
    MULPD  X0, X4
    MULPD  X0, X5
    MULPD  X0, X6
    MULPD  X0, X7
    MULPD  X0, X8
    MOVUPD X1, (CX)
    MOVUPD X2, 16(CX)
    MOVUPD X3, 32(CX)
    MOVUPD X4, 48(CX)
    MOVUPD X5, 64(CX)
    MOVUPD X6, 80(CX)
    MOVUPD X7, 96(CX)
    MOVUPD X8, 112(CX)
    ADDQ   $0x00000080, AX
    ADDQ   $0x00000080, CX
    SUBQ   $0x00000010, DX
    JMP    unrolledLoop8

unrolledLoop4:
    CMPQ   DX, $0x00000008
    JL     unrolledLoop1
    MOVUPD (AX), X1
    MOVUPD 16(AX), X2
    MOVUPD 32(AX), X3
    MOVUPD 48(AX), X4
    MULPD  X0, X1
    MULPD  X0, X2
    MULPD  X0, X3
    MULPD  X0, X4
    MOVUPD X1, (CX)
    MOVUPD X2, 16(CX)
    MOVUPD X3, 32(CX)
    MOVUPD X4, 48(CX)
    ADDQ   $0x00000040, AX
    ADDQ   $0x00000040, CX
    SUBQ   $0x00000008, DX
    JMP    unrolledLoop4

unrolledLoop1:
    CMPQ   DX, $0x00000002
    JL     tailLoop
    MOVUPD (AX), X1
    MULPD  X0, X1
    MOVUPD X1, (CX)
    ADDQ   $0x00000010, AX
    ADDQ   $0x00000010, CX
    SUBQ   $0x00000002, DX
    JMP    unrolledLoop1

tailLoop:
    CMPQ  DX, $0x00000000
    JE    end
    MOVSD (AX), X1
    MULSD X0, X1
    MOVSD X1, (CX)
    ADDQ  $0x00000008, AX
    ADDQ  $0x00000008, CX
    DECQ  DX
    JMP   tailLoop

end:
    RET