nlpodyssey/spago

View on GitHub
mat/internal/matfuncs/dotprod_amd64.s

Summary

Maintainability
Test Coverage
// Code generated by command: go run dotprod_asm.go -out ../../matfuncs/dotprod_amd64.s -stubs ../../matfuncs/dotprod_amd64_stubs.go -pkg matfuncs. DO NOT EDIT.

//go:build amd64 && gc && !purego

#include "textflag.h"

// func DotProdAVX32(x1 []float32, x2 []float32) float32
// Requires: AVX, FMA3, SSE
TEXT ·DotProdAVX32(SB), NOSPLIT, $0-52
    MOVQ x1_base+0(FP), AX
    MOVQ x2_base+24(FP), CX
    MOVQ x1_len+8(FP), DX
    VZEROALL

unrolledLoop14:
    CMPQ        DX, $0x00000070
    JL          unrolledLoop8
    VMOVUPS     (AX), Y2
    VMOVUPS     32(AX), Y3
    VMOVUPS     64(AX), Y4
    VMOVUPS     96(AX), Y5
    VMOVUPS     128(AX), Y6
    VMOVUPS     160(AX), Y7
    VMOVUPS     192(AX), Y8
    VMOVUPS     224(AX), Y9
    VMOVUPS     256(AX), Y10
    VMOVUPS     288(AX), Y11
    VMOVUPS     320(AX), Y12
    VMOVUPS     352(AX), Y13
    VMOVUPS     384(AX), Y14
    VMOVUPS     416(AX), Y15
    VFMADD231PS (CX), Y2, Y0
    VFMADD231PS 32(CX), Y3, Y1
    VFMADD231PS 64(CX), Y4, Y0
    VFMADD231PS 96(CX), Y5, Y1
    VFMADD231PS 128(CX), Y6, Y0
    VFMADD231PS 160(CX), Y7, Y1
    VFMADD231PS 192(CX), Y8, Y0
    VFMADD231PS 224(CX), Y9, Y1
    VFMADD231PS 256(CX), Y10, Y0
    VFMADD231PS 288(CX), Y11, Y1
    VFMADD231PS 320(CX), Y12, Y0
    VFMADD231PS 352(CX), Y13, Y1
    VFMADD231PS 384(CX), Y14, Y0
    VFMADD231PS 416(CX), Y15, Y1
    ADDQ        $0x000001c0, AX
    ADDQ        $0x000001c0, CX
    SUBQ        $0x00000070, DX
    JMP         unrolledLoop14

unrolledLoop8:
    CMPQ        DX, $0x00000040
    JL          unrolledLoop4
    VMOVUPS     (AX), Y2
    VMOVUPS     32(AX), Y3
    VMOVUPS     64(AX), Y4
    VMOVUPS     96(AX), Y5
    VMOVUPS     128(AX), Y6
    VMOVUPS     160(AX), Y7
    VMOVUPS     192(AX), Y8
    VMOVUPS     224(AX), Y9
    VFMADD231PS (CX), Y2, Y0
    VFMADD231PS 32(CX), Y3, Y1
    VFMADD231PS 64(CX), Y4, Y0
    VFMADD231PS 96(CX), Y5, Y1
    VFMADD231PS 128(CX), Y6, Y0
    VFMADD231PS 160(CX), Y7, Y1
    VFMADD231PS 192(CX), Y8, Y0
    VFMADD231PS 224(CX), Y9, Y1
    ADDQ        $0x00000100, AX
    ADDQ        $0x00000100, CX
    SUBQ        $0x00000040, DX
    JMP         unrolledLoop8

unrolledLoop4:
    CMPQ        DX, $0x00000020
    JL          unrolledLoop1
    VMOVUPS     (AX), Y2
    VMOVUPS     32(AX), Y3
    VMOVUPS     64(AX), Y4
    VMOVUPS     96(AX), Y5
    VFMADD231PS (CX), Y2, Y0
    VFMADD231PS 32(CX), Y3, Y1
    VFMADD231PS 64(CX), Y4, Y0
    VFMADD231PS 96(CX), Y5, Y1
    ADDQ        $0x00000080, AX
    ADDQ        $0x00000080, CX
    SUBQ        $0x00000020, DX
    JMP         unrolledLoop4

unrolledLoop1:
    CMPQ        DX, $0x00000008
    JL          tail
    VMOVUPS     (AX), Y2
    VFMADD231PS (CX), Y2, Y0
    ADDQ        $0x00000020, AX
    ADDQ        $0x00000020, CX
    SUBQ        $0x00000008, DX
    JMP         unrolledLoop1

tail:
    VXORPS X2, X2, X2

tailLoop:
    CMPQ        DX, $0x00000000
    JE          reduce
    VMOVSS      (AX), X3
    VFMADD231SS (CX), X3, X2
    ADDQ        $0x00000004, AX
    ADDQ        $0x00000004, CX
    DECQ        DX
    JMP         tailLoop

reduce:
    VADDPS       Y0, Y1, Y0
    VEXTRACTF128 $0x01, Y0, X1
    VADDPS       X0, X1, X0
    VADDPS       X0, X2, X0
    VHADDPS      X0, X0, X0
    VHADDPS      X0, X0, X0
    MOVSS        X0, ret+48(FP)
    RET

// func DotProdAVX64(x1 []float64, x2 []float64) float64
// Requires: AVX, FMA3, SSE2
TEXT ·DotProdAVX64(SB), NOSPLIT, $0-56
    MOVQ x1_base+0(FP), AX
    MOVQ x2_base+24(FP), CX
    MOVQ x1_len+8(FP), DX
    VZEROALL

unrolledLoop14:
    CMPQ        DX, $0x00000038
    JL          unrolledLoop8
    VMOVUPD     (AX), Y2
    VMOVUPD     32(AX), Y3
    VMOVUPD     64(AX), Y4
    VMOVUPD     96(AX), Y5
    VMOVUPD     128(AX), Y6
    VMOVUPD     160(AX), Y7
    VMOVUPD     192(AX), Y8
    VMOVUPD     224(AX), Y9
    VMOVUPD     256(AX), Y10
    VMOVUPD     288(AX), Y11
    VMOVUPD     320(AX), Y12
    VMOVUPD     352(AX), Y13
    VMOVUPD     384(AX), Y14
    VMOVUPD     416(AX), Y15
    VFMADD231PD (CX), Y2, Y0
    VFMADD231PD 32(CX), Y3, Y1
    VFMADD231PD 64(CX), Y4, Y0
    VFMADD231PD 96(CX), Y5, Y1
    VFMADD231PD 128(CX), Y6, Y0
    VFMADD231PD 160(CX), Y7, Y1
    VFMADD231PD 192(CX), Y8, Y0
    VFMADD231PD 224(CX), Y9, Y1
    VFMADD231PD 256(CX), Y10, Y0
    VFMADD231PD 288(CX), Y11, Y1
    VFMADD231PD 320(CX), Y12, Y0
    VFMADD231PD 352(CX), Y13, Y1
    VFMADD231PD 384(CX), Y14, Y0
    VFMADD231PD 416(CX), Y15, Y1
    ADDQ        $0x000001c0, AX
    ADDQ        $0x000001c0, CX
    SUBQ        $0x00000038, DX
    JMP         unrolledLoop14

unrolledLoop8:
    CMPQ        DX, $0x00000020
    JL          unrolledLoop4
    VMOVUPD     (AX), Y2
    VMOVUPD     32(AX), Y3
    VMOVUPD     64(AX), Y4
    VMOVUPD     96(AX), Y5
    VMOVUPD     128(AX), Y6
    VMOVUPD     160(AX), Y7
    VMOVUPD     192(AX), Y8
    VMOVUPD     224(AX), Y9
    VFMADD231PD (CX), Y2, Y0
    VFMADD231PD 32(CX), Y3, Y1
    VFMADD231PD 64(CX), Y4, Y0
    VFMADD231PD 96(CX), Y5, Y1
    VFMADD231PD 128(CX), Y6, Y0
    VFMADD231PD 160(CX), Y7, Y1
    VFMADD231PD 192(CX), Y8, Y0
    VFMADD231PD 224(CX), Y9, Y1
    ADDQ        $0x00000100, AX
    ADDQ        $0x00000100, CX
    SUBQ        $0x00000020, DX
    JMP         unrolledLoop8

unrolledLoop4:
    CMPQ        DX, $0x00000010
    JL          unrolledLoop1
    VMOVUPD     (AX), Y2
    VMOVUPD     32(AX), Y3
    VMOVUPD     64(AX), Y4
    VMOVUPD     96(AX), Y5
    VFMADD231PD (CX), Y2, Y0
    VFMADD231PD 32(CX), Y3, Y1
    VFMADD231PD 64(CX), Y4, Y0
    VFMADD231PD 96(CX), Y5, Y1
    ADDQ        $0x00000080, AX
    ADDQ        $0x00000080, CX
    SUBQ        $0x00000010, DX
    JMP         unrolledLoop4

unrolledLoop1:
    CMPQ        DX, $0x00000004
    JL          tail
    VMOVUPD     (AX), Y2
    VFMADD231PD (CX), Y2, Y0
    ADDQ        $0x00000020, AX
    ADDQ        $0x00000020, CX
    SUBQ        $0x00000004, DX
    JMP         unrolledLoop1

tail:
    VXORPD X2, X2, X2

tailLoop:
    CMPQ        DX, $0x00000000
    JE          reduce
    VMOVSD      (AX), X3
    VFMADD231SD (CX), X3, X2
    ADDQ        $0x00000008, AX
    ADDQ        $0x00000008, CX
    DECQ        DX
    JMP         tailLoop

reduce:
    VADDPD       Y0, Y1, Y0
    VEXTRACTF128 $0x01, Y0, X1
    VADDPD       X0, X1, X0
    VADDPD       X0, X2, X0
    VHADDPD      X0, X0, X0
    MOVSD        X0, ret+48(FP)
    RET

// func DotProdSSE32(x1 []float32, x2 []float32) float32
// Requires: SSE, SSE3
TEXT ·DotProdSSE32(SB), NOSPLIT, $0-52
    MOVQ  x1_base+0(FP), AX
    MOVQ  x2_base+24(FP), CX
    MOVQ  x1_len+8(FP), DX
    XORPS X0, X0
    XORPS X1, X1
    CMPQ  DX, $0x00000000
    JE    reduce
    MOVQ  CX, BX
    ANDQ  $0x0000000f, BX
    JZ    unrolledLoops
    XORQ  $0x0000000f, BX
    INCQ  BX
    SHRQ  $0x02, BX

alignmentLoop:
    MOVSS (AX), X2
    MULSS (CX), X2
    ADDSS X2, X0
    ADDQ  $0x00000004, AX
    ADDQ  $0x00000004, CX
    DECQ  DX
    JZ    reduce
    DECQ  BX
    JNZ   alignmentLoop

unrolledLoops:
unrolledLoop14:
    CMPQ   DX, $0x00000038
    JL     unrolledLoop8
    MOVUPS (AX), X2
    MOVUPS 16(AX), X3
    MOVUPS 32(AX), X4
    MOVUPS 48(AX), X5
    MOVUPS 64(AX), X6
    MOVUPS 80(AX), X7
    MOVUPS 96(AX), X8
    MOVUPS 112(AX), X9
    MOVUPS 128(AX), X10
    MOVUPS 144(AX), X11
    MOVUPS 160(AX), X12
    MOVUPS 176(AX), X13
    MOVUPS 192(AX), X14
    MOVUPS 208(AX), X15
    MULPS  (CX), X2
    MULPS  16(CX), X3
    MULPS  32(CX), X4
    MULPS  48(CX), X5
    MULPS  64(CX), X6
    MULPS  80(CX), X7
    MULPS  96(CX), X8
    MULPS  112(CX), X9
    MULPS  128(CX), X10
    MULPS  144(CX), X11
    MULPS  160(CX), X12
    MULPS  176(CX), X13
    MULPS  192(CX), X14
    MULPS  208(CX), X15
    ADDPS  X2, X0
    ADDPS  X3, X1
    ADDPS  X4, X0
    ADDPS  X5, X1
    ADDPS  X6, X0
    ADDPS  X7, X1
    ADDPS  X8, X0
    ADDPS  X9, X1
    ADDPS  X10, X0
    ADDPS  X11, X1
    ADDPS  X12, X0
    ADDPS  X13, X1
    ADDPS  X14, X0
    ADDPS  X15, X1
    ADDQ   $0x000000e0, AX
    ADDQ   $0x000000e0, CX
    SUBQ   $0x00000038, DX
    JMP    unrolledLoop14

unrolledLoop8:
    CMPQ   DX, $0x00000020
    JL     unrolledLoop4
    MOVUPS (AX), X2
    MOVUPS 16(AX), X3
    MOVUPS 32(AX), X4
    MOVUPS 48(AX), X5
    MOVUPS 64(AX), X6
    MOVUPS 80(AX), X7
    MOVUPS 96(AX), X8
    MOVUPS 112(AX), X9
    MULPS  (CX), X2
    MULPS  16(CX), X3
    MULPS  32(CX), X4
    MULPS  48(CX), X5
    MULPS  64(CX), X6
    MULPS  80(CX), X7
    MULPS  96(CX), X8
    MULPS  112(CX), X9
    ADDPS  X2, X0
    ADDPS  X3, X1
    ADDPS  X4, X0
    ADDPS  X5, X1
    ADDPS  X6, X0
    ADDPS  X7, X1
    ADDPS  X8, X0
    ADDPS  X9, X1
    ADDQ   $0x00000080, AX
    ADDQ   $0x00000080, CX
    SUBQ   $0x00000020, DX
    JMP    unrolledLoop8

unrolledLoop4:
    CMPQ   DX, $0x00000010
    JL     unrolledLoop1
    MOVUPS (AX), X2
    MOVUPS 16(AX), X3
    MOVUPS 32(AX), X4
    MOVUPS 48(AX), X5
    MULPS  (CX), X2
    MULPS  16(CX), X3
    MULPS  32(CX), X4
    MULPS  48(CX), X5
    ADDPS  X2, X0
    ADDPS  X3, X1
    ADDPS  X4, X0
    ADDPS  X5, X1
    ADDQ   $0x00000040, AX
    ADDQ   $0x00000040, CX
    SUBQ   $0x00000010, DX
    JMP    unrolledLoop4

unrolledLoop1:
    CMPQ   DX, $0x00000004
    JL     tailLoop
    MOVUPS (AX), X2
    MULPS  (CX), X2
    ADDPS  X2, X0
    ADDQ   $0x00000010, AX
    ADDQ   $0x00000010, CX
    SUBQ   $0x00000004, DX
    JMP    unrolledLoop1

tailLoop:
    CMPQ  DX, $0x00000000
    JE    reduce
    MOVSS (AX), X2
    MULSS (CX), X2
    ADDSS X2, X0
    ADDQ  $0x00000004, AX
    ADDQ  $0x00000004, CX
    DECQ  DX
    JMP   tailLoop

reduce:
    ADDPS  X1, X0
    HADDPS X0, X0
    HADDPS X0, X0
    MOVSS  X0, ret+48(FP)
    RET

// func DotProdSSE64(x1 []float64, x2 []float64) float64
// Requires: SSE2, SSE3
TEXT ·DotProdSSE64(SB), NOSPLIT, $0-56
    MOVQ  x1_base+0(FP), AX
    MOVQ  x2_base+24(FP), CX
    MOVQ  x1_len+8(FP), DX
    XORPD X0, X0
    XORPD X1, X1
    CMPQ  DX, $0x00000000
    JE    reduce
    MOVQ  CX, BX
    ANDQ  $0x0000000f, BX
    JZ    unrolledLoops
    MOVSD (AX), X2
    MULSD (CX), X2
    ADDSD X2, X0
    ADDQ  $0x00000008, AX
    ADDQ  $0x00000008, CX
    DECQ  DX

unrolledLoops:
unrolledLoop14:
    CMPQ   DX, $0x0000001c
    JL     unrolledLoop8
    MOVUPD (AX), X2
    MOVUPD 16(AX), X3
    MOVUPD 32(AX), X4
    MOVUPD 48(AX), X5
    MOVUPD 64(AX), X6
    MOVUPD 80(AX), X7
    MOVUPD 96(AX), X8
    MOVUPD 112(AX), X9
    MOVUPD 128(AX), X10
    MOVUPD 144(AX), X11
    MOVUPD 160(AX), X12
    MOVUPD 176(AX), X13
    MOVUPD 192(AX), X14
    MOVUPD 208(AX), X15
    MULPD  (CX), X2
    MULPD  16(CX), X3
    MULPD  32(CX), X4
    MULPD  48(CX), X5
    MULPD  64(CX), X6
    MULPD  80(CX), X7
    MULPD  96(CX), X8
    MULPD  112(CX), X9
    MULPD  128(CX), X10
    MULPD  144(CX), X11
    MULPD  160(CX), X12
    MULPD  176(CX), X13
    MULPD  192(CX), X14
    MULPD  208(CX), X15
    ADDPD  X2, X0
    ADDPD  X3, X1
    ADDPD  X4, X0
    ADDPD  X5, X1
    ADDPD  X6, X0
    ADDPD  X7, X1
    ADDPD  X8, X0
    ADDPD  X9, X1
    ADDPD  X10, X0
    ADDPD  X11, X1
    ADDPD  X12, X0
    ADDPD  X13, X1
    ADDPD  X14, X0
    ADDPD  X15, X1
    ADDQ   $0x000000e0, AX
    ADDQ   $0x000000e0, CX
    SUBQ   $0x0000001c, DX
    JMP    unrolledLoop14

unrolledLoop8:
    CMPQ   DX, $0x00000010
    JL     unrolledLoop4
    MOVUPD (AX), X2
    MOVUPD 16(AX), X3
    MOVUPD 32(AX), X4
    MOVUPD 48(AX), X5
    MOVUPD 64(AX), X6
    MOVUPD 80(AX), X7
    MOVUPD 96(AX), X8
    MOVUPD 112(AX), X9
    MULPD  (CX), X2
    MULPD  16(CX), X3
    MULPD  32(CX), X4
    MULPD  48(CX), X5
    MULPD  64(CX), X6
    MULPD  80(CX), X7
    MULPD  96(CX), X8
    MULPD  112(CX), X9
    ADDPD  X2, X0
    ADDPD  X3, X1
    ADDPD  X4, X0
    ADDPD  X5, X1
    ADDPD  X6, X0
    ADDPD  X7, X1
    ADDPD  X8, X0
    ADDPD  X9, X1
    ADDQ   $0x00000080, AX
    ADDQ   $0x00000080, CX
    SUBQ   $0x00000010, DX
    JMP    unrolledLoop8

unrolledLoop4:
    CMPQ   DX, $0x00000008
    JL     unrolledLoop1
    MOVUPD (AX), X2
    MOVUPD 16(AX), X3
    MOVUPD 32(AX), X4
    MOVUPD 48(AX), X5
    MULPD  (CX), X2
    MULPD  16(CX), X3
    MULPD  32(CX), X4
    MULPD  48(CX), X5
    ADDPD  X2, X0
    ADDPD  X3, X1
    ADDPD  X4, X0
    ADDPD  X5, X1
    ADDQ   $0x00000040, AX
    ADDQ   $0x00000040, CX
    SUBQ   $0x00000008, DX
    JMP    unrolledLoop4

unrolledLoop1:
    CMPQ   DX, $0x00000002
    JL     tailLoop
    MOVUPD (AX), X2
    MULPD  (CX), X2
    ADDPD  X2, X0
    ADDQ   $0x00000010, AX
    ADDQ   $0x00000010, CX
    SUBQ   $0x00000002, DX
    JMP    unrolledLoop1

tailLoop:
    CMPQ  DX, $0x00000000
    JE    reduce
    MOVSD (AX), X2
    MULSD (CX), X2
    ADDSD X2, X0
    ADDQ  $0x00000008, AX
    ADDQ  $0x00000008, CX
    DECQ  DX
    JMP   tailLoop

reduce:
    ADDPD  X1, X0
    HADDPD X0, X0
    MOVSD  X0, ret+48(FP)
    RET