nlpodyssey/spago

View on GitHub
mat/internal/matfuncs/div_amd64.s

Summary

Maintainability
Test Coverage
// Code generated by command: go run div_asm.go -out ../../matfuncs/div_amd64.s -stubs ../../matfuncs/div_amd64_stubs.go -pkg matfuncs. DO NOT EDIT.

//go:build amd64 && gc && !purego

#include "textflag.h"

// func DivAVX32(x1 []float32, x2 []float32, y []float32)
// Requires: AVX, SSE
TEXT ·DivAVX32(SB), NOSPLIT, $0-72
    MOVQ x1_base+0(FP), AX
    MOVQ x2_base+24(FP), CX
    MOVQ y_base+48(FP), DX
    MOVQ x1_len+8(FP), BX

unrolledLoop:
    CMPQ    BX, $0x00000080
    JL      singleRegisterLoop
    VMOVUPS (AX), Y0
    VMOVUPS 32(AX), Y1
    VMOVUPS 64(AX), Y2
    VMOVUPS 96(AX), Y3
    VMOVUPS 128(AX), Y4
    VMOVUPS 160(AX), Y5
    VMOVUPS 192(AX), Y6
    VMOVUPS 224(AX), Y7
    VMOVUPS 256(AX), Y8
    VMOVUPS 288(AX), Y9
    VMOVUPS 320(AX), Y10
    VMOVUPS 352(AX), Y11
    VMOVUPS 384(AX), Y12
    VMOVUPS 416(AX), Y13
    VMOVUPS 448(AX), Y14
    VMOVUPS 480(AX), Y15
    VDIVPS  (CX), Y0, Y0
    VDIVPS  32(CX), Y1, Y1
    VDIVPS  64(CX), Y2, Y2
    VDIVPS  96(CX), Y3, Y3
    VDIVPS  128(CX), Y4, Y4
    VDIVPS  160(CX), Y5, Y5
    VDIVPS  192(CX), Y6, Y6
    VDIVPS  224(CX), Y7, Y7
    VDIVPS  256(CX), Y8, Y8
    VDIVPS  288(CX), Y9, Y9
    VDIVPS  320(CX), Y10, Y10
    VDIVPS  352(CX), Y11, Y11
    VDIVPS  384(CX), Y12, Y12
    VDIVPS  416(CX), Y13, Y13
    VDIVPS  448(CX), Y14, Y14
    VDIVPS  480(CX), Y15, Y15
    VMOVUPS Y0, (DX)
    VMOVUPS Y1, 32(DX)
    VMOVUPS Y2, 64(DX)
    VMOVUPS Y3, 96(DX)
    VMOVUPS Y4, 128(DX)
    VMOVUPS Y5, 160(DX)
    VMOVUPS Y6, 192(DX)
    VMOVUPS Y7, 224(DX)
    VMOVUPS Y8, 256(DX)
    VMOVUPS Y9, 288(DX)
    VMOVUPS Y10, 320(DX)
    VMOVUPS Y11, 352(DX)
    VMOVUPS Y12, 384(DX)
    VMOVUPS Y13, 416(DX)
    VMOVUPS Y14, 448(DX)
    VMOVUPS Y15, 480(DX)
    ADDQ    $0x00000200, AX
    ADDQ    $0x00000200, CX
    ADDQ    $0x00000200, DX
    SUBQ    $0x00000080, BX
    JMP     unrolledLoop

singleRegisterLoop:
    CMPQ    BX, $0x00000008
    JL      tailLoop
    VMOVUPS (AX), Y0
    VDIVPS  (CX), Y0, Y0
    VMOVUPS Y0, (DX)
    ADDQ    $0x00000020, AX
    ADDQ    $0x00000020, CX
    ADDQ    $0x00000020, DX
    SUBQ    $0x00000008, BX
    JMP     singleRegisterLoop

tailLoop:
    CMPQ  BX, $0x00000000
    JE    end
    MOVSS (AX), X0
    DIVSS (CX), X0
    MOVSS X0, (DX)
    ADDQ  $0x00000004, AX
    ADDQ  $0x00000004, CX
    ADDQ  $0x00000004, DX
    DECQ  BX
    JMP   tailLoop

end:
    RET

// func DivAVX64(x1 []float64, x2 []float64, y []float64)
// Requires: AVX, SSE2
TEXT ·DivAVX64(SB), NOSPLIT, $0-72
    MOVQ x1_base+0(FP), AX
    MOVQ x2_base+24(FP), CX
    MOVQ y_base+48(FP), DX
    MOVQ x1_len+8(FP), BX

unrolledLoop:
    CMPQ    BX, $0x00000040
    JL      singleRegisterLoop
    VMOVUPD (AX), Y0
    VMOVUPD 32(AX), Y1
    VMOVUPD 64(AX), Y2
    VMOVUPD 96(AX), Y3
    VMOVUPD 128(AX), Y4
    VMOVUPD 160(AX), Y5
    VMOVUPD 192(AX), Y6
    VMOVUPD 224(AX), Y7
    VMOVUPD 256(AX), Y8
    VMOVUPD 288(AX), Y9
    VMOVUPD 320(AX), Y10
    VMOVUPD 352(AX), Y11
    VMOVUPD 384(AX), Y12
    VMOVUPD 416(AX), Y13
    VMOVUPD 448(AX), Y14
    VMOVUPD 480(AX), Y15
    VDIVPD  (CX), Y0, Y0
    VDIVPD  32(CX), Y1, Y1
    VDIVPD  64(CX), Y2, Y2
    VDIVPD  96(CX), Y3, Y3
    VDIVPD  128(CX), Y4, Y4
    VDIVPD  160(CX), Y5, Y5
    VDIVPD  192(CX), Y6, Y6
    VDIVPD  224(CX), Y7, Y7
    VDIVPD  256(CX), Y8, Y8
    VDIVPD  288(CX), Y9, Y9
    VDIVPD  320(CX), Y10, Y10
    VDIVPD  352(CX), Y11, Y11
    VDIVPD  384(CX), Y12, Y12
    VDIVPD  416(CX), Y13, Y13
    VDIVPD  448(CX), Y14, Y14
    VDIVPD  480(CX), Y15, Y15
    VMOVUPD Y0, (DX)
    VMOVUPD Y1, 32(DX)
    VMOVUPD Y2, 64(DX)
    VMOVUPD Y3, 96(DX)
    VMOVUPD Y4, 128(DX)
    VMOVUPD Y5, 160(DX)
    VMOVUPD Y6, 192(DX)
    VMOVUPD Y7, 224(DX)
    VMOVUPD Y8, 256(DX)
    VMOVUPD Y9, 288(DX)
    VMOVUPD Y10, 320(DX)
    VMOVUPD Y11, 352(DX)
    VMOVUPD Y12, 384(DX)
    VMOVUPD Y13, 416(DX)
    VMOVUPD Y14, 448(DX)
    VMOVUPD Y15, 480(DX)
    ADDQ    $0x00000200, AX
    ADDQ    $0x00000200, CX
    ADDQ    $0x00000200, DX
    SUBQ    $0x00000040, BX
    JMP     unrolledLoop

singleRegisterLoop:
    CMPQ    BX, $0x00000004
    JL      tailLoop
    VMOVUPD (AX), Y0
    VDIVPD  (CX), Y0, Y0
    VMOVUPD Y0, (DX)
    ADDQ    $0x00000020, AX
    ADDQ    $0x00000020, CX
    ADDQ    $0x00000020, DX
    SUBQ    $0x00000004, BX
    JMP     singleRegisterLoop

tailLoop:
    CMPQ  BX, $0x00000000
    JE    end
    MOVSD (AX), X0
    DIVSD (CX), X0
    MOVSD X0, (DX)
    ADDQ  $0x00000008, AX
    ADDQ  $0x00000008, CX
    ADDQ  $0x00000008, DX
    DECQ  BX
    JMP   tailLoop

end:
    RET

// func DivSSE32(x1 []float32, x2 []float32, y []float32)
// Requires: SSE
TEXT ·DivSSE32(SB), NOSPLIT, $0-72
    MOVQ x1_base+0(FP), AX
    MOVQ x2_base+24(FP), CX
    MOVQ y_base+48(FP), DX
    MOVQ x1_len+8(FP), BX
    CMPQ BX, $0x00000000
    JE   end
    MOVQ CX, SI
    ANDQ $0x0000000f, SI
    JZ   unrolledLoop
    XORQ $0x0000000f, SI
    INCQ SI
    SHRQ $0x02, SI

alignmentLoop:
    MOVSS (AX), X0
    DIVSS (CX), X0
    MOVSS X0, (DX)
    ADDQ  $0x00000004, AX
    ADDQ  $0x00000004, CX
    ADDQ  $0x00000004, DX
    DECQ  BX
    JZ    end
    DECQ  SI
    JNZ   alignmentLoop

unrolledLoop:
    CMPQ   BX, $0x00000040
    JL     singleRegisterLoop
    MOVUPS (AX), X0
    MOVUPS 16(AX), X1
    MOVUPS 32(AX), X2
    MOVUPS 48(AX), X3
    MOVUPS 64(AX), X4
    MOVUPS 80(AX), X5
    MOVUPS 96(AX), X6
    MOVUPS 112(AX), X7
    MOVUPS 128(AX), X8
    MOVUPS 144(AX), X9
    MOVUPS 160(AX), X10
    MOVUPS 176(AX), X11
    MOVUPS 192(AX), X12
    MOVUPS 208(AX), X13
    MOVUPS 224(AX), X14
    MOVUPS 240(AX), X15
    DIVPS  (CX), X0
    DIVPS  16(CX), X1
    DIVPS  32(CX), X2
    DIVPS  48(CX), X3
    DIVPS  64(CX), X4
    DIVPS  80(CX), X5
    DIVPS  96(CX), X6
    DIVPS  112(CX), X7
    DIVPS  128(CX), X8
    DIVPS  144(CX), X9
    DIVPS  160(CX), X10
    DIVPS  176(CX), X11
    DIVPS  192(CX), X12
    DIVPS  208(CX), X13
    DIVPS  224(CX), X14
    DIVPS  240(CX), X15
    MOVUPS X0, (DX)
    MOVUPS X1, 16(DX)
    MOVUPS X2, 32(DX)
    MOVUPS X3, 48(DX)
    MOVUPS X4, 64(DX)
    MOVUPS X5, 80(DX)
    MOVUPS X6, 96(DX)
    MOVUPS X7, 112(DX)
    MOVUPS X8, 128(DX)
    MOVUPS X9, 144(DX)
    MOVUPS X10, 160(DX)
    MOVUPS X11, 176(DX)
    MOVUPS X12, 192(DX)
    MOVUPS X13, 208(DX)
    MOVUPS X14, 224(DX)
    MOVUPS X15, 240(DX)
    ADDQ   $0x00000100, AX
    ADDQ   $0x00000100, CX
    ADDQ   $0x00000100, DX
    SUBQ   $0x00000040, BX
    JMP    unrolledLoop

singleRegisterLoop:
    CMPQ   BX, $0x00000004
    JL     tailLoop
    MOVUPS (AX), X0
    DIVPS  (CX), X0
    MOVUPS X0, (DX)
    ADDQ   $0x00000010, AX
    ADDQ   $0x00000010, CX
    ADDQ   $0x00000010, DX
    SUBQ   $0x00000004, BX
    JMP    singleRegisterLoop

tailLoop:
    CMPQ  BX, $0x00000000
    JE    end
    MOVSS (AX), X0
    DIVSS (CX), X0
    MOVSS X0, (DX)
    ADDQ  $0x00000004, AX
    ADDQ  $0x00000004, CX
    ADDQ  $0x00000004, DX
    DECQ  BX
    JMP   tailLoop

end:
    RET

// func DivSSE64(x1 []float64, x2 []float64, y []float64)
// Requires: SSE2
TEXT ·DivSSE64(SB), NOSPLIT, $0-72
    MOVQ  x1_base+0(FP), AX
    MOVQ  x2_base+24(FP), CX
    MOVQ  y_base+48(FP), DX
    MOVQ  x1_len+8(FP), BX
    CMPQ  BX, $0x00000000
    JE    end
    MOVQ  CX, SI
    ANDQ  $0x0000000f, SI
    JZ    unrolledLoop
    MOVSD (AX), X0
    DIVSD (CX), X0
    MOVSD X0, (DX)
    ADDQ  $0x00000008, AX
    ADDQ  $0x00000008, CX
    ADDQ  $0x00000008, DX
    DECQ  BX

unrolledLoop:
    CMPQ   BX, $0x00000020
    JL     singleRegisterLoop
    MOVUPD (AX), X0
    MOVUPD 16(AX), X1
    MOVUPD 32(AX), X2
    MOVUPD 48(AX), X3
    MOVUPD 64(AX), X4
    MOVUPD 80(AX), X5
    MOVUPD 96(AX), X6
    MOVUPD 112(AX), X7
    MOVUPD 128(AX), X8
    MOVUPD 144(AX), X9
    MOVUPD 160(AX), X10
    MOVUPD 176(AX), X11
    MOVUPD 192(AX), X12
    MOVUPD 208(AX), X13
    MOVUPD 224(AX), X14
    MOVUPD 240(AX), X15
    DIVPD  (CX), X0
    DIVPD  16(CX), X1
    DIVPD  32(CX), X2
    DIVPD  48(CX), X3
    DIVPD  64(CX), X4
    DIVPD  80(CX), X5
    DIVPD  96(CX), X6
    DIVPD  112(CX), X7
    DIVPD  128(CX), X8
    DIVPD  144(CX), X9
    DIVPD  160(CX), X10
    DIVPD  176(CX), X11
    DIVPD  192(CX), X12
    DIVPD  208(CX), X13
    DIVPD  224(CX), X14
    DIVPD  240(CX), X15
    MOVUPD X0, (DX)
    MOVUPD X1, 16(DX)
    MOVUPD X2, 32(DX)
    MOVUPD X3, 48(DX)
    MOVUPD X4, 64(DX)
    MOVUPD X5, 80(DX)
    MOVUPD X6, 96(DX)
    MOVUPD X7, 112(DX)
    MOVUPD X8, 128(DX)
    MOVUPD X9, 144(DX)
    MOVUPD X10, 160(DX)
    MOVUPD X11, 176(DX)
    MOVUPD X12, 192(DX)
    MOVUPD X13, 208(DX)
    MOVUPD X14, 224(DX)
    MOVUPD X15, 240(DX)
    ADDQ   $0x00000100, AX
    ADDQ   $0x00000100, CX
    ADDQ   $0x00000100, DX
    SUBQ   $0x00000020, BX
    JMP    unrolledLoop

singleRegisterLoop:
    CMPQ   BX, $0x00000002
    JL     tailLoop
    MOVUPD (AX), X0
    DIVPD  (CX), X0
    MOVUPD X0, (DX)
    ADDQ   $0x00000010, AX
    ADDQ   $0x00000010, CX
    ADDQ   $0x00000010, DX
    SUBQ   $0x00000002, BX
    JMP    singleRegisterLoop

tailLoop:
    CMPQ  BX, $0x00000000
    JE    end
    MOVSD (AX), X0
    DIVSD (CX), X0
    MOVSD X0, (DX)
    ADDQ  $0x00000008, AX
    ADDQ  $0x00000008, CX
    ADDQ  $0x00000008, DX
    DECQ  BX
    JMP   tailLoop

end:
    RET