nlpodyssey/spago

View on GitHub
mat/internal/matfuncs/exp_amd64.s

Summary

Maintainability
Test Coverage
// Code generated by command: go run exp_asm.go -out ../../matfuncs/exp_amd64.s -stubs ../../matfuncs/exp_amd64_stubs.go -pkg matfuncs. DO NOT EDIT.

//go:build amd64 && gc && !purego

#include "textflag.h"

DATA LCPI0_0<>+0(SB)/4, $0x42b0c0a5
GLOBL LCPI0_0<>(SB), RODATA|NOPTR, $4

DATA LCPI0_1<>+0(SB)/4, $0xc2b0c0a5
GLOBL LCPI0_1<>(SB), RODATA|NOPTR, $4

DATA LCPI0_2<>+0(SB)/4, $0x3fb8aa3b
GLOBL LCPI0_2<>(SB), RODATA|NOPTR, $4

DATA LCPI0_3<>+0(SB)/4, $0x3f000000
GLOBL LCPI0_3<>(SB), RODATA|NOPTR, $4

DATA LCPI0_4<>+0(SB)/4, $0x3f800000
GLOBL LCPI0_4<>(SB), RODATA|NOPTR, $4

DATA LCPI0_5<>+0(SB)/4, $0xbf318000
GLOBL LCPI0_5<>(SB), RODATA|NOPTR, $4

DATA LCPI0_6<>+0(SB)/4, $0x395e8083
GLOBL LCPI0_6<>(SB), RODATA|NOPTR, $4

DATA LCPI0_7<>+0(SB)/4, $0x39506967
GLOBL LCPI0_7<>(SB), RODATA|NOPTR, $4

DATA LCPI0_8<>+0(SB)/4, $0x3ab743ce
GLOBL LCPI0_8<>(SB), RODATA|NOPTR, $4

DATA LCPI0_9<>+0(SB)/4, $0x3c088908
GLOBL LCPI0_9<>(SB), RODATA|NOPTR, $4

DATA LCPI0_10<>+0(SB)/4, $0x3d2aa9c1
GLOBL LCPI0_10<>(SB), RODATA|NOPTR, $4

DATA LCPI0_11<>+0(SB)/4, $0x3e2aaaaa
GLOBL LCPI0_11<>(SB), RODATA|NOPTR, $4

// func ExpAVX32(x []float32, y []float32)
// Requires: AVX, AVX2
TEXT ·ExpAVX32(SB), NOSPLIT, $0-48
    MOVQ         x_base+0(FP), AX
    MOVQ         y_base+24(FP), CX
    VMOVUPS      (AX), Y0
    VBROADCASTSS LCPI0_0<>+0(SB), Y1
    VMINPS       Y1, Y0, Y0
    VBROADCASTSS LCPI0_1<>+0(SB), Y1
    VMAXPS       Y1, Y0, Y0
    VBROADCASTSS LCPI0_2<>+0(SB), Y1
    VMULPS       Y1, Y0, Y1
    VBROADCASTSS LCPI0_3<>+0(SB), Y2
    VADDPS       Y2, Y1, Y1
    VROUNDPS     $0x01, Y1, Y3
    VCMPPS       $0x01, Y3, Y1, Y1
    VBROADCASTSS LCPI0_4<>+0(SB), Y4
    VANDPS       Y4, Y1, Y1
    VSUBPS       Y1, Y3, Y1
    VBROADCASTSS LCPI0_5<>+0(SB), Y3
    VMULPS       Y3, Y1, Y3
    VBROADCASTSS LCPI0_6<>+0(SB), Y5
    VMULPS       Y5, Y1, Y5
    VADDPS       Y3, Y0, Y0
    VADDPS       Y5, Y0, Y0
    VMULPS       Y0, Y0, Y3
    VBROADCASTSS LCPI0_7<>+0(SB), Y5
    VMULPS       Y5, Y0, Y5
    VBROADCASTSS LCPI0_8<>+0(SB), Y6
    VADDPS       Y6, Y5, Y5
    VMULPS       Y5, Y0, Y5
    VBROADCASTSS LCPI0_9<>+0(SB), Y6
    VADDPS       Y6, Y5, Y5
    VMULPS       Y5, Y0, Y5
    VBROADCASTSS LCPI0_10<>+0(SB), Y6
    VADDPS       Y6, Y5, Y5
    VMULPS       Y5, Y0, Y5
    VBROADCASTSS LCPI0_11<>+0(SB), Y6
    VADDPS       Y6, Y5, Y5
    VMULPS       Y5, Y0, Y5
    VADDPS       Y2, Y5, Y2
    VMULPS       Y2, Y3, Y2
    VADDPS       Y2, Y0, Y0
    VADDPS       Y4, Y0, Y0
    VCVTTPS2DQ   Y1, Y1
    VPSLLD       $0x17, Y1, Y1
    VPBROADCASTD LCPI0_4<>+0(SB), Y2
    VPADDD       Y2, Y1, Y1
    VMULPS       Y1, Y0, Y0
    VMOVUPS      Y0, (CX)
    RET

DATA SSE_LCPI0_0<>+0(SB)/4, $0x42b0c0a5
DATA SSE_LCPI0_0<>+4(SB)/4, $0x42b0c0a5
DATA SSE_LCPI0_0<>+8(SB)/4, $0x42b0c0a5
DATA SSE_LCPI0_0<>+12(SB)/4, $0x42b0c0a5
GLOBL SSE_LCPI0_0<>(SB), RODATA|NOPTR, $16

DATA SSE_LCPI0_1<>+0(SB)/4, $0xc2b0c0a5
DATA SSE_LCPI0_1<>+4(SB)/4, $0xc2b0c0a5
DATA SSE_LCPI0_1<>+8(SB)/4, $0xc2b0c0a5
DATA SSE_LCPI0_1<>+12(SB)/4, $0xc2b0c0a5
GLOBL SSE_LCPI0_1<>(SB), RODATA|NOPTR, $16

DATA SSE_LCPI0_2<>+0(SB)/4, $0x3fb8aa3b
DATA SSE_LCPI0_2<>+4(SB)/4, $0x3fb8aa3b
DATA SSE_LCPI0_2<>+8(SB)/4, $0x3fb8aa3b
DATA SSE_LCPI0_2<>+12(SB)/4, $0x3fb8aa3b
GLOBL SSE_LCPI0_2<>(SB), RODATA|NOPTR, $16

DATA SSE_LCPI0_3<>+0(SB)/4, $0x3f000000
DATA SSE_LCPI0_3<>+4(SB)/4, $0x3f000000
DATA SSE_LCPI0_3<>+8(SB)/4, $0x3f000000
DATA SSE_LCPI0_3<>+12(SB)/4, $0x3f000000
GLOBL SSE_LCPI0_3<>(SB), RODATA|NOPTR, $16

DATA SSE_LCPI0_4<>+0(SB)/4, $0x3f800000
DATA SSE_LCPI0_4<>+4(SB)/4, $0x3f800000
DATA SSE_LCPI0_4<>+8(SB)/4, $0x3f800000
DATA SSE_LCPI0_4<>+12(SB)/4, $0x3f800000
GLOBL SSE_LCPI0_4<>(SB), RODATA|NOPTR, $16

DATA SSE_LCPI0_5<>+0(SB)/4, $0xbf318000
DATA SSE_LCPI0_5<>+4(SB)/4, $0xbf318000
DATA SSE_LCPI0_5<>+8(SB)/4, $0xbf318000
DATA SSE_LCPI0_5<>+12(SB)/4, $0xbf318000
GLOBL SSE_LCPI0_5<>(SB), RODATA|NOPTR, $16

DATA SSE_LCPI0_6<>+0(SB)/4, $0x395e8083
DATA SSE_LCPI0_6<>+4(SB)/4, $0x395e8083
DATA SSE_LCPI0_6<>+8(SB)/4, $0x395e8083
DATA SSE_LCPI0_6<>+12(SB)/4, $0x395e8083
GLOBL SSE_LCPI0_6<>(SB), RODATA|NOPTR, $16

DATA SSE_LCPI0_7<>+0(SB)/4, $0x39506967
DATA SSE_LCPI0_7<>+4(SB)/4, $0x39506967
DATA SSE_LCPI0_7<>+8(SB)/4, $0x39506967
DATA SSE_LCPI0_7<>+12(SB)/4, $0x39506967
GLOBL SSE_LCPI0_7<>(SB), RODATA|NOPTR, $16

DATA SSE_LCPI0_8<>+0(SB)/4, $0x3ab743ce
DATA SSE_LCPI0_8<>+4(SB)/4, $0x3ab743ce
DATA SSE_LCPI0_8<>+8(SB)/4, $0x3ab743ce
DATA SSE_LCPI0_8<>+12(SB)/4, $0x3ab743ce
GLOBL SSE_LCPI0_8<>(SB), RODATA|NOPTR, $16

DATA SSE_LCPI0_9<>+0(SB)/4, $0x3c088908
DATA SSE_LCPI0_9<>+4(SB)/4, $0x3c088908
DATA SSE_LCPI0_9<>+8(SB)/4, $0x3c088908
DATA SSE_LCPI0_9<>+12(SB)/4, $0x3c088908
GLOBL SSE_LCPI0_9<>(SB), RODATA|NOPTR, $16

DATA SSE_LCPI0_10<>+0(SB)/4, $0x3d2aa9c1
DATA SSE_LCPI0_10<>+4(SB)/4, $0x3d2aa9c1
DATA SSE_LCPI0_10<>+8(SB)/4, $0x3d2aa9c1
DATA SSE_LCPI0_10<>+12(SB)/4, $0x3d2aa9c1
GLOBL SSE_LCPI0_10<>(SB), RODATA|NOPTR, $16

DATA SSE_LCPI0_11<>+0(SB)/4, $0x3e2aaaaa
DATA SSE_LCPI0_11<>+4(SB)/4, $0x3e2aaaaa
DATA SSE_LCPI0_11<>+8(SB)/4, $0x3e2aaaaa
DATA SSE_LCPI0_11<>+12(SB)/4, $0x3e2aaaaa
GLOBL SSE_LCPI0_11<>(SB), RODATA|NOPTR, $16

// func ExpSSE32(x []float32, y []float32)
// Requires: SSE, SSE2
TEXT ·ExpSSE32(SB), NOSPLIT, $0-48
    MOVQ      x_base+0(FP), AX
    MOVQ      y_base+24(FP), CX
    MOVUPS    (AX), X0
    MINPS     SSE_LCPI0_0<>+0(SB), X0
    MAXPS     SSE_LCPI0_1<>+0(SB), X0
    MOVAPS    SSE_LCPI0_2<>+0(SB), X4
    MULPS     X0, X4
    MOVAPS    SSE_LCPI0_3<>+0(SB), X2
    ADDPS     X2, X4
    CVTTPS2PL X4, X1
    CVTPL2PS  X1, X1
    CMPPS     X1, X4, $0x01
    MOVAPS    SSE_LCPI0_4<>+0(SB), X3
    ANDPS     X3, X4
    SUBPS     X4, X1
    MOVAPS    SSE_LCPI0_5<>+0(SB), X4
    MULPS     X1, X4
    MOVAPS    SSE_LCPI0_6<>+0(SB), X5
    MULPS     X1, X5
    ADDPS     X4, X0
    ADDPS     X5, X0
    MOVAPS    X0, X4
    MOVAPS    SSE_LCPI0_7<>+0(SB), X5
    MULPS     X0, X5
    ADDPS     SSE_LCPI0_8<>+0(SB), X5
    MULPS     X0, X4
    MULPS     X0, X5
    ADDPS     SSE_LCPI0_9<>+0(SB), X5
    MULPS     X0, X5
    ADDPS     SSE_LCPI0_10<>+0(SB), X5
    MULPS     X0, X5
    ADDPS     SSE_LCPI0_11<>+0(SB), X5
    MULPS     X0, X5
    ADDPS     X2, X5
    MULPS     X4, X5
    ADDPS     X5, X0
    ADDPS     X3, X0
    CVTTPS2PL X1, X1
    PSLLL     $0x17, X1
    PADDD     SSE_LCPI0_4<>+0(SB), X1
    MULPS     X1, X0
    MOVUPS    X0, (CX)
    RET