mat/internal/f64/asm64/gemvN_amd64.s
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define SIZE 8
#define M_DIM m+0(FP)
#define M CX
#define N_DIM n+8(FP)
#define N BX
#define TMP1 R14
#define TMP2 R15
#define X_PTR SI
#define X x_base+56(FP)
#define INC_X R8
#define INC3_X R9
#define Y_PTR DX
#define Y y_base+96(FP)
#define INC_Y R10
#define INC3_Y R11
#define A_ROW AX
#define A_PTR DI
#define LDA R12
#define LDA3 R13
#define ALPHA X15
#define BETA X14
#define INIT4 \
XORPS X0, X0 \
XORPS X1, X1 \
XORPS X2, X2 \
XORPS X3, X3
#define INIT2 \
XORPS X0, X0 \
XORPS X1, X1
#define INIT1 \
XORPS X0, X0
#define KERNEL_LOAD4 \
MOVUPS (X_PTR), X12 \
MOVUPS 2*SIZE(X_PTR), X13
#define KERNEL_LOAD2 \
MOVUPS (X_PTR), X12
#define KERNEL_LOAD4_INC \
MOVSD (X_PTR), X12 \
MOVHPD (X_PTR)(INC_X*1), X12 \
MOVSD (X_PTR)(INC_X*2), X13 \
MOVHPD (X_PTR)(INC3_X*1), X13
#define KERNEL_LOAD2_INC \
MOVSD (X_PTR), X12 \
MOVHPD (X_PTR)(INC_X*1), X12
#define KERNEL_4x4 \
MOVUPS (A_PTR), X4 \
MOVUPS 2*SIZE(A_PTR), X5 \
MOVUPS (A_PTR)(LDA*1), X6 \
MOVUPS 2*SIZE(A_PTR)(LDA*1), X7 \
MOVUPS (A_PTR)(LDA*2), X8 \
MOVUPS 2*SIZE(A_PTR)(LDA*2), X9 \
MOVUPS (A_PTR)(LDA3*1), X10 \
MOVUPS 2*SIZE(A_PTR)(LDA3*1), X11 \
MULPD X12, X4 \
MULPD X13, X5 \
MULPD X12, X6 \
MULPD X13, X7 \
MULPD X12, X8 \
MULPD X13, X9 \
MULPD X12, X10 \
MULPD X13, X11 \
ADDPD X4, X0 \
ADDPD X5, X0 \
ADDPD X6, X1 \
ADDPD X7, X1 \
ADDPD X8, X2 \
ADDPD X9, X2 \
ADDPD X10, X3 \
ADDPD X11, X3 \
ADDQ $4*SIZE, A_PTR
#define KERNEL_4x2 \
MOVUPS (A_PTR), X4 \
MOVUPS (A_PTR)(LDA*1), X5 \
MOVUPS (A_PTR)(LDA*2), X6 \
MOVUPS (A_PTR)(LDA3*1), X7 \
MULPD X12, X4 \
MULPD X12, X5 \
MULPD X12, X6 \
MULPD X12, X7 \
ADDPD X4, X0 \
ADDPD X5, X1 \
ADDPD X6, X2 \
ADDPD X7, X3 \
ADDQ $2*SIZE, A_PTR
#define KERNEL_4x1 \
MOVDDUP (X_PTR), X12 \
MOVSD (A_PTR), X4 \
MOVHPD (A_PTR)(LDA*1), X4 \
MOVSD (A_PTR)(LDA*2), X5 \
MOVHPD (A_PTR)(LDA3*1), X5 \
MULPD X12, X4 \
MULPD X12, X5 \
ADDPD X4, X0 \
ADDPD X5, X2 \
ADDQ $SIZE, A_PTR
#define STORE4 \
MOVUPS (Y_PTR), X4 \
MOVUPS 2*SIZE(Y_PTR), X5 \
MULPD ALPHA, X0 \
MULPD ALPHA, X2 \
MULPD BETA, X4 \
MULPD BETA, X5 \
ADDPD X0, X4 \
ADDPD X2, X5 \
MOVUPS X4, (Y_PTR) \
MOVUPS X5, 2*SIZE(Y_PTR)
#define STORE4_INC \
MOVSD (Y_PTR), X4 \
MOVHPD (Y_PTR)(INC_Y*1), X4 \
MOVSD (Y_PTR)(INC_Y*2), X5 \
MOVHPD (Y_PTR)(INC3_Y*1), X5 \
MULPD ALPHA, X0 \
MULPD ALPHA, X2 \
MULPD BETA, X4 \
MULPD BETA, X5 \
ADDPD X0, X4 \
ADDPD X2, X5 \
MOVLPD X4, (Y_PTR) \
MOVHPD X4, (Y_PTR)(INC_Y*1) \
MOVLPD X5, (Y_PTR)(INC_Y*2) \
MOVHPD X5, (Y_PTR)(INC3_Y*1)
#define KERNEL_2x4 \
MOVUPS (A_PTR), X8 \
MOVUPS 2*SIZE(A_PTR), X9 \
MOVUPS (A_PTR)(LDA*1), X10 \
MOVUPS 2*SIZE(A_PTR)(LDA*1), X11 \
MULPD X12, X8 \
MULPD X13, X9 \
MULPD X12, X10 \
MULPD X13, X11 \
ADDPD X8, X0 \
ADDPD X10, X1 \
ADDPD X9, X0 \
ADDPD X11, X1 \
ADDQ $4*SIZE, A_PTR
#define KERNEL_2x2 \
MOVUPS (A_PTR), X8 \
MOVUPS (A_PTR)(LDA*1), X9 \
MULPD X12, X8 \
MULPD X12, X9 \
ADDPD X8, X0 \
ADDPD X9, X1 \
ADDQ $2*SIZE, A_PTR
#define KERNEL_2x1 \
MOVDDUP (X_PTR), X12 \
MOVSD (A_PTR), X8 \
MOVHPD (A_PTR)(LDA*1), X8 \
MULPD X12, X8 \
ADDPD X8, X0 \
ADDQ $SIZE, A_PTR
#define STORE2 \
MOVUPS (Y_PTR), X4 \
MULPD ALPHA, X0 \
MULPD BETA, X4 \
ADDPD X0, X4 \
MOVUPS X4, (Y_PTR)
#define STORE2_INC \
MOVSD (Y_PTR), X4 \
MOVHPD (Y_PTR)(INC_Y*1), X4 \
MULPD ALPHA, X0 \
MULPD BETA, X4 \
ADDPD X0, X4 \
MOVSD X4, (Y_PTR) \
MOVHPD X4, (Y_PTR)(INC_Y*1)
#define KERNEL_1x4 \
MOVUPS (A_PTR), X8 \
MOVUPS 2*SIZE(A_PTR), X9 \
MULPD X12, X8 \
MULPD X13, X9 \
ADDPD X8, X0 \
ADDPD X9, X0 \
ADDQ $4*SIZE, A_PTR
#define KERNEL_1x2 \
MOVUPS (A_PTR), X8 \
MULPD X12, X8 \
ADDPD X8, X0 \
ADDQ $2*SIZE, A_PTR
#define KERNEL_1x1 \
MOVSD (X_PTR), X12 \
MOVSD (A_PTR), X8 \
MULSD X12, X8 \
ADDSD X8, X0 \
ADDQ $SIZE, A_PTR
#define STORE1 \
HADDPD X0, X0 \
MOVSD (Y_PTR), X4 \
MULSD ALPHA, X0 \
MULSD BETA, X4 \
ADDSD X0, X4 \
MOVSD X4, (Y_PTR)
// func GemvN(m, n int,
// alpha float64,
// a []float64, lda int,
// x []float64, incX int,
// beta float64,
// y []float64, incY int)
TEXT ·GemvN(SB), NOSPLIT, $32-128
MOVQ M_DIM, M
MOVQ N_DIM, N
CMPQ M, $0
JE end
CMPQ N, $0
JE end
MOVDDUP alpha+16(FP), ALPHA
MOVDDUP beta+88(FP), BETA
MOVQ x_base+56(FP), X_PTR
MOVQ y_base+96(FP), Y_PTR
MOVQ a_base+24(FP), A_ROW
MOVQ incY+120(FP), INC_Y
MOVQ lda+48(FP), LDA // LDA = LDA * sizeof(float64)
SHLQ $3, LDA
LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3
MOVQ A_ROW, A_PTR
XORQ TMP2, TMP2
MOVQ M, TMP1
SUBQ $1, TMP1
IMULQ INC_Y, TMP1
NEGQ TMP1
CMPQ INC_Y, $0
CMOVQLT TMP1, TMP2
LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR
MOVQ Y_PTR, Y
SHLQ $3, INC_Y // INC_Y = incY * sizeof(float64)
LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
MOVSD $0.0, X0
COMISD BETA, X0
JNE gemv_start // if beta != 0 { goto gemv_start }
gemv_clear: // beta == 0 is special cased to clear memory (no nan handling)
XORPS X0, X0
XORPS X1, X1
XORPS X2, X2
XORPS X3, X3
CMPQ incY+120(FP), $1 // Check for dense vector X (fast-path)
JNE inc_clear
SHRQ $3, M
JZ clear4
clear8:
MOVUPS X0, (Y_PTR)
MOVUPS X1, 16(Y_PTR)
MOVUPS X2, 32(Y_PTR)
MOVUPS X3, 48(Y_PTR)
ADDQ $8*SIZE, Y_PTR
DECQ M
JNZ clear8
clear4:
TESTQ $4, M_DIM
JZ clear2
MOVUPS X0, (Y_PTR)
MOVUPS X1, 16(Y_PTR)
ADDQ $4*SIZE, Y_PTR
clear2:
TESTQ $2, M_DIM
JZ clear1
MOVUPS X0, (Y_PTR)
ADDQ $2*SIZE, Y_PTR
clear1:
TESTQ $1, M_DIM
JZ prep_end
MOVSD X0, (Y_PTR)
JMP prep_end
inc_clear:
SHRQ $2, M
JZ inc_clear2
inc_clear4:
MOVSD X0, (Y_PTR)
MOVSD X1, (Y_PTR)(INC_Y*1)
MOVSD X2, (Y_PTR)(INC_Y*2)
MOVSD X3, (Y_PTR)(INC3_Y*1)
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
DECQ M
JNZ inc_clear4
inc_clear2:
TESTQ $2, M_DIM
JZ inc_clear1
MOVSD X0, (Y_PTR)
MOVSD X1, (Y_PTR)(INC_Y*1)
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
inc_clear1:
TESTQ $1, M_DIM
JZ prep_end
MOVSD X0, (Y_PTR)
prep_end:
MOVQ Y, Y_PTR
MOVQ M_DIM, M
gemv_start:
CMPQ incX+80(FP), $1 // Check for dense vector X (fast-path)
JNE inc
SHRQ $2, M
JZ r2
r4:
// LOAD 4
INIT4
MOVQ N_DIM, N
SHRQ $2, N
JZ r4c2
r4c4:
// 4x4 KERNEL
KERNEL_LOAD4
KERNEL_4x4
ADDQ $4*SIZE, X_PTR
DECQ N
JNZ r4c4
r4c2:
TESTQ $2, N_DIM
JZ r4c1
// 4x2 KERNEL
KERNEL_LOAD2
KERNEL_4x2
ADDQ $2*SIZE, X_PTR
r4c1:
HADDPD X1, X0
HADDPD X3, X2
TESTQ $1, N_DIM
JZ r4end
// 4x1 KERNEL
KERNEL_4x1
ADDQ $SIZE, X_PTR
r4end:
CMPQ INC_Y, $SIZE
JNZ r4st_inc
STORE4
ADDQ $4*SIZE, Y_PTR
JMP r4inc
r4st_inc:
STORE4_INC
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
r4inc:
MOVQ X, X_PTR
LEAQ (A_ROW)(LDA*4), A_ROW
MOVQ A_ROW, A_PTR
DECQ M
JNZ r4
r2:
TESTQ $2, M_DIM
JZ r1
// LOAD 2
INIT2
MOVQ N_DIM, N
SHRQ $2, N
JZ r2c2
r2c4:
// 2x4 KERNEL
KERNEL_LOAD4
KERNEL_2x4
ADDQ $4*SIZE, X_PTR
DECQ N
JNZ r2c4
r2c2:
TESTQ $2, N_DIM
JZ r2c1
// 2x2 KERNEL
KERNEL_LOAD2
KERNEL_2x2
ADDQ $2*SIZE, X_PTR
r2c1:
HADDPD X1, X0
TESTQ $1, N_DIM
JZ r2end
// 2x1 KERNEL
KERNEL_2x1
ADDQ $SIZE, X_PTR
r2end:
CMPQ INC_Y, $SIZE
JNE r2st_inc
STORE2
ADDQ $2*SIZE, Y_PTR
JMP r2inc
r2st_inc:
STORE2_INC
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
r2inc:
MOVQ X, X_PTR
LEAQ (A_ROW)(LDA*2), A_ROW
MOVQ A_ROW, A_PTR
r1:
TESTQ $1, M_DIM
JZ end
// LOAD 1
INIT1
MOVQ N_DIM, N
SHRQ $2, N
JZ r1c2
r1c4:
// 1x4 KERNEL
KERNEL_LOAD4
KERNEL_1x4
ADDQ $4*SIZE, X_PTR
DECQ N
JNZ r1c4
r1c2:
TESTQ $2, N_DIM
JZ r1c1
// 1x2 KERNEL
KERNEL_LOAD2
KERNEL_1x2
ADDQ $2*SIZE, X_PTR
r1c1:
TESTQ $1, N_DIM
JZ r1end
// 1x1 KERNEL
KERNEL_1x1
r1end:
STORE1
end:
RET
inc: // Algorithm for incX != 1 ( split loads in kernel )
MOVQ incX+80(FP), INC_X // INC_X = incX
XORQ TMP2, TMP2 // TMP2 = 0
MOVQ N, TMP1 // TMP1 = N
SUBQ $1, TMP1 // TMP1 -= 1
NEGQ TMP1 // TMP1 = -TMP1
IMULQ INC_X, TMP1 // TMP1 *= INC_X
CMPQ INC_X, $0 // if INC_X < 0 { TMP2 = TMP1 }
CMOVQLT TMP1, TMP2
LEAQ (X_PTR)(TMP2*SIZE), X_PTR // X_PTR = X_PTR[TMP2]
MOVQ X_PTR, X // X = X_PTR
SHLQ $3, INC_X
LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
SHRQ $2, M
JZ inc_r2
inc_r4:
// LOAD 4
INIT4
MOVQ N_DIM, N
SHRQ $2, N
JZ inc_r4c2
inc_r4c4:
// 4x4 KERNEL
KERNEL_LOAD4_INC
KERNEL_4x4
LEAQ (X_PTR)(INC_X*4), X_PTR
DECQ N
JNZ inc_r4c4
inc_r4c2:
TESTQ $2, N_DIM
JZ inc_r4c1
// 4x2 KERNEL
KERNEL_LOAD2_INC
KERNEL_4x2
LEAQ (X_PTR)(INC_X*2), X_PTR
inc_r4c1:
HADDPD X1, X0
HADDPD X3, X2
TESTQ $1, N_DIM
JZ inc_r4end
// 4x1 KERNEL
KERNEL_4x1
ADDQ INC_X, X_PTR
inc_r4end:
CMPQ INC_Y, $SIZE
JNE inc_r4st_inc
STORE4
ADDQ $4*SIZE, Y_PTR
JMP inc_r4inc
inc_r4st_inc:
STORE4_INC
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
inc_r4inc:
MOVQ X, X_PTR
LEAQ (A_ROW)(LDA*4), A_ROW
MOVQ A_ROW, A_PTR
DECQ M
JNZ inc_r4
inc_r2:
TESTQ $2, M_DIM
JZ inc_r1
// LOAD 2
INIT2
MOVQ N_DIM, N
SHRQ $2, N
JZ inc_r2c2
inc_r2c4:
// 2x4 KERNEL
KERNEL_LOAD4_INC
KERNEL_2x4
LEAQ (X_PTR)(INC_X*4), X_PTR
DECQ N
JNZ inc_r2c4
inc_r2c2:
TESTQ $2, N_DIM
JZ inc_r2c1
// 2x2 KERNEL
KERNEL_LOAD2_INC
KERNEL_2x2
LEAQ (X_PTR)(INC_X*2), X_PTR
inc_r2c1:
HADDPD X1, X0
TESTQ $1, N_DIM
JZ inc_r2end
// 2x1 KERNEL
KERNEL_2x1
ADDQ INC_X, X_PTR
inc_r2end:
CMPQ INC_Y, $SIZE
JNE inc_r2st_inc
STORE2
ADDQ $2*SIZE, Y_PTR
JMP inc_r2inc
inc_r2st_inc:
STORE2_INC
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
inc_r2inc:
MOVQ X, X_PTR
LEAQ (A_ROW)(LDA*2), A_ROW
MOVQ A_ROW, A_PTR
inc_r1:
TESTQ $1, M_DIM
JZ inc_end
// LOAD 1
INIT1
MOVQ N_DIM, N
SHRQ $2, N
JZ inc_r1c2
inc_r1c4:
// 1x4 KERNEL
KERNEL_LOAD4_INC
KERNEL_1x4
LEAQ (X_PTR)(INC_X*4), X_PTR
DECQ N
JNZ inc_r1c4
inc_r1c2:
TESTQ $2, N_DIM
JZ inc_r1c1
// 1x2 KERNEL
KERNEL_LOAD2_INC
KERNEL_1x2
LEAQ (X_PTR)(INC_X*2), X_PTR
inc_r1c1:
TESTQ $1, N_DIM
JZ inc_r1end
// 1x1 KERNEL
KERNEL_1x1
inc_r1end:
STORE1
inc_end:
RET