mat/internal/f32/asm32/sum_amd64.s
// Copyright ©2021 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define X_PTR SI
#define IDX AX
#define LEN CX
#define TAIL BX
#define SUM X0
#define SUM_1 X1
#define SUM_2 X2
#define SUM_3 X3
// func Sum(x []float32) float32
TEXT ·Sum(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ x_len+8(FP), LEN // LEN = len(x)
XORQ IDX, IDX // i = 0
PXOR SUM, SUM // p_sum_i = 0
CMPQ LEN, $0 // if LEN == 0 { return 0 }
JE sum_end
PXOR SUM_1, SUM_1
PXOR SUM_2, SUM_2
PXOR SUM_3, SUM_3
MOVQ X_PTR, TAIL // Check memory alignment
ANDQ $15, TAIL // TAIL = &x % 16
JZ no_trim // if TAIL == 0 { goto no_trim }
SUBQ $16, TAIL // TAIL -= 16
sum_align: // Align on 16-byte boundary do {
ADDSS (X_PTR)(IDX*4), SUM // SUM += x[0]
INCQ IDX // i++
DECQ LEN // LEN--
JZ sum_end // if LEN == 0 { return }
ADDQ $4, TAIL // TAIL += 4
JNZ sum_align // } while TAIL < 0
no_trim:
MOVQ LEN, TAIL
SHRQ $4, LEN // LEN = floor( n / 16 )
JZ sum_tail8 // if LEN == 0 { goto sum_tail8 }
sum_loop: // sum 16x wide do {
ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
ADDPS 16(X_PTR)(IDX*4), SUM_1
ADDPS 32(X_PTR)(IDX*4), SUM_2
ADDPS 48(X_PTR)(IDX*4), SUM_3
ADDQ $16, IDX // i += 16
DECQ LEN
JNZ sum_loop // } while --LEN > 0
sum_tail8:
ADDPS SUM_3, SUM
ADDPS SUM_2, SUM_1
TESTQ $8, TAIL
JZ sum_tail4
ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
ADDPS 16(X_PTR)(IDX*4), SUM_1
ADDQ $8, IDX
sum_tail4:
ADDPS SUM_1, SUM
TESTQ $4, TAIL
JZ sum_tail2
ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
ADDQ $4, IDX
sum_tail2:
HADDPS SUM, SUM // sum_i[:2] += sum_i[2:4]
TESTQ $2, TAIL
JZ sum_tail1
MOVSD (X_PTR)(IDX*4), SUM_1 // reuse SUM_1
ADDPS SUM_1, SUM // sum_i += x[i:i+2]
ADDQ $2, IDX
sum_tail1:
HADDPS SUM, SUM // sum_i[0] += sum_i[1]
TESTQ $1, TAIL
JZ sum_end
ADDSS (X_PTR)(IDX*4), SUM
sum_end: // return sum
MOVSS SUM, ret+24(FP)
RET