#include "textflag.h"

// func sumAsmNeon(data unsafe.Pointer, length uintptr) uintptr
//
// args (8 bytes aligned):
//   data   unsafe.Pointer - 8 bytes - 0 offset
//   length uintptr        - 8 bytes - 8 offset
//   result uintptr        - 8 bytes - 16 offset
#define PDATA  R0
#define LENGTH R1
#define RESULT R2
#define VSUM V0
TEXT ·sumAsmNeon(SB),NOSPLIT,$0-24
    MOVD data+0(FP), PDATA
    MOVD length+8(FP), LENGTH
    MOVD $0, RESULT
    VMOVQ $0, $0, VSUM

#define LOADED_0 V1
#define LOADED_1 V2
#define LOADED_2 V3
#define LOADED_3 V4
BATCH_32:
    CMP $32, LENGTH
    BLO BATCH_16
    VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8, LOADED_2.B8, LOADED_3.B8]
    VREV16 LOADED_0.B8, LOADED_0.B8
    VREV16 LOADED_1.B8, LOADED_1.B8
    VREV16 LOADED_2.B8, LOADED_2.B8
    VREV16 LOADED_3.B8, LOADED_3.B8
    VUSHLL $0, LOADED_0.H4, LOADED_0.S4
    VUSHLL $0, LOADED_1.H4, LOADED_1.S4
    VUSHLL $0, LOADED_2.H4, LOADED_2.S4
    VUSHLL $0, LOADED_3.H4, LOADED_3.S4
    VADD LOADED_0.S4, VSUM.S4, VSUM.S4
    VADD LOADED_1.S4, VSUM.S4, VSUM.S4
    VADD LOADED_2.S4, VSUM.S4, VSUM.S4
    VADD LOADED_3.S4, VSUM.S4, VSUM.S4
    ADD $-32, LENGTH
    ADD $32, PDATA
    B BATCH_32
#undef LOADED_0
#undef LOADED_1
#undef LOADED_2
#undef LOADED_3

#define LOADED_0 V1
#define LOADED_1 V2
BATCH_16:
    CMP $16, LENGTH
    BLO BATCH_8
    VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8]
    VREV16 LOADED_0.B8, LOADED_0.B8
    VREV16 LOADED_1.B8, LOADED_1.B8
    VUSHLL $0, LOADED_0.H4, LOADED_0.S4
    VUSHLL $0, LOADED_1.H4, LOADED_1.S4
    VADD LOADED_0.S4, VSUM.S4, VSUM.S4
    VADD LOADED_1.S4, VSUM.S4, VSUM.S4
    ADD $-16, LENGTH
    ADD $16, PDATA
    B BATCH_16
#undef LOADED_0
#undef LOADED_1

#define LOADED_0 V1
BATCH_8:
    CMP $8, LENGTH
    BLO BATCH_2
    VLD1 (PDATA), [LOADED_0.B8]
    VREV16 LOADED_0.B8, LOADED_0.B8
    VUSHLL $0, LOADED_0.H4, LOADED_0.S4
    VADD LOADED_0.S4, VSUM.S4, VSUM.S4
    ADD $-8, LENGTH
    ADD $8, PDATA
    B BATCH_8
#undef LOADED_0

#define LOADED_L R3
#define LOADED_H R4
BATCH_2:
    CMP $2, LENGTH
    BLO BATCH_1
    MOVBU (PDATA), LOADED_H
    MOVBU 1(PDATA), LOADED_L
    LSL $8, LOADED_H
    ORR LOADED_H, LOADED_L, LOADED_L
    ADD LOADED_L, RESULT, RESULT
    ADD $2, PDATA
    ADD $-2, LENGTH
    B BATCH_2
#undef LOADED_H
#undef LOADED_L

#define LOADED R3
BATCH_1:
    CMP $1, LENGTH
    BLO COLLECT
    MOVBU (PDATA), LOADED
    LSL $8, LOADED
    ADD LOADED, RESULT, RESULT

#define EXTRACTED R3
COLLECT:
    VMOV VSUM.S[0], EXTRACTED
    ADD EXTRACTED, RESULT
    VMOV VSUM.S[1], EXTRACTED
    ADD EXTRACTED, RESULT
    VMOV VSUM.S[2], EXTRACTED
    ADD EXTRACTED, RESULT
    VMOV VSUM.S[3], EXTRACTED
    ADD EXTRACTED, RESULT
#undef VSUM
#undef PDATA
#undef LENGTH

RETURN:
    MOVD RESULT, result+16(FP)
    RET
