mirror of
https://github.com/superseriousbusiness/gotosocial.git
synced 2025-01-01 12:06:30 +00:00
9d0df426da
* feat: vendor minio client * feat: introduce storage package with s3 support * feat: serve s3 files directly this saves a lot of bandwith as the files are fetched from the object store directly * fix: use explicit local storage in tests * feat: integrate s3 storage with the main server * fix: add s3 config to cli tests * docs: explicitly set values in example config also adds license header to the storage package * fix: use better http status code on s3 redirect HTTP 302 Found is the best fit, as it signifies that the resource requested was found but not under its presumed URL 307/TemporaryRedirect would mean that this resource is usually located here, not in this case 303/SeeOther indicates that the redirection does not link to the requested resource but to another page * refactor: use context in storage driver interface
15679 lines
393 KiB
ArmAsm
15679 lines
393 KiB
ArmAsm
// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
|
|
|
|
// +build !appengine
|
|
// +build !noasm
|
|
// +build gc
|
|
|
|
#include "textflag.h"
|
|
|
|
// func encodeBlockAsm(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeBlockAsm(SB), $65560-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000200, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeBlockAsm
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL CX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBlockAsm:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x06, SI
|
|
LEAL 4(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeBlockAsm
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x10, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x32, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 24(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
LEAL 1(CX), R10
|
|
MOVL R10, 24(SP)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
MOVL CX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(DX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeBlockAsm
|
|
LEAL 1(CX), DI
|
|
MOVL 12(SP), R8
|
|
MOVL DI, SI
|
|
SUBL 16(SP), SI
|
|
JZ repeat_extend_back_end_encodeBlockAsm
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm:
|
|
CMPL DI, R8
|
|
JLE repeat_extend_back_end_encodeBlockAsm
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(DI*1), R9
|
|
CMPB BL, R9
|
|
JNE repeat_extend_back_end_encodeBlockAsm
|
|
LEAL -1(DI), DI
|
|
DECL SI
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm
|
|
|
|
repeat_extend_back_end_encodeBlockAsm:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_repeat_emit_encodeBlockAsm
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_repeat_emit_encodeBlockAsm
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_repeat_emit_encodeBlockAsm
|
|
CMPL SI, $0x00010000
|
|
JLT three_bytes_repeat_emit_encodeBlockAsm
|
|
CMPL SI, $0x01000000
|
|
JLT four_bytes_repeat_emit_encodeBlockAsm
|
|
MOVB $0xfc, (AX)
|
|
MOVL SI, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm
|
|
|
|
four_bytes_repeat_emit_encodeBlockAsm:
|
|
MOVL SI, R11
|
|
SHRL $0x10, R11
|
|
MOVB $0xf8, (AX)
|
|
MOVW SI, 1(AX)
|
|
MOVB R11, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm
|
|
|
|
three_bytes_repeat_emit_encodeBlockAsm:
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm
|
|
|
|
two_bytes_repeat_emit_encodeBlockAsm:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_repeat_emit_encodeBlockAsm
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm
|
|
|
|
one_byte_repeat_emit_encodeBlockAsm:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_repeat_emit_encodeBlockAsm:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
|
|
MOVQ (R10), R11
|
|
MOVQ R11, (AX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBlockAsm:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_repeat_emit_encodeBlockAsm
|
|
|
|
memmove_long_repeat_emit_encodeBlockAsm:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R11, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R13*1), R11
|
|
LEAQ -32(AX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R13*1), X4
|
|
MOVOU -16(R10)(R13*1), X5
|
|
MOVOA X4, -32(AX)(R13*1)
|
|
MOVOA X5, -16(AX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R9, R13
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_repeat_emit_encodeBlockAsm:
|
|
ADDL $0x05, CX
|
|
MOVL CX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R9
|
|
SUBL CX, R9
|
|
LEAQ (DX)(CX*1), R10
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R9, $0x08
|
|
JL matchlen_single_repeat_extend_encodeBlockAsm
|
|
|
|
matchlen_loopback_repeat_extend_encodeBlockAsm:
|
|
MOVQ (R10)(R12*1), R11
|
|
XORQ (SI)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_repeat_extend_encodeBlockAsm
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP repeat_extend_forward_end_encodeBlockAsm
|
|
|
|
matchlen_loop_repeat_extend_encodeBlockAsm:
|
|
LEAL -8(R9), R9
|
|
LEAL 8(R12), R12
|
|
CMPL R9, $0x08
|
|
JGE matchlen_loopback_repeat_extend_encodeBlockAsm
|
|
|
|
matchlen_single_repeat_extend_encodeBlockAsm:
|
|
TESTL R9, R9
|
|
JZ repeat_extend_forward_end_encodeBlockAsm
|
|
|
|
matchlen_single_loopback_repeat_extend_encodeBlockAsm:
|
|
MOVB (R10)(R12*1), R11
|
|
CMPB (SI)(R12*1), R11
|
|
JNE repeat_extend_forward_end_encodeBlockAsm
|
|
LEAL 1(R12), R12
|
|
DECL R9
|
|
JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm:
|
|
ADDL R12, CX
|
|
MOVL CX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
TESTL R8, R8
|
|
JZ repeat_as_copy_encodeBlockAsm
|
|
|
|
// emitRepeat
|
|
emit_repeat_again_match_repeat_encodeBlockAsm:
|
|
MOVL SI, R8
|
|
LEAL -4(SI), SI
|
|
CMPL R8, $0x08
|
|
JLE repeat_two_match_repeat_encodeBlockAsm
|
|
CMPL R8, $0x0c
|
|
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm
|
|
CMPL DI, $0x00000800
|
|
JLT repeat_two_offset_match_repeat_encodeBlockAsm
|
|
|
|
cant_repeat_two_offset_match_repeat_encodeBlockAsm:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_match_repeat_encodeBlockAsm
|
|
CMPL SI, $0x00010100
|
|
JLT repeat_four_match_repeat_encodeBlockAsm
|
|
CMPL SI, $0x0100ffff
|
|
JLT repeat_five_match_repeat_encodeBlockAsm
|
|
LEAL -16842747(SI), SI
|
|
MOVW $0x001d, (AX)
|
|
MOVW $0xfffb, 2(AX)
|
|
MOVB $0xff, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP emit_repeat_again_match_repeat_encodeBlockAsm
|
|
|
|
repeat_five_match_repeat_encodeBlockAsm:
|
|
LEAL -65536(SI), SI
|
|
MOVL SI, DI
|
|
MOVW $0x001d, (AX)
|
|
MOVW SI, 2(AX)
|
|
SARL $0x10, DI
|
|
MOVB DI, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_four_match_repeat_encodeBlockAsm:
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_two_offset_match_repeat_encodeBlockAsm:
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_as_copy_encodeBlockAsm:
|
|
// emitCopy
|
|
CMPL DI, $0x00010000
|
|
JL two_byte_offset_repeat_as_copy_encodeBlockAsm
|
|
|
|
four_bytes_loop_back_repeat_as_copy_encodeBlockAsm:
|
|
CMPL SI, $0x40
|
|
JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm
|
|
MOVB $0xff, (AX)
|
|
MOVL DI, 1(AX)
|
|
LEAL -64(SI), SI
|
|
ADDQ $0x05, AX
|
|
CMPL SI, $0x04
|
|
JL four_bytes_remain_repeat_as_copy_encodeBlockAsm
|
|
|
|
// emitRepeat
|
|
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
|
|
MOVL SI, R8
|
|
LEAL -4(SI), SI
|
|
CMPL R8, $0x08
|
|
JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
|
|
CMPL R8, $0x0c
|
|
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
|
|
CMPL DI, $0x00000800
|
|
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
|
|
|
|
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
|
|
CMPL SI, $0x00010100
|
|
JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
|
|
CMPL SI, $0x0100ffff
|
|
JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
|
|
LEAL -16842747(SI), SI
|
|
MOVW $0x001d, (AX)
|
|
MOVW $0xfffb, 2(AX)
|
|
MOVB $0xff, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
|
|
|
|
repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
|
|
LEAL -65536(SI), SI
|
|
MOVL SI, DI
|
|
MOVW $0x001d, (AX)
|
|
MOVW SI, 2(AX)
|
|
SARL $0x10, DI
|
|
MOVB DI, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm
|
|
|
|
four_bytes_remain_repeat_as_copy_encodeBlockAsm:
|
|
TESTL SI, SI
|
|
JZ repeat_end_emit_encodeBlockAsm
|
|
MOVB $0x03, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVL DI, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
two_byte_offset_repeat_as_copy_encodeBlockAsm:
|
|
CMPL SI, $0x40
|
|
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm
|
|
MOVB $0xee, (AX)
|
|
MOVW DI, 1(AX)
|
|
LEAL -60(SI), SI
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
|
|
MOVL SI, R8
|
|
LEAL -4(SI), SI
|
|
CMPL R8, $0x08
|
|
JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
|
|
CMPL R8, $0x0c
|
|
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
|
|
CMPL DI, $0x00000800
|
|
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
|
|
|
|
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
|
|
CMPL SI, $0x00010100
|
|
JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
|
|
CMPL SI, $0x0100ffff
|
|
JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
|
|
LEAL -16842747(SI), SI
|
|
MOVW $0x001d, (AX)
|
|
MOVW $0xfffb, 2(AX)
|
|
MOVB $0xff, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
|
|
|
|
repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
|
|
LEAL -65536(SI), SI
|
|
MOVL SI, DI
|
|
MOVW $0x001d, (AX)
|
|
MOVW SI, 2(AX)
|
|
SARL $0x10, DI
|
|
MOVB DI, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
JMP two_byte_offset_repeat_as_copy_encodeBlockAsm
|
|
|
|
two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
|
|
CMPL SI, $0x0c
|
|
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
|
|
CMPL DI, $0x00000800
|
|
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SHRL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm
|
|
|
|
emit_copy_three_repeat_as_copy_encodeBlockAsm:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
repeat_end_emit_encodeBlockAsm:
|
|
MOVL CX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm
|
|
|
|
no_repeat_found_encodeBlockAsm:
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm
|
|
SHRQ $0x08, DI
|
|
MOVL 24(SP)(R10*4), SI
|
|
LEAL 2(CX), R9
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm
|
|
MOVL R9, 24(SP)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeBlockAsm
|
|
|
|
candidate3_match_encodeBlockAsm:
|
|
ADDL $0x02, CX
|
|
JMP candidate_match_encodeBlockAsm
|
|
|
|
candidate2_match_encodeBlockAsm:
|
|
MOVL R9, 24(SP)(R10*4)
|
|
INCL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBlockAsm:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm
|
|
|
|
match_extend_back_loop_encodeBlockAsm:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeBlockAsm
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeBlockAsm
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm
|
|
JMP match_extend_back_loop_encodeBlockAsm
|
|
|
|
match_extend_back_end_encodeBlockAsm:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 5(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBlockAsm:
|
|
MOVL CX, DI
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_match_emit_encodeBlockAsm
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), DI
|
|
SUBL R8, R9
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x3c
|
|
JLT one_byte_match_emit_encodeBlockAsm
|
|
CMPL R8, $0x00000100
|
|
JLT two_bytes_match_emit_encodeBlockAsm
|
|
CMPL R8, $0x00010000
|
|
JLT three_bytes_match_emit_encodeBlockAsm
|
|
CMPL R8, $0x01000000
|
|
JLT four_bytes_match_emit_encodeBlockAsm
|
|
MOVB $0xfc, (AX)
|
|
MOVL R8, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_match_emit_encodeBlockAsm
|
|
|
|
four_bytes_match_emit_encodeBlockAsm:
|
|
MOVL R8, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (AX)
|
|
MOVW R8, 1(AX)
|
|
MOVB R10, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_match_emit_encodeBlockAsm
|
|
|
|
three_bytes_match_emit_encodeBlockAsm:
|
|
MOVB $0xf4, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeBlockAsm
|
|
|
|
two_bytes_match_emit_encodeBlockAsm:
|
|
MOVB $0xf0, (AX)
|
|
MOVB R8, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL R8, $0x40
|
|
JL memmove_match_emit_encodeBlockAsm
|
|
JMP memmove_long_match_emit_encodeBlockAsm
|
|
|
|
one_byte_match_emit_encodeBlockAsm:
|
|
SHLB $0x02, R8
|
|
MOVB R8, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeBlockAsm:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
|
|
MOVQ (DI), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
|
|
MOVQ (DI), R10
|
|
MOVQ -8(DI)(R9*1), DI
|
|
MOVQ R10, (AX)
|
|
MOVQ DI, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm:
|
|
MOVQ R8, AX
|
|
JMP emit_literal_done_match_emit_encodeBlockAsm
|
|
|
|
memmove_long_match_emit_encodeBlockAsm:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVQ R9, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R12*1), X4
|
|
MOVOU -16(DI)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R9, R12
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ R8, AX
|
|
|
|
emit_literal_done_match_emit_encodeBlockAsm:
|
|
match_nolit_loop_encodeBlockAsm:
|
|
MOVL CX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL CX, DI
|
|
LEAQ (DX)(CX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
CMPL DI, $0x08
|
|
JL matchlen_single_match_nolit_encodeBlockAsm
|
|
|
|
matchlen_loopback_match_nolit_encodeBlockAsm:
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
TESTQ R9, R9
|
|
JZ matchlen_loop_match_nolit_encodeBlockAsm
|
|
BSFQ R9, R9
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP match_nolit_end_encodeBlockAsm
|
|
|
|
matchlen_loop_match_nolit_encodeBlockAsm:
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
CMPL DI, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeBlockAsm
|
|
|
|
matchlen_single_match_nolit_encodeBlockAsm:
|
|
TESTL DI, DI
|
|
JZ match_nolit_end_encodeBlockAsm
|
|
|
|
matchlen_single_loopback_match_nolit_encodeBlockAsm:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE match_nolit_end_encodeBlockAsm
|
|
LEAL 1(R10), R10
|
|
DECL DI
|
|
JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm
|
|
|
|
match_nolit_end_encodeBlockAsm:
|
|
ADDL R10, CX
|
|
MOVL 16(SP), SI
|
|
ADDL $0x04, R10
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL SI, $0x00010000
|
|
JL two_byte_offset_match_nolit_encodeBlockAsm
|
|
|
|
four_bytes_loop_back_match_nolit_encodeBlockAsm:
|
|
CMPL R10, $0x40
|
|
JLE four_bytes_remain_match_nolit_encodeBlockAsm
|
|
MOVB $0xff, (AX)
|
|
MOVL SI, 1(AX)
|
|
LEAL -64(R10), R10
|
|
ADDQ $0x05, AX
|
|
CMPL R10, $0x04
|
|
JL four_bytes_remain_match_nolit_encodeBlockAsm
|
|
|
|
// emitRepeat
|
|
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
|
|
MOVL R10, DI
|
|
LEAL -4(R10), R10
|
|
CMPL DI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy
|
|
CMPL DI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
|
|
CMPL SI, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
|
|
CMPL R10, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy
|
|
CMPL R10, $0x00010100
|
|
JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy
|
|
CMPL R10, $0x0100ffff
|
|
JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy
|
|
LEAL -16842747(R10), R10
|
|
MOVW $0x001d, (AX)
|
|
MOVW $0xfffb, 2(AX)
|
|
MOVB $0xff, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
|
|
|
|
repeat_five_match_nolit_encodeBlockAsm_emit_copy:
|
|
LEAL -65536(R10), R10
|
|
MOVL R10, SI
|
|
MOVW $0x001d, (AX)
|
|
MOVW R10, 2(AX)
|
|
SARL $0x10, SI
|
|
MOVB SI, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_four_match_nolit_encodeBlockAsm_emit_copy:
|
|
LEAL -256(R10), R10
|
|
MOVW $0x0019, (AX)
|
|
MOVW R10, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_three_match_nolit_encodeBlockAsm_emit_copy:
|
|
LEAL -4(R10), R10
|
|
MOVW $0x0015, (AX)
|
|
MOVB R10, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_two_match_nolit_encodeBlockAsm_emit_copy:
|
|
SHLL $0x02, R10
|
|
ORL $0x01, R10
|
|
MOVW R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
|
|
XORQ DI, DI
|
|
LEAL 1(DI)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SARL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
JMP four_bytes_loop_back_match_nolit_encodeBlockAsm
|
|
|
|
four_bytes_remain_match_nolit_encodeBlockAsm:
|
|
TESTL R10, R10
|
|
JZ match_nolit_emitcopy_end_encodeBlockAsm
|
|
MOVB $0x03, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVL SI, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
two_byte_offset_match_nolit_encodeBlockAsm:
|
|
CMPL R10, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeBlockAsm
|
|
MOVB $0xee, (AX)
|
|
MOVW SI, 1(AX)
|
|
LEAL -60(R10), R10
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
|
|
MOVL R10, DI
|
|
LEAL -4(R10), R10
|
|
CMPL DI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
|
|
CMPL DI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
|
|
CMPL SI, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
|
|
CMPL R10, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
|
|
CMPL R10, $0x00010100
|
|
JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
|
|
CMPL R10, $0x0100ffff
|
|
JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
|
|
LEAL -16842747(R10), R10
|
|
MOVW $0x001d, (AX)
|
|
MOVW $0xfffb, 2(AX)
|
|
MOVB $0xff, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
|
|
|
|
repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
|
|
LEAL -65536(R10), R10
|
|
MOVL R10, SI
|
|
MOVW $0x001d, (AX)
|
|
MOVW R10, 2(AX)
|
|
SARL $0x10, SI
|
|
MOVB SI, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
|
|
LEAL -256(R10), R10
|
|
MOVW $0x0019, (AX)
|
|
MOVW R10, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
|
|
LEAL -4(R10), R10
|
|
MOVW $0x0015, (AX)
|
|
MOVB R10, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
|
|
SHLL $0x02, R10
|
|
ORL $0x01, R10
|
|
MOVW R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
|
|
XORQ DI, DI
|
|
LEAL 1(DI)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SARL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
JMP two_byte_offset_match_nolit_encodeBlockAsm
|
|
|
|
two_byte_offset_short_match_nolit_encodeBlockAsm:
|
|
CMPL R10, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeBlockAsm
|
|
CMPL SI, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeBlockAsm
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SHRL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm
|
|
|
|
emit_copy_three_match_nolit_encodeBlockAsm:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeBlockAsm
|
|
MOVQ -2(DX)(CX*1), DI
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm:
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, SI
|
|
SHLQ $0x10, R8
|
|
IMULQ R9, R8
|
|
SHRQ $0x32, R8
|
|
SHLQ $0x10, SI
|
|
IMULQ R9, SI
|
|
SHRQ $0x32, SI
|
|
LEAL -2(CX), R9
|
|
LEAQ 24(SP)(SI*4), R10
|
|
MOVL (R10), SI
|
|
MOVL R9, 24(SP)(R8*4)
|
|
MOVL CX, (R10)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ match_nolit_loop_encodeBlockAsm
|
|
INCL CX
|
|
JMP search_loop_encodeBlockAsm
|
|
|
|
emit_remainder_encodeBlockAsm:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 5(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBlockAsm:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeBlockAsm
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeBlockAsm
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeBlockAsm
|
|
CMPL DX, $0x00010000
|
|
JLT three_bytes_emit_remainder_encodeBlockAsm
|
|
CMPL DX, $0x01000000
|
|
JLT four_bytes_emit_remainder_encodeBlockAsm
|
|
MOVB $0xfc, (AX)
|
|
MOVL DX, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm
|
|
|
|
four_bytes_emit_remainder_encodeBlockAsm:
|
|
MOVL DX, BX
|
|
SHRL $0x10, BX
|
|
MOVB $0xf8, (AX)
|
|
MOVW DX, 1(AX)
|
|
MOVB BL, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm
|
|
|
|
three_bytes_emit_remainder_encodeBlockAsm:
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeBlockAsm
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeBlockAsm:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeBlockAsm
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeBlockAsm:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeBlockAsm4MB(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeBlockAsm4MB(SB), $65560-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000200, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm4MB:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeBlockAsm4MB
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL CX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBlockAsm4MB:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x06, SI
|
|
LEAL 4(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeBlockAsm4MB
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x10, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x32, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 24(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
LEAL 1(CX), R10
|
|
MOVL R10, 24(SP)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
MOVL CX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(DX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeBlockAsm4MB
|
|
LEAL 1(CX), DI
|
|
MOVL 12(SP), R8
|
|
MOVL DI, SI
|
|
SUBL 16(SP), SI
|
|
JZ repeat_extend_back_end_encodeBlockAsm4MB
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm4MB:
|
|
CMPL DI, R8
|
|
JLE repeat_extend_back_end_encodeBlockAsm4MB
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(DI*1), R9
|
|
CMPB BL, R9
|
|
JNE repeat_extend_back_end_encodeBlockAsm4MB
|
|
LEAL -1(DI), DI
|
|
DECL SI
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm4MB
|
|
|
|
repeat_extend_back_end_encodeBlockAsm4MB:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_repeat_emit_encodeBlockAsm4MB
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_repeat_emit_encodeBlockAsm4MB
|
|
CMPL SI, $0x00010000
|
|
JLT three_bytes_repeat_emit_encodeBlockAsm4MB
|
|
MOVL SI, R11
|
|
SHRL $0x10, R11
|
|
MOVB $0xf8, (AX)
|
|
MOVW SI, 1(AX)
|
|
MOVB R11, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm4MB
|
|
|
|
three_bytes_repeat_emit_encodeBlockAsm4MB:
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm4MB
|
|
|
|
two_bytes_repeat_emit_encodeBlockAsm4MB:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_repeat_emit_encodeBlockAsm4MB
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm4MB
|
|
|
|
one_byte_repeat_emit_encodeBlockAsm4MB:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_repeat_emit_encodeBlockAsm4MB:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
|
|
MOVQ (R10), R11
|
|
MOVQ R11, (AX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB
|
|
|
|
memmove_long_repeat_emit_encodeBlockAsm4MB:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R11, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R13*1), R11
|
|
LEAQ -32(AX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R13*1), X4
|
|
MOVOU -16(R10)(R13*1), X5
|
|
MOVOA X4, -32(AX)(R13*1)
|
|
MOVOA X5, -16(AX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R9, R13
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_repeat_emit_encodeBlockAsm4MB:
|
|
ADDL $0x05, CX
|
|
MOVL CX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R9
|
|
SUBL CX, R9
|
|
LEAQ (DX)(CX*1), R10
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R9, $0x08
|
|
JL matchlen_single_repeat_extend_encodeBlockAsm4MB
|
|
|
|
matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
|
|
MOVQ (R10)(R12*1), R11
|
|
XORQ (SI)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP repeat_extend_forward_end_encodeBlockAsm4MB
|
|
|
|
matchlen_loop_repeat_extend_encodeBlockAsm4MB:
|
|
LEAL -8(R9), R9
|
|
LEAL 8(R12), R12
|
|
CMPL R9, $0x08
|
|
JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB
|
|
|
|
matchlen_single_repeat_extend_encodeBlockAsm4MB:
|
|
TESTL R9, R9
|
|
JZ repeat_extend_forward_end_encodeBlockAsm4MB
|
|
|
|
matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB:
|
|
MOVB (R10)(R12*1), R11
|
|
CMPB (SI)(R12*1), R11
|
|
JNE repeat_extend_forward_end_encodeBlockAsm4MB
|
|
LEAL 1(R12), R12
|
|
DECL R9
|
|
JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm4MB:
|
|
ADDL R12, CX
|
|
MOVL CX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
TESTL R8, R8
|
|
JZ repeat_as_copy_encodeBlockAsm4MB
|
|
|
|
// emitRepeat
|
|
MOVL SI, R8
|
|
LEAL -4(SI), SI
|
|
CMPL R8, $0x08
|
|
JLE repeat_two_match_repeat_encodeBlockAsm4MB
|
|
CMPL R8, $0x0c
|
|
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
|
|
CMPL DI, $0x00000800
|
|
JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB
|
|
|
|
cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_match_repeat_encodeBlockAsm4MB
|
|
CMPL SI, $0x00010100
|
|
JLT repeat_four_match_repeat_encodeBlockAsm4MB
|
|
LEAL -65536(SI), SI
|
|
MOVL SI, DI
|
|
MOVW $0x001d, (AX)
|
|
MOVW SI, 2(AX)
|
|
SARL $0x10, DI
|
|
MOVB DI, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_four_match_repeat_encodeBlockAsm4MB:
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm4MB:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm4MB:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_two_offset_match_repeat_encodeBlockAsm4MB:
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_as_copy_encodeBlockAsm4MB:
|
|
// emitCopy
|
|
CMPL DI, $0x00010000
|
|
JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
|
|
|
|
four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB:
|
|
CMPL SI, $0x40
|
|
JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
|
|
MOVB $0xff, (AX)
|
|
MOVL DI, 1(AX)
|
|
LEAL -64(SI), SI
|
|
ADDQ $0x05, AX
|
|
CMPL SI, $0x04
|
|
JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
|
|
|
|
// emitRepeat
|
|
MOVL SI, R8
|
|
LEAL -4(SI), SI
|
|
CMPL R8, $0x08
|
|
JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
|
|
CMPL R8, $0x0c
|
|
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
|
|
CMPL DI, $0x00000800
|
|
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
|
|
|
|
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
|
|
CMPL SI, $0x00010100
|
|
JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
|
|
LEAL -65536(SI), SI
|
|
MOVL SI, DI
|
|
MOVW $0x001d, (AX)
|
|
MOVW SI, 2(AX)
|
|
SARL $0x10, DI
|
|
MOVB DI, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB
|
|
|
|
four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
|
|
TESTL SI, SI
|
|
JZ repeat_end_emit_encodeBlockAsm4MB
|
|
MOVB $0x03, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVL DI, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
|
|
CMPL SI, $0x40
|
|
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
|
|
MOVB $0xee, (AX)
|
|
MOVW DI, 1(AX)
|
|
LEAL -60(SI), SI
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL SI, R8
|
|
LEAL -4(SI), SI
|
|
CMPL R8, $0x08
|
|
JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
|
|
CMPL R8, $0x0c
|
|
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
|
|
CMPL DI, $0x00000800
|
|
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
|
|
|
|
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
|
|
CMPL SI, $0x00010100
|
|
JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
|
|
LEAL -65536(SI), SI
|
|
MOVL SI, DI
|
|
MOVW $0x001d, (AX)
|
|
MOVW SI, 2(AX)
|
|
SARL $0x10, DI
|
|
MOVB DI, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
|
|
|
|
two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
|
|
CMPL SI, $0x0c
|
|
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
|
|
CMPL DI, $0x00000800
|
|
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SHRL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm4MB
|
|
|
|
emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
repeat_end_emit_encodeBlockAsm4MB:
|
|
MOVL CX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm4MB
|
|
|
|
no_repeat_found_encodeBlockAsm4MB:
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm4MB
|
|
SHRQ $0x08, DI
|
|
MOVL 24(SP)(R10*4), SI
|
|
LEAL 2(CX), R9
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm4MB
|
|
MOVL R9, 24(SP)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm4MB
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeBlockAsm4MB
|
|
|
|
candidate3_match_encodeBlockAsm4MB:
|
|
ADDL $0x02, CX
|
|
JMP candidate_match_encodeBlockAsm4MB
|
|
|
|
candidate2_match_encodeBlockAsm4MB:
|
|
MOVL R9, 24(SP)(R10*4)
|
|
INCL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBlockAsm4MB:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm4MB
|
|
|
|
match_extend_back_loop_encodeBlockAsm4MB:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeBlockAsm4MB
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeBlockAsm4MB
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm4MB
|
|
JMP match_extend_back_loop_encodeBlockAsm4MB
|
|
|
|
match_extend_back_end_encodeBlockAsm4MB:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 4(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeBlockAsm4MB
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBlockAsm4MB:
|
|
MOVL CX, DI
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_match_emit_encodeBlockAsm4MB
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), DI
|
|
SUBL R8, R9
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x3c
|
|
JLT one_byte_match_emit_encodeBlockAsm4MB
|
|
CMPL R8, $0x00000100
|
|
JLT two_bytes_match_emit_encodeBlockAsm4MB
|
|
CMPL R8, $0x00010000
|
|
JLT three_bytes_match_emit_encodeBlockAsm4MB
|
|
MOVL R8, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (AX)
|
|
MOVW R8, 1(AX)
|
|
MOVB R10, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_match_emit_encodeBlockAsm4MB
|
|
|
|
three_bytes_match_emit_encodeBlockAsm4MB:
|
|
MOVB $0xf4, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeBlockAsm4MB
|
|
|
|
two_bytes_match_emit_encodeBlockAsm4MB:
|
|
MOVB $0xf0, (AX)
|
|
MOVB R8, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL R8, $0x40
|
|
JL memmove_match_emit_encodeBlockAsm4MB
|
|
JMP memmove_long_match_emit_encodeBlockAsm4MB
|
|
|
|
one_byte_match_emit_encodeBlockAsm4MB:
|
|
SHLB $0x02, R8
|
|
MOVB R8, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeBlockAsm4MB:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
|
|
MOVQ (DI), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
|
|
MOVQ (DI), R10
|
|
MOVQ -8(DI)(R9*1), DI
|
|
MOVQ R10, (AX)
|
|
MOVQ DI, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm4MB:
|
|
MOVQ R8, AX
|
|
JMP emit_literal_done_match_emit_encodeBlockAsm4MB
|
|
|
|
memmove_long_match_emit_encodeBlockAsm4MB:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVQ R9, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R12*1), X4
|
|
MOVOU -16(DI)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R9, R12
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ R8, AX
|
|
|
|
emit_literal_done_match_emit_encodeBlockAsm4MB:
|
|
match_nolit_loop_encodeBlockAsm4MB:
|
|
MOVL CX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL CX, DI
|
|
LEAQ (DX)(CX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
CMPL DI, $0x08
|
|
JL matchlen_single_match_nolit_encodeBlockAsm4MB
|
|
|
|
matchlen_loopback_match_nolit_encodeBlockAsm4MB:
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
TESTQ R9, R9
|
|
JZ matchlen_loop_match_nolit_encodeBlockAsm4MB
|
|
BSFQ R9, R9
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP match_nolit_end_encodeBlockAsm4MB
|
|
|
|
matchlen_loop_match_nolit_encodeBlockAsm4MB:
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
CMPL DI, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB
|
|
|
|
matchlen_single_match_nolit_encodeBlockAsm4MB:
|
|
TESTL DI, DI
|
|
JZ match_nolit_end_encodeBlockAsm4MB
|
|
|
|
matchlen_single_loopback_match_nolit_encodeBlockAsm4MB:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE match_nolit_end_encodeBlockAsm4MB
|
|
LEAL 1(R10), R10
|
|
DECL DI
|
|
JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB
|
|
|
|
match_nolit_end_encodeBlockAsm4MB:
|
|
ADDL R10, CX
|
|
MOVL 16(SP), SI
|
|
ADDL $0x04, R10
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL SI, $0x00010000
|
|
JL two_byte_offset_match_nolit_encodeBlockAsm4MB
|
|
|
|
four_bytes_loop_back_match_nolit_encodeBlockAsm4MB:
|
|
CMPL R10, $0x40
|
|
JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB
|
|
MOVB $0xff, (AX)
|
|
MOVL SI, 1(AX)
|
|
LEAL -64(R10), R10
|
|
ADDQ $0x05, AX
|
|
CMPL R10, $0x04
|
|
JL four_bytes_remain_match_nolit_encodeBlockAsm4MB
|
|
|
|
// emitRepeat
|
|
MOVL R10, DI
|
|
LEAL -4(R10), R10
|
|
CMPL DI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
|
|
CMPL DI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
|
|
CMPL SI, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
|
|
CMPL R10, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
|
|
CMPL R10, $0x00010100
|
|
JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
|
|
LEAL -65536(R10), R10
|
|
MOVL R10, SI
|
|
MOVW $0x001d, (AX)
|
|
MOVW R10, 2(AX)
|
|
SARL $0x10, SI
|
|
MOVB SI, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
|
|
repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
|
|
LEAL -256(R10), R10
|
|
MOVW $0x0019, (AX)
|
|
MOVW R10, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
|
|
repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
|
|
LEAL -4(R10), R10
|
|
MOVW $0x0015, (AX)
|
|
MOVB R10, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
|
|
repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
|
|
SHLL $0x02, R10
|
|
ORL $0x01, R10
|
|
MOVW R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
|
|
repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
|
|
XORQ DI, DI
|
|
LEAL 1(DI)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SARL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB
|
|
|
|
four_bytes_remain_match_nolit_encodeBlockAsm4MB:
|
|
TESTL R10, R10
|
|
JZ match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
MOVB $0x03, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVL SI, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
|
|
two_byte_offset_match_nolit_encodeBlockAsm4MB:
|
|
CMPL R10, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB
|
|
MOVB $0xee, (AX)
|
|
MOVW SI, 1(AX)
|
|
LEAL -60(R10), R10
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL R10, DI
|
|
LEAL -4(R10), R10
|
|
CMPL DI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
|
|
CMPL DI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
|
|
CMPL SI, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
|
|
CMPL R10, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
|
|
CMPL R10, $0x00010100
|
|
JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
|
|
LEAL -65536(R10), R10
|
|
MOVL R10, SI
|
|
MOVW $0x001d, (AX)
|
|
MOVW R10, 2(AX)
|
|
SARL $0x10, SI
|
|
MOVB SI, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
|
|
repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
|
|
LEAL -256(R10), R10
|
|
MOVW $0x0019, (AX)
|
|
MOVW R10, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
|
|
repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
|
|
LEAL -4(R10), R10
|
|
MOVW $0x0015, (AX)
|
|
MOVB R10, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
|
|
repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
|
|
SHLL $0x02, R10
|
|
ORL $0x01, R10
|
|
MOVW R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
|
|
repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
|
|
XORQ DI, DI
|
|
LEAL 1(DI)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SARL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
JMP two_byte_offset_match_nolit_encodeBlockAsm4MB
|
|
|
|
two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
|
|
CMPL R10, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
|
|
CMPL SI, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SHRL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
|
|
|
|
emit_copy_three_match_nolit_encodeBlockAsm4MB:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm4MB:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeBlockAsm4MB
|
|
MOVQ -2(DX)(CX*1), DI
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeBlockAsm4MB
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm4MB:
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, SI
|
|
SHLQ $0x10, R8
|
|
IMULQ R9, R8
|
|
SHRQ $0x32, R8
|
|
SHLQ $0x10, SI
|
|
IMULQ R9, SI
|
|
SHRQ $0x32, SI
|
|
LEAL -2(CX), R9
|
|
LEAQ 24(SP)(SI*4), R10
|
|
MOVL (R10), SI
|
|
MOVL R9, 24(SP)(R8*4)
|
|
MOVL CX, (R10)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ match_nolit_loop_encodeBlockAsm4MB
|
|
INCL CX
|
|
JMP search_loop_encodeBlockAsm4MB
|
|
|
|
emit_remainder_encodeBlockAsm4MB:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 4(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeBlockAsm4MB
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBlockAsm4MB:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeBlockAsm4MB
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeBlockAsm4MB
|
|
CMPL DX, $0x00010000
|
|
JLT three_bytes_emit_remainder_encodeBlockAsm4MB
|
|
MOVL DX, BX
|
|
SHRL $0x10, BX
|
|
MOVB $0xf8, (AX)
|
|
MOVW DX, 1(AX)
|
|
MOVB BL, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm4MB
|
|
|
|
three_bytes_emit_remainder_encodeBlockAsm4MB:
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm4MB
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm4MB:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeBlockAsm4MB
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm4MB
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm4MB:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeBlockAsm4MB:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm4MB:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeBlockAsm4MB:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeBlockAsm12B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeBlockAsm12B(SB), $16408-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000080, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm12B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeBlockAsm12B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL CX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBlockAsm12B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x05, SI
|
|
LEAL 4(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeBlockAsm12B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x000000cf1bbcdcbb, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x18, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
SHLQ $0x18, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x34, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 24(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
LEAL 1(CX), R10
|
|
MOVL R10, 24(SP)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x18, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
MOVL CX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(DX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeBlockAsm12B
|
|
LEAL 1(CX), DI
|
|
MOVL 12(SP), R8
|
|
MOVL DI, SI
|
|
SUBL 16(SP), SI
|
|
JZ repeat_extend_back_end_encodeBlockAsm12B
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm12B:
|
|
CMPL DI, R8
|
|
JLE repeat_extend_back_end_encodeBlockAsm12B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(DI*1), R9
|
|
CMPB BL, R9
|
|
JNE repeat_extend_back_end_encodeBlockAsm12B
|
|
LEAL -1(DI), DI
|
|
DECL SI
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm12B
|
|
|
|
repeat_extend_back_end_encodeBlockAsm12B:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_repeat_emit_encodeBlockAsm12B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_repeat_emit_encodeBlockAsm12B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm12B
|
|
|
|
two_bytes_repeat_emit_encodeBlockAsm12B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_repeat_emit_encodeBlockAsm12B
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm12B
|
|
|
|
one_byte_repeat_emit_encodeBlockAsm12B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_repeat_emit_encodeBlockAsm12B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
|
|
MOVQ (R10), R11
|
|
MOVQ R11, (AX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBlockAsm12B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_repeat_emit_encodeBlockAsm12B
|
|
|
|
memmove_long_repeat_emit_encodeBlockAsm12B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R11, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R13*1), R11
|
|
LEAQ -32(AX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R13*1), X4
|
|
MOVOU -16(R10)(R13*1), X5
|
|
MOVOA X4, -32(AX)(R13*1)
|
|
MOVOA X5, -16(AX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R9, R13
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_repeat_emit_encodeBlockAsm12B:
|
|
ADDL $0x05, CX
|
|
MOVL CX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R9
|
|
SUBL CX, R9
|
|
LEAQ (DX)(CX*1), R10
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R9, $0x08
|
|
JL matchlen_single_repeat_extend_encodeBlockAsm12B
|
|
|
|
matchlen_loopback_repeat_extend_encodeBlockAsm12B:
|
|
MOVQ (R10)(R12*1), R11
|
|
XORQ (SI)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_repeat_extend_encodeBlockAsm12B
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP repeat_extend_forward_end_encodeBlockAsm12B
|
|
|
|
matchlen_loop_repeat_extend_encodeBlockAsm12B:
|
|
LEAL -8(R9), R9
|
|
LEAL 8(R12), R12
|
|
CMPL R9, $0x08
|
|
JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B
|
|
|
|
matchlen_single_repeat_extend_encodeBlockAsm12B:
|
|
TESTL R9, R9
|
|
JZ repeat_extend_forward_end_encodeBlockAsm12B
|
|
|
|
matchlen_single_loopback_repeat_extend_encodeBlockAsm12B:
|
|
MOVB (R10)(R12*1), R11
|
|
CMPB (SI)(R12*1), R11
|
|
JNE repeat_extend_forward_end_encodeBlockAsm12B
|
|
LEAL 1(R12), R12
|
|
DECL R9
|
|
JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm12B:
|
|
ADDL R12, CX
|
|
MOVL CX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
TESTL R8, R8
|
|
JZ repeat_as_copy_encodeBlockAsm12B
|
|
|
|
// emitRepeat
|
|
MOVL SI, R8
|
|
LEAL -4(SI), SI
|
|
CMPL R8, $0x08
|
|
JLE repeat_two_match_repeat_encodeBlockAsm12B
|
|
CMPL R8, $0x0c
|
|
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
|
|
CMPL DI, $0x00000800
|
|
JLT repeat_two_offset_match_repeat_encodeBlockAsm12B
|
|
|
|
cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_match_repeat_encodeBlockAsm12B
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm12B
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm12B:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm12B
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm12B:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm12B
|
|
|
|
repeat_two_offset_match_repeat_encodeBlockAsm12B:
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm12B
|
|
|
|
repeat_as_copy_encodeBlockAsm12B:
|
|
// emitCopy
|
|
two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
|
|
CMPL SI, $0x40
|
|
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
|
|
MOVB $0xee, (AX)
|
|
MOVW DI, 1(AX)
|
|
LEAL -60(SI), SI
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL SI, R8
|
|
LEAL -4(SI), SI
|
|
CMPL R8, $0x08
|
|
JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
|
|
CMPL R8, $0x0c
|
|
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
|
|
CMPL DI, $0x00000800
|
|
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
|
|
|
|
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm12B
|
|
|
|
repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm12B
|
|
|
|
repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm12B
|
|
|
|
repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm12B
|
|
JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B
|
|
|
|
two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
|
|
CMPL SI, $0x0c
|
|
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
|
|
CMPL DI, $0x00000800
|
|
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SHRL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm12B
|
|
|
|
emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
repeat_end_emit_encodeBlockAsm12B:
|
|
MOVL CX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm12B
|
|
|
|
no_repeat_found_encodeBlockAsm12B:
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm12B
|
|
SHRQ $0x08, DI
|
|
MOVL 24(SP)(R10*4), SI
|
|
LEAL 2(CX), R9
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm12B
|
|
MOVL R9, 24(SP)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm12B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeBlockAsm12B
|
|
|
|
candidate3_match_encodeBlockAsm12B:
|
|
ADDL $0x02, CX
|
|
JMP candidate_match_encodeBlockAsm12B
|
|
|
|
candidate2_match_encodeBlockAsm12B:
|
|
MOVL R9, 24(SP)(R10*4)
|
|
INCL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBlockAsm12B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm12B
|
|
|
|
match_extend_back_loop_encodeBlockAsm12B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeBlockAsm12B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeBlockAsm12B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm12B
|
|
JMP match_extend_back_loop_encodeBlockAsm12B
|
|
|
|
match_extend_back_end_encodeBlockAsm12B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBlockAsm12B:
|
|
MOVL CX, DI
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_match_emit_encodeBlockAsm12B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), DI
|
|
SUBL R8, R9
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x3c
|
|
JLT one_byte_match_emit_encodeBlockAsm12B
|
|
CMPL R8, $0x00000100
|
|
JLT two_bytes_match_emit_encodeBlockAsm12B
|
|
MOVB $0xf4, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeBlockAsm12B
|
|
|
|
two_bytes_match_emit_encodeBlockAsm12B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB R8, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL R8, $0x40
|
|
JL memmove_match_emit_encodeBlockAsm12B
|
|
JMP memmove_long_match_emit_encodeBlockAsm12B
|
|
|
|
one_byte_match_emit_encodeBlockAsm12B:
|
|
SHLB $0x02, R8
|
|
MOVB R8, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeBlockAsm12B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
|
|
MOVQ (DI), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
|
|
MOVQ (DI), R10
|
|
MOVQ -8(DI)(R9*1), DI
|
|
MOVQ R10, (AX)
|
|
MOVQ DI, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm12B:
|
|
MOVQ R8, AX
|
|
JMP emit_literal_done_match_emit_encodeBlockAsm12B
|
|
|
|
memmove_long_match_emit_encodeBlockAsm12B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVQ R9, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R12*1), X4
|
|
MOVOU -16(DI)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R9, R12
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ R8, AX
|
|
|
|
emit_literal_done_match_emit_encodeBlockAsm12B:
|
|
match_nolit_loop_encodeBlockAsm12B:
|
|
MOVL CX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL CX, DI
|
|
LEAQ (DX)(CX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
CMPL DI, $0x08
|
|
JL matchlen_single_match_nolit_encodeBlockAsm12B
|
|
|
|
matchlen_loopback_match_nolit_encodeBlockAsm12B:
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
TESTQ R9, R9
|
|
JZ matchlen_loop_match_nolit_encodeBlockAsm12B
|
|
BSFQ R9, R9
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP match_nolit_end_encodeBlockAsm12B
|
|
|
|
matchlen_loop_match_nolit_encodeBlockAsm12B:
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
CMPL DI, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeBlockAsm12B
|
|
|
|
matchlen_single_match_nolit_encodeBlockAsm12B:
|
|
TESTL DI, DI
|
|
JZ match_nolit_end_encodeBlockAsm12B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeBlockAsm12B:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE match_nolit_end_encodeBlockAsm12B
|
|
LEAL 1(R10), R10
|
|
DECL DI
|
|
JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B
|
|
|
|
match_nolit_end_encodeBlockAsm12B:
|
|
ADDL R10, CX
|
|
MOVL 16(SP), SI
|
|
ADDL $0x04, R10
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeBlockAsm12B:
|
|
CMPL R10, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B
|
|
MOVB $0xee, (AX)
|
|
MOVW SI, 1(AX)
|
|
LEAL -60(R10), R10
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL R10, DI
|
|
LEAL -4(R10), R10
|
|
CMPL DI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
|
|
CMPL DI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
|
|
CMPL SI, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
|
|
CMPL R10, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
|
|
LEAL -256(R10), R10
|
|
MOVW $0x0019, (AX)
|
|
MOVW R10, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
|
|
|
|
repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
|
|
LEAL -4(R10), R10
|
|
MOVW $0x0015, (AX)
|
|
MOVB R10, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
|
|
|
|
repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
|
|
SHLL $0x02, R10
|
|
ORL $0x01, R10
|
|
MOVW R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
|
|
|
|
repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
|
|
XORQ DI, DI
|
|
LEAL 1(DI)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SARL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
|
|
JMP two_byte_offset_match_nolit_encodeBlockAsm12B
|
|
|
|
two_byte_offset_short_match_nolit_encodeBlockAsm12B:
|
|
CMPL R10, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeBlockAsm12B
|
|
CMPL SI, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeBlockAsm12B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SHRL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
|
|
|
|
emit_copy_three_match_nolit_encodeBlockAsm12B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm12B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeBlockAsm12B
|
|
MOVQ -2(DX)(CX*1), DI
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm12B:
|
|
MOVQ $0x000000cf1bbcdcbb, R9
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, SI
|
|
SHLQ $0x18, R8
|
|
IMULQ R9, R8
|
|
SHRQ $0x34, R8
|
|
SHLQ $0x18, SI
|
|
IMULQ R9, SI
|
|
SHRQ $0x34, SI
|
|
LEAL -2(CX), R9
|
|
LEAQ 24(SP)(SI*4), R10
|
|
MOVL (R10), SI
|
|
MOVL R9, 24(SP)(R8*4)
|
|
MOVL CX, (R10)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ match_nolit_loop_encodeBlockAsm12B
|
|
INCL CX
|
|
JMP search_loop_encodeBlockAsm12B
|
|
|
|
emit_remainder_encodeBlockAsm12B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBlockAsm12B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeBlockAsm12B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeBlockAsm12B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm12B
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm12B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeBlockAsm12B
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm12B
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm12B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeBlockAsm12B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm12B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeBlockAsm12B
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm12B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeBlockAsm12B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeBlockAsm10B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeBlockAsm10B(SB), $4120-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000020, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm10B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeBlockAsm10B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL CX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBlockAsm10B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x05, SI
|
|
LEAL 4(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeBlockAsm10B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x9e3779b1, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x36, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x36, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 24(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
LEAL 1(CX), R10
|
|
MOVL R10, 24(SP)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x36, R10
|
|
MOVL CX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(DX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeBlockAsm10B
|
|
LEAL 1(CX), DI
|
|
MOVL 12(SP), R8
|
|
MOVL DI, SI
|
|
SUBL 16(SP), SI
|
|
JZ repeat_extend_back_end_encodeBlockAsm10B
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm10B:
|
|
CMPL DI, R8
|
|
JLE repeat_extend_back_end_encodeBlockAsm10B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(DI*1), R9
|
|
CMPB BL, R9
|
|
JNE repeat_extend_back_end_encodeBlockAsm10B
|
|
LEAL -1(DI), DI
|
|
DECL SI
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm10B
|
|
|
|
repeat_extend_back_end_encodeBlockAsm10B:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_repeat_emit_encodeBlockAsm10B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_repeat_emit_encodeBlockAsm10B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm10B
|
|
|
|
two_bytes_repeat_emit_encodeBlockAsm10B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_repeat_emit_encodeBlockAsm10B
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm10B
|
|
|
|
one_byte_repeat_emit_encodeBlockAsm10B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_repeat_emit_encodeBlockAsm10B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
|
|
MOVQ (R10), R11
|
|
MOVQ R11, (AX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBlockAsm10B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_repeat_emit_encodeBlockAsm10B
|
|
|
|
memmove_long_repeat_emit_encodeBlockAsm10B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R11, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R13*1), R11
|
|
LEAQ -32(AX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R13*1), X4
|
|
MOVOU -16(R10)(R13*1), X5
|
|
MOVOA X4, -32(AX)(R13*1)
|
|
MOVOA X5, -16(AX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R9, R13
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_repeat_emit_encodeBlockAsm10B:
|
|
ADDL $0x05, CX
|
|
MOVL CX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R9
|
|
SUBL CX, R9
|
|
LEAQ (DX)(CX*1), R10
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R9, $0x08
|
|
JL matchlen_single_repeat_extend_encodeBlockAsm10B
|
|
|
|
matchlen_loopback_repeat_extend_encodeBlockAsm10B:
|
|
MOVQ (R10)(R12*1), R11
|
|
XORQ (SI)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_repeat_extend_encodeBlockAsm10B
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP repeat_extend_forward_end_encodeBlockAsm10B
|
|
|
|
matchlen_loop_repeat_extend_encodeBlockAsm10B:
|
|
LEAL -8(R9), R9
|
|
LEAL 8(R12), R12
|
|
CMPL R9, $0x08
|
|
JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B
|
|
|
|
matchlen_single_repeat_extend_encodeBlockAsm10B:
|
|
TESTL R9, R9
|
|
JZ repeat_extend_forward_end_encodeBlockAsm10B
|
|
|
|
matchlen_single_loopback_repeat_extend_encodeBlockAsm10B:
|
|
MOVB (R10)(R12*1), R11
|
|
CMPB (SI)(R12*1), R11
|
|
JNE repeat_extend_forward_end_encodeBlockAsm10B
|
|
LEAL 1(R12), R12
|
|
DECL R9
|
|
JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm10B:
|
|
ADDL R12, CX
|
|
MOVL CX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
TESTL R8, R8
|
|
JZ repeat_as_copy_encodeBlockAsm10B
|
|
|
|
// emitRepeat
|
|
MOVL SI, R8
|
|
LEAL -4(SI), SI
|
|
CMPL R8, $0x08
|
|
JLE repeat_two_match_repeat_encodeBlockAsm10B
|
|
CMPL R8, $0x0c
|
|
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
|
|
CMPL DI, $0x00000800
|
|
JLT repeat_two_offset_match_repeat_encodeBlockAsm10B
|
|
|
|
cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_match_repeat_encodeBlockAsm10B
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm10B
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm10B:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm10B
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm10B:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm10B
|
|
|
|
repeat_two_offset_match_repeat_encodeBlockAsm10B:
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm10B
|
|
|
|
repeat_as_copy_encodeBlockAsm10B:
|
|
// emitCopy
|
|
two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
|
|
CMPL SI, $0x40
|
|
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
|
|
MOVB $0xee, (AX)
|
|
MOVW DI, 1(AX)
|
|
LEAL -60(SI), SI
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL SI, R8
|
|
LEAL -4(SI), SI
|
|
CMPL R8, $0x08
|
|
JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
|
|
CMPL R8, $0x0c
|
|
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
|
|
CMPL DI, $0x00000800
|
|
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
|
|
|
|
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm10B
|
|
|
|
repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm10B
|
|
|
|
repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm10B
|
|
|
|
repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm10B
|
|
JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B
|
|
|
|
two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
|
|
CMPL SI, $0x0c
|
|
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
|
|
CMPL DI, $0x00000800
|
|
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SHRL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm10B
|
|
|
|
emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
repeat_end_emit_encodeBlockAsm10B:
|
|
MOVL CX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm10B
|
|
|
|
no_repeat_found_encodeBlockAsm10B:
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm10B
|
|
SHRQ $0x08, DI
|
|
MOVL 24(SP)(R10*4), SI
|
|
LEAL 2(CX), R9
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm10B
|
|
MOVL R9, 24(SP)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm10B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeBlockAsm10B
|
|
|
|
candidate3_match_encodeBlockAsm10B:
|
|
ADDL $0x02, CX
|
|
JMP candidate_match_encodeBlockAsm10B
|
|
|
|
candidate2_match_encodeBlockAsm10B:
|
|
MOVL R9, 24(SP)(R10*4)
|
|
INCL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBlockAsm10B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm10B
|
|
|
|
match_extend_back_loop_encodeBlockAsm10B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeBlockAsm10B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeBlockAsm10B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm10B
|
|
JMP match_extend_back_loop_encodeBlockAsm10B
|
|
|
|
match_extend_back_end_encodeBlockAsm10B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBlockAsm10B:
|
|
MOVL CX, DI
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_match_emit_encodeBlockAsm10B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), DI
|
|
SUBL R8, R9
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x3c
|
|
JLT one_byte_match_emit_encodeBlockAsm10B
|
|
CMPL R8, $0x00000100
|
|
JLT two_bytes_match_emit_encodeBlockAsm10B
|
|
MOVB $0xf4, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeBlockAsm10B
|
|
|
|
two_bytes_match_emit_encodeBlockAsm10B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB R8, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL R8, $0x40
|
|
JL memmove_match_emit_encodeBlockAsm10B
|
|
JMP memmove_long_match_emit_encodeBlockAsm10B
|
|
|
|
one_byte_match_emit_encodeBlockAsm10B:
|
|
SHLB $0x02, R8
|
|
MOVB R8, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeBlockAsm10B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
|
|
MOVQ (DI), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
|
|
MOVQ (DI), R10
|
|
MOVQ -8(DI)(R9*1), DI
|
|
MOVQ R10, (AX)
|
|
MOVQ DI, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm10B:
|
|
MOVQ R8, AX
|
|
JMP emit_literal_done_match_emit_encodeBlockAsm10B
|
|
|
|
memmove_long_match_emit_encodeBlockAsm10B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVQ R9, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R12*1), X4
|
|
MOVOU -16(DI)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R9, R12
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ R8, AX
|
|
|
|
emit_literal_done_match_emit_encodeBlockAsm10B:
|
|
match_nolit_loop_encodeBlockAsm10B:
|
|
MOVL CX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL CX, DI
|
|
LEAQ (DX)(CX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
CMPL DI, $0x08
|
|
JL matchlen_single_match_nolit_encodeBlockAsm10B
|
|
|
|
matchlen_loopback_match_nolit_encodeBlockAsm10B:
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
TESTQ R9, R9
|
|
JZ matchlen_loop_match_nolit_encodeBlockAsm10B
|
|
BSFQ R9, R9
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP match_nolit_end_encodeBlockAsm10B
|
|
|
|
matchlen_loop_match_nolit_encodeBlockAsm10B:
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
CMPL DI, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeBlockAsm10B
|
|
|
|
matchlen_single_match_nolit_encodeBlockAsm10B:
|
|
TESTL DI, DI
|
|
JZ match_nolit_end_encodeBlockAsm10B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeBlockAsm10B:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE match_nolit_end_encodeBlockAsm10B
|
|
LEAL 1(R10), R10
|
|
DECL DI
|
|
JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B
|
|
|
|
match_nolit_end_encodeBlockAsm10B:
|
|
ADDL R10, CX
|
|
MOVL 16(SP), SI
|
|
ADDL $0x04, R10
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeBlockAsm10B:
|
|
CMPL R10, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B
|
|
MOVB $0xee, (AX)
|
|
MOVW SI, 1(AX)
|
|
LEAL -60(R10), R10
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL R10, DI
|
|
LEAL -4(R10), R10
|
|
CMPL DI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
|
|
CMPL DI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
|
|
CMPL SI, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
|
|
CMPL R10, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
|
|
LEAL -256(R10), R10
|
|
MOVW $0x0019, (AX)
|
|
MOVW R10, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
|
|
|
|
repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
|
|
LEAL -4(R10), R10
|
|
MOVW $0x0015, (AX)
|
|
MOVB R10, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
|
|
|
|
repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
|
|
SHLL $0x02, R10
|
|
ORL $0x01, R10
|
|
MOVW R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
|
|
|
|
repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
|
|
XORQ DI, DI
|
|
LEAL 1(DI)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SARL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
|
|
JMP two_byte_offset_match_nolit_encodeBlockAsm10B
|
|
|
|
two_byte_offset_short_match_nolit_encodeBlockAsm10B:
|
|
CMPL R10, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeBlockAsm10B
|
|
CMPL SI, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeBlockAsm10B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SHRL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
|
|
|
|
emit_copy_three_match_nolit_encodeBlockAsm10B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm10B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeBlockAsm10B
|
|
MOVQ -2(DX)(CX*1), DI
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm10B:
|
|
MOVQ $0x9e3779b1, R9
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, SI
|
|
SHLQ $0x20, R8
|
|
IMULQ R9, R8
|
|
SHRQ $0x36, R8
|
|
SHLQ $0x20, SI
|
|
IMULQ R9, SI
|
|
SHRQ $0x36, SI
|
|
LEAL -2(CX), R9
|
|
LEAQ 24(SP)(SI*4), R10
|
|
MOVL (R10), SI
|
|
MOVL R9, 24(SP)(R8*4)
|
|
MOVL CX, (R10)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ match_nolit_loop_encodeBlockAsm10B
|
|
INCL CX
|
|
JMP search_loop_encodeBlockAsm10B
|
|
|
|
emit_remainder_encodeBlockAsm10B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBlockAsm10B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeBlockAsm10B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeBlockAsm10B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm10B
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm10B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeBlockAsm10B
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm10B
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm10B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeBlockAsm10B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm10B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeBlockAsm10B
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm10B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeBlockAsm10B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeBlockAsm8B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeBlockAsm8B(SB), $1048-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000008, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBlockAsm8B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeBlockAsm8B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL CX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBlockAsm8B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x04, SI
|
|
LEAL 4(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeBlockAsm8B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x9e3779b1, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x38, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x38, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 24(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
LEAL 1(CX), R10
|
|
MOVL R10, 24(SP)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x38, R10
|
|
MOVL CX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(DX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeBlockAsm8B
|
|
LEAL 1(CX), DI
|
|
MOVL 12(SP), R8
|
|
MOVL DI, SI
|
|
SUBL 16(SP), SI
|
|
JZ repeat_extend_back_end_encodeBlockAsm8B
|
|
|
|
repeat_extend_back_loop_encodeBlockAsm8B:
|
|
CMPL DI, R8
|
|
JLE repeat_extend_back_end_encodeBlockAsm8B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(DI*1), R9
|
|
CMPB BL, R9
|
|
JNE repeat_extend_back_end_encodeBlockAsm8B
|
|
LEAL -1(DI), DI
|
|
DECL SI
|
|
JNZ repeat_extend_back_loop_encodeBlockAsm8B
|
|
|
|
repeat_extend_back_end_encodeBlockAsm8B:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_repeat_emit_encodeBlockAsm8B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_repeat_emit_encodeBlockAsm8B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm8B
|
|
|
|
two_bytes_repeat_emit_encodeBlockAsm8B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_repeat_emit_encodeBlockAsm8B
|
|
JMP memmove_long_repeat_emit_encodeBlockAsm8B
|
|
|
|
one_byte_repeat_emit_encodeBlockAsm8B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_repeat_emit_encodeBlockAsm8B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
|
|
MOVQ (R10), R11
|
|
MOVQ R11, (AX)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeBlockAsm8B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_repeat_emit_encodeBlockAsm8B
|
|
|
|
memmove_long_repeat_emit_encodeBlockAsm8B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R12
|
|
SHRQ $0x05, R12
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R11, R13
|
|
DECQ R12
|
|
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R13*1), R11
|
|
LEAQ -32(AX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R13
|
|
DECQ R12
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R13*1), X4
|
|
MOVOU -16(R10)(R13*1), X5
|
|
MOVOA X4, -32(AX)(R13*1)
|
|
MOVOA X5, -16(AX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R9, R13
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_repeat_emit_encodeBlockAsm8B:
|
|
ADDL $0x05, CX
|
|
MOVL CX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R9
|
|
SUBL CX, R9
|
|
LEAQ (DX)(CX*1), R10
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R9, $0x08
|
|
JL matchlen_single_repeat_extend_encodeBlockAsm8B
|
|
|
|
matchlen_loopback_repeat_extend_encodeBlockAsm8B:
|
|
MOVQ (R10)(R12*1), R11
|
|
XORQ (SI)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_repeat_extend_encodeBlockAsm8B
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP repeat_extend_forward_end_encodeBlockAsm8B
|
|
|
|
matchlen_loop_repeat_extend_encodeBlockAsm8B:
|
|
LEAL -8(R9), R9
|
|
LEAL 8(R12), R12
|
|
CMPL R9, $0x08
|
|
JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B
|
|
|
|
matchlen_single_repeat_extend_encodeBlockAsm8B:
|
|
TESTL R9, R9
|
|
JZ repeat_extend_forward_end_encodeBlockAsm8B
|
|
|
|
matchlen_single_loopback_repeat_extend_encodeBlockAsm8B:
|
|
MOVB (R10)(R12*1), R11
|
|
CMPB (SI)(R12*1), R11
|
|
JNE repeat_extend_forward_end_encodeBlockAsm8B
|
|
LEAL 1(R12), R12
|
|
DECL R9
|
|
JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B
|
|
|
|
repeat_extend_forward_end_encodeBlockAsm8B:
|
|
ADDL R12, CX
|
|
MOVL CX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
TESTL R8, R8
|
|
JZ repeat_as_copy_encodeBlockAsm8B
|
|
|
|
// emitRepeat
|
|
MOVL SI, DI
|
|
LEAL -4(SI), SI
|
|
CMPL DI, $0x08
|
|
JLE repeat_two_match_repeat_encodeBlockAsm8B
|
|
CMPL DI, $0x0c
|
|
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
|
|
|
|
cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_match_repeat_encodeBlockAsm8B
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm8B
|
|
|
|
repeat_three_match_repeat_encodeBlockAsm8B:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm8B
|
|
|
|
repeat_two_match_repeat_encodeBlockAsm8B:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm8B
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm8B
|
|
|
|
repeat_as_copy_encodeBlockAsm8B:
|
|
// emitCopy
|
|
two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
|
|
CMPL SI, $0x40
|
|
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
|
|
MOVB $0xee, (AX)
|
|
MOVW DI, 1(AX)
|
|
LEAL -60(SI), SI
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL SI, DI
|
|
LEAL -4(SI), SI
|
|
CMPL DI, $0x08
|
|
JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
|
|
CMPL DI, $0x0c
|
|
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
|
|
|
|
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
|
|
CMPL SI, $0x00000104
|
|
JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
|
|
LEAL -256(SI), SI
|
|
MOVW $0x0019, (AX)
|
|
MOVW SI, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP repeat_end_emit_encodeBlockAsm8B
|
|
|
|
repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
|
|
LEAL -4(SI), SI
|
|
MOVW $0x0015, (AX)
|
|
MOVB SI, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP repeat_end_emit_encodeBlockAsm8B
|
|
|
|
repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
|
|
SHLL $0x02, SI
|
|
ORL $0x01, SI
|
|
MOVW SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm8B
|
|
XORQ R8, R8
|
|
LEAL 1(R8)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SARL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm8B
|
|
JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B
|
|
|
|
two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
|
|
CMPL SI, $0x0c
|
|
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SHRL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeBlockAsm8B
|
|
|
|
emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
repeat_end_emit_encodeBlockAsm8B:
|
|
MOVL CX, 12(SP)
|
|
JMP search_loop_encodeBlockAsm8B
|
|
|
|
no_repeat_found_encodeBlockAsm8B:
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBlockAsm8B
|
|
SHRQ $0x08, DI
|
|
MOVL 24(SP)(R10*4), SI
|
|
LEAL 2(CX), R9
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidate2_match_encodeBlockAsm8B
|
|
MOVL R9, 24(SP)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate3_match_encodeBlockAsm8B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeBlockAsm8B
|
|
|
|
candidate3_match_encodeBlockAsm8B:
|
|
ADDL $0x02, CX
|
|
JMP candidate_match_encodeBlockAsm8B
|
|
|
|
candidate2_match_encodeBlockAsm8B:
|
|
MOVL R9, 24(SP)(R10*4)
|
|
INCL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBlockAsm8B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBlockAsm8B
|
|
|
|
match_extend_back_loop_encodeBlockAsm8B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeBlockAsm8B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeBlockAsm8B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBlockAsm8B
|
|
JMP match_extend_back_loop_encodeBlockAsm8B
|
|
|
|
match_extend_back_end_encodeBlockAsm8B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBlockAsm8B:
|
|
MOVL CX, DI
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_match_emit_encodeBlockAsm8B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), DI
|
|
SUBL R8, R9
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x3c
|
|
JLT one_byte_match_emit_encodeBlockAsm8B
|
|
CMPL R8, $0x00000100
|
|
JLT two_bytes_match_emit_encodeBlockAsm8B
|
|
MOVB $0xf4, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeBlockAsm8B
|
|
|
|
two_bytes_match_emit_encodeBlockAsm8B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB R8, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL R8, $0x40
|
|
JL memmove_match_emit_encodeBlockAsm8B
|
|
JMP memmove_long_match_emit_encodeBlockAsm8B
|
|
|
|
one_byte_match_emit_encodeBlockAsm8B:
|
|
SHLB $0x02, R8
|
|
MOVB R8, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeBlockAsm8B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
|
|
MOVQ (DI), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
|
|
MOVQ (DI), R10
|
|
MOVQ -8(DI)(R9*1), DI
|
|
MOVQ R10, (AX)
|
|
MOVQ DI, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBlockAsm8B:
|
|
MOVQ R8, AX
|
|
JMP emit_literal_done_match_emit_encodeBlockAsm8B
|
|
|
|
memmove_long_match_emit_encodeBlockAsm8B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVQ R9, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R12*1), X4
|
|
MOVOU -16(DI)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R9, R12
|
|
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ R8, AX
|
|
|
|
emit_literal_done_match_emit_encodeBlockAsm8B:
|
|
match_nolit_loop_encodeBlockAsm8B:
|
|
MOVL CX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL CX, DI
|
|
LEAQ (DX)(CX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
CMPL DI, $0x08
|
|
JL matchlen_single_match_nolit_encodeBlockAsm8B
|
|
|
|
matchlen_loopback_match_nolit_encodeBlockAsm8B:
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
TESTQ R9, R9
|
|
JZ matchlen_loop_match_nolit_encodeBlockAsm8B
|
|
BSFQ R9, R9
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP match_nolit_end_encodeBlockAsm8B
|
|
|
|
matchlen_loop_match_nolit_encodeBlockAsm8B:
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
CMPL DI, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeBlockAsm8B
|
|
|
|
matchlen_single_match_nolit_encodeBlockAsm8B:
|
|
TESTL DI, DI
|
|
JZ match_nolit_end_encodeBlockAsm8B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeBlockAsm8B:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE match_nolit_end_encodeBlockAsm8B
|
|
LEAL 1(R10), R10
|
|
DECL DI
|
|
JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B
|
|
|
|
match_nolit_end_encodeBlockAsm8B:
|
|
ADDL R10, CX
|
|
MOVL 16(SP), SI
|
|
ADDL $0x04, R10
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeBlockAsm8B:
|
|
CMPL R10, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B
|
|
MOVB $0xee, (AX)
|
|
MOVW SI, 1(AX)
|
|
LEAL -60(R10), R10
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL R10, SI
|
|
LEAL -4(R10), R10
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
|
|
CMPL R10, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
|
|
LEAL -256(R10), R10
|
|
MOVW $0x0019, (AX)
|
|
MOVW R10, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
|
|
|
|
repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
|
|
LEAL -4(R10), R10
|
|
MOVW $0x0015, (AX)
|
|
MOVB R10, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
|
|
|
|
repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
|
|
SHLL $0x02, R10
|
|
ORL $0x01, R10
|
|
MOVW R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
|
|
XORQ DI, DI
|
|
LEAL 1(DI)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SARL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
|
|
JMP two_byte_offset_match_nolit_encodeBlockAsm8B
|
|
|
|
two_byte_offset_short_match_nolit_encodeBlockAsm8B:
|
|
CMPL R10, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeBlockAsm8B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SHRL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
|
|
|
|
emit_copy_three_match_nolit_encodeBlockAsm8B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeBlockAsm8B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeBlockAsm8B
|
|
MOVQ -2(DX)(CX*1), DI
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBlockAsm8B:
|
|
MOVQ $0x9e3779b1, R9
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, SI
|
|
SHLQ $0x20, R8
|
|
IMULQ R9, R8
|
|
SHRQ $0x38, R8
|
|
SHLQ $0x20, SI
|
|
IMULQ R9, SI
|
|
SHRQ $0x38, SI
|
|
LEAL -2(CX), R9
|
|
LEAQ 24(SP)(SI*4), R10
|
|
MOVL (R10), SI
|
|
MOVL R9, 24(SP)(R8*4)
|
|
MOVL CX, (R10)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ match_nolit_loop_encodeBlockAsm8B
|
|
INCL CX
|
|
JMP search_loop_encodeBlockAsm8B
|
|
|
|
emit_remainder_encodeBlockAsm8B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBlockAsm8B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeBlockAsm8B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeBlockAsm8B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm8B
|
|
|
|
two_bytes_emit_remainder_encodeBlockAsm8B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeBlockAsm8B
|
|
JMP memmove_long_emit_remainder_encodeBlockAsm8B
|
|
|
|
one_byte_emit_remainder_encodeBlockAsm8B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeBlockAsm8B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBlockAsm8B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeBlockAsm8B
|
|
|
|
memmove_long_emit_remainder_encodeBlockAsm8B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeBlockAsm8B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeBetterBlockAsm(SB), $327704-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000a00, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeBetterBlockAsm
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -6(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL $0x00000000, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x07, SI
|
|
CMPL SI, $0x63
|
|
JLE check_maxskip_ok_encodeBetterBlockAsm
|
|
LEAL 100(CX), SI
|
|
JMP check_maxskip_cont_encodeBetterBlockAsm
|
|
|
|
check_maxskip_ok_encodeBetterBlockAsm:
|
|
LEAL 1(CX)(SI*1), SI
|
|
|
|
check_maxskip_cont_encodeBetterBlockAsm:
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeBetterBlockAsm
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x00cf1bbcdcbfa563, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x32, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 262168(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
MOVL CX, 262168(SP)(R11*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidateS_match_encodeBetterBlockAsm
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeBetterBlockAsm
|
|
|
|
candidateS_match_encodeBetterBlockAsm:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x30, R10
|
|
MOVL 24(SP)(R10*4), SI
|
|
INCL CX
|
|
MOVL CX, 24(SP)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm
|
|
DECL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBetterBlockAsm:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeBetterBlockAsm
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeBetterBlockAsm
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 5(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeBetterBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm:
|
|
MOVL CX, DI
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), R10
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_match_nolit_encodeBetterBlockAsm
|
|
|
|
matchlen_loopback_match_nolit_encodeBetterBlockAsm:
|
|
MOVQ (R9)(R12*1), R11
|
|
XORQ (R10)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP match_nolit_end_encodeBetterBlockAsm
|
|
|
|
matchlen_loop_match_nolit_encodeBetterBlockAsm:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R12), R12
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm
|
|
|
|
matchlen_single_match_nolit_encodeBetterBlockAsm:
|
|
TESTL R8, R8
|
|
JZ match_nolit_end_encodeBetterBlockAsm
|
|
|
|
matchlen_single_loopback_match_nolit_encodeBetterBlockAsm:
|
|
MOVB (R9)(R12*1), R11
|
|
CMPB (R10)(R12*1), R11
|
|
JNE match_nolit_end_encodeBetterBlockAsm
|
|
LEAL 1(R12), R12
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm
|
|
|
|
match_nolit_end_encodeBetterBlockAsm:
|
|
MOVL CX, R8
|
|
SUBL SI, R8
|
|
|
|
// Check if repeat
|
|
CMPL 16(SP), R8
|
|
JEQ match_is_repeat_encodeBetterBlockAsm
|
|
CMPL R12, $0x01
|
|
JG match_length_ok_encodeBetterBlockAsm
|
|
CMPL R8, $0x0000ffff
|
|
JLE match_length_ok_encodeBetterBlockAsm
|
|
MOVL 20(SP), CX
|
|
INCL CX
|
|
JMP search_loop_encodeBetterBlockAsm
|
|
|
|
match_length_ok_encodeBetterBlockAsm:
|
|
MOVL R8, 16(SP)
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_encodeBetterBlockAsm
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_encodeBetterBlockAsm
|
|
CMPL SI, $0x00010000
|
|
JLT three_bytes_match_emit_encodeBetterBlockAsm
|
|
CMPL SI, $0x01000000
|
|
JLT four_bytes_match_emit_encodeBetterBlockAsm
|
|
MOVB $0xfc, (AX)
|
|
MOVL SI, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm
|
|
|
|
four_bytes_match_emit_encodeBetterBlockAsm:
|
|
MOVL SI, R11
|
|
SHRL $0x10, R11
|
|
MOVB $0xf8, (AX)
|
|
MOVW SI, 1(AX)
|
|
MOVB R11, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm
|
|
|
|
three_bytes_match_emit_encodeBetterBlockAsm:
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_encodeBetterBlockAsm
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeBetterBlockAsm:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x04
|
|
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
|
|
CMPQ R9, $0x08
|
|
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
|
|
MOVL (R10), R11
|
|
MOVL R11, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
|
|
MOVL (R10), R11
|
|
MOVL -4(R10)(R9*1), R10
|
|
MOVL R11, (AX)
|
|
MOVL R10, -4(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_encodeBetterBlockAsm
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_encodeBetterBlockAsm:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL R8, $0x00010000
|
|
JL two_byte_offset_match_nolit_encodeBetterBlockAsm
|
|
|
|
four_bytes_loop_back_match_nolit_encodeBetterBlockAsm:
|
|
CMPL R12, $0x40
|
|
JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm
|
|
MOVB $0xff, (AX)
|
|
MOVL R8, 1(AX)
|
|
LEAL -64(R12), R12
|
|
ADDQ $0x05, AX
|
|
CMPL R12, $0x04
|
|
JL four_bytes_remain_match_nolit_encodeBetterBlockAsm
|
|
|
|
// emitRepeat
|
|
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
|
|
CMPL R8, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
|
|
CMPL R12, $0x00010100
|
|
JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
|
|
CMPL R12, $0x0100ffff
|
|
JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
|
|
LEAL -16842747(R12), R12
|
|
MOVW $0x001d, (AX)
|
|
MOVW $0xfffb, 2(AX)
|
|
MOVB $0xff, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
|
|
|
|
repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
|
|
LEAL -65536(R12), R12
|
|
MOVL R12, R8
|
|
MOVW $0x001d, (AX)
|
|
MOVW R12, 2(AX)
|
|
SARL $0x10, R8
|
|
MOVB R8, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm
|
|
|
|
four_bytes_remain_match_nolit_encodeBetterBlockAsm:
|
|
TESTL R12, R12
|
|
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
MOVB $0x03, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVL R8, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
two_byte_offset_match_nolit_encodeBetterBlockAsm:
|
|
CMPL R12, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm
|
|
MOVB $0xee, (AX)
|
|
MOVW R8, 1(AX)
|
|
LEAL -60(R12), R12
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
|
|
CMPL R8, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
|
|
CMPL R12, $0x00010100
|
|
JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
|
|
CMPL R12, $0x0100ffff
|
|
JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
|
|
LEAL -16842747(R12), R12
|
|
MOVW $0x001d, (AX)
|
|
MOVW $0xfffb, 2(AX)
|
|
MOVB $0xff, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
|
|
|
|
repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
|
|
LEAL -65536(R12), R12
|
|
MOVL R12, R8
|
|
MOVW $0x001d, (AX)
|
|
MOVW R12, 2(AX)
|
|
SARL $0x10, R8
|
|
MOVB R8, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
JMP two_byte_offset_match_nolit_encodeBetterBlockAsm
|
|
|
|
two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
|
|
CMPL R12, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
|
|
CMPL R8, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SHRL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
emit_copy_three_match_nolit_encodeBetterBlockAsm:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
match_is_repeat_encodeBetterBlockAsm:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm
|
|
CMPL SI, $0x00010000
|
|
JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm
|
|
CMPL SI, $0x01000000
|
|
JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm
|
|
MOVB $0xfc, (AX)
|
|
MOVL SI, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
four_bytes_match_emit_repeat_encodeBetterBlockAsm:
|
|
MOVL SI, R11
|
|
SHRL $0x10, R11
|
|
MOVB $0xf8, (AX)
|
|
MOVW SI, 1(AX)
|
|
MOVB R11, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
three_bytes_match_emit_repeat_encodeBetterBlockAsm:
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_repeat_encodeBetterBlockAsm
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_repeat_encodeBetterBlockAsm:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x04
|
|
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
|
|
CMPQ R9, $0x08
|
|
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
|
|
MOVL (R10), R11
|
|
MOVL R11, (AX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
|
|
MOVL (R10), R11
|
|
MOVL -4(R10)(R9*1), R10
|
|
MOVL R11, (AX)
|
|
MOVL R10, -4(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitRepeat
|
|
emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
|
|
CMPL R8, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
|
|
|
|
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm
|
|
CMPL R12, $0x00010100
|
|
JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm
|
|
CMPL R12, $0x0100ffff
|
|
JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm
|
|
LEAL -16842747(R12), R12
|
|
MOVW $0x001d, (AX)
|
|
MOVW $0xfffb, 2(AX)
|
|
MOVB $0xff, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
|
|
|
|
repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
|
|
LEAL -65536(R12), R12
|
|
MOVL R12, R8
|
|
MOVW $0x001d, (AX)
|
|
MOVW R12, 2(AX)
|
|
SARL $0x10, R8
|
|
MOVB R8, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
|
|
|
|
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeBetterBlockAsm
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeBetterBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm:
|
|
MOVQ $0x00cf1bbcdcbfa563, SI
|
|
MOVQ $0x9e3779b1, R8
|
|
INCL DI
|
|
MOVQ (DX)(DI*1), R9
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
MOVQ R9, R12
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
SHRQ $0x10, R12
|
|
LEAL 1(DI), R14
|
|
LEAL 2(DI), R15
|
|
MOVQ -2(DX)(CX*1), R9
|
|
SHLQ $0x08, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x08, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x30, R13
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x32, R11
|
|
SHLQ $0x20, R12
|
|
IMULQ R8, R12
|
|
SHRQ $0x32, R12
|
|
MOVL DI, 24(SP)(R10*4)
|
|
MOVL R14, 24(SP)(R13*4)
|
|
MOVL R14, 262168(SP)(R11*4)
|
|
MOVL R15, 262168(SP)(R12*4)
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
LEAL -2(CX), R9
|
|
LEAL -1(CX), DI
|
|
SHLQ $0x08, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x32, R11
|
|
SHLQ $0x08, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x30, R13
|
|
MOVL R9, 24(SP)(R10*4)
|
|
MOVL DI, 262168(SP)(R11*4)
|
|
MOVL DI, 24(SP)(R13*4)
|
|
JMP search_loop_encodeBetterBlockAsm
|
|
|
|
emit_remainder_encodeBetterBlockAsm:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 5(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeBetterBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeBetterBlockAsm
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeBetterBlockAsm
|
|
CMPL DX, $0x00010000
|
|
JLT three_bytes_emit_remainder_encodeBetterBlockAsm
|
|
CMPL DX, $0x01000000
|
|
JLT four_bytes_emit_remainder_encodeBetterBlockAsm
|
|
MOVB $0xfc, (AX)
|
|
MOVL DX, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm
|
|
|
|
four_bytes_emit_remainder_encodeBetterBlockAsm:
|
|
MOVL DX, BX
|
|
SHRL $0x10, BX
|
|
MOVB $0xf8, (AX)
|
|
MOVW DX, 1(AX)
|
|
MOVB BL, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm
|
|
|
|
three_bytes_emit_remainder_encodeBetterBlockAsm:
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeBetterBlockAsm
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeBetterBlockAsm:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x04
|
|
JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4
|
|
CMPQ BX, $0x08
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4:
|
|
MOVL (CX), SI
|
|
MOVL SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
|
|
MOVL (CX), SI
|
|
MOVL -4(CX)(BX*1), CX
|
|
MOVL SI, (AX)
|
|
MOVL CX, -4(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000a00, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm4MB:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeBetterBlockAsm4MB
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -6(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL $0x00000000, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm4MB:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x07, SI
|
|
CMPL SI, $0x63
|
|
JLE check_maxskip_ok_encodeBetterBlockAsm4MB
|
|
LEAL 100(CX), SI
|
|
JMP check_maxskip_cont_encodeBetterBlockAsm4MB
|
|
|
|
check_maxskip_ok_encodeBetterBlockAsm4MB:
|
|
LEAL 1(CX)(SI*1), SI
|
|
|
|
check_maxskip_cont_encodeBetterBlockAsm4MB:
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeBetterBlockAsm4MB
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x00cf1bbcdcbfa563, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x32, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 262168(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
MOVL CX, 262168(SP)(R11*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm4MB
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidateS_match_encodeBetterBlockAsm4MB
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeBetterBlockAsm4MB
|
|
|
|
candidateS_match_encodeBetterBlockAsm4MB:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x30, R10
|
|
MOVL 24(SP)(R10*4), SI
|
|
INCL CX
|
|
MOVL CX, 24(SP)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm4MB
|
|
DECL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBetterBlockAsm4MB:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm4MB
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm4MB:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeBetterBlockAsm4MB
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeBetterBlockAsm4MB
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm4MB
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm4MB
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm4MB:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 4(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeBetterBlockAsm4MB
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm4MB:
|
|
MOVL CX, DI
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), R10
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB
|
|
|
|
matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
|
|
MOVQ (R9)(R12*1), R11
|
|
XORQ (R10)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP match_nolit_end_encodeBetterBlockAsm4MB
|
|
|
|
matchlen_loop_match_nolit_encodeBetterBlockAsm4MB:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R12), R12
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB
|
|
|
|
matchlen_single_match_nolit_encodeBetterBlockAsm4MB:
|
|
TESTL R8, R8
|
|
JZ match_nolit_end_encodeBetterBlockAsm4MB
|
|
|
|
matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB:
|
|
MOVB (R9)(R12*1), R11
|
|
CMPB (R10)(R12*1), R11
|
|
JNE match_nolit_end_encodeBetterBlockAsm4MB
|
|
LEAL 1(R12), R12
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB
|
|
|
|
match_nolit_end_encodeBetterBlockAsm4MB:
|
|
MOVL CX, R8
|
|
SUBL SI, R8
|
|
|
|
// Check if repeat
|
|
CMPL 16(SP), R8
|
|
JEQ match_is_repeat_encodeBetterBlockAsm4MB
|
|
CMPL R12, $0x01
|
|
JG match_length_ok_encodeBetterBlockAsm4MB
|
|
CMPL R8, $0x0000ffff
|
|
JLE match_length_ok_encodeBetterBlockAsm4MB
|
|
MOVL 20(SP), CX
|
|
INCL CX
|
|
JMP search_loop_encodeBetterBlockAsm4MB
|
|
|
|
match_length_ok_encodeBetterBlockAsm4MB:
|
|
MOVL R8, 16(SP)
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_encodeBetterBlockAsm4MB
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_encodeBetterBlockAsm4MB
|
|
CMPL SI, $0x00010000
|
|
JLT three_bytes_match_emit_encodeBetterBlockAsm4MB
|
|
MOVL SI, R11
|
|
SHRL $0x10, R11
|
|
MOVB $0xf8, (AX)
|
|
MOVW SI, 1(AX)
|
|
MOVB R11, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
|
|
|
|
three_bytes_match_emit_encodeBetterBlockAsm4MB:
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm4MB:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_encodeBetterBlockAsm4MB
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm4MB:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeBetterBlockAsm4MB:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x04
|
|
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
|
|
CMPQ R9, $0x08
|
|
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
|
|
MOVL (R10), R11
|
|
MOVL R11, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
|
|
MOVL (R10), R11
|
|
MOVL -4(R10)(R9*1), R10
|
|
MOVL R11, (AX)
|
|
MOVL R10, -4(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm4MB:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL R8, $0x00010000
|
|
JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
|
|
|
|
four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB:
|
|
CMPL R12, $0x40
|
|
JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
|
|
MOVB $0xff, (AX)
|
|
MOVL R8, 1(AX)
|
|
LEAL -64(R12), R12
|
|
ADDQ $0x05, AX
|
|
CMPL R12, $0x04
|
|
JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
|
|
|
|
// emitRepeat
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
|
|
CMPL R8, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
|
|
CMPL R12, $0x00010100
|
|
JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
|
|
LEAL -65536(R12), R12
|
|
MOVL R12, R8
|
|
MOVW $0x001d, (AX)
|
|
MOVW R12, 2(AX)
|
|
SARL $0x10, R8
|
|
MOVB R8, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB
|
|
|
|
four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
|
|
TESTL R12, R12
|
|
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
MOVB $0x03, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVL R8, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
|
|
CMPL R12, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
|
|
MOVB $0xee, (AX)
|
|
MOVW R8, 1(AX)
|
|
LEAL -60(R12), R12
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
|
|
CMPL R8, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
|
|
CMPL R12, $0x00010100
|
|
JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
|
|
LEAL -65536(R12), R12
|
|
MOVL R12, R8
|
|
MOVW $0x001d, (AX)
|
|
MOVW R12, 2(AX)
|
|
SARL $0x10, R8
|
|
MOVB R8, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
|
|
|
|
two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
|
|
CMPL R12, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
|
|
CMPL R8, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SHRL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
match_is_repeat_encodeBetterBlockAsm4MB:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
CMPL SI, $0x00010000
|
|
JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
MOVL SI, R11
|
|
SHRL $0x10, R11
|
|
MOVB $0xf8, (AX)
|
|
MOVW SI, 1(AX)
|
|
MOVB R11, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
|
|
three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x04
|
|
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
|
|
CMPQ R9, $0x08
|
|
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
|
|
MOVL (R10), R11
|
|
MOVL R11, (AX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
|
|
MOVL (R10), R11
|
|
MOVL -4(R10)(R9*1), R10
|
|
MOVL R11, (AX)
|
|
MOVL R10, -4(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitRepeat
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
|
|
CMPL R8, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
|
|
|
|
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
|
|
CMPL R12, $0x00010100
|
|
JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
|
|
LEAL -65536(R12), R12
|
|
MOVL R12, R8
|
|
MOVW $0x001d, (AX)
|
|
MOVW R12, 2(AX)
|
|
SARL $0x10, R8
|
|
MOVB R8, 4(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
|
|
|
|
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeBetterBlockAsm4MB
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeBetterBlockAsm4MB
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm4MB:
|
|
MOVQ $0x00cf1bbcdcbfa563, SI
|
|
MOVQ $0x9e3779b1, R8
|
|
INCL DI
|
|
MOVQ (DX)(DI*1), R9
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
MOVQ R9, R12
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
SHRQ $0x10, R12
|
|
LEAL 1(DI), R14
|
|
LEAL 2(DI), R15
|
|
MOVQ -2(DX)(CX*1), R9
|
|
SHLQ $0x08, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x08, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x30, R13
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x32, R11
|
|
SHLQ $0x20, R12
|
|
IMULQ R8, R12
|
|
SHRQ $0x32, R12
|
|
MOVL DI, 24(SP)(R10*4)
|
|
MOVL R14, 24(SP)(R13*4)
|
|
MOVL R14, 262168(SP)(R11*4)
|
|
MOVL R15, 262168(SP)(R12*4)
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
LEAL -2(CX), R9
|
|
LEAL -1(CX), DI
|
|
SHLQ $0x08, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x32, R11
|
|
SHLQ $0x08, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x30, R13
|
|
MOVL R9, 24(SP)(R10*4)
|
|
MOVL DI, 262168(SP)(R11*4)
|
|
MOVL DI, 24(SP)(R13*4)
|
|
JMP search_loop_encodeBetterBlockAsm4MB
|
|
|
|
emit_remainder_encodeBetterBlockAsm4MB:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 4(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeBetterBlockAsm4MB
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm4MB:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeBetterBlockAsm4MB
|
|
CMPL DX, $0x00010000
|
|
JLT three_bytes_emit_remainder_encodeBetterBlockAsm4MB
|
|
MOVL DX, BX
|
|
SHRL $0x10, BX
|
|
MOVB $0xf8, (AX)
|
|
MOVW DX, 1(AX)
|
|
MOVB BL, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
|
|
|
|
three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeBetterBlockAsm4MB
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm4MB:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeBetterBlockAsm4MB:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x04
|
|
JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4
|
|
CMPQ BX, $0x08
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4:
|
|
MOVL (CX), SI
|
|
MOVL SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
|
|
MOVL (CX), SI
|
|
MOVL -4(CX)(BX*1), CX
|
|
MOVL SI, (AX)
|
|
MOVL CX, -4(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000280, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm12B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeBetterBlockAsm12B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -6(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL $0x00000000, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm12B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x06, SI
|
|
LEAL 1(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeBetterBlockAsm12B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x34, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 65560(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
MOVL CX, 65560(SP)(R11*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm12B
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidateS_match_encodeBetterBlockAsm12B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeBetterBlockAsm12B
|
|
|
|
candidateS_match_encodeBetterBlockAsm12B:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
MOVL 24(SP)(R10*4), SI
|
|
INCL CX
|
|
MOVL CX, 24(SP)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm12B
|
|
DECL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBetterBlockAsm12B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm12B
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm12B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeBetterBlockAsm12B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeBetterBlockAsm12B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm12B
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm12B
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm12B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeBetterBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm12B:
|
|
MOVL CX, DI
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), R10
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_match_nolit_encodeBetterBlockAsm12B
|
|
|
|
matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
|
|
MOVQ (R9)(R12*1), R11
|
|
XORQ (R10)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP match_nolit_end_encodeBetterBlockAsm12B
|
|
|
|
matchlen_loop_match_nolit_encodeBetterBlockAsm12B:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R12), R12
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B
|
|
|
|
matchlen_single_match_nolit_encodeBetterBlockAsm12B:
|
|
TESTL R8, R8
|
|
JZ match_nolit_end_encodeBetterBlockAsm12B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B:
|
|
MOVB (R9)(R12*1), R11
|
|
CMPB (R10)(R12*1), R11
|
|
JNE match_nolit_end_encodeBetterBlockAsm12B
|
|
LEAL 1(R12), R12
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B
|
|
|
|
match_nolit_end_encodeBetterBlockAsm12B:
|
|
MOVL CX, R8
|
|
SUBL SI, R8
|
|
|
|
// Check if repeat
|
|
CMPL 16(SP), R8
|
|
JEQ match_is_repeat_encodeBetterBlockAsm12B
|
|
MOVL R8, 16(SP)
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_encodeBetterBlockAsm12B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_encodeBetterBlockAsm12B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm12B
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm12B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_encodeBetterBlockAsm12B
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm12B
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm12B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeBetterBlockAsm12B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x04
|
|
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
|
|
CMPQ R9, $0x08
|
|
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
|
|
MOVL (R10), R11
|
|
MOVL R11, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
|
|
MOVL (R10), R11
|
|
MOVL -4(R10)(R9*1), R10
|
|
MOVL R11, (AX)
|
|
MOVL R10, -4(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm12B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_encodeBetterBlockAsm12B:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
|
|
CMPL R12, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
|
|
MOVB $0xee, (AX)
|
|
MOVW R8, 1(AX)
|
|
LEAL -60(R12), R12
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
|
|
CMPL R8, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
|
|
|
|
repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
|
|
|
|
repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
|
|
|
|
repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
|
|
JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B
|
|
|
|
two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
|
|
CMPL R12, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
|
|
CMPL R8, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SHRL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
|
|
|
|
emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
|
|
|
|
match_is_repeat_encodeBetterBlockAsm12B:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_repeat_encodeBetterBlockAsm12B
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_repeat_encodeBetterBlockAsm12B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x04
|
|
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
|
|
CMPQ R9, $0x08
|
|
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
|
|
MOVL (R10), R11
|
|
MOVL R11, (AX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
|
|
MOVL (R10), R11
|
|
MOVL -4(R10)(R9*1), R10
|
|
MOVL R11, (AX)
|
|
MOVL R10, -4(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitRepeat
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
|
|
CMPL R8, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
|
|
|
|
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
|
|
|
|
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeBetterBlockAsm12B
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeBetterBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm12B:
|
|
MOVQ $0x0000cf1bbcdcbf9b, SI
|
|
MOVQ $0x9e3779b1, R8
|
|
INCL DI
|
|
MOVQ (DX)(DI*1), R9
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
MOVQ R9, R12
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
SHRQ $0x10, R12
|
|
LEAL 1(DI), R14
|
|
LEAL 2(DI), R15
|
|
MOVQ -2(DX)(CX*1), R9
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x32, R13
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x34, R11
|
|
SHLQ $0x20, R12
|
|
IMULQ R8, R12
|
|
SHRQ $0x34, R12
|
|
MOVL DI, 24(SP)(R10*4)
|
|
MOVL R14, 24(SP)(R13*4)
|
|
MOVL R14, 65560(SP)(R11*4)
|
|
MOVL R15, 65560(SP)(R12*4)
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
LEAL -2(CX), R9
|
|
LEAL -1(CX), DI
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x34, R11
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x32, R13
|
|
MOVL R9, 24(SP)(R10*4)
|
|
MOVL DI, 65560(SP)(R11*4)
|
|
MOVL DI, 24(SP)(R13*4)
|
|
JMP search_loop_encodeBetterBlockAsm12B
|
|
|
|
emit_remainder_encodeBetterBlockAsm12B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeBetterBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm12B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeBetterBlockAsm12B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm12B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeBetterBlockAsm12B
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm12B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeBetterBlockAsm12B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x04
|
|
JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4
|
|
CMPQ BX, $0x08
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4:
|
|
MOVL (CX), SI
|
|
MOVL SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
|
|
MOVL (CX), SI
|
|
MOVL -4(CX)(BX*1), CX
|
|
MOVL SI, (AX)
|
|
MOVL CX, -4(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm12B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x000000a0, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm10B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeBetterBlockAsm10B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -6(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL $0x00000000, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm10B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x05, SI
|
|
LEAL 1(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeBetterBlockAsm10B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x36, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 16408(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
MOVL CX, 16408(SP)(R11*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm10B
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidateS_match_encodeBetterBlockAsm10B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeBetterBlockAsm10B
|
|
|
|
candidateS_match_encodeBetterBlockAsm10B:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
MOVL 24(SP)(R10*4), SI
|
|
INCL CX
|
|
MOVL CX, 24(SP)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm10B
|
|
DECL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBetterBlockAsm10B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm10B
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm10B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeBetterBlockAsm10B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeBetterBlockAsm10B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm10B
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm10B
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm10B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeBetterBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm10B:
|
|
MOVL CX, DI
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), R10
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_match_nolit_encodeBetterBlockAsm10B
|
|
|
|
matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
|
|
MOVQ (R9)(R12*1), R11
|
|
XORQ (R10)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP match_nolit_end_encodeBetterBlockAsm10B
|
|
|
|
matchlen_loop_match_nolit_encodeBetterBlockAsm10B:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R12), R12
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B
|
|
|
|
matchlen_single_match_nolit_encodeBetterBlockAsm10B:
|
|
TESTL R8, R8
|
|
JZ match_nolit_end_encodeBetterBlockAsm10B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B:
|
|
MOVB (R9)(R12*1), R11
|
|
CMPB (R10)(R12*1), R11
|
|
JNE match_nolit_end_encodeBetterBlockAsm10B
|
|
LEAL 1(R12), R12
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B
|
|
|
|
match_nolit_end_encodeBetterBlockAsm10B:
|
|
MOVL CX, R8
|
|
SUBL SI, R8
|
|
|
|
// Check if repeat
|
|
CMPL 16(SP), R8
|
|
JEQ match_is_repeat_encodeBetterBlockAsm10B
|
|
MOVL R8, 16(SP)
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_encodeBetterBlockAsm10B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_encodeBetterBlockAsm10B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm10B
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm10B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_encodeBetterBlockAsm10B
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm10B
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm10B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeBetterBlockAsm10B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x04
|
|
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
|
|
CMPQ R9, $0x08
|
|
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
|
|
MOVL (R10), R11
|
|
MOVL R11, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
|
|
MOVL (R10), R11
|
|
MOVL -4(R10)(R9*1), R10
|
|
MOVL R11, (AX)
|
|
MOVL R10, -4(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm10B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_encodeBetterBlockAsm10B:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
|
|
CMPL R12, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
|
|
MOVB $0xee, (AX)
|
|
MOVW R8, 1(AX)
|
|
LEAL -60(R12), R12
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
|
|
CMPL R8, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
|
|
|
|
repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
|
|
|
|
repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
|
|
|
|
repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
|
|
JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B
|
|
|
|
two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
|
|
CMPL R12, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
|
|
CMPL R8, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SHRL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
|
|
|
|
emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
|
|
|
|
match_is_repeat_encodeBetterBlockAsm10B:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_repeat_encodeBetterBlockAsm10B
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_repeat_encodeBetterBlockAsm10B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x04
|
|
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
|
|
CMPQ R9, $0x08
|
|
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
|
|
MOVL (R10), R11
|
|
MOVL R11, (AX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
|
|
MOVL (R10), R11
|
|
MOVL -4(R10)(R9*1), R10
|
|
MOVL R11, (AX)
|
|
MOVL R10, -4(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitRepeat
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
|
|
CMPL R8, $0x00000800
|
|
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
|
|
|
|
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
|
|
|
|
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeBetterBlockAsm10B
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeBetterBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm10B:
|
|
MOVQ $0x0000cf1bbcdcbf9b, SI
|
|
MOVQ $0x9e3779b1, R8
|
|
INCL DI
|
|
MOVQ (DX)(DI*1), R9
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
MOVQ R9, R12
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
SHRQ $0x10, R12
|
|
LEAL 1(DI), R14
|
|
LEAL 2(DI), R15
|
|
MOVQ -2(DX)(CX*1), R9
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x34, R10
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x34, R13
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x36, R11
|
|
SHLQ $0x20, R12
|
|
IMULQ R8, R12
|
|
SHRQ $0x36, R12
|
|
MOVL DI, 24(SP)(R10*4)
|
|
MOVL R14, 24(SP)(R13*4)
|
|
MOVL R14, 16408(SP)(R11*4)
|
|
MOVL R15, 16408(SP)(R12*4)
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
LEAL -2(CX), R9
|
|
LEAL -1(CX), DI
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x34, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x36, R11
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x34, R13
|
|
MOVL R9, 24(SP)(R10*4)
|
|
MOVL DI, 16408(SP)(R11*4)
|
|
MOVL DI, 24(SP)(R13*4)
|
|
JMP search_loop_encodeBetterBlockAsm10B
|
|
|
|
emit_remainder_encodeBetterBlockAsm10B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeBetterBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm10B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeBetterBlockAsm10B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm10B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeBetterBlockAsm10B
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm10B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeBetterBlockAsm10B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x04
|
|
JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4
|
|
CMPQ BX, $0x08
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4:
|
|
MOVL (CX), SI
|
|
MOVL SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
|
|
MOVL (CX), SI
|
|
MOVL -4(CX)(BX*1), CX
|
|
MOVL SI, (AX)
|
|
MOVL CX, -4(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm10B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000028, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeBetterBlockAsm8B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeBetterBlockAsm8B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -6(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL $0x00000000, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeBetterBlockAsm8B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x04, SI
|
|
LEAL 1(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeBetterBlockAsm8B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x36, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x38, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 4120(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
MOVL CX, 4120(SP)(R11*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm8B
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidateS_match_encodeBetterBlockAsm8B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeBetterBlockAsm8B
|
|
|
|
candidateS_match_encodeBetterBlockAsm8B:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x36, R10
|
|
MOVL 24(SP)(R10*4), SI
|
|
INCL CX
|
|
MOVL CX, 24(SP)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeBetterBlockAsm8B
|
|
DECL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeBetterBlockAsm8B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm8B
|
|
|
|
match_extend_back_loop_encodeBetterBlockAsm8B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeBetterBlockAsm8B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeBetterBlockAsm8B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeBetterBlockAsm8B
|
|
JMP match_extend_back_loop_encodeBetterBlockAsm8B
|
|
|
|
match_extend_back_end_encodeBetterBlockAsm8B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeBetterBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeBetterBlockAsm8B:
|
|
MOVL CX, DI
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), R10
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_match_nolit_encodeBetterBlockAsm8B
|
|
|
|
matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
|
|
MOVQ (R9)(R12*1), R11
|
|
XORQ (R10)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP match_nolit_end_encodeBetterBlockAsm8B
|
|
|
|
matchlen_loop_match_nolit_encodeBetterBlockAsm8B:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R12), R12
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B
|
|
|
|
matchlen_single_match_nolit_encodeBetterBlockAsm8B:
|
|
TESTL R8, R8
|
|
JZ match_nolit_end_encodeBetterBlockAsm8B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B:
|
|
MOVB (R9)(R12*1), R11
|
|
CMPB (R10)(R12*1), R11
|
|
JNE match_nolit_end_encodeBetterBlockAsm8B
|
|
LEAL 1(R12), R12
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B
|
|
|
|
match_nolit_end_encodeBetterBlockAsm8B:
|
|
MOVL CX, R8
|
|
SUBL SI, R8
|
|
|
|
// Check if repeat
|
|
CMPL 16(SP), R8
|
|
JEQ match_is_repeat_encodeBetterBlockAsm8B
|
|
MOVL R8, 16(SP)
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_encodeBetterBlockAsm8B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_encodeBetterBlockAsm8B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm8B
|
|
|
|
two_bytes_match_emit_encodeBetterBlockAsm8B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_encodeBetterBlockAsm8B
|
|
JMP memmove_long_match_emit_encodeBetterBlockAsm8B
|
|
|
|
one_byte_match_emit_encodeBetterBlockAsm8B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeBetterBlockAsm8B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x04
|
|
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
|
|
CMPQ R9, $0x08
|
|
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
|
|
MOVL (R10), R11
|
|
MOVL R11, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
|
|
MOVL (R10), R11
|
|
MOVL -4(R10)(R9*1), R10
|
|
MOVL R11, (AX)
|
|
MOVL R10, -4(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B
|
|
|
|
memmove_long_match_emit_encodeBetterBlockAsm8B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_encodeBetterBlockAsm8B:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
|
|
CMPL R12, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
|
|
MOVB $0xee, (AX)
|
|
MOVW R8, 1(AX)
|
|
LEAL -60(R12), R12
|
|
ADDQ $0x03, AX
|
|
|
|
// emitRepeat
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
|
|
|
|
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
|
|
|
|
repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
|
|
|
|
repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
|
|
JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B
|
|
|
|
two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
|
|
CMPL R12, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SHRL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
|
|
|
|
emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
|
|
|
|
match_is_repeat_encodeBetterBlockAsm8B:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
|
|
MOVL DI, R8
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R9
|
|
SUBL SI, R8
|
|
LEAL -1(R8), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
|
|
|
|
two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_repeat_encodeBetterBlockAsm8B
|
|
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
|
|
|
|
one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_repeat_encodeBetterBlockAsm8B:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R8, $0x04
|
|
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
|
|
CMPQ R8, $0x08
|
|
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
|
|
MOVL (R9), R10
|
|
MOVL R10, (AX)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
|
|
MOVL (R9), R10
|
|
MOVL -4(R9)(R8*1), R9
|
|
MOVL R10, (AX)
|
|
MOVL R9, -4(AX)(R8*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
|
|
MOVQ (R9), R10
|
|
MOVQ -8(R9)(R8*1), R9
|
|
MOVQ R10, (AX)
|
|
MOVQ R9, -8(AX)(R8*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
|
|
MOVOU (R9), X0
|
|
MOVOU -16(R9)(R8*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R8*1)
|
|
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
|
|
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
|
|
|
|
memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVQ R8, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R13
|
|
SUBQ R10, R13
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
|
|
LEAQ -32(R9)(R13*1), R10
|
|
LEAQ -32(AX)(R13*1), R14
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R14)
|
|
MOVOA X5, 16(R14)
|
|
ADDQ $0x20, R14
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R13
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R9)(R13*1), X4
|
|
MOVOU -16(R9)(R13*1), X5
|
|
MOVOA X4, -32(AX)(R13*1)
|
|
MOVOA X5, -16(AX)(R13*1)
|
|
ADDQ $0x20, R13
|
|
CMPQ R8, R13
|
|
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitRepeat
|
|
MOVL R12, SI
|
|
LEAL -4(R12), R12
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
|
|
|
|
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
|
|
CMPL R12, $0x00000104
|
|
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
|
|
LEAL -256(R12), R12
|
|
MOVW $0x0019, (AX)
|
|
MOVW R12, 2(AX)
|
|
ADDQ $0x04, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
|
|
|
|
repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
|
|
LEAL -4(R12), R12
|
|
MOVW $0x0015, (AX)
|
|
MOVB R12, 2(AX)
|
|
ADDQ $0x03, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
|
|
|
|
repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
|
|
SHLL $0x02, R12
|
|
ORL $0x01, R12
|
|
MOVW R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SARL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
|
|
match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeBetterBlockAsm8B
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeBetterBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeBetterBlockAsm8B:
|
|
MOVQ $0x0000cf1bbcdcbf9b, SI
|
|
MOVQ $0x9e3779b1, R8
|
|
INCL DI
|
|
MOVQ (DX)(DI*1), R9
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
MOVQ R9, R12
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
SHRQ $0x10, R12
|
|
LEAL 1(DI), R14
|
|
LEAL 2(DI), R15
|
|
MOVQ -2(DX)(CX*1), R9
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x36, R10
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x36, R13
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x38, R11
|
|
SHLQ $0x20, R12
|
|
IMULQ R8, R12
|
|
SHRQ $0x38, R12
|
|
MOVL DI, 24(SP)(R10*4)
|
|
MOVL R14, 24(SP)(R13*4)
|
|
MOVL R14, 4120(SP)(R11*4)
|
|
MOVL R15, 4120(SP)(R12*4)
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
LEAL -2(CX), R9
|
|
LEAL -1(CX), DI
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x36, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x38, R11
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x36, R13
|
|
MOVL R9, 24(SP)(R10*4)
|
|
MOVL DI, 4120(SP)(R11*4)
|
|
MOVL DI, 24(SP)(R13*4)
|
|
JMP search_loop_encodeBetterBlockAsm8B
|
|
|
|
emit_remainder_encodeBetterBlockAsm8B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeBetterBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeBetterBlockAsm8B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeBetterBlockAsm8B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
|
|
|
|
two_bytes_emit_remainder_encodeBetterBlockAsm8B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeBetterBlockAsm8B
|
|
JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
|
|
|
|
one_byte_emit_remainder_encodeBetterBlockAsm8B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeBetterBlockAsm8B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x04
|
|
JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4
|
|
CMPQ BX, $0x08
|
|
JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4:
|
|
MOVL (CX), SI
|
|
MOVL SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
|
|
MOVL (CX), SI
|
|
MOVL -4(CX)(BX*1), CX
|
|
MOVL SI, (AX)
|
|
MOVL CX, -4(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
|
|
|
|
memmove_long_emit_remainder_encodeBetterBlockAsm8B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeSnappyBlockAsm(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeSnappyBlockAsm(SB), $65560-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000200, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeSnappyBlockAsm:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeSnappyBlockAsm
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL CX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeSnappyBlockAsm:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x06, SI
|
|
LEAL 4(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBlockAsm
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x10, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x32, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 24(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
LEAL 1(CX), R10
|
|
MOVL R10, 24(SP)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
MOVL CX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(DX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeSnappyBlockAsm
|
|
LEAL 1(CX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeSnappyBlockAsm
|
|
|
|
repeat_extend_back_loop_encodeSnappyBlockAsm:
|
|
CMPL DI, SI
|
|
JLE repeat_extend_back_end_encodeSnappyBlockAsm
|
|
MOVB -1(DX)(R8*1), BL
|
|
MOVB -1(DX)(DI*1), R9
|
|
CMPB BL, R9
|
|
JNE repeat_extend_back_end_encodeSnappyBlockAsm
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm
|
|
|
|
repeat_extend_back_end_encodeSnappyBlockAsm:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm
|
|
MOVL DI, R8
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R9
|
|
SUBL SI, R8
|
|
LEAL -1(R8), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_repeat_emit_encodeSnappyBlockAsm
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm
|
|
CMPL SI, $0x00010000
|
|
JLT three_bytes_repeat_emit_encodeSnappyBlockAsm
|
|
CMPL SI, $0x01000000
|
|
JLT four_bytes_repeat_emit_encodeSnappyBlockAsm
|
|
MOVB $0xfc, (AX)
|
|
MOVL SI, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
|
|
|
|
four_bytes_repeat_emit_encodeSnappyBlockAsm:
|
|
MOVL SI, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (AX)
|
|
MOVW SI, 1(AX)
|
|
MOVB R10, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
|
|
|
|
three_bytes_repeat_emit_encodeSnappyBlockAsm:
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
|
|
|
|
two_bytes_repeat_emit_encodeSnappyBlockAsm:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_repeat_emit_encodeSnappyBlockAsm
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
|
|
|
|
one_byte_repeat_emit_encodeSnappyBlockAsm:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_repeat_emit_encodeSnappyBlockAsm:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R8, $0x08
|
|
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
|
|
MOVQ (R9), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
|
|
MOVQ (R9), R10
|
|
MOVQ -8(R9)(R8*1), R9
|
|
MOVQ R10, (AX)
|
|
MOVQ R9, -8(AX)(R8*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
|
|
MOVOU (R9), X0
|
|
MOVOU -16(R9)(R8*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R8*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm
|
|
|
|
memmove_long_repeat_emit_encodeSnappyBlockAsm:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVQ R8, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(R9)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(R9)(R12*1), X4
|
|
MOVOU -16(R9)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R8, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
|
|
ADDL $0x05, CX
|
|
MOVL CX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_repeat_extend_encodeSnappyBlockAsm
|
|
|
|
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
TESTQ R10, R10
|
|
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm
|
|
BSFQ R10, R10
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeSnappyBlockAsm
|
|
|
|
matchlen_loop_repeat_extend_encodeSnappyBlockAsm:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm
|
|
|
|
matchlen_single_repeat_extend_encodeSnappyBlockAsm:
|
|
TESTL R8, R8
|
|
JZ repeat_extend_forward_end_encodeSnappyBlockAsm
|
|
|
|
matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeSnappyBlockAsm
|
|
LEAL 1(R11), R11
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm
|
|
|
|
repeat_extend_forward_end_encodeSnappyBlockAsm:
|
|
ADDL R11, CX
|
|
MOVL CX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitCopy
|
|
CMPL DI, $0x00010000
|
|
JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
|
|
|
|
four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
|
|
CMPL SI, $0x40
|
|
JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
|
|
MOVB $0xff, (AX)
|
|
MOVL DI, 1(AX)
|
|
LEAL -64(SI), SI
|
|
ADDQ $0x05, AX
|
|
CMPL SI, $0x04
|
|
JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
|
|
JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
|
|
|
|
four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
|
|
TESTL SI, SI
|
|
JZ repeat_end_emit_encodeSnappyBlockAsm
|
|
MOVB $0x03, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVL DI, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP repeat_end_emit_encodeSnappyBlockAsm
|
|
|
|
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
|
|
CMPL SI, $0x40
|
|
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
|
|
MOVB $0xee, (AX)
|
|
MOVW DI, 1(AX)
|
|
LEAL -60(SI), SI
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
|
|
|
|
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
|
|
CMPL SI, $0x0c
|
|
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
|
|
CMPL DI, $0x00000800
|
|
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SHRL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeSnappyBlockAsm
|
|
|
|
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
repeat_end_emit_encodeSnappyBlockAsm:
|
|
MOVL CX, 12(SP)
|
|
JMP search_loop_encodeSnappyBlockAsm
|
|
|
|
no_repeat_found_encodeSnappyBlockAsm:
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBlockAsm
|
|
SHRQ $0x08, DI
|
|
MOVL 24(SP)(R10*4), SI
|
|
LEAL 2(CX), R9
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidate2_match_encodeSnappyBlockAsm
|
|
MOVL R9, 24(SP)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate3_match_encodeSnappyBlockAsm
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeSnappyBlockAsm
|
|
|
|
candidate3_match_encodeSnappyBlockAsm:
|
|
ADDL $0x02, CX
|
|
JMP candidate_match_encodeSnappyBlockAsm
|
|
|
|
candidate2_match_encodeSnappyBlockAsm:
|
|
MOVL R9, 24(SP)(R10*4)
|
|
INCL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeSnappyBlockAsm:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeSnappyBlockAsm
|
|
|
|
match_extend_back_loop_encodeSnappyBlockAsm:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeSnappyBlockAsm
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeSnappyBlockAsm
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeSnappyBlockAsm
|
|
JMP match_extend_back_loop_encodeSnappyBlockAsm
|
|
|
|
match_extend_back_end_encodeSnappyBlockAsm:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 5(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeSnappyBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeSnappyBlockAsm:
|
|
MOVL CX, DI
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), DI
|
|
SUBL R8, R9
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x3c
|
|
JLT one_byte_match_emit_encodeSnappyBlockAsm
|
|
CMPL R8, $0x00000100
|
|
JLT two_bytes_match_emit_encodeSnappyBlockAsm
|
|
CMPL R8, $0x00010000
|
|
JLT three_bytes_match_emit_encodeSnappyBlockAsm
|
|
CMPL R8, $0x01000000
|
|
JLT four_bytes_match_emit_encodeSnappyBlockAsm
|
|
MOVB $0xfc, (AX)
|
|
MOVL R8, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm
|
|
|
|
four_bytes_match_emit_encodeSnappyBlockAsm:
|
|
MOVL R8, R10
|
|
SHRL $0x10, R10
|
|
MOVB $0xf8, (AX)
|
|
MOVW R8, 1(AX)
|
|
MOVB R10, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm
|
|
|
|
three_bytes_match_emit_encodeSnappyBlockAsm:
|
|
MOVB $0xf4, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm
|
|
|
|
two_bytes_match_emit_encodeSnappyBlockAsm:
|
|
MOVB $0xf0, (AX)
|
|
MOVB R8, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL R8, $0x40
|
|
JL memmove_match_emit_encodeSnappyBlockAsm
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm
|
|
|
|
one_byte_match_emit_encodeSnappyBlockAsm:
|
|
SHLB $0x02, R8
|
|
MOVB R8, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeSnappyBlockAsm:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
|
|
MOVQ (DI), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
|
|
MOVQ (DI), R10
|
|
MOVQ -8(DI)(R9*1), DI
|
|
MOVQ R10, (AX)
|
|
MOVQ DI, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeSnappyBlockAsm:
|
|
MOVQ R8, AX
|
|
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm
|
|
|
|
memmove_long_match_emit_encodeSnappyBlockAsm:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVQ R9, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R12*1), X4
|
|
MOVOU -16(DI)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R9, R12
|
|
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ R8, AX
|
|
|
|
emit_literal_done_match_emit_encodeSnappyBlockAsm:
|
|
match_nolit_loop_encodeSnappyBlockAsm:
|
|
MOVL CX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL CX, DI
|
|
LEAQ (DX)(CX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
CMPL DI, $0x08
|
|
JL matchlen_single_match_nolit_encodeSnappyBlockAsm
|
|
|
|
matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
TESTQ R9, R9
|
|
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm
|
|
BSFQ R9, R9
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP match_nolit_end_encodeSnappyBlockAsm
|
|
|
|
matchlen_loop_match_nolit_encodeSnappyBlockAsm:
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
CMPL DI, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm
|
|
|
|
matchlen_single_match_nolit_encodeSnappyBlockAsm:
|
|
TESTL DI, DI
|
|
JZ match_nolit_end_encodeSnappyBlockAsm
|
|
|
|
matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE match_nolit_end_encodeSnappyBlockAsm
|
|
LEAL 1(R10), R10
|
|
DECL DI
|
|
JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm
|
|
|
|
match_nolit_end_encodeSnappyBlockAsm:
|
|
ADDL R10, CX
|
|
MOVL 16(SP), SI
|
|
ADDL $0x04, R10
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL SI, $0x00010000
|
|
JL two_byte_offset_match_nolit_encodeSnappyBlockAsm
|
|
|
|
four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
|
|
CMPL R10, $0x40
|
|
JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm
|
|
MOVB $0xff, (AX)
|
|
MOVL SI, 1(AX)
|
|
LEAL -64(R10), R10
|
|
ADDQ $0x05, AX
|
|
CMPL R10, $0x04
|
|
JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm
|
|
JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
|
|
|
|
four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
|
|
TESTL R10, R10
|
|
JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm
|
|
MOVB $0x03, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVL SI, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
|
|
|
|
two_byte_offset_match_nolit_encodeSnappyBlockAsm:
|
|
CMPL R10, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
|
|
MOVB $0xee, (AX)
|
|
MOVW SI, 1(AX)
|
|
LEAL -60(R10), R10
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm
|
|
|
|
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
|
|
CMPL R10, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
|
|
CMPL SI, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SHRL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
|
|
|
|
emit_copy_three_match_nolit_encodeSnappyBlockAsm:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeSnappyBlockAsm:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBlockAsm
|
|
MOVQ -2(DX)(CX*1), DI
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeSnappyBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeSnappyBlockAsm:
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, SI
|
|
SHLQ $0x10, R8
|
|
IMULQ R9, R8
|
|
SHRQ $0x32, R8
|
|
SHLQ $0x10, SI
|
|
IMULQ R9, SI
|
|
SHRQ $0x32, SI
|
|
LEAL -2(CX), R9
|
|
LEAQ 24(SP)(SI*4), R10
|
|
MOVL (R10), SI
|
|
MOVL R9, 24(SP)(R8*4)
|
|
MOVL CX, (R10)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ match_nolit_loop_encodeSnappyBlockAsm
|
|
INCL CX
|
|
JMP search_loop_encodeSnappyBlockAsm
|
|
|
|
emit_remainder_encodeSnappyBlockAsm:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 5(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeSnappyBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeSnappyBlockAsm:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeSnappyBlockAsm
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeSnappyBlockAsm
|
|
CMPL DX, $0x00010000
|
|
JLT three_bytes_emit_remainder_encodeSnappyBlockAsm
|
|
CMPL DX, $0x01000000
|
|
JLT four_bytes_emit_remainder_encodeSnappyBlockAsm
|
|
MOVB $0xfc, (AX)
|
|
MOVL DX, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
|
|
|
|
four_bytes_emit_remainder_encodeSnappyBlockAsm:
|
|
MOVL DX, BX
|
|
SHRL $0x10, BX
|
|
MOVB $0xf8, (AX)
|
|
MOVW DX, 1(AX)
|
|
MOVB BL, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
|
|
|
|
three_bytes_emit_remainder_encodeSnappyBlockAsm:
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
|
|
|
|
two_bytes_emit_remainder_encodeSnappyBlockAsm:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeSnappyBlockAsm
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
|
|
|
|
one_byte_emit_remainder_encodeSnappyBlockAsm:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeSnappyBlockAsm:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm
|
|
|
|
memmove_long_emit_remainder_encodeSnappyBlockAsm:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000200, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeSnappyBlockAsm64K:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeSnappyBlockAsm64K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL CX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeSnappyBlockAsm64K:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x06, SI
|
|
LEAL 4(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBlockAsm64K
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x10, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x32, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 24(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
LEAL 1(CX), R10
|
|
MOVL R10, 24(SP)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
MOVL CX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(DX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeSnappyBlockAsm64K
|
|
LEAL 1(CX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeSnappyBlockAsm64K
|
|
|
|
repeat_extend_back_loop_encodeSnappyBlockAsm64K:
|
|
CMPL DI, SI
|
|
JLE repeat_extend_back_end_encodeSnappyBlockAsm64K
|
|
MOVB -1(DX)(R8*1), BL
|
|
MOVB -1(DX)(DI*1), R9
|
|
CMPB BL, R9
|
|
JNE repeat_extend_back_end_encodeSnappyBlockAsm64K
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K
|
|
|
|
repeat_extend_back_end_encodeSnappyBlockAsm64K:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
|
|
MOVL DI, R8
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R9
|
|
SUBL SI, R8
|
|
LEAL -1(R8), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
|
|
|
|
two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_repeat_emit_encodeSnappyBlockAsm64K
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
|
|
|
|
one_byte_repeat_emit_encodeSnappyBlockAsm64K:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_repeat_emit_encodeSnappyBlockAsm64K:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R8, $0x08
|
|
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
|
|
MOVQ (R9), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
|
|
MOVQ (R9), R10
|
|
MOVQ -8(R9)(R8*1), R9
|
|
MOVQ R10, (AX)
|
|
MOVQ R9, -8(AX)(R8*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (R9), X0
|
|
MOVOU -16(R9)(R8*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R8*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
|
|
|
|
memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVQ R8, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(R9)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R9)(R12*1), X4
|
|
MOVOU -16(R9)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R8, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
|
|
ADDL $0x05, CX
|
|
MOVL CX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_repeat_extend_encodeSnappyBlockAsm64K
|
|
|
|
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
TESTQ R10, R10
|
|
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K
|
|
BSFQ R10, R10
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
|
|
|
|
matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K
|
|
|
|
matchlen_single_repeat_extend_encodeSnappyBlockAsm64K:
|
|
TESTL R8, R8
|
|
JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
|
|
|
|
matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K
|
|
LEAL 1(R11), R11
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K
|
|
|
|
repeat_extend_forward_end_encodeSnappyBlockAsm64K:
|
|
ADDL R11, CX
|
|
MOVL CX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitCopy
|
|
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
|
|
CMPL SI, $0x40
|
|
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
|
|
MOVB $0xee, (AX)
|
|
MOVW DI, 1(AX)
|
|
LEAL -60(SI), SI
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
|
|
|
|
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
|
|
CMPL SI, $0x0c
|
|
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
|
|
CMPL DI, $0x00000800
|
|
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SHRL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeSnappyBlockAsm64K
|
|
|
|
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
repeat_end_emit_encodeSnappyBlockAsm64K:
|
|
MOVL CX, 12(SP)
|
|
JMP search_loop_encodeSnappyBlockAsm64K
|
|
|
|
no_repeat_found_encodeSnappyBlockAsm64K:
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBlockAsm64K
|
|
SHRQ $0x08, DI
|
|
MOVL 24(SP)(R10*4), SI
|
|
LEAL 2(CX), R9
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidate2_match_encodeSnappyBlockAsm64K
|
|
MOVL R9, 24(SP)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate3_match_encodeSnappyBlockAsm64K
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeSnappyBlockAsm64K
|
|
|
|
candidate3_match_encodeSnappyBlockAsm64K:
|
|
ADDL $0x02, CX
|
|
JMP candidate_match_encodeSnappyBlockAsm64K
|
|
|
|
candidate2_match_encodeSnappyBlockAsm64K:
|
|
MOVL R9, 24(SP)(R10*4)
|
|
INCL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeSnappyBlockAsm64K:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeSnappyBlockAsm64K
|
|
|
|
match_extend_back_loop_encodeSnappyBlockAsm64K:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeSnappyBlockAsm64K
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeSnappyBlockAsm64K
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeSnappyBlockAsm64K
|
|
JMP match_extend_back_loop_encodeSnappyBlockAsm64K
|
|
|
|
match_extend_back_end_encodeSnappyBlockAsm64K:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeSnappyBlockAsm64K
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeSnappyBlockAsm64K:
|
|
MOVL CX, DI
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), DI
|
|
SUBL R8, R9
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x3c
|
|
JLT one_byte_match_emit_encodeSnappyBlockAsm64K
|
|
CMPL R8, $0x00000100
|
|
JLT two_bytes_match_emit_encodeSnappyBlockAsm64K
|
|
MOVB $0xf4, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
|
|
|
|
two_bytes_match_emit_encodeSnappyBlockAsm64K:
|
|
MOVB $0xf0, (AX)
|
|
MOVB R8, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL R8, $0x40
|
|
JL memmove_match_emit_encodeSnappyBlockAsm64K
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
|
|
|
|
one_byte_match_emit_encodeSnappyBlockAsm64K:
|
|
SHLB $0x02, R8
|
|
MOVB R8, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeSnappyBlockAsm64K:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
|
|
MOVQ (DI), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
|
|
MOVQ (DI), R10
|
|
MOVQ -8(DI)(R9*1), DI
|
|
MOVQ R10, (AX)
|
|
MOVQ DI, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
|
|
MOVQ R8, AX
|
|
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K
|
|
|
|
memmove_long_match_emit_encodeSnappyBlockAsm64K:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVQ R9, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R12*1), X4
|
|
MOVOU -16(DI)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R9, R12
|
|
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ R8, AX
|
|
|
|
emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
|
|
match_nolit_loop_encodeSnappyBlockAsm64K:
|
|
MOVL CX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL CX, DI
|
|
LEAQ (DX)(CX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
CMPL DI, $0x08
|
|
JL matchlen_single_match_nolit_encodeSnappyBlockAsm64K
|
|
|
|
matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
TESTQ R9, R9
|
|
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K
|
|
BSFQ R9, R9
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP match_nolit_end_encodeSnappyBlockAsm64K
|
|
|
|
matchlen_loop_match_nolit_encodeSnappyBlockAsm64K:
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
CMPL DI, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K
|
|
|
|
matchlen_single_match_nolit_encodeSnappyBlockAsm64K:
|
|
TESTL DI, DI
|
|
JZ match_nolit_end_encodeSnappyBlockAsm64K
|
|
|
|
matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE match_nolit_end_encodeSnappyBlockAsm64K
|
|
LEAL 1(R10), R10
|
|
DECL DI
|
|
JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K
|
|
|
|
match_nolit_end_encodeSnappyBlockAsm64K:
|
|
ADDL R10, CX
|
|
MOVL 16(SP), SI
|
|
ADDL $0x04, R10
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
|
|
CMPL R10, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
|
|
MOVB $0xee, (AX)
|
|
MOVW SI, 1(AX)
|
|
LEAL -60(R10), R10
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
|
|
|
|
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
|
|
CMPL R10, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
|
|
CMPL SI, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SHRL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
|
|
|
|
emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBlockAsm64K
|
|
MOVQ -2(DX)(CX*1), DI
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeSnappyBlockAsm64K
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeSnappyBlockAsm64K:
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, SI
|
|
SHLQ $0x10, R8
|
|
IMULQ R9, R8
|
|
SHRQ $0x32, R8
|
|
SHLQ $0x10, SI
|
|
IMULQ R9, SI
|
|
SHRQ $0x32, SI
|
|
LEAL -2(CX), R9
|
|
LEAQ 24(SP)(SI*4), R10
|
|
MOVL (R10), SI
|
|
MOVL R9, 24(SP)(R8*4)
|
|
MOVL CX, (R10)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ match_nolit_loop_encodeSnappyBlockAsm64K
|
|
INCL CX
|
|
JMP search_loop_encodeSnappyBlockAsm64K
|
|
|
|
emit_remainder_encodeSnappyBlockAsm64K:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeSnappyBlockAsm64K
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeSnappyBlockAsm64K:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeSnappyBlockAsm64K
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeSnappyBlockAsm64K
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
|
|
|
|
two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeSnappyBlockAsm64K
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
|
|
|
|
one_byte_emit_remainder_encodeSnappyBlockAsm64K:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeSnappyBlockAsm64K:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
|
|
|
|
memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000080, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeSnappyBlockAsm12B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeSnappyBlockAsm12B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL CX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeSnappyBlockAsm12B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x05, SI
|
|
LEAL 4(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBlockAsm12B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x000000cf1bbcdcbb, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x18, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
SHLQ $0x18, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x34, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 24(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
LEAL 1(CX), R10
|
|
MOVL R10, 24(SP)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x18, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
MOVL CX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(DX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeSnappyBlockAsm12B
|
|
LEAL 1(CX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeSnappyBlockAsm12B
|
|
|
|
repeat_extend_back_loop_encodeSnappyBlockAsm12B:
|
|
CMPL DI, SI
|
|
JLE repeat_extend_back_end_encodeSnappyBlockAsm12B
|
|
MOVB -1(DX)(R8*1), BL
|
|
MOVB -1(DX)(DI*1), R9
|
|
CMPB BL, R9
|
|
JNE repeat_extend_back_end_encodeSnappyBlockAsm12B
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B
|
|
|
|
repeat_extend_back_end_encodeSnappyBlockAsm12B:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
|
|
MOVL DI, R8
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R9
|
|
SUBL SI, R8
|
|
LEAL -1(R8), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
|
|
|
|
two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_repeat_emit_encodeSnappyBlockAsm12B
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
|
|
|
|
one_byte_repeat_emit_encodeSnappyBlockAsm12B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_repeat_emit_encodeSnappyBlockAsm12B:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R8, $0x08
|
|
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
|
|
MOVQ (R9), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
|
|
MOVQ (R9), R10
|
|
MOVQ -8(R9)(R8*1), R9
|
|
MOVQ R10, (AX)
|
|
MOVQ R9, -8(AX)(R8*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
|
|
MOVOU (R9), X0
|
|
MOVOU -16(R9)(R8*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R8*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
|
|
|
|
memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVQ R8, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
|
|
LEAQ -32(R9)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R9)(R12*1), X4
|
|
MOVOU -16(R9)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R8, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
|
|
ADDL $0x05, CX
|
|
MOVL CX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B
|
|
|
|
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
TESTQ R10, R10
|
|
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B
|
|
BSFQ R10, R10
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
|
|
|
|
matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B
|
|
|
|
matchlen_single_repeat_extend_encodeSnappyBlockAsm12B:
|
|
TESTL R8, R8
|
|
JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
|
|
|
|
matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B
|
|
LEAL 1(R11), R11
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B
|
|
|
|
repeat_extend_forward_end_encodeSnappyBlockAsm12B:
|
|
ADDL R11, CX
|
|
MOVL CX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitCopy
|
|
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
|
|
CMPL SI, $0x40
|
|
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
|
|
MOVB $0xee, (AX)
|
|
MOVW DI, 1(AX)
|
|
LEAL -60(SI), SI
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
|
|
|
|
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
|
|
CMPL SI, $0x0c
|
|
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
|
|
CMPL DI, $0x00000800
|
|
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SHRL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeSnappyBlockAsm12B
|
|
|
|
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
repeat_end_emit_encodeSnappyBlockAsm12B:
|
|
MOVL CX, 12(SP)
|
|
JMP search_loop_encodeSnappyBlockAsm12B
|
|
|
|
no_repeat_found_encodeSnappyBlockAsm12B:
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBlockAsm12B
|
|
SHRQ $0x08, DI
|
|
MOVL 24(SP)(R10*4), SI
|
|
LEAL 2(CX), R9
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidate2_match_encodeSnappyBlockAsm12B
|
|
MOVL R9, 24(SP)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate3_match_encodeSnappyBlockAsm12B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeSnappyBlockAsm12B
|
|
|
|
candidate3_match_encodeSnappyBlockAsm12B:
|
|
ADDL $0x02, CX
|
|
JMP candidate_match_encodeSnappyBlockAsm12B
|
|
|
|
candidate2_match_encodeSnappyBlockAsm12B:
|
|
MOVL R9, 24(SP)(R10*4)
|
|
INCL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeSnappyBlockAsm12B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeSnappyBlockAsm12B
|
|
|
|
match_extend_back_loop_encodeSnappyBlockAsm12B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeSnappyBlockAsm12B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeSnappyBlockAsm12B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeSnappyBlockAsm12B
|
|
JMP match_extend_back_loop_encodeSnappyBlockAsm12B
|
|
|
|
match_extend_back_end_encodeSnappyBlockAsm12B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeSnappyBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeSnappyBlockAsm12B:
|
|
MOVL CX, DI
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), DI
|
|
SUBL R8, R9
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x3c
|
|
JLT one_byte_match_emit_encodeSnappyBlockAsm12B
|
|
CMPL R8, $0x00000100
|
|
JLT two_bytes_match_emit_encodeSnappyBlockAsm12B
|
|
MOVB $0xf4, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
|
|
|
|
two_bytes_match_emit_encodeSnappyBlockAsm12B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB R8, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL R8, $0x40
|
|
JL memmove_match_emit_encodeSnappyBlockAsm12B
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
|
|
|
|
one_byte_match_emit_encodeSnappyBlockAsm12B:
|
|
SHLB $0x02, R8
|
|
MOVB R8, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeSnappyBlockAsm12B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
|
|
MOVQ (DI), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
|
|
MOVQ (DI), R10
|
|
MOVQ -8(DI)(R9*1), DI
|
|
MOVQ R10, (AX)
|
|
MOVQ DI, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
|
|
MOVQ R8, AX
|
|
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B
|
|
|
|
memmove_long_match_emit_encodeSnappyBlockAsm12B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVQ R9, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R12*1), X4
|
|
MOVOU -16(DI)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R9, R12
|
|
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ R8, AX
|
|
|
|
emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
|
|
match_nolit_loop_encodeSnappyBlockAsm12B:
|
|
MOVL CX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL CX, DI
|
|
LEAQ (DX)(CX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
CMPL DI, $0x08
|
|
JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B
|
|
|
|
matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
TESTQ R9, R9
|
|
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B
|
|
BSFQ R9, R9
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP match_nolit_end_encodeSnappyBlockAsm12B
|
|
|
|
matchlen_loop_match_nolit_encodeSnappyBlockAsm12B:
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
CMPL DI, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B
|
|
|
|
matchlen_single_match_nolit_encodeSnappyBlockAsm12B:
|
|
TESTL DI, DI
|
|
JZ match_nolit_end_encodeSnappyBlockAsm12B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE match_nolit_end_encodeSnappyBlockAsm12B
|
|
LEAL 1(R10), R10
|
|
DECL DI
|
|
JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B
|
|
|
|
match_nolit_end_encodeSnappyBlockAsm12B:
|
|
ADDL R10, CX
|
|
MOVL 16(SP), SI
|
|
ADDL $0x04, R10
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
|
|
CMPL R10, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
|
|
MOVB $0xee, (AX)
|
|
MOVW SI, 1(AX)
|
|
LEAL -60(R10), R10
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
|
|
|
|
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
|
|
CMPL R10, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
|
|
CMPL SI, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SHRL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
|
|
|
|
emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBlockAsm12B
|
|
MOVQ -2(DX)(CX*1), DI
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeSnappyBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeSnappyBlockAsm12B:
|
|
MOVQ $0x000000cf1bbcdcbb, R9
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, SI
|
|
SHLQ $0x18, R8
|
|
IMULQ R9, R8
|
|
SHRQ $0x34, R8
|
|
SHLQ $0x18, SI
|
|
IMULQ R9, SI
|
|
SHRQ $0x34, SI
|
|
LEAL -2(CX), R9
|
|
LEAQ 24(SP)(SI*4), R10
|
|
MOVL (R10), SI
|
|
MOVL R9, 24(SP)(R8*4)
|
|
MOVL CX, (R10)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ match_nolit_loop_encodeSnappyBlockAsm12B
|
|
INCL CX
|
|
JMP search_loop_encodeSnappyBlockAsm12B
|
|
|
|
emit_remainder_encodeSnappyBlockAsm12B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeSnappyBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeSnappyBlockAsm12B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeSnappyBlockAsm12B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
|
|
|
|
two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeSnappyBlockAsm12B
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
|
|
|
|
one_byte_emit_remainder_encodeSnappyBlockAsm12B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeSnappyBlockAsm12B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
|
|
|
|
memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000020, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeSnappyBlockAsm10B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeSnappyBlockAsm10B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL CX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeSnappyBlockAsm10B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x05, SI
|
|
LEAL 4(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBlockAsm10B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x9e3779b1, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x36, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x36, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 24(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
LEAL 1(CX), R10
|
|
MOVL R10, 24(SP)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x36, R10
|
|
MOVL CX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(DX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeSnappyBlockAsm10B
|
|
LEAL 1(CX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeSnappyBlockAsm10B
|
|
|
|
repeat_extend_back_loop_encodeSnappyBlockAsm10B:
|
|
CMPL DI, SI
|
|
JLE repeat_extend_back_end_encodeSnappyBlockAsm10B
|
|
MOVB -1(DX)(R8*1), BL
|
|
MOVB -1(DX)(DI*1), R9
|
|
CMPB BL, R9
|
|
JNE repeat_extend_back_end_encodeSnappyBlockAsm10B
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B
|
|
|
|
repeat_extend_back_end_encodeSnappyBlockAsm10B:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
|
|
MOVL DI, R8
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R9
|
|
SUBL SI, R8
|
|
LEAL -1(R8), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
|
|
|
|
two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_repeat_emit_encodeSnappyBlockAsm10B
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
|
|
|
|
one_byte_repeat_emit_encodeSnappyBlockAsm10B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_repeat_emit_encodeSnappyBlockAsm10B:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R8, $0x08
|
|
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
|
|
MOVQ (R9), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
|
|
MOVQ (R9), R10
|
|
MOVQ -8(R9)(R8*1), R9
|
|
MOVQ R10, (AX)
|
|
MOVQ R9, -8(AX)(R8*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
|
|
MOVOU (R9), X0
|
|
MOVOU -16(R9)(R8*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R8*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
|
|
|
|
memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVQ R8, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
|
|
LEAQ -32(R9)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R9)(R12*1), X4
|
|
MOVOU -16(R9)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R8, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
|
|
ADDL $0x05, CX
|
|
MOVL CX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B
|
|
|
|
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
TESTQ R10, R10
|
|
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B
|
|
BSFQ R10, R10
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
|
|
|
|
matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B
|
|
|
|
matchlen_single_repeat_extend_encodeSnappyBlockAsm10B:
|
|
TESTL R8, R8
|
|
JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
|
|
|
|
matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B
|
|
LEAL 1(R11), R11
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B
|
|
|
|
repeat_extend_forward_end_encodeSnappyBlockAsm10B:
|
|
ADDL R11, CX
|
|
MOVL CX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitCopy
|
|
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
|
|
CMPL SI, $0x40
|
|
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
|
|
MOVB $0xee, (AX)
|
|
MOVW DI, 1(AX)
|
|
LEAL -60(SI), SI
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
|
|
|
|
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
|
|
CMPL SI, $0x0c
|
|
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
|
|
CMPL DI, $0x00000800
|
|
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SHRL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeSnappyBlockAsm10B
|
|
|
|
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
repeat_end_emit_encodeSnappyBlockAsm10B:
|
|
MOVL CX, 12(SP)
|
|
JMP search_loop_encodeSnappyBlockAsm10B
|
|
|
|
no_repeat_found_encodeSnappyBlockAsm10B:
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBlockAsm10B
|
|
SHRQ $0x08, DI
|
|
MOVL 24(SP)(R10*4), SI
|
|
LEAL 2(CX), R9
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidate2_match_encodeSnappyBlockAsm10B
|
|
MOVL R9, 24(SP)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate3_match_encodeSnappyBlockAsm10B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeSnappyBlockAsm10B
|
|
|
|
candidate3_match_encodeSnappyBlockAsm10B:
|
|
ADDL $0x02, CX
|
|
JMP candidate_match_encodeSnappyBlockAsm10B
|
|
|
|
candidate2_match_encodeSnappyBlockAsm10B:
|
|
MOVL R9, 24(SP)(R10*4)
|
|
INCL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeSnappyBlockAsm10B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeSnappyBlockAsm10B
|
|
|
|
match_extend_back_loop_encodeSnappyBlockAsm10B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeSnappyBlockAsm10B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeSnappyBlockAsm10B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeSnappyBlockAsm10B
|
|
JMP match_extend_back_loop_encodeSnappyBlockAsm10B
|
|
|
|
match_extend_back_end_encodeSnappyBlockAsm10B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeSnappyBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeSnappyBlockAsm10B:
|
|
MOVL CX, DI
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), DI
|
|
SUBL R8, R9
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x3c
|
|
JLT one_byte_match_emit_encodeSnappyBlockAsm10B
|
|
CMPL R8, $0x00000100
|
|
JLT two_bytes_match_emit_encodeSnappyBlockAsm10B
|
|
MOVB $0xf4, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
|
|
|
|
two_bytes_match_emit_encodeSnappyBlockAsm10B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB R8, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL R8, $0x40
|
|
JL memmove_match_emit_encodeSnappyBlockAsm10B
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
|
|
|
|
one_byte_match_emit_encodeSnappyBlockAsm10B:
|
|
SHLB $0x02, R8
|
|
MOVB R8, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeSnappyBlockAsm10B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
|
|
MOVQ (DI), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
|
|
MOVQ (DI), R10
|
|
MOVQ -8(DI)(R9*1), DI
|
|
MOVQ R10, (AX)
|
|
MOVQ DI, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
|
|
MOVQ R8, AX
|
|
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B
|
|
|
|
memmove_long_match_emit_encodeSnappyBlockAsm10B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVQ R9, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R12*1), X4
|
|
MOVOU -16(DI)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R9, R12
|
|
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ R8, AX
|
|
|
|
emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
|
|
match_nolit_loop_encodeSnappyBlockAsm10B:
|
|
MOVL CX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL CX, DI
|
|
LEAQ (DX)(CX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
CMPL DI, $0x08
|
|
JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B
|
|
|
|
matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
TESTQ R9, R9
|
|
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B
|
|
BSFQ R9, R9
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP match_nolit_end_encodeSnappyBlockAsm10B
|
|
|
|
matchlen_loop_match_nolit_encodeSnappyBlockAsm10B:
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
CMPL DI, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B
|
|
|
|
matchlen_single_match_nolit_encodeSnappyBlockAsm10B:
|
|
TESTL DI, DI
|
|
JZ match_nolit_end_encodeSnappyBlockAsm10B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE match_nolit_end_encodeSnappyBlockAsm10B
|
|
LEAL 1(R10), R10
|
|
DECL DI
|
|
JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B
|
|
|
|
match_nolit_end_encodeSnappyBlockAsm10B:
|
|
ADDL R10, CX
|
|
MOVL 16(SP), SI
|
|
ADDL $0x04, R10
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
|
|
CMPL R10, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
|
|
MOVB $0xee, (AX)
|
|
MOVW SI, 1(AX)
|
|
LEAL -60(R10), R10
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
|
|
|
|
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
|
|
CMPL R10, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
|
|
CMPL SI, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SHRL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
|
|
|
|
emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBlockAsm10B
|
|
MOVQ -2(DX)(CX*1), DI
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeSnappyBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeSnappyBlockAsm10B:
|
|
MOVQ $0x9e3779b1, R9
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, SI
|
|
SHLQ $0x20, R8
|
|
IMULQ R9, R8
|
|
SHRQ $0x36, R8
|
|
SHLQ $0x20, SI
|
|
IMULQ R9, SI
|
|
SHRQ $0x36, SI
|
|
LEAL -2(CX), R9
|
|
LEAQ 24(SP)(SI*4), R10
|
|
MOVL (R10), SI
|
|
MOVL R9, 24(SP)(R8*4)
|
|
MOVL CX, (R10)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ match_nolit_loop_encodeSnappyBlockAsm10B
|
|
INCL CX
|
|
JMP search_loop_encodeSnappyBlockAsm10B
|
|
|
|
emit_remainder_encodeSnappyBlockAsm10B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeSnappyBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeSnappyBlockAsm10B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeSnappyBlockAsm10B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
|
|
|
|
two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeSnappyBlockAsm10B
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
|
|
|
|
one_byte_emit_remainder_encodeSnappyBlockAsm10B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeSnappyBlockAsm10B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
|
|
|
|
memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000008, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeSnappyBlockAsm8B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeSnappyBlockAsm8B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL CX, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeSnappyBlockAsm8B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x04, SI
|
|
LEAL 4(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBlockAsm8B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x9e3779b1, R9
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHRQ $0x08, R11
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x38, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R9, R11
|
|
SHRQ $0x38, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 24(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
LEAL 1(CX), R10
|
|
MOVL R10, 24(SP)(R11*4)
|
|
MOVQ DI, R10
|
|
SHRQ $0x10, R10
|
|
SHLQ $0x20, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x38, R10
|
|
MOVL CX, R9
|
|
SUBL 16(SP), R9
|
|
MOVL 1(DX)(R9*1), R11
|
|
MOVQ DI, R9
|
|
SHRQ $0x08, R9
|
|
CMPL R9, R11
|
|
JNE no_repeat_found_encodeSnappyBlockAsm8B
|
|
LEAL 1(CX), DI
|
|
MOVL 12(SP), SI
|
|
MOVL DI, R8
|
|
SUBL 16(SP), R8
|
|
JZ repeat_extend_back_end_encodeSnappyBlockAsm8B
|
|
|
|
repeat_extend_back_loop_encodeSnappyBlockAsm8B:
|
|
CMPL DI, SI
|
|
JLE repeat_extend_back_end_encodeSnappyBlockAsm8B
|
|
MOVB -1(DX)(R8*1), BL
|
|
MOVB -1(DX)(DI*1), R9
|
|
CMPB BL, R9
|
|
JNE repeat_extend_back_end_encodeSnappyBlockAsm8B
|
|
LEAL -1(DI), DI
|
|
DECL R8
|
|
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B
|
|
|
|
repeat_extend_back_end_encodeSnappyBlockAsm8B:
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
|
|
MOVL DI, R8
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R9
|
|
SUBL SI, R8
|
|
LEAL -1(R8), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
|
|
|
|
two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_repeat_emit_encodeSnappyBlockAsm8B
|
|
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
|
|
|
|
one_byte_repeat_emit_encodeSnappyBlockAsm8B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_repeat_emit_encodeSnappyBlockAsm8B:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R8, $0x08
|
|
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
|
|
CMPQ R8, $0x10
|
|
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
|
|
CMPQ R8, $0x20
|
|
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
|
|
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
|
|
MOVQ (R9), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
|
|
MOVQ (R9), R10
|
|
MOVQ -8(R9)(R8*1), R9
|
|
MOVQ R10, (AX)
|
|
MOVQ R9, -8(AX)(R8*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
|
|
MOVOU (R9), X0
|
|
MOVOU -16(R9)(R8*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R8*1)
|
|
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
|
|
|
|
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
|
|
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
|
|
|
|
memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
|
|
LEAQ (AX)(R8*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R9), X0
|
|
MOVOU 16(R9), X1
|
|
MOVOU -32(R9)(R8*1), X2
|
|
MOVOU -16(R9)(R8*1), X3
|
|
MOVQ R8, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
|
|
LEAQ -32(R9)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R9)(R12*1), X4
|
|
MOVOU -16(R9)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R8, R12
|
|
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R8*1)
|
|
MOVOU X3, -16(AX)(R8*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
|
|
ADDL $0x05, CX
|
|
MOVL CX, SI
|
|
SUBL 16(SP), SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R11, R11
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B
|
|
|
|
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
|
|
MOVQ (R9)(R11*1), R10
|
|
XORQ (SI)(R11*1), R10
|
|
TESTQ R10, R10
|
|
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B
|
|
BSFQ R10, R10
|
|
SARQ $0x03, R10
|
|
LEAL (R11)(R10*1), R11
|
|
JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
|
|
|
|
matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R11), R11
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B
|
|
|
|
matchlen_single_repeat_extend_encodeSnappyBlockAsm8B:
|
|
TESTL R8, R8
|
|
JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
|
|
|
|
matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B:
|
|
MOVB (R9)(R11*1), R10
|
|
CMPB (SI)(R11*1), R10
|
|
JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B
|
|
LEAL 1(R11), R11
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B
|
|
|
|
repeat_extend_forward_end_encodeSnappyBlockAsm8B:
|
|
ADDL R11, CX
|
|
MOVL CX, SI
|
|
SUBL DI, SI
|
|
MOVL 16(SP), DI
|
|
|
|
// emitCopy
|
|
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
|
|
CMPL SI, $0x40
|
|
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
|
|
MOVB $0xee, (AX)
|
|
MOVW DI, 1(AX)
|
|
LEAL -60(SI), SI
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
|
|
|
|
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
|
|
CMPL SI, $0x0c
|
|
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(SI*4), SI
|
|
MOVB DI, 1(AX)
|
|
SHRL $0x08, DI
|
|
SHLL $0x05, DI
|
|
ORL DI, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP repeat_end_emit_encodeSnappyBlockAsm8B
|
|
|
|
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(SI*4), SI
|
|
MOVB SI, (AX)
|
|
MOVW DI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
repeat_end_emit_encodeSnappyBlockAsm8B:
|
|
MOVL CX, 12(SP)
|
|
JMP search_loop_encodeSnappyBlockAsm8B
|
|
|
|
no_repeat_found_encodeSnappyBlockAsm8B:
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBlockAsm8B
|
|
SHRQ $0x08, DI
|
|
MOVL 24(SP)(R10*4), SI
|
|
LEAL 2(CX), R9
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidate2_match_encodeSnappyBlockAsm8B
|
|
MOVL R9, 24(SP)(R10*4)
|
|
SHRQ $0x08, DI
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate3_match_encodeSnappyBlockAsm8B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeSnappyBlockAsm8B
|
|
|
|
candidate3_match_encodeSnappyBlockAsm8B:
|
|
ADDL $0x02, CX
|
|
JMP candidate_match_encodeSnappyBlockAsm8B
|
|
|
|
candidate2_match_encodeSnappyBlockAsm8B:
|
|
MOVL R9, 24(SP)(R10*4)
|
|
INCL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeSnappyBlockAsm8B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeSnappyBlockAsm8B
|
|
|
|
match_extend_back_loop_encodeSnappyBlockAsm8B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeSnappyBlockAsm8B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeSnappyBlockAsm8B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeSnappyBlockAsm8B
|
|
JMP match_extend_back_loop_encodeSnappyBlockAsm8B
|
|
|
|
match_extend_back_end_encodeSnappyBlockAsm8B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeSnappyBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeSnappyBlockAsm8B:
|
|
MOVL CX, DI
|
|
MOVL 12(SP), R8
|
|
CMPL R8, DI
|
|
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(R8*1), DI
|
|
SUBL R8, R9
|
|
LEAL -1(R9), R8
|
|
CMPL R8, $0x3c
|
|
JLT one_byte_match_emit_encodeSnappyBlockAsm8B
|
|
CMPL R8, $0x00000100
|
|
JLT two_bytes_match_emit_encodeSnappyBlockAsm8B
|
|
MOVB $0xf4, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
|
|
|
|
two_bytes_match_emit_encodeSnappyBlockAsm8B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB R8, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL R8, $0x40
|
|
JL memmove_match_emit_encodeSnappyBlockAsm8B
|
|
JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
|
|
|
|
one_byte_match_emit_encodeSnappyBlockAsm8B:
|
|
SHLB $0x02, R8
|
|
MOVB R8, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeSnappyBlockAsm8B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
|
|
MOVQ (DI), R10
|
|
MOVQ R10, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
|
|
MOVQ (DI), R10
|
|
MOVQ -8(DI)(R9*1), DI
|
|
MOVQ R10, (AX)
|
|
MOVQ DI, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
|
|
MOVOU (DI), X0
|
|
MOVOU -16(DI)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
|
|
MOVQ R8, AX
|
|
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B
|
|
|
|
memmove_long_match_emit_encodeSnappyBlockAsm8B:
|
|
LEAQ (AX)(R9*1), R8
|
|
|
|
// genMemMoveLong
|
|
MOVOU (DI), X0
|
|
MOVOU 16(DI), X1
|
|
MOVOU -32(DI)(R9*1), X2
|
|
MOVOU -16(DI)(R9*1), X3
|
|
MOVQ R9, R11
|
|
SHRQ $0x05, R11
|
|
MOVQ AX, R10
|
|
ANDL $0x0000001f, R10
|
|
MOVQ $0x00000040, R12
|
|
SUBQ R10, R12
|
|
DECQ R11
|
|
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
|
|
LEAQ -32(DI)(R12*1), R10
|
|
LEAQ -32(AX)(R12*1), R13
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
|
|
MOVOU (R10), X4
|
|
MOVOU 16(R10), X5
|
|
MOVOA X4, (R13)
|
|
MOVOA X5, 16(R13)
|
|
ADDQ $0x20, R13
|
|
ADDQ $0x20, R10
|
|
ADDQ $0x20, R12
|
|
DECQ R11
|
|
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
|
|
MOVOU -32(DI)(R12*1), X4
|
|
MOVOU -16(DI)(R12*1), X5
|
|
MOVOA X4, -32(AX)(R12*1)
|
|
MOVOA X5, -16(AX)(R12*1)
|
|
ADDQ $0x20, R12
|
|
CMPQ R9, R12
|
|
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ R8, AX
|
|
|
|
emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
|
|
match_nolit_loop_encodeSnappyBlockAsm8B:
|
|
MOVL CX, DI
|
|
SUBL SI, DI
|
|
MOVL DI, 16(SP)
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), DI
|
|
SUBL CX, DI
|
|
LEAQ (DX)(CX*1), R8
|
|
LEAQ (DX)(SI*1), SI
|
|
|
|
// matchLen
|
|
XORL R10, R10
|
|
CMPL DI, $0x08
|
|
JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B
|
|
|
|
matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
|
|
MOVQ (R8)(R10*1), R9
|
|
XORQ (SI)(R10*1), R9
|
|
TESTQ R9, R9
|
|
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B
|
|
BSFQ R9, R9
|
|
SARQ $0x03, R9
|
|
LEAL (R10)(R9*1), R10
|
|
JMP match_nolit_end_encodeSnappyBlockAsm8B
|
|
|
|
matchlen_loop_match_nolit_encodeSnappyBlockAsm8B:
|
|
LEAL -8(DI), DI
|
|
LEAL 8(R10), R10
|
|
CMPL DI, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B
|
|
|
|
matchlen_single_match_nolit_encodeSnappyBlockAsm8B:
|
|
TESTL DI, DI
|
|
JZ match_nolit_end_encodeSnappyBlockAsm8B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B:
|
|
MOVB (R8)(R10*1), R9
|
|
CMPB (SI)(R10*1), R9
|
|
JNE match_nolit_end_encodeSnappyBlockAsm8B
|
|
LEAL 1(R10), R10
|
|
DECL DI
|
|
JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B
|
|
|
|
match_nolit_end_encodeSnappyBlockAsm8B:
|
|
ADDL R10, CX
|
|
MOVL 16(SP), SI
|
|
ADDL $0x04, R10
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
|
|
CMPL R10, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
|
|
MOVB $0xee, (AX)
|
|
MOVW SI, 1(AX)
|
|
LEAL -60(R10), R10
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
|
|
|
|
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
|
|
CMPL R10, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R10*4), R10
|
|
MOVB SI, 1(AX)
|
|
SHRL $0x08, SI
|
|
SHLL $0x05, SI
|
|
ORL SI, R10
|
|
MOVB R10, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
|
|
|
|
emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R10*4), R10
|
|
MOVB R10, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBlockAsm8B
|
|
MOVQ -2(DX)(CX*1), DI
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeSnappyBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeSnappyBlockAsm8B:
|
|
MOVQ $0x9e3779b1, R9
|
|
MOVQ DI, R8
|
|
SHRQ $0x10, DI
|
|
MOVQ DI, SI
|
|
SHLQ $0x20, R8
|
|
IMULQ R9, R8
|
|
SHRQ $0x38, R8
|
|
SHLQ $0x20, SI
|
|
IMULQ R9, SI
|
|
SHRQ $0x38, SI
|
|
LEAL -2(CX), R9
|
|
LEAQ 24(SP)(SI*4), R10
|
|
MOVL (R10), SI
|
|
MOVL R9, 24(SP)(R8*4)
|
|
MOVL CX, (R10)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ match_nolit_loop_encodeSnappyBlockAsm8B
|
|
INCL CX
|
|
JMP search_loop_encodeSnappyBlockAsm8B
|
|
|
|
emit_remainder_encodeSnappyBlockAsm8B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeSnappyBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeSnappyBlockAsm8B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeSnappyBlockAsm8B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
|
|
|
|
two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeSnappyBlockAsm8B
|
|
JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
|
|
|
|
one_byte_emit_remainder_encodeSnappyBlockAsm8B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeSnappyBlockAsm8B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
|
|
|
|
memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000a00, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeSnappyBetterBlockAsm:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeSnappyBetterBlockAsm
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL $0x00000000, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeSnappyBetterBlockAsm:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x07, SI
|
|
CMPL SI, $0x63
|
|
JLE check_maxskip_ok_encodeSnappyBetterBlockAsm
|
|
LEAL 100(CX), SI
|
|
JMP check_maxskip_cont_encodeSnappyBetterBlockAsm
|
|
|
|
check_maxskip_ok_encodeSnappyBetterBlockAsm:
|
|
LEAL 1(CX)(SI*1), SI
|
|
|
|
check_maxskip_cont_encodeSnappyBetterBlockAsm:
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBetterBlockAsm
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x00cf1bbcdcbfa563, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x32, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 262168(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
MOVL CX, 262168(SP)(R11*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBetterBlockAsm
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidateS_match_encodeSnappyBetterBlockAsm
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeSnappyBetterBlockAsm
|
|
|
|
candidateS_match_encodeSnappyBetterBlockAsm:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x30, R10
|
|
MOVL 24(SP)(R10*4), SI
|
|
INCL CX
|
|
MOVL CX, 24(SP)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBetterBlockAsm
|
|
DECL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeSnappyBetterBlockAsm:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeSnappyBetterBlockAsm
|
|
|
|
match_extend_back_loop_encodeSnappyBetterBlockAsm:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeSnappyBetterBlockAsm
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeSnappyBetterBlockAsm
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeSnappyBetterBlockAsm
|
|
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm
|
|
|
|
match_extend_back_end_encodeSnappyBetterBlockAsm:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 5(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeSnappyBetterBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeSnappyBetterBlockAsm:
|
|
MOVL CX, DI
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), R10
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm
|
|
|
|
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:
|
|
MOVQ (R9)(R12*1), R11
|
|
XORQ (R10)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP match_nolit_end_encodeSnappyBetterBlockAsm
|
|
|
|
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R12), R12
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm
|
|
|
|
matchlen_single_match_nolit_encodeSnappyBetterBlockAsm:
|
|
TESTL R8, R8
|
|
JZ match_nolit_end_encodeSnappyBetterBlockAsm
|
|
|
|
matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm:
|
|
MOVB (R9)(R12*1), R11
|
|
CMPB (R10)(R12*1), R11
|
|
JNE match_nolit_end_encodeSnappyBetterBlockAsm
|
|
LEAL 1(R12), R12
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm
|
|
|
|
match_nolit_end_encodeSnappyBetterBlockAsm:
|
|
MOVL CX, R8
|
|
SUBL SI, R8
|
|
|
|
// Check if repeat
|
|
CMPL R12, $0x01
|
|
JG match_length_ok_encodeSnappyBetterBlockAsm
|
|
CMPL R8, $0x0000ffff
|
|
JLE match_length_ok_encodeSnappyBetterBlockAsm
|
|
MOVL 20(SP), CX
|
|
INCL CX
|
|
JMP search_loop_encodeSnappyBetterBlockAsm
|
|
|
|
match_length_ok_encodeSnappyBetterBlockAsm:
|
|
MOVL R8, 16(SP)
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm
|
|
CMPL SI, $0x00010000
|
|
JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm
|
|
CMPL SI, $0x01000000
|
|
JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm
|
|
MOVB $0xfc, (AX)
|
|
MOVL SI, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
|
|
|
|
four_bytes_match_emit_encodeSnappyBetterBlockAsm:
|
|
MOVL SI, R11
|
|
SHRL $0x10, R11
|
|
MOVB $0xf8, (AX)
|
|
MOVW SI, 1(AX)
|
|
MOVB R11, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
|
|
|
|
three_bytes_match_emit_encodeSnappyBetterBlockAsm:
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
|
|
|
|
two_bytes_match_emit_encodeSnappyBetterBlockAsm:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_encodeSnappyBetterBlockAsm
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
|
|
|
|
one_byte_match_emit_encodeSnappyBetterBlockAsm:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeSnappyBetterBlockAsm:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
|
|
MOVQ (R10), R11
|
|
MOVQ R11, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
|
|
|
|
memmove_long_match_emit_encodeSnappyBetterBlockAsm:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
CMPL R8, $0x00010000
|
|
JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
|
|
|
|
four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
|
|
CMPL R12, $0x40
|
|
JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
|
|
MOVB $0xff, (AX)
|
|
MOVL R8, 1(AX)
|
|
LEAL -64(R12), R12
|
|
ADDQ $0x05, AX
|
|
CMPL R12, $0x04
|
|
JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
|
|
JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
|
|
|
|
four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
|
|
TESTL R12, R12
|
|
JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
|
|
MOVB $0x03, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVL R8, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
|
|
|
|
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
|
|
CMPL R12, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
|
|
MOVB $0xee, (AX)
|
|
MOVW R8, 1(AX)
|
|
LEAL -60(R12), R12
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
|
|
|
|
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
|
|
CMPL R12, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
|
|
CMPL R8, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SHRL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
|
|
|
|
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBetterBlockAsm
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
|
|
MOVQ $0x00cf1bbcdcbfa563, SI
|
|
MOVQ $0x9e3779b1, R8
|
|
INCL DI
|
|
MOVQ (DX)(DI*1), R9
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
MOVQ R9, R12
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
SHRQ $0x10, R12
|
|
LEAL 1(DI), R14
|
|
LEAL 2(DI), R15
|
|
MOVQ -2(DX)(CX*1), R9
|
|
SHLQ $0x08, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x08, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x30, R13
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x32, R11
|
|
SHLQ $0x20, R12
|
|
IMULQ R8, R12
|
|
SHRQ $0x32, R12
|
|
MOVL DI, 24(SP)(R10*4)
|
|
MOVL R14, 24(SP)(R13*4)
|
|
MOVL R14, 262168(SP)(R11*4)
|
|
MOVL R15, 262168(SP)(R12*4)
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
LEAL -2(CX), R9
|
|
LEAL -1(CX), DI
|
|
SHLQ $0x08, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x32, R11
|
|
SHLQ $0x08, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x30, R13
|
|
MOVL R9, 24(SP)(R10*4)
|
|
MOVL DI, 262168(SP)(R11*4)
|
|
MOVL DI, 24(SP)(R13*4)
|
|
JMP search_loop_encodeSnappyBetterBlockAsm
|
|
|
|
emit_remainder_encodeSnappyBetterBlockAsm:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 5(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeSnappyBetterBlockAsm
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeSnappyBetterBlockAsm:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
|
|
CMPL DX, $0x00010000
|
|
JLT three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
|
|
CMPL DX, $0x01000000
|
|
JLT four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
|
|
MOVB $0xfc, (AX)
|
|
MOVL DX, 1(AX)
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
|
|
|
|
four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
|
|
MOVL DX, BX
|
|
SHRL $0x10, BX
|
|
MOVB $0xf8, (AX)
|
|
MOVW DX, 1(AX)
|
|
MOVB BL, 3(AX)
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
|
|
|
|
three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
|
|
|
|
two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeSnappyBetterBlockAsm
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
|
|
|
|
one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeSnappyBetterBlockAsm:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
|
|
|
|
memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000a00, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeSnappyBetterBlockAsm64K:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeSnappyBetterBlockAsm64K
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL $0x00000000, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeSnappyBetterBlockAsm64K:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x07, SI
|
|
LEAL 1(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x00cf1bbcdcbfa563, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x32, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 262168(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
MOVL CX, 262168(SP)(R11*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBetterBlockAsm64K
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeSnappyBetterBlockAsm64K
|
|
|
|
candidateS_match_encodeSnappyBetterBlockAsm64K:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x08, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x30, R10
|
|
MOVL 24(SP)(R10*4), SI
|
|
INCL CX
|
|
MOVL CX, 24(SP)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBetterBlockAsm64K
|
|
DECL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeSnappyBetterBlockAsm64K:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
|
|
|
|
match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
|
|
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K
|
|
|
|
match_extend_back_end_encodeSnappyBetterBlockAsm64K:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeSnappyBetterBlockAsm64K
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeSnappyBetterBlockAsm64K:
|
|
MOVL CX, DI
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), R10
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K
|
|
|
|
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
|
|
MOVQ (R9)(R12*1), R11
|
|
XORQ (R10)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
|
|
|
|
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R12), R12
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
|
|
|
|
matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K:
|
|
TESTL R8, R8
|
|
JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
|
|
|
|
matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
|
|
MOVB (R9)(R12*1), R11
|
|
CMPB (R10)(R12*1), R11
|
|
JNE match_nolit_end_encodeSnappyBetterBlockAsm64K
|
|
LEAL 1(R12), R12
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
|
|
|
|
match_nolit_end_encodeSnappyBetterBlockAsm64K:
|
|
MOVL CX, R8
|
|
SUBL SI, R8
|
|
|
|
// Check if repeat
|
|
MOVL R8, 16(SP)
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
|
|
|
|
two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_encodeSnappyBetterBlockAsm64K
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
|
|
|
|
one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeSnappyBetterBlockAsm64K:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
|
|
MOVQ (R10), R11
|
|
MOVQ R11, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
|
|
|
|
memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
|
|
CMPL R12, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
|
|
MOVB $0xee, (AX)
|
|
MOVW R8, 1(AX)
|
|
LEAL -60(R12), R12
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
|
|
|
|
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
|
|
CMPL R12, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
|
|
CMPL R8, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SHRL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
|
|
|
|
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
|
|
MOVQ $0x00cf1bbcdcbfa563, SI
|
|
MOVQ $0x9e3779b1, R8
|
|
INCL DI
|
|
MOVQ (DX)(DI*1), R9
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
MOVQ R9, R12
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
SHRQ $0x10, R12
|
|
LEAL 1(DI), R14
|
|
LEAL 2(DI), R15
|
|
MOVQ -2(DX)(CX*1), R9
|
|
SHLQ $0x08, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x08, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x30, R13
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x32, R11
|
|
SHLQ $0x20, R12
|
|
IMULQ R8, R12
|
|
SHRQ $0x32, R12
|
|
MOVL DI, 24(SP)(R10*4)
|
|
MOVL R14, 24(SP)(R13*4)
|
|
MOVL R14, 262168(SP)(R11*4)
|
|
MOVL R15, 262168(SP)(R12*4)
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
LEAL -2(CX), R9
|
|
LEAL -1(CX), DI
|
|
SHLQ $0x08, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x30, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x32, R11
|
|
SHLQ $0x08, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x30, R13
|
|
MOVL R9, 24(SP)(R10*4)
|
|
MOVL DI, 262168(SP)(R11*4)
|
|
MOVL DI, 24(SP)(R13*4)
|
|
JMP search_loop_encodeSnappyBetterBlockAsm64K
|
|
|
|
emit_remainder_encodeSnappyBetterBlockAsm64K:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeSnappyBetterBlockAsm64K
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
|
|
two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
|
|
one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
|
|
|
|
memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000280, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeSnappyBetterBlockAsm12B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeSnappyBetterBlockAsm12B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL $0x00000000, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeSnappyBetterBlockAsm12B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x06, SI
|
|
LEAL 1(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x34, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 65560(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
MOVL CX, 65560(SP)(R11*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBetterBlockAsm12B
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeSnappyBetterBlockAsm12B
|
|
|
|
candidateS_match_encodeSnappyBetterBlockAsm12B:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x32, R10
|
|
MOVL 24(SP)(R10*4), SI
|
|
INCL CX
|
|
MOVL CX, 24(SP)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBetterBlockAsm12B
|
|
DECL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeSnappyBetterBlockAsm12B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
|
|
|
|
match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
|
|
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B
|
|
|
|
match_extend_back_end_encodeSnappyBetterBlockAsm12B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeSnappyBetterBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeSnappyBetterBlockAsm12B:
|
|
MOVL CX, DI
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), R10
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B
|
|
|
|
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
|
|
MOVQ (R9)(R12*1), R11
|
|
XORQ (R10)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
|
|
|
|
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R12), R12
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
|
|
|
|
matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B:
|
|
TESTL R8, R8
|
|
JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
|
|
MOVB (R9)(R12*1), R11
|
|
CMPB (R10)(R12*1), R11
|
|
JNE match_nolit_end_encodeSnappyBetterBlockAsm12B
|
|
LEAL 1(R12), R12
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
|
|
|
|
match_nolit_end_encodeSnappyBetterBlockAsm12B:
|
|
MOVL CX, R8
|
|
SUBL SI, R8
|
|
|
|
// Check if repeat
|
|
MOVL R8, 16(SP)
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
|
|
|
|
two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_encodeSnappyBetterBlockAsm12B
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
|
|
|
|
one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeSnappyBetterBlockAsm12B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
|
|
MOVQ (R10), R11
|
|
MOVQ R11, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
|
|
|
|
memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
|
|
CMPL R12, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
|
|
MOVB $0xee, (AX)
|
|
MOVW R8, 1(AX)
|
|
LEAL -60(R12), R12
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
|
|
|
|
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
|
|
CMPL R12, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
|
|
CMPL R8, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SHRL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
|
|
|
|
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
|
|
MOVQ $0x0000cf1bbcdcbf9b, SI
|
|
MOVQ $0x9e3779b1, R8
|
|
INCL DI
|
|
MOVQ (DX)(DI*1), R9
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
MOVQ R9, R12
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
SHRQ $0x10, R12
|
|
LEAL 1(DI), R14
|
|
LEAL 2(DI), R15
|
|
MOVQ -2(DX)(CX*1), R9
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x32, R13
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x34, R11
|
|
SHLQ $0x20, R12
|
|
IMULQ R8, R12
|
|
SHRQ $0x34, R12
|
|
MOVL DI, 24(SP)(R10*4)
|
|
MOVL R14, 24(SP)(R13*4)
|
|
MOVL R14, 65560(SP)(R11*4)
|
|
MOVL R15, 65560(SP)(R12*4)
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
LEAL -2(CX), R9
|
|
LEAL -1(CX), DI
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x32, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x34, R11
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x32, R13
|
|
MOVL R9, 24(SP)(R10*4)
|
|
MOVL DI, 65560(SP)(R11*4)
|
|
MOVL DI, 24(SP)(R13*4)
|
|
JMP search_loop_encodeSnappyBetterBlockAsm12B
|
|
|
|
emit_remainder_encodeSnappyBetterBlockAsm12B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeSnappyBetterBlockAsm12B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
|
|
two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
|
|
one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
|
|
|
|
memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x000000a0, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeSnappyBetterBlockAsm10B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeSnappyBetterBlockAsm10B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL $0x00000000, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeSnappyBetterBlockAsm10B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x05, SI
|
|
LEAL 1(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x36, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 16408(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
MOVL CX, 16408(SP)(R11*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBetterBlockAsm10B
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeSnappyBetterBlockAsm10B
|
|
|
|
candidateS_match_encodeSnappyBetterBlockAsm10B:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x34, R10
|
|
MOVL 24(SP)(R10*4), SI
|
|
INCL CX
|
|
MOVL CX, 24(SP)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBetterBlockAsm10B
|
|
DECL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeSnappyBetterBlockAsm10B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
|
|
|
|
match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
|
|
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B
|
|
|
|
match_extend_back_end_encodeSnappyBetterBlockAsm10B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeSnappyBetterBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeSnappyBetterBlockAsm10B:
|
|
MOVL CX, DI
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), R10
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B
|
|
|
|
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
|
|
MOVQ (R9)(R12*1), R11
|
|
XORQ (R10)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
|
|
|
|
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R12), R12
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
|
|
|
|
matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B:
|
|
TESTL R8, R8
|
|
JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
|
|
MOVB (R9)(R12*1), R11
|
|
CMPB (R10)(R12*1), R11
|
|
JNE match_nolit_end_encodeSnappyBetterBlockAsm10B
|
|
LEAL 1(R12), R12
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
|
|
|
|
match_nolit_end_encodeSnappyBetterBlockAsm10B:
|
|
MOVL CX, R8
|
|
SUBL SI, R8
|
|
|
|
// Check if repeat
|
|
MOVL R8, 16(SP)
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
|
|
|
|
two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_encodeSnappyBetterBlockAsm10B
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
|
|
|
|
one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeSnappyBetterBlockAsm10B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
|
|
MOVQ (R10), R11
|
|
MOVQ R11, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
|
|
|
|
memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
|
|
CMPL R12, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
|
|
MOVB $0xee, (AX)
|
|
MOVW R8, 1(AX)
|
|
LEAL -60(R12), R12
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
|
|
|
|
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
|
|
CMPL R12, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
|
|
CMPL R8, $0x00000800
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SHRL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
|
|
|
|
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
|
|
MOVQ $0x0000cf1bbcdcbf9b, SI
|
|
MOVQ $0x9e3779b1, R8
|
|
INCL DI
|
|
MOVQ (DX)(DI*1), R9
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
MOVQ R9, R12
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
SHRQ $0x10, R12
|
|
LEAL 1(DI), R14
|
|
LEAL 2(DI), R15
|
|
MOVQ -2(DX)(CX*1), R9
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x34, R10
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x34, R13
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x36, R11
|
|
SHLQ $0x20, R12
|
|
IMULQ R8, R12
|
|
SHRQ $0x36, R12
|
|
MOVL DI, 24(SP)(R10*4)
|
|
MOVL R14, 24(SP)(R13*4)
|
|
MOVL R14, 16408(SP)(R11*4)
|
|
MOVL R15, 16408(SP)(R12*4)
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
LEAL -2(CX), R9
|
|
LEAL -1(CX), DI
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x34, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x36, R11
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x34, R13
|
|
MOVL R9, 24(SP)(R10*4)
|
|
MOVL DI, 16408(SP)(R11*4)
|
|
MOVL DI, 24(SP)(R13*4)
|
|
JMP search_loop_encodeSnappyBetterBlockAsm10B
|
|
|
|
emit_remainder_encodeSnappyBetterBlockAsm10B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeSnappyBetterBlockAsm10B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
|
|
two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
|
|
one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
|
|
|
|
memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ $0x00000028, CX
|
|
LEAQ 24(SP), DX
|
|
PXOR X0, X0
|
|
|
|
zero_loop_encodeSnappyBetterBlockAsm8B:
|
|
MOVOU X0, (DX)
|
|
MOVOU X0, 16(DX)
|
|
MOVOU X0, 32(DX)
|
|
MOVOU X0, 48(DX)
|
|
MOVOU X0, 64(DX)
|
|
MOVOU X0, 80(DX)
|
|
MOVOU X0, 96(DX)
|
|
MOVOU X0, 112(DX)
|
|
ADDQ $0x80, DX
|
|
DECQ CX
|
|
JNZ zero_loop_encodeSnappyBetterBlockAsm8B
|
|
MOVL $0x00000000, 12(SP)
|
|
MOVQ src_len+32(FP), CX
|
|
LEAQ -9(CX), DX
|
|
LEAQ -8(CX), SI
|
|
MOVL SI, 8(SP)
|
|
SHRQ $0x05, CX
|
|
SUBL CX, DX
|
|
LEAQ (AX)(DX*1), DX
|
|
MOVQ DX, (SP)
|
|
MOVL $0x00000001, CX
|
|
MOVL $0x00000000, 16(SP)
|
|
MOVQ src_base+24(FP), DX
|
|
|
|
search_loop_encodeSnappyBetterBlockAsm8B:
|
|
MOVL CX, SI
|
|
SUBL 12(SP), SI
|
|
SHRL $0x04, SI
|
|
LEAL 1(CX)(SI*1), SI
|
|
CMPL SI, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
MOVQ (DX)(CX*1), DI
|
|
MOVL SI, 20(SP)
|
|
MOVQ $0x0000cf1bbcdcbf9b, R9
|
|
MOVQ $0x9e3779b1, SI
|
|
MOVQ DI, R10
|
|
MOVQ DI, R11
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x36, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ SI, R11
|
|
SHRQ $0x38, R11
|
|
MOVL 24(SP)(R10*4), SI
|
|
MOVL 4120(SP)(R11*4), R8
|
|
MOVL CX, 24(SP)(R10*4)
|
|
MOVL CX, 4120(SP)(R11*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBetterBlockAsm8B
|
|
CMPL (DX)(R8*1), DI
|
|
JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
|
|
MOVL 20(SP), CX
|
|
JMP search_loop_encodeSnappyBetterBlockAsm8B
|
|
|
|
candidateS_match_encodeSnappyBetterBlockAsm8B:
|
|
SHRQ $0x08, DI
|
|
MOVQ DI, R10
|
|
SHLQ $0x10, R10
|
|
IMULQ R9, R10
|
|
SHRQ $0x36, R10
|
|
MOVL 24(SP)(R10*4), SI
|
|
INCL CX
|
|
MOVL CX, 24(SP)(R10*4)
|
|
CMPL (DX)(SI*1), DI
|
|
JEQ candidate_match_encodeSnappyBetterBlockAsm8B
|
|
DECL CX
|
|
MOVL R8, SI
|
|
|
|
candidate_match_encodeSnappyBetterBlockAsm8B:
|
|
MOVL 12(SP), DI
|
|
TESTL SI, SI
|
|
JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
|
|
|
|
match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
|
|
CMPL CX, DI
|
|
JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B
|
|
MOVB -1(DX)(SI*1), BL
|
|
MOVB -1(DX)(CX*1), R8
|
|
CMPB BL, R8
|
|
JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B
|
|
LEAL -1(CX), CX
|
|
DECL SI
|
|
JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
|
|
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B
|
|
|
|
match_extend_back_end_encodeSnappyBetterBlockAsm8B:
|
|
MOVL CX, DI
|
|
SUBL 12(SP), DI
|
|
LEAQ 3(AX)(DI*1), DI
|
|
CMPQ DI, (SP)
|
|
JL match_dst_size_check_encodeSnappyBetterBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_dst_size_check_encodeSnappyBetterBlockAsm8B:
|
|
MOVL CX, DI
|
|
ADDL $0x04, CX
|
|
ADDL $0x04, SI
|
|
MOVQ src_len+32(FP), R8
|
|
SUBL CX, R8
|
|
LEAQ (DX)(CX*1), R9
|
|
LEAQ (DX)(SI*1), R10
|
|
|
|
// matchLen
|
|
XORL R12, R12
|
|
CMPL R8, $0x08
|
|
JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B
|
|
|
|
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
|
|
MOVQ (R9)(R12*1), R11
|
|
XORQ (R10)(R12*1), R11
|
|
TESTQ R11, R11
|
|
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B
|
|
BSFQ R11, R11
|
|
SARQ $0x03, R11
|
|
LEAL (R12)(R11*1), R12
|
|
JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
|
|
|
|
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B:
|
|
LEAL -8(R8), R8
|
|
LEAL 8(R12), R12
|
|
CMPL R8, $0x08
|
|
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
|
|
|
|
matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B:
|
|
TESTL R8, R8
|
|
JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
|
|
|
|
matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
|
|
MOVB (R9)(R12*1), R11
|
|
CMPB (R10)(R12*1), R11
|
|
JNE match_nolit_end_encodeSnappyBetterBlockAsm8B
|
|
LEAL 1(R12), R12
|
|
DECL R8
|
|
JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
|
|
|
|
match_nolit_end_encodeSnappyBetterBlockAsm8B:
|
|
MOVL CX, R8
|
|
SUBL SI, R8
|
|
|
|
// Check if repeat
|
|
MOVL R8, 16(SP)
|
|
MOVL 12(SP), SI
|
|
CMPL SI, DI
|
|
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
|
|
MOVL DI, R9
|
|
MOVL DI, 12(SP)
|
|
LEAQ (DX)(SI*1), R10
|
|
SUBL SI, R9
|
|
LEAL -1(R9), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
|
|
|
|
two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_match_emit_encodeSnappyBetterBlockAsm8B
|
|
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
|
|
|
|
one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_match_emit_encodeSnappyBetterBlockAsm8B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveShort
|
|
CMPQ R9, $0x08
|
|
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
|
|
CMPQ R9, $0x10
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
|
|
CMPQ R9, $0x20
|
|
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
|
|
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
|
|
MOVQ (R10), R11
|
|
MOVQ R11, (AX)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
|
|
MOVQ (R10), R11
|
|
MOVQ -8(R10)(R9*1), R10
|
|
MOVQ R11, (AX)
|
|
MOVQ R10, -8(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
|
|
MOVOU (R10), X0
|
|
MOVOU -16(R10)(R9*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(R9*1)
|
|
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
|
|
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
|
|
MOVQ SI, AX
|
|
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
|
|
|
|
memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
|
|
LEAQ (AX)(R9*1), SI
|
|
|
|
// genMemMoveLong
|
|
MOVOU (R10), X0
|
|
MOVOU 16(R10), X1
|
|
MOVOU -32(R10)(R9*1), X2
|
|
MOVOU -16(R10)(R9*1), X3
|
|
MOVQ R9, R13
|
|
SHRQ $0x05, R13
|
|
MOVQ AX, R11
|
|
ANDL $0x0000001f, R11
|
|
MOVQ $0x00000040, R14
|
|
SUBQ R11, R14
|
|
DECQ R13
|
|
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
|
|
LEAQ -32(R10)(R14*1), R11
|
|
LEAQ -32(AX)(R14*1), R15
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
|
|
MOVOU (R11), X4
|
|
MOVOU 16(R11), X5
|
|
MOVOA X4, (R15)
|
|
MOVOA X5, 16(R15)
|
|
ADDQ $0x20, R15
|
|
ADDQ $0x20, R11
|
|
ADDQ $0x20, R14
|
|
DECQ R13
|
|
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
|
|
MOVOU -32(R10)(R14*1), X4
|
|
MOVOU -16(R10)(R14*1), X5
|
|
MOVOA X4, -32(AX)(R14*1)
|
|
MOVOA X5, -16(AX)(R14*1)
|
|
ADDQ $0x20, R14
|
|
CMPQ R9, R14
|
|
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(R9*1)
|
|
MOVOU X3, -16(AX)(R9*1)
|
|
MOVQ SI, AX
|
|
|
|
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
|
|
ADDL R12, CX
|
|
ADDL $0x04, R12
|
|
MOVL CX, 12(SP)
|
|
|
|
// emitCopy
|
|
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
|
|
CMPL R12, $0x40
|
|
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
|
|
MOVB $0xee, (AX)
|
|
MOVW R8, 1(AX)
|
|
LEAL -60(R12), R12
|
|
ADDQ $0x03, AX
|
|
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
|
|
|
|
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
|
|
CMPL R12, $0x0c
|
|
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
|
|
MOVB $0x01, BL
|
|
LEAL -16(BX)(R12*4), R12
|
|
MOVB R8, 1(AX)
|
|
SHRL $0x08, R8
|
|
SHLL $0x05, R8
|
|
ORL R8, R12
|
|
MOVB R12, (AX)
|
|
ADDQ $0x02, AX
|
|
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
|
|
|
|
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
|
|
MOVB $0x02, BL
|
|
LEAL -4(BX)(R12*4), R12
|
|
MOVB R12, (AX)
|
|
MOVW R8, 1(AX)
|
|
ADDQ $0x03, AX
|
|
|
|
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
|
|
CMPL CX, 8(SP)
|
|
JGE emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
CMPQ AX, (SP)
|
|
JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
|
|
MOVQ $0x0000cf1bbcdcbf9b, SI
|
|
MOVQ $0x9e3779b1, R8
|
|
INCL DI
|
|
MOVQ (DX)(DI*1), R9
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
MOVQ R9, R12
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
SHRQ $0x10, R12
|
|
LEAL 1(DI), R14
|
|
LEAL 2(DI), R15
|
|
MOVQ -2(DX)(CX*1), R9
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x36, R10
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x36, R13
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x38, R11
|
|
SHLQ $0x20, R12
|
|
IMULQ R8, R12
|
|
SHRQ $0x38, R12
|
|
MOVL DI, 24(SP)(R10*4)
|
|
MOVL R14, 24(SP)(R13*4)
|
|
MOVL R14, 4120(SP)(R11*4)
|
|
MOVL R15, 4120(SP)(R12*4)
|
|
MOVQ R9, R10
|
|
MOVQ R9, R11
|
|
SHRQ $0x08, R11
|
|
MOVQ R11, R13
|
|
LEAL -2(CX), R9
|
|
LEAL -1(CX), DI
|
|
SHLQ $0x10, R10
|
|
IMULQ SI, R10
|
|
SHRQ $0x36, R10
|
|
SHLQ $0x20, R11
|
|
IMULQ R8, R11
|
|
SHRQ $0x38, R11
|
|
SHLQ $0x10, R13
|
|
IMULQ SI, R13
|
|
SHRQ $0x36, R13
|
|
MOVL R9, 24(SP)(R10*4)
|
|
MOVL DI, 4120(SP)(R11*4)
|
|
MOVL DI, 24(SP)(R13*4)
|
|
JMP search_loop_encodeSnappyBetterBlockAsm8B
|
|
|
|
emit_remainder_encodeSnappyBetterBlockAsm8B:
|
|
MOVQ src_len+32(FP), CX
|
|
SUBL 12(SP), CX
|
|
LEAQ 3(AX)(CX*1), CX
|
|
CMPQ CX, (SP)
|
|
JL emit_remainder_ok_encodeSnappyBetterBlockAsm8B
|
|
MOVQ $0x00000000, ret+48(FP)
|
|
RET
|
|
|
|
emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
|
|
MOVQ src_len+32(FP), CX
|
|
MOVL 12(SP), BX
|
|
CMPL BX, CX
|
|
JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
MOVL CX, SI
|
|
MOVL CX, 12(SP)
|
|
LEAQ (DX)(BX*1), CX
|
|
SUBL BX, SI
|
|
LEAL -1(SI), DX
|
|
CMPL DX, $0x3c
|
|
JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
CMPL DX, $0x00000100
|
|
JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
MOVB $0xf4, (AX)
|
|
MOVW DX, 1(AX)
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
|
|
two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
|
|
MOVB $0xf0, (AX)
|
|
MOVB DL, 1(AX)
|
|
ADDQ $0x02, AX
|
|
CMPL DX, $0x40
|
|
JL memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
|
|
one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
|
|
SHLB $0x02, DL
|
|
MOVB DL, (AX)
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveShort
|
|
CMPQ BX, $0x08
|
|
JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8
|
|
CMPQ BX, $0x10
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
|
|
CMPQ BX, $0x20
|
|
JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
|
|
JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8:
|
|
MOVQ (CX), SI
|
|
MOVQ SI, (AX)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(BX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(BX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(BX*1)
|
|
JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
|
|
emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
|
|
memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
|
|
MOVQ DX, AX
|
|
JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
|
|
|
|
memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
|
|
LEAQ (AX)(SI*1), DX
|
|
MOVL SI, BX
|
|
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(BX*1), X2
|
|
MOVOU -16(CX)(BX*1), X3
|
|
MOVQ BX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ BX, R8
|
|
JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(BX*1)
|
|
MOVOU X3, -16(AX)(BX*1)
|
|
MOVQ DX, AX
|
|
|
|
emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
|
|
MOVQ dst_base+0(FP), CX
|
|
SUBQ CX, AX
|
|
MOVQ AX, ret+48(FP)
|
|
RET
|
|
|
|
// func emitLiteral(dst []byte, lit []byte) int
|
|
// Requires: SSE2
|
|
TEXT ·emitLiteral(SB), NOSPLIT, $0-56
|
|
MOVQ lit_len+32(FP), DX
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ lit_base+24(FP), CX
|
|
TESTQ DX, DX
|
|
JZ emit_literal_end_standalone_skip
|
|
MOVL DX, BX
|
|
LEAL -1(DX), SI
|
|
CMPL SI, $0x3c
|
|
JLT one_byte_standalone
|
|
CMPL SI, $0x00000100
|
|
JLT two_bytes_standalone
|
|
CMPL SI, $0x00010000
|
|
JLT three_bytes_standalone
|
|
CMPL SI, $0x01000000
|
|
JLT four_bytes_standalone
|
|
MOVB $0xfc, (AX)
|
|
MOVL SI, 1(AX)
|
|
ADDQ $0x05, BX
|
|
ADDQ $0x05, AX
|
|
JMP memmove_long_standalone
|
|
|
|
four_bytes_standalone:
|
|
MOVL SI, DI
|
|
SHRL $0x10, DI
|
|
MOVB $0xf8, (AX)
|
|
MOVW SI, 1(AX)
|
|
MOVB DI, 3(AX)
|
|
ADDQ $0x04, BX
|
|
ADDQ $0x04, AX
|
|
JMP memmove_long_standalone
|
|
|
|
three_bytes_standalone:
|
|
MOVB $0xf4, (AX)
|
|
MOVW SI, 1(AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
JMP memmove_long_standalone
|
|
|
|
two_bytes_standalone:
|
|
MOVB $0xf0, (AX)
|
|
MOVB SI, 1(AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
CMPL SI, $0x40
|
|
JL memmove_standalone
|
|
JMP memmove_long_standalone
|
|
|
|
one_byte_standalone:
|
|
SHLB $0x02, SI
|
|
MOVB SI, (AX)
|
|
ADDQ $0x01, BX
|
|
ADDQ $0x01, AX
|
|
|
|
memmove_standalone:
|
|
// genMemMoveShort
|
|
CMPQ DX, $0x03
|
|
JB emit_lit_memmove_standalone_memmove_move_1or2
|
|
JE emit_lit_memmove_standalone_memmove_move_3
|
|
CMPQ DX, $0x08
|
|
JB emit_lit_memmove_standalone_memmove_move_4through7
|
|
CMPQ DX, $0x10
|
|
JBE emit_lit_memmove_standalone_memmove_move_8through16
|
|
CMPQ DX, $0x20
|
|
JBE emit_lit_memmove_standalone_memmove_move_17through32
|
|
JMP emit_lit_memmove_standalone_memmove_move_33through64
|
|
|
|
emit_lit_memmove_standalone_memmove_move_1or2:
|
|
MOVB (CX), SI
|
|
MOVB -1(CX)(DX*1), CL
|
|
MOVB SI, (AX)
|
|
MOVB CL, -1(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_lit_memmove_standalone_memmove_move_3:
|
|
MOVW (CX), SI
|
|
MOVB 2(CX), CL
|
|
MOVW SI, (AX)
|
|
MOVB CL, 2(AX)
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_lit_memmove_standalone_memmove_move_4through7:
|
|
MOVL (CX), SI
|
|
MOVL -4(CX)(DX*1), CX
|
|
MOVL SI, (AX)
|
|
MOVL CX, -4(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_lit_memmove_standalone_memmove_move_8through16:
|
|
MOVQ (CX), SI
|
|
MOVQ -8(CX)(DX*1), CX
|
|
MOVQ SI, (AX)
|
|
MOVQ CX, -8(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_lit_memmove_standalone_memmove_move_17through32:
|
|
MOVOU (CX), X0
|
|
MOVOU -16(CX)(DX*1), X1
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, -16(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_lit_memmove_standalone_memmove_move_33through64:
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(DX*1), X2
|
|
MOVOU -16(CX)(DX*1), X3
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(DX*1)
|
|
MOVOU X3, -16(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
JMP emit_literal_end_standalone
|
|
|
|
memmove_long_standalone:
|
|
// genMemMoveLong
|
|
MOVOU (CX), X0
|
|
MOVOU 16(CX), X1
|
|
MOVOU -32(CX)(DX*1), X2
|
|
MOVOU -16(CX)(DX*1), X3
|
|
MOVQ DX, DI
|
|
SHRQ $0x05, DI
|
|
MOVQ AX, SI
|
|
ANDL $0x0000001f, SI
|
|
MOVQ $0x00000040, R8
|
|
SUBQ SI, R8
|
|
DECQ DI
|
|
JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
|
|
LEAQ -32(CX)(R8*1), SI
|
|
LEAQ -32(AX)(R8*1), R9
|
|
|
|
emit_lit_memmove_long_standalonelarge_big_loop_back:
|
|
MOVOU (SI), X4
|
|
MOVOU 16(SI), X5
|
|
MOVOA X4, (R9)
|
|
MOVOA X5, 16(R9)
|
|
ADDQ $0x20, R9
|
|
ADDQ $0x20, SI
|
|
ADDQ $0x20, R8
|
|
DECQ DI
|
|
JNA emit_lit_memmove_long_standalonelarge_big_loop_back
|
|
|
|
emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
|
|
MOVOU -32(CX)(R8*1), X4
|
|
MOVOU -16(CX)(R8*1), X5
|
|
MOVOA X4, -32(AX)(R8*1)
|
|
MOVOA X5, -16(AX)(R8*1)
|
|
ADDQ $0x20, R8
|
|
CMPQ DX, R8
|
|
JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
|
|
MOVOU X0, (AX)
|
|
MOVOU X1, 16(AX)
|
|
MOVOU X2, -32(AX)(DX*1)
|
|
MOVOU X3, -16(AX)(DX*1)
|
|
JMP emit_literal_end_standalone
|
|
JMP emit_literal_end_standalone
|
|
|
|
emit_literal_end_standalone_skip:
|
|
XORQ BX, BX
|
|
|
|
emit_literal_end_standalone:
|
|
MOVQ BX, ret+48(FP)
|
|
RET
|
|
|
|
// func emitRepeat(dst []byte, offset int, length int) int
|
|
TEXT ·emitRepeat(SB), NOSPLIT, $0-48
|
|
XORQ BX, BX
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ offset+24(FP), CX
|
|
MOVQ length+32(FP), DX
|
|
|
|
// emitRepeat
|
|
emit_repeat_again_standalone:
|
|
MOVL DX, SI
|
|
LEAL -4(DX), DX
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_standalone
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_standalone
|
|
CMPL CX, $0x00000800
|
|
JLT repeat_two_offset_standalone
|
|
|
|
cant_repeat_two_offset_standalone:
|
|
CMPL DX, $0x00000104
|
|
JLT repeat_three_standalone
|
|
CMPL DX, $0x00010100
|
|
JLT repeat_four_standalone
|
|
CMPL DX, $0x0100ffff
|
|
JLT repeat_five_standalone
|
|
LEAL -16842747(DX), DX
|
|
MOVW $0x001d, (AX)
|
|
MOVW $0xfffb, 2(AX)
|
|
MOVB $0xff, 4(AX)
|
|
ADDQ $0x05, AX
|
|
ADDQ $0x05, BX
|
|
JMP emit_repeat_again_standalone
|
|
|
|
repeat_five_standalone:
|
|
LEAL -65536(DX), DX
|
|
MOVL DX, CX
|
|
MOVW $0x001d, (AX)
|
|
MOVW DX, 2(AX)
|
|
SARL $0x10, CX
|
|
MOVB CL, 4(AX)
|
|
ADDQ $0x05, BX
|
|
ADDQ $0x05, AX
|
|
JMP gen_emit_repeat_end
|
|
|
|
repeat_four_standalone:
|
|
LEAL -256(DX), DX
|
|
MOVW $0x0019, (AX)
|
|
MOVW DX, 2(AX)
|
|
ADDQ $0x04, BX
|
|
ADDQ $0x04, AX
|
|
JMP gen_emit_repeat_end
|
|
|
|
repeat_three_standalone:
|
|
LEAL -4(DX), DX
|
|
MOVW $0x0015, (AX)
|
|
MOVB DL, 2(AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
JMP gen_emit_repeat_end
|
|
|
|
repeat_two_standalone:
|
|
SHLL $0x02, DX
|
|
ORL $0x01, DX
|
|
MOVW DX, (AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
JMP gen_emit_repeat_end
|
|
|
|
repeat_two_offset_standalone:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(DX*4), DX
|
|
MOVB CL, 1(AX)
|
|
SARL $0x08, CX
|
|
SHLL $0x05, CX
|
|
ORL CX, DX
|
|
MOVB DL, (AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
|
|
gen_emit_repeat_end:
|
|
MOVQ BX, ret+40(FP)
|
|
RET
|
|
|
|
// func emitCopy(dst []byte, offset int, length int) int
|
|
TEXT ·emitCopy(SB), NOSPLIT, $0-48
|
|
XORQ BX, BX
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ offset+24(FP), CX
|
|
MOVQ length+32(FP), DX
|
|
|
|
// emitCopy
|
|
CMPL CX, $0x00010000
|
|
JL two_byte_offset_standalone
|
|
|
|
four_bytes_loop_back_standalone:
|
|
CMPL DX, $0x40
|
|
JLE four_bytes_remain_standalone
|
|
MOVB $0xff, (AX)
|
|
MOVL CX, 1(AX)
|
|
LEAL -64(DX), DX
|
|
ADDQ $0x05, BX
|
|
ADDQ $0x05, AX
|
|
CMPL DX, $0x04
|
|
JL four_bytes_remain_standalone
|
|
|
|
// emitRepeat
|
|
emit_repeat_again_standalone_emit_copy:
|
|
MOVL DX, SI
|
|
LEAL -4(DX), DX
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_standalone_emit_copy
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_standalone_emit_copy
|
|
CMPL CX, $0x00000800
|
|
JLT repeat_two_offset_standalone_emit_copy
|
|
|
|
cant_repeat_two_offset_standalone_emit_copy:
|
|
CMPL DX, $0x00000104
|
|
JLT repeat_three_standalone_emit_copy
|
|
CMPL DX, $0x00010100
|
|
JLT repeat_four_standalone_emit_copy
|
|
CMPL DX, $0x0100ffff
|
|
JLT repeat_five_standalone_emit_copy
|
|
LEAL -16842747(DX), DX
|
|
MOVW $0x001d, (AX)
|
|
MOVW $0xfffb, 2(AX)
|
|
MOVB $0xff, 4(AX)
|
|
ADDQ $0x05, AX
|
|
ADDQ $0x05, BX
|
|
JMP emit_repeat_again_standalone_emit_copy
|
|
|
|
repeat_five_standalone_emit_copy:
|
|
LEAL -65536(DX), DX
|
|
MOVL DX, CX
|
|
MOVW $0x001d, (AX)
|
|
MOVW DX, 2(AX)
|
|
SARL $0x10, CX
|
|
MOVB CL, 4(AX)
|
|
ADDQ $0x05, BX
|
|
ADDQ $0x05, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
repeat_four_standalone_emit_copy:
|
|
LEAL -256(DX), DX
|
|
MOVW $0x0019, (AX)
|
|
MOVW DX, 2(AX)
|
|
ADDQ $0x04, BX
|
|
ADDQ $0x04, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
repeat_three_standalone_emit_copy:
|
|
LEAL -4(DX), DX
|
|
MOVW $0x0015, (AX)
|
|
MOVB DL, 2(AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
repeat_two_standalone_emit_copy:
|
|
SHLL $0x02, DX
|
|
ORL $0x01, DX
|
|
MOVW DX, (AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
repeat_two_offset_standalone_emit_copy:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(DX*4), DX
|
|
MOVB CL, 1(AX)
|
|
SARL $0x08, CX
|
|
SHLL $0x05, CX
|
|
ORL CX, DX
|
|
MOVB DL, (AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
JMP gen_emit_copy_end
|
|
JMP four_bytes_loop_back_standalone
|
|
|
|
four_bytes_remain_standalone:
|
|
TESTL DX, DX
|
|
JZ gen_emit_copy_end
|
|
MOVB $0x03, SI
|
|
LEAL -4(SI)(DX*4), DX
|
|
MOVB DL, (AX)
|
|
MOVL CX, 1(AX)
|
|
ADDQ $0x05, BX
|
|
ADDQ $0x05, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
two_byte_offset_standalone:
|
|
CMPL DX, $0x40
|
|
JLE two_byte_offset_short_standalone
|
|
MOVB $0xee, (AX)
|
|
MOVW CX, 1(AX)
|
|
LEAL -60(DX), DX
|
|
ADDQ $0x03, AX
|
|
ADDQ $0x03, BX
|
|
|
|
// emitRepeat
|
|
emit_repeat_again_standalone_emit_copy_short:
|
|
MOVL DX, SI
|
|
LEAL -4(DX), DX
|
|
CMPL SI, $0x08
|
|
JLE repeat_two_standalone_emit_copy_short
|
|
CMPL SI, $0x0c
|
|
JGE cant_repeat_two_offset_standalone_emit_copy_short
|
|
CMPL CX, $0x00000800
|
|
JLT repeat_two_offset_standalone_emit_copy_short
|
|
|
|
cant_repeat_two_offset_standalone_emit_copy_short:
|
|
CMPL DX, $0x00000104
|
|
JLT repeat_three_standalone_emit_copy_short
|
|
CMPL DX, $0x00010100
|
|
JLT repeat_four_standalone_emit_copy_short
|
|
CMPL DX, $0x0100ffff
|
|
JLT repeat_five_standalone_emit_copy_short
|
|
LEAL -16842747(DX), DX
|
|
MOVW $0x001d, (AX)
|
|
MOVW $0xfffb, 2(AX)
|
|
MOVB $0xff, 4(AX)
|
|
ADDQ $0x05, AX
|
|
ADDQ $0x05, BX
|
|
JMP emit_repeat_again_standalone_emit_copy_short
|
|
|
|
repeat_five_standalone_emit_copy_short:
|
|
LEAL -65536(DX), DX
|
|
MOVL DX, CX
|
|
MOVW $0x001d, (AX)
|
|
MOVW DX, 2(AX)
|
|
SARL $0x10, CX
|
|
MOVB CL, 4(AX)
|
|
ADDQ $0x05, BX
|
|
ADDQ $0x05, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
repeat_four_standalone_emit_copy_short:
|
|
LEAL -256(DX), DX
|
|
MOVW $0x0019, (AX)
|
|
MOVW DX, 2(AX)
|
|
ADDQ $0x04, BX
|
|
ADDQ $0x04, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
repeat_three_standalone_emit_copy_short:
|
|
LEAL -4(DX), DX
|
|
MOVW $0x0015, (AX)
|
|
MOVB DL, 2(AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
repeat_two_standalone_emit_copy_short:
|
|
SHLL $0x02, DX
|
|
ORL $0x01, DX
|
|
MOVW DX, (AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
repeat_two_offset_standalone_emit_copy_short:
|
|
XORQ SI, SI
|
|
LEAL 1(SI)(DX*4), DX
|
|
MOVB CL, 1(AX)
|
|
SARL $0x08, CX
|
|
SHLL $0x05, CX
|
|
ORL CX, DX
|
|
MOVB DL, (AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
JMP gen_emit_copy_end
|
|
JMP two_byte_offset_standalone
|
|
|
|
two_byte_offset_short_standalone:
|
|
CMPL DX, $0x0c
|
|
JGE emit_copy_three_standalone
|
|
CMPL CX, $0x00000800
|
|
JGE emit_copy_three_standalone
|
|
MOVB $0x01, SI
|
|
LEAL -16(SI)(DX*4), DX
|
|
MOVB CL, 1(AX)
|
|
SHRL $0x08, CX
|
|
SHLL $0x05, CX
|
|
ORL CX, DX
|
|
MOVB DL, (AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
JMP gen_emit_copy_end
|
|
|
|
emit_copy_three_standalone:
|
|
MOVB $0x02, SI
|
|
LEAL -4(SI)(DX*4), DX
|
|
MOVB DL, (AX)
|
|
MOVW CX, 1(AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
|
|
gen_emit_copy_end:
|
|
MOVQ BX, ret+40(FP)
|
|
RET
|
|
|
|
// func emitCopyNoRepeat(dst []byte, offset int, length int) int
|
|
TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
|
|
XORQ BX, BX
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ offset+24(FP), CX
|
|
MOVQ length+32(FP), DX
|
|
|
|
// emitCopy
|
|
CMPL CX, $0x00010000
|
|
JL two_byte_offset_standalone_snappy
|
|
|
|
four_bytes_loop_back_standalone_snappy:
|
|
CMPL DX, $0x40
|
|
JLE four_bytes_remain_standalone_snappy
|
|
MOVB $0xff, (AX)
|
|
MOVL CX, 1(AX)
|
|
LEAL -64(DX), DX
|
|
ADDQ $0x05, BX
|
|
ADDQ $0x05, AX
|
|
CMPL DX, $0x04
|
|
JL four_bytes_remain_standalone_snappy
|
|
JMP four_bytes_loop_back_standalone_snappy
|
|
|
|
four_bytes_remain_standalone_snappy:
|
|
TESTL DX, DX
|
|
JZ gen_emit_copy_end_snappy
|
|
MOVB $0x03, SI
|
|
LEAL -4(SI)(DX*4), DX
|
|
MOVB DL, (AX)
|
|
MOVL CX, 1(AX)
|
|
ADDQ $0x05, BX
|
|
ADDQ $0x05, AX
|
|
JMP gen_emit_copy_end_snappy
|
|
|
|
two_byte_offset_standalone_snappy:
|
|
CMPL DX, $0x40
|
|
JLE two_byte_offset_short_standalone_snappy
|
|
MOVB $0xee, (AX)
|
|
MOVW CX, 1(AX)
|
|
LEAL -60(DX), DX
|
|
ADDQ $0x03, AX
|
|
ADDQ $0x03, BX
|
|
JMP two_byte_offset_standalone_snappy
|
|
|
|
two_byte_offset_short_standalone_snappy:
|
|
CMPL DX, $0x0c
|
|
JGE emit_copy_three_standalone_snappy
|
|
CMPL CX, $0x00000800
|
|
JGE emit_copy_three_standalone_snappy
|
|
MOVB $0x01, SI
|
|
LEAL -16(SI)(DX*4), DX
|
|
MOVB CL, 1(AX)
|
|
SHRL $0x08, CX
|
|
SHLL $0x05, CX
|
|
ORL CX, DX
|
|
MOVB DL, (AX)
|
|
ADDQ $0x02, BX
|
|
ADDQ $0x02, AX
|
|
JMP gen_emit_copy_end_snappy
|
|
|
|
emit_copy_three_standalone_snappy:
|
|
MOVB $0x02, SI
|
|
LEAL -4(SI)(DX*4), DX
|
|
MOVB DL, (AX)
|
|
MOVW CX, 1(AX)
|
|
ADDQ $0x03, BX
|
|
ADDQ $0x03, AX
|
|
|
|
gen_emit_copy_end_snappy:
|
|
MOVQ BX, ret+40(FP)
|
|
RET
|
|
|
|
// func matchLen(a []byte, b []byte) int
|
|
TEXT ·matchLen(SB), NOSPLIT, $0-56
|
|
MOVQ a_base+0(FP), AX
|
|
MOVQ b_base+24(FP), CX
|
|
MOVQ a_len+8(FP), DX
|
|
|
|
// matchLen
|
|
XORL SI, SI
|
|
CMPL DX, $0x08
|
|
JL matchlen_single_standalone
|
|
|
|
matchlen_loopback_standalone:
|
|
MOVQ (AX)(SI*1), BX
|
|
XORQ (CX)(SI*1), BX
|
|
TESTQ BX, BX
|
|
JZ matchlen_loop_standalone
|
|
BSFQ BX, BX
|
|
SARQ $0x03, BX
|
|
LEAL (SI)(BX*1), SI
|
|
JMP gen_match_len_end
|
|
|
|
matchlen_loop_standalone:
|
|
LEAL -8(DX), DX
|
|
LEAL 8(SI), SI
|
|
CMPL DX, $0x08
|
|
JGE matchlen_loopback_standalone
|
|
|
|
matchlen_single_standalone:
|
|
TESTL DX, DX
|
|
JZ gen_match_len_end
|
|
|
|
matchlen_single_loopback_standalone:
|
|
MOVB (AX)(SI*1), BL
|
|
CMPB (CX)(SI*1), BL
|
|
JNE gen_match_len_end
|
|
LEAL 1(SI), SI
|
|
DECL DX
|
|
JNZ matchlen_single_loopback_standalone
|
|
|
|
gen_match_len_end:
|
|
MOVQ SI, ret+48(FP)
|
|
RET
|