// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm #include "textflag.h" // func _dummy_() TEXT ·_dummy_(SB), $0 #ifdef GOAMD64_v4 #ifndef GOAMD64_v3 #define GOAMD64_v3 #endif #endif RET // func encodeBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000200, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm: MOVL DX, SI SUBL 12(SP), SI SHRL $0x06, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 SHLQ $0x10, R11 IMULQ R9, R11 SHRQ $0x32, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm LEAL 1(DX), DI MOVL 12(SP), R8 MOVL DI, SI SUBL 16(SP), SI JZ repeat_extend_back_end_encodeBlockAsm repeat_extend_back_loop_encodeBlockAsm: CMPL DI, R8 JBE repeat_extend_back_end_encodeBlockAsm MOVB -1(BX)(SI*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm LEAL -1(DI), DI DECL SI JNZ repeat_extend_back_loop_encodeBlockAsm repeat_extend_back_end_encodeBlockAsm: MOVL DI, SI SUBL 12(SP), SI LEAQ 5(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeBlockAsm MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBlockAsm: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_repeat_emit_encodeBlockAsm CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBlockAsm CMPL SI, $0x00010000 JB three_bytes_repeat_emit_encodeBlockAsm CMPL SI, $0x01000000 JB four_bytes_repeat_emit_encodeBlockAsm MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x05, CX JMP memmove_long_repeat_emit_encodeBlockAsm four_bytes_repeat_emit_encodeBlockAsm: MOVL SI, R11 SHRL $0x10, R11 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R11, 3(CX) ADDQ $0x04, CX JMP memmove_long_repeat_emit_encodeBlockAsm three_bytes_repeat_emit_encodeBlockAsm: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_repeat_emit_encodeBlockAsm two_bytes_repeat_emit_encodeBlockAsm: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_encodeBlockAsm JMP memmove_long_repeat_emit_encodeBlockAsm one_byte_repeat_emit_encodeBlockAsm: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_repeat_emit_encodeBlockAsm: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: MOVQ (R10), R11 MOVQ R11, (CX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_repeat_emit_encodeBlockAsm: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBlockAsm memmove_long_repeat_emit_encodeBlockAsm: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R12 SHRQ $0x05, R12 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R13 SUBQ R11, R13 DECQ R12 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R10)(R13*1), R11 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R11 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R10)(R13*1), X4 MOVOU -16(R10)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R9, R13 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeBlockAsm: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R9 SUBL DX, R9 LEAQ (BX)(DX*1), R10 LEAQ (BX)(SI*1), SI // matchLen XORL R12, R12 matchlen_loopback_16_repeat_extend_encodeBlockAsm: CMPL R9, $0x10 JB matchlen_match8_repeat_extend_encodeBlockAsm MOVQ (R10)(R12*1), R11 MOVQ 8(R10)(R12*1), R13 XORQ (SI)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm XORQ 8(SI)(R12*1), R13 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm LEAL -16(R9), R9 LEAL 16(R12), R12 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm matchlen_bsf_16repeat_extend_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm matchlen_match8_repeat_extend_encodeBlockAsm: CMPL R9, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm LEAL -8(R9), R9 LEAL 8(R12), R12 JMP matchlen_match4_repeat_extend_encodeBlockAsm matchlen_bsf_8_repeat_extend_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm matchlen_match4_repeat_extend_encodeBlockAsm: CMPL R9, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm MOVL (R10)(R12*1), R11 CMPL (SI)(R12*1), R11 JNE matchlen_match2_repeat_extend_encodeBlockAsm LEAL -4(R9), R9 LEAL 4(R12), R12 matchlen_match2_repeat_extend_encodeBlockAsm: CMPL R9, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm JB repeat_extend_forward_end_encodeBlockAsm MOVW (R10)(R12*1), R11 CMPW (SI)(R12*1), R11 JNE matchlen_match1_repeat_extend_encodeBlockAsm LEAL 2(R12), R12 SUBL $0x02, R9 JZ repeat_extend_forward_end_encodeBlockAsm matchlen_match1_repeat_extend_encodeBlockAsm: MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm LEAL 1(R12), R12 repeat_extend_forward_end_encodeBlockAsm: ADDL R12, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm // emitRepeat emit_repeat_again_match_repeat_encodeBlockAsm: MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_match_repeat_encodeBlockAsm CMPL R8, $0x0c JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm CMPL DI, $0x00000800 JB repeat_two_offset_match_repeat_encodeBlockAsm cant_repeat_two_offset_match_repeat_encodeBlockAsm: CMPL SI, $0x00000104 JB repeat_three_match_repeat_encodeBlockAsm CMPL SI, $0x00010100 JB repeat_four_match_repeat_encodeBlockAsm CMPL SI, $0x0100ffff JB repeat_five_match_repeat_encodeBlockAsm LEAL -16842747(SI), SI MOVL $0xfffb001d, (CX) MOVB $0xff, 4(CX) ADDQ $0x05, CX JMP emit_repeat_again_match_repeat_encodeBlockAsm repeat_five_match_repeat_encodeBlockAsm: LEAL -65536(SI), SI MOVL SI, DI MOVW $0x001d, (CX) MOVW SI, 2(CX) SARL $0x10, DI MOVB DI, 4(CX) ADDQ $0x05, CX JMP repeat_end_emit_encodeBlockAsm repeat_four_match_repeat_encodeBlockAsm: LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm repeat_three_match_repeat_encodeBlockAsm: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm repeat_two_match_repeat_encodeBlockAsm: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_match_repeat_encodeBlockAsm: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm repeat_as_copy_encodeBlockAsm: // emitCopy CMPL DI, $0x00010000 JB two_byte_offset_repeat_as_copy_encodeBlockAsm CMPL SI, $0x40 JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm MOVB $0xff, (CX) MOVL DI, 1(CX) LEAL -64(SI), SI ADDQ $0x05, CX CMPL SI, $0x04 JB four_bytes_remain_repeat_as_copy_encodeBlockAsm // emitRepeat emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy CMPL R8, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy CMPL DI, $0x00000800 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy CMPL SI, $0x00010100 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy CMPL SI, $0x0100ffff JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy LEAL -16842747(SI), SI MOVL $0xfffb001d, (CX) MOVB $0xff, 4(CX) ADDQ $0x05, CX JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: LEAL -65536(SI), SI MOVL SI, DI MOVW $0x001d, (CX) MOVW SI, 2(CX) SARL $0x10, DI MOVB DI, 4(CX) ADDQ $0x05, CX JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm four_bytes_remain_repeat_as_copy_encodeBlockAsm: TESTL SI, SI JZ repeat_end_emit_encodeBlockAsm XORL R8, R8 LEAL -1(R8)(SI*4), SI MOVB SI, (CX) MOVL DI, 1(CX) ADDQ $0x05, CX JMP repeat_end_emit_encodeBlockAsm two_byte_offset_repeat_as_copy_encodeBlockAsm: CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm CMPL DI, $0x00000800 JAE long_offset_short_repeat_as_copy_encodeBlockAsm MOVL $0x00000001, R8 LEAL 16(R8), R8 MOVB DI, 1(CX) MOVL DI, R9 SHRL $0x08, R9 SHLL $0x05, R9 ORL R9, R8 MOVB R8, (CX) ADDQ $0x02, CX SUBL $0x08, SI // emitRepeat LEAL -4(SI), SI JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b CMPL R8, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b CMPL DI, $0x00000800 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b CMPL SI, $0x00010100 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b CMPL SI, $0x0100ffff JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b LEAL -16842747(SI), SI MOVL $0xfffb001d, (CX) MOVB $0xff, 4(CX) ADDQ $0x05, CX JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: LEAL -65536(SI), SI MOVL SI, DI MOVW $0x001d, (CX) MOVW SI, 2(CX) SARL $0x10, DI MOVB DI, 4(CX) ADDQ $0x05, CX JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm long_offset_short_repeat_as_copy_encodeBlockAsm: MOVB $0xee, (CX) MOVW DI, 1(CX) LEAL -60(SI), SI ADDQ $0x03, CX // emitRepeat emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short CMPL R8, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short CMPL DI, $0x00000800 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short CMPL SI, $0x00010100 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short CMPL SI, $0x0100ffff JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short LEAL -16842747(SI), SI MOVL $0xfffb001d, (CX) MOVB $0xff, 4(CX) ADDQ $0x05, CX JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: LEAL -65536(SI), SI MOVL SI, DI MOVW $0x001d, (CX) MOVW SI, 2(CX) SARL $0x10, DI MOVB DI, 4(CX) ADDQ $0x05, CX JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm two_byte_offset_short_repeat_as_copy_encodeBlockAsm: MOVL SI, R8 SHLL $0x02, R8 CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_encodeBlockAsm CMPL DI, $0x00000800 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm LEAL -15(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm emit_copy_three_repeat_as_copy_encodeBlockAsm: LEAL -2(R8), R8 MOVB R8, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX repeat_end_emit_encodeBlockAsm: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm no_repeat_found_encodeBlockAsm: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm MOVL 20(SP), DX JMP search_loop_encodeBlockAsm candidate3_match_encodeBlockAsm: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm candidate2_match_encodeBlockAsm: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeBlockAsm: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm match_extend_back_loop_encodeBlockAsm: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm JMP match_extend_back_loop_encodeBlockAsm match_extend_back_end_encodeBlockAsm: MOVL DX, DI SUBL 12(SP), DI LEAQ 5(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBlockAsm: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_encodeBlockAsm MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), R8 CMPL R8, $0x3c JB one_byte_match_emit_encodeBlockAsm CMPL R8, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm CMPL R8, $0x00010000 JB three_bytes_match_emit_encodeBlockAsm CMPL R8, $0x01000000 JB four_bytes_match_emit_encodeBlockAsm MOVB $0xfc, (CX) MOVL R8, 1(CX) ADDQ $0x05, CX JMP memmove_long_match_emit_encodeBlockAsm four_bytes_match_emit_encodeBlockAsm: MOVL R8, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R8, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX JMP memmove_long_match_emit_encodeBlockAsm three_bytes_match_emit_encodeBlockAsm: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeBlockAsm two_bytes_match_emit_encodeBlockAsm: MOVB $0xf0, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_match_emit_encodeBlockAsm JMP memmove_long_match_emit_encodeBlockAsm one_byte_match_emit_encodeBlockAsm: SHLB $0x02, R8 MOVB R8, (CX) ADDQ $0x01, CX memmove_match_emit_encodeBlockAsm: LEAQ (CX)(R9*1), R8 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: MOVQ (DI), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: MOVQ (DI), R10 MOVQ -8(DI)(R9*1), DI MOVQ R10, (CX) MOVQ DI, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeBlockAsm: MOVQ R8, CX JMP emit_literal_done_match_emit_encodeBlockAsm memmove_long_match_emit_encodeBlockAsm: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVQ R9, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 LEAQ -32(DI)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: MOVOU -32(DI)(R12*1), X4 MOVOU -16(DI)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_match_emit_encodeBlockAsm: match_nolit_loop_encodeBlockAsm: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_encodeBlockAsm: CMPL DI, $0x10 JB matchlen_match8_match_nolit_encodeBlockAsm MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm matchlen_bsf_16match_nolit_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_encodeBlockAsm matchlen_match8_match_nolit_encodeBlockAsm: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_encodeBlockAsm matchlen_bsf_8_match_nolit_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeBlockAsm matchlen_match4_match_nolit_encodeBlockAsm: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_encodeBlockAsm LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_encodeBlockAsm: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm JB match_nolit_end_encodeBlockAsm MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_encodeBlockAsm LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm matchlen_match1_match_nolit_encodeBlockAsm: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm LEAL 1(R10), R10 match_nolit_end_encodeBlockAsm: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy CMPL SI, $0x00010000 JB two_byte_offset_match_nolit_encodeBlockAsm CMPL R10, $0x40 JBE four_bytes_remain_match_nolit_encodeBlockAsm MOVB $0xff, (CX) MOVL SI, 1(CX) LEAL -64(R10), R10 ADDQ $0x05, CX CMPL R10, $0x04 JB four_bytes_remain_match_nolit_encodeBlockAsm // emitRepeat emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: MOVL R10, DI LEAL -4(R10), R10 CMPL DI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy CMPL DI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy CMPL SI, $0x00000800 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy CMPL R10, $0x00010100 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy CMPL R10, $0x0100ffff JB repeat_five_match_nolit_encodeBlockAsm_emit_copy LEAL -16842747(R10), R10 MOVL $0xfffb001d, (CX) MOVB $0xff, 4(CX) ADDQ $0x05, CX JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy repeat_five_match_nolit_encodeBlockAsm_emit_copy: LEAL -65536(R10), R10 MOVL R10, SI MOVW $0x001d, (CX) MOVW R10, 2(CX) SARL $0x10, SI MOVB SI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy: LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm four_bytes_remain_match_nolit_encodeBlockAsm: TESTL R10, R10 JZ match_nolit_emitcopy_end_encodeBlockAsm XORL DI, DI LEAL -1(DI)(R10*4), R10 MOVB R10, (CX) MOVL SI, 1(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm two_byte_offset_match_nolit_encodeBlockAsm: CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_encodeBlockAsm CMPL SI, $0x00000800 JAE long_offset_short_match_nolit_encodeBlockAsm MOVL $0x00000001, DI LEAL 16(DI), DI MOVB SI, 1(CX) MOVL SI, R8 SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, DI MOVB DI, (CX) ADDQ $0x02, CX SUBL $0x08, R10 // emitRepeat LEAL -4(R10), R10 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b: MOVL R10, DI LEAL -4(R10), R10 CMPL DI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b CMPL DI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b CMPL SI, $0x00000800 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b CMPL R10, $0x00010100 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b CMPL R10, $0x0100ffff JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b LEAL -16842747(R10), R10 MOVL $0xfffb001d, (CX) MOVB $0xff, 4(CX) ADDQ $0x05, CX JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b: LEAL -65536(R10), R10 MOVL R10, SI MOVW $0x001d, (CX) MOVW R10, 2(CX) SARL $0x10, SI MOVB SI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b: LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm long_offset_short_match_nolit_encodeBlockAsm: MOVB $0xee, (CX) MOVW SI, 1(CX) LEAL -60(R10), R10 ADDQ $0x03, CX // emitRepeat emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: MOVL R10, DI LEAL -4(R10), R10 CMPL DI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short CMPL DI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short CMPL SI, $0x00000800 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short CMPL R10, $0x00010100 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short CMPL R10, $0x0100ffff JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short LEAL -16842747(R10), R10 MOVL $0xfffb001d, (CX) MOVB $0xff, 4(CX) ADDQ $0x05, CX JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: LEAL -65536(R10), R10 MOVL R10, SI MOVW $0x001d, (CX) MOVW R10, 2(CX) SARL $0x10, SI MOVB SI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm two_byte_offset_short_match_nolit_encodeBlockAsm: MOVL R10, DI SHLL $0x02, DI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_encodeBlockAsm CMPL SI, $0x00000800 JAE emit_copy_three_match_nolit_encodeBlockAsm LEAL -15(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy_three_match_nolit_encodeBlockAsm: LEAL -2(DI), DI MOVB DI, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm: MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x10, R8 IMULQ R9, R8 SHRQ $0x32, R8 SHLQ $0x10, SI IMULQ R9, SI SHRQ $0x32, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_encodeBlockAsm INCL DX JMP search_loop_encodeBlockAsm emit_remainder_encodeBlockAsm: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 5(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBlockAsm MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBlockAsm: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeBlockAsm MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeBlockAsm CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm CMPL DX, $0x00010000 JB three_bytes_emit_remainder_encodeBlockAsm CMPL DX, $0x01000000 JB four_bytes_emit_remainder_encodeBlockAsm MOVB $0xfc, (CX) MOVL DX, 1(CX) ADDQ $0x05, CX JMP memmove_long_emit_remainder_encodeBlockAsm four_bytes_emit_remainder_encodeBlockAsm: MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (CX) MOVW DX, 1(CX) MOVB BL, 3(CX) ADDQ $0x04, CX JMP memmove_long_emit_remainder_encodeBlockAsm three_bytes_emit_remainder_encodeBlockAsm: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeBlockAsm two_bytes_emit_remainder_encodeBlockAsm: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeBlockAsm JMP memmove_long_emit_remainder_encodeBlockAsm one_byte_emit_remainder_encodeBlockAsm: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeBlockAsm: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBlockAsm: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBlockAsm memmove_long_emit_remainder_encodeBlockAsm: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBlockAsm: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBlockAsm4MB(dst []byte, src []byte, tmp *[65536]byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm4MB(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000200, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm4MB: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm4MB MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm4MB: MOVL DX, SI SUBL 12(SP), SI SHRL $0x06, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm4MB MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 SHLQ $0x10, R11 IMULQ R9, R11 SHRQ $0x32, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm4MB LEAL 1(DX), DI MOVL 12(SP), R8 MOVL DI, SI SUBL 16(SP), SI JZ repeat_extend_back_end_encodeBlockAsm4MB repeat_extend_back_loop_encodeBlockAsm4MB: CMPL DI, R8 JBE repeat_extend_back_end_encodeBlockAsm4MB MOVB -1(BX)(SI*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm4MB LEAL -1(DI), DI DECL SI JNZ repeat_extend_back_loop_encodeBlockAsm4MB repeat_extend_back_end_encodeBlockAsm4MB: MOVL DI, SI SUBL 12(SP), SI LEAQ 4(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeBlockAsm4MB MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBlockAsm4MB: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_repeat_emit_encodeBlockAsm4MB CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBlockAsm4MB CMPL SI, $0x00010000 JB three_bytes_repeat_emit_encodeBlockAsm4MB MOVL SI, R11 SHRL $0x10, R11 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R11, 3(CX) ADDQ $0x04, CX JMP memmove_long_repeat_emit_encodeBlockAsm4MB three_bytes_repeat_emit_encodeBlockAsm4MB: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_repeat_emit_encodeBlockAsm4MB two_bytes_repeat_emit_encodeBlockAsm4MB: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_encodeBlockAsm4MB JMP memmove_long_repeat_emit_encodeBlockAsm4MB one_byte_repeat_emit_encodeBlockAsm4MB: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_repeat_emit_encodeBlockAsm4MB: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8: MOVQ (R10), R11 MOVQ R11, (CX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_repeat_emit_encodeBlockAsm4MB: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB memmove_long_repeat_emit_encodeBlockAsm4MB: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R12 SHRQ $0x05, R12 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R13 SUBQ R11, R13 DECQ R12 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(R10)(R13*1), R11 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R11 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(R10)(R13*1), X4 MOVOU -16(R10)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R9, R13 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeBlockAsm4MB: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R9 SUBL DX, R9 LEAQ (BX)(DX*1), R10 LEAQ (BX)(SI*1), SI // matchLen XORL R12, R12 matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB: CMPL R9, $0x10 JB matchlen_match8_repeat_extend_encodeBlockAsm4MB MOVQ (R10)(R12*1), R11 MOVQ 8(R10)(R12*1), R13 XORQ (SI)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB XORQ 8(SI)(R12*1), R13 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4MB LEAL -16(R9), R9 LEAL 16(R12), R12 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB matchlen_bsf_16repeat_extend_encodeBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm4MB matchlen_match8_repeat_extend_encodeBlockAsm4MB: CMPL R9, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm4MB MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB LEAL -8(R9), R9 LEAL 8(R12), R12 JMP matchlen_match4_repeat_extend_encodeBlockAsm4MB matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm4MB matchlen_match4_repeat_extend_encodeBlockAsm4MB: CMPL R9, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm4MB MOVL (R10)(R12*1), R11 CMPL (SI)(R12*1), R11 JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB LEAL -4(R9), R9 LEAL 4(R12), R12 matchlen_match2_repeat_extend_encodeBlockAsm4MB: CMPL R9, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm4MB JB repeat_extend_forward_end_encodeBlockAsm4MB MOVW (R10)(R12*1), R11 CMPW (SI)(R12*1), R11 JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB LEAL 2(R12), R12 SUBL $0x02, R9 JZ repeat_extend_forward_end_encodeBlockAsm4MB matchlen_match1_repeat_extend_encodeBlockAsm4MB: MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm4MB LEAL 1(R12), R12 repeat_extend_forward_end_encodeBlockAsm4MB: ADDL R12, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm4MB // emitRepeat MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_match_repeat_encodeBlockAsm4MB CMPL R8, $0x0c JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB CMPL DI, $0x00000800 JB repeat_two_offset_match_repeat_encodeBlockAsm4MB cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: CMPL SI, $0x00000104 JB repeat_three_match_repeat_encodeBlockAsm4MB CMPL SI, $0x00010100 JB repeat_four_match_repeat_encodeBlockAsm4MB LEAL -65536(SI), SI MOVL SI, DI MOVW $0x001d, (CX) MOVW SI, 2(CX) SARL $0x10, DI MOVB DI, 4(CX) ADDQ $0x05, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_four_match_repeat_encodeBlockAsm4MB: LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_three_match_repeat_encodeBlockAsm4MB: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_match_repeat_encodeBlockAsm4MB: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_offset_match_repeat_encodeBlockAsm4MB: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_as_copy_encodeBlockAsm4MB: // emitCopy CMPL DI, $0x00010000 JB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB CMPL SI, $0x40 JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB MOVB $0xff, (CX) MOVL DI, 1(CX) LEAL -64(SI), SI ADDQ $0x05, CX CMPL SI, $0x04 JB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB // emitRepeat MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy CMPL R8, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy CMPL DI, $0x00000800 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy CMPL SI, $0x00010100 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy LEAL -65536(SI), SI MOVL SI, DI MOVW $0x001d, (CX) MOVW SI, 2(CX) SARL $0x10, DI MOVB DI, 4(CX) ADDQ $0x05, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm4MB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: TESTL SI, SI JZ repeat_end_emit_encodeBlockAsm4MB XORL R8, R8 LEAL -1(R8)(SI*4), SI MOVB SI, (CX) MOVL DI, 1(CX) ADDQ $0x05, CX JMP repeat_end_emit_encodeBlockAsm4MB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB CMPL DI, $0x00000800 JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB MOVL $0x00000001, R8 LEAL 16(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX SUBL $0x08, SI // emitRepeat LEAL -4(SI), SI JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b CMPL R8, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b CMPL DI, $0x00000800 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b CMPL SI, $0x00010100 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b LEAL -65536(SI), SI MOVL SI, DI MOVW $0x001d, (CX) MOVW SI, 2(CX) SARL $0x10, DI MOVB DI, 4(CX) ADDQ $0x05, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm4MB long_offset_short_repeat_as_copy_encodeBlockAsm4MB: MOVB $0xee, (CX) MOVW DI, 1(CX) LEAL -60(SI), SI ADDQ $0x03, CX // emitRepeat MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short CMPL R8, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short CMPL DI, $0x00000800 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short CMPL SI, $0x00010100 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short LEAL -65536(SI), SI MOVL SI, DI MOVW $0x001d, (CX) MOVW SI, 2(CX) SARL $0x10, DI MOVB DI, 4(CX) ADDQ $0x05, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm4MB two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: MOVL SI, R8 SHLL $0x02, R8 CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB CMPL DI, $0x00000800 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB LEAL -15(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm4MB emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: LEAL -2(R8), R8 MOVB R8, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX repeat_end_emit_encodeBlockAsm4MB: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm4MB no_repeat_found_encodeBlockAsm4MB: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm4MB SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm4MB MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm4MB MOVL 20(SP), DX JMP search_loop_encodeBlockAsm4MB candidate3_match_encodeBlockAsm4MB: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm4MB candidate2_match_encodeBlockAsm4MB: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeBlockAsm4MB: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm4MB match_extend_back_loop_encodeBlockAsm4MB: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm4MB MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm4MB LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm4MB JMP match_extend_back_loop_encodeBlockAsm4MB match_extend_back_end_encodeBlockAsm4MB: MOVL DX, DI SUBL 12(SP), DI LEAQ 4(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeBlockAsm4MB MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBlockAsm4MB: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_encodeBlockAsm4MB MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), R8 CMPL R8, $0x3c JB one_byte_match_emit_encodeBlockAsm4MB CMPL R8, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm4MB CMPL R8, $0x00010000 JB three_bytes_match_emit_encodeBlockAsm4MB MOVL R8, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R8, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX JMP memmove_long_match_emit_encodeBlockAsm4MB three_bytes_match_emit_encodeBlockAsm4MB: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeBlockAsm4MB two_bytes_match_emit_encodeBlockAsm4MB: MOVB $0xf0, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_match_emit_encodeBlockAsm4MB JMP memmove_long_match_emit_encodeBlockAsm4MB one_byte_match_emit_encodeBlockAsm4MB: SHLB $0x02, R8 MOVB R8, (CX) ADDQ $0x01, CX memmove_match_emit_encodeBlockAsm4MB: LEAQ (CX)(R9*1), R8 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8: MOVQ (DI), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm4MB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (DI), R10 MOVQ -8(DI)(R9*1), DI MOVQ R10, (CX) MOVQ DI, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm4MB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm4MB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeBlockAsm4MB: MOVQ R8, CX JMP emit_literal_done_match_emit_encodeBlockAsm4MB memmove_long_match_emit_encodeBlockAsm4MB: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVQ R9, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(DI)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(DI)(R12*1), X4 MOVOU -16(DI)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_match_emit_encodeBlockAsm4MB: match_nolit_loop_encodeBlockAsm4MB: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_encodeBlockAsm4MB: CMPL DI, $0x10 JB matchlen_match8_match_nolit_encodeBlockAsm4MB MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm4MB LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm4MB matchlen_bsf_16match_nolit_encodeBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_encodeBlockAsm4MB matchlen_match8_match_nolit_encodeBlockAsm4MB: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm4MB MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_encodeBlockAsm4MB matchlen_bsf_8_match_nolit_encodeBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeBlockAsm4MB matchlen_match4_match_nolit_encodeBlockAsm4MB: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm4MB MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_encodeBlockAsm4MB LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_encodeBlockAsm4MB: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm4MB JB match_nolit_end_encodeBlockAsm4MB MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_encodeBlockAsm4MB LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm4MB matchlen_match1_match_nolit_encodeBlockAsm4MB: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm4MB LEAL 1(R10), R10 match_nolit_end_encodeBlockAsm4MB: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy CMPL SI, $0x00010000 JB two_byte_offset_match_nolit_encodeBlockAsm4MB CMPL R10, $0x40 JBE four_bytes_remain_match_nolit_encodeBlockAsm4MB MOVB $0xff, (CX) MOVL SI, 1(CX) LEAL -64(R10), R10 ADDQ $0x05, CX CMPL R10, $0x04 JB four_bytes_remain_match_nolit_encodeBlockAsm4MB // emitRepeat MOVL R10, DI LEAL -4(R10), R10 CMPL DI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy CMPL DI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy CMPL SI, $0x00000800 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy CMPL R10, $0x00010100 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy LEAL -65536(R10), R10 MOVL R10, SI MOVW $0x001d, (CX) MOVW R10, 2(CX) SARL $0x10, SI MOVB SI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB four_bytes_remain_match_nolit_encodeBlockAsm4MB: TESTL R10, R10 JZ match_nolit_emitcopy_end_encodeBlockAsm4MB XORL DI, DI LEAL -1(DI)(R10*4), R10 MOVB R10, (CX) MOVL SI, 1(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB two_byte_offset_match_nolit_encodeBlockAsm4MB: CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_encodeBlockAsm4MB CMPL SI, $0x00000800 JAE long_offset_short_match_nolit_encodeBlockAsm4MB MOVL $0x00000001, DI LEAL 16(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX SUBL $0x08, R10 // emitRepeat LEAL -4(R10), R10 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b MOVL R10, DI LEAL -4(R10), R10 CMPL DI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b CMPL DI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b CMPL SI, $0x00000800 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b CMPL R10, $0x00010100 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b LEAL -65536(R10), R10 MOVL R10, SI MOVW $0x001d, (CX) MOVW R10, 2(CX) SARL $0x10, SI MOVB SI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB long_offset_short_match_nolit_encodeBlockAsm4MB: MOVB $0xee, (CX) MOVW SI, 1(CX) LEAL -60(R10), R10 ADDQ $0x03, CX // emitRepeat MOVL R10, DI LEAL -4(R10), R10 CMPL DI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short CMPL DI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short CMPL SI, $0x00000800 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short CMPL R10, $0x00010100 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short LEAL -65536(R10), R10 MOVL R10, SI MOVW $0x001d, (CX) MOVW R10, 2(CX) SARL $0x10, SI MOVB SI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB two_byte_offset_short_match_nolit_encodeBlockAsm4MB: MOVL R10, DI SHLL $0x02, DI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_encodeBlockAsm4MB CMPL SI, $0x00000800 JAE emit_copy_three_match_nolit_encodeBlockAsm4MB LEAL -15(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB emit_copy_three_match_nolit_encodeBlockAsm4MB: LEAL -2(DI), DI MOVB DI, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm4MB: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm4MB MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm4MB MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm4MB: MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x10, R8 IMULQ R9, R8 SHRQ $0x32, R8 SHLQ $0x10, SI IMULQ R9, SI SHRQ $0x32, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_encodeBlockAsm4MB INCL DX JMP search_loop_encodeBlockAsm4MB emit_remainder_encodeBlockAsm4MB: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 4(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBlockAsm4MB MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBlockAsm4MB: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeBlockAsm4MB CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm4MB CMPL DX, $0x00010000 JB three_bytes_emit_remainder_encodeBlockAsm4MB MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (CX) MOVW DX, 1(CX) MOVB BL, 3(CX) ADDQ $0x04, CX JMP memmove_long_emit_remainder_encodeBlockAsm4MB three_bytes_emit_remainder_encodeBlockAsm4MB: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeBlockAsm4MB two_bytes_emit_remainder_encodeBlockAsm4MB: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeBlockAsm4MB JMP memmove_long_emit_remainder_encodeBlockAsm4MB one_byte_emit_remainder_encodeBlockAsm4MB: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeBlockAsm4MB: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBlockAsm4MB: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB memmove_long_emit_remainder_encodeBlockAsm4MB: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBlockAsm4MB: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm12B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000080, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm12B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm12B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x05, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm12B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x000000cf1bbcdcbb, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x18, R10 IMULQ R9, R10 SHRQ $0x34, R10 SHLQ $0x18, R11 IMULQ R9, R11 SHRQ $0x34, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x18, R10 IMULQ R9, R10 SHRQ $0x34, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm12B LEAL 1(DX), DI MOVL 12(SP), R8 MOVL DI, SI SUBL 16(SP), SI JZ repeat_extend_back_end_encodeBlockAsm12B repeat_extend_back_loop_encodeBlockAsm12B: CMPL DI, R8 JBE repeat_extend_back_end_encodeBlockAsm12B MOVB -1(BX)(SI*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm12B LEAL -1(DI), DI DECL SI JNZ repeat_extend_back_loop_encodeBlockAsm12B repeat_extend_back_end_encodeBlockAsm12B: MOVL DI, SI SUBL 12(SP), SI LEAQ 3(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBlockAsm12B: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_repeat_emit_encodeBlockAsm12B CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBlockAsm12B JB three_bytes_repeat_emit_encodeBlockAsm12B three_bytes_repeat_emit_encodeBlockAsm12B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_repeat_emit_encodeBlockAsm12B two_bytes_repeat_emit_encodeBlockAsm12B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_encodeBlockAsm12B JMP memmove_long_repeat_emit_encodeBlockAsm12B one_byte_repeat_emit_encodeBlockAsm12B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_repeat_emit_encodeBlockAsm12B: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: MOVQ (R10), R11 MOVQ R11, (CX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_repeat_emit_encodeBlockAsm12B: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBlockAsm12B memmove_long_repeat_emit_encodeBlockAsm12B: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R12 SHRQ $0x05, R12 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R13 SUBQ R11, R13 DECQ R12 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R10)(R13*1), R11 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R11 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R10)(R13*1), X4 MOVOU -16(R10)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R9, R13 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeBlockAsm12B: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R9 SUBL DX, R9 LEAQ (BX)(DX*1), R10 LEAQ (BX)(SI*1), SI // matchLen XORL R12, R12 matchlen_loopback_16_repeat_extend_encodeBlockAsm12B: CMPL R9, $0x10 JB matchlen_match8_repeat_extend_encodeBlockAsm12B MOVQ (R10)(R12*1), R11 MOVQ 8(R10)(R12*1), R13 XORQ (SI)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B XORQ 8(SI)(R12*1), R13 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm12B LEAL -16(R9), R9 LEAL 16(R12), R12 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm12B matchlen_bsf_16repeat_extend_encodeBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm12B matchlen_match8_repeat_extend_encodeBlockAsm12B: CMPL R9, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm12B MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B LEAL -8(R9), R9 LEAL 8(R12), R12 JMP matchlen_match4_repeat_extend_encodeBlockAsm12B matchlen_bsf_8_repeat_extend_encodeBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm12B matchlen_match4_repeat_extend_encodeBlockAsm12B: CMPL R9, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm12B MOVL (R10)(R12*1), R11 CMPL (SI)(R12*1), R11 JNE matchlen_match2_repeat_extend_encodeBlockAsm12B LEAL -4(R9), R9 LEAL 4(R12), R12 matchlen_match2_repeat_extend_encodeBlockAsm12B: CMPL R9, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm12B JB repeat_extend_forward_end_encodeBlockAsm12B MOVW (R10)(R12*1), R11 CMPW (SI)(R12*1), R11 JNE matchlen_match1_repeat_extend_encodeBlockAsm12B LEAL 2(R12), R12 SUBL $0x02, R9 JZ repeat_extend_forward_end_encodeBlockAsm12B matchlen_match1_repeat_extend_encodeBlockAsm12B: MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm12B LEAL 1(R12), R12 repeat_extend_forward_end_encodeBlockAsm12B: ADDL R12, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm12B // emitRepeat MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_match_repeat_encodeBlockAsm12B CMPL R8, $0x0c JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B CMPL DI, $0x00000800 JB repeat_two_offset_match_repeat_encodeBlockAsm12B cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: CMPL SI, $0x00000104 JB repeat_three_match_repeat_encodeBlockAsm12B LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm12B repeat_three_match_repeat_encodeBlockAsm12B: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_match_repeat_encodeBlockAsm12B: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_match_repeat_encodeBlockAsm12B: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm12B repeat_as_copy_encodeBlockAsm12B: // emitCopy CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B CMPL DI, $0x00000800 JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B MOVL $0x00000001, R8 LEAL 16(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX SUBL $0x08, SI // emitRepeat LEAL -4(SI), SI JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b CMPL R8, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b CMPL DI, $0x00000800 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm12B repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm12B long_offset_short_repeat_as_copy_encodeBlockAsm12B: MOVB $0xee, (CX) MOVW DI, 1(CX) LEAL -60(SI), SI ADDQ $0x03, CX // emitRepeat MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short CMPL R8, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short CMPL DI, $0x00000800 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm12B repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm12B two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: MOVL SI, R8 SHLL $0x02, R8 CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B CMPL DI, $0x00000800 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B LEAL -15(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm12B emit_copy_three_repeat_as_copy_encodeBlockAsm12B: LEAL -2(R8), R8 MOVB R8, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX repeat_end_emit_encodeBlockAsm12B: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm12B no_repeat_found_encodeBlockAsm12B: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm12B SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm12B MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm12B MOVL 20(SP), DX JMP search_loop_encodeBlockAsm12B candidate3_match_encodeBlockAsm12B: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm12B candidate2_match_encodeBlockAsm12B: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeBlockAsm12B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm12B match_extend_back_loop_encodeBlockAsm12B: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm12B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm12B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm12B JMP match_extend_back_loop_encodeBlockAsm12B match_extend_back_end_encodeBlockAsm12B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBlockAsm12B: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_encodeBlockAsm12B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), R8 CMPL R8, $0x3c JB one_byte_match_emit_encodeBlockAsm12B CMPL R8, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm12B JB three_bytes_match_emit_encodeBlockAsm12B three_bytes_match_emit_encodeBlockAsm12B: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeBlockAsm12B two_bytes_match_emit_encodeBlockAsm12B: MOVB $0xf0, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_match_emit_encodeBlockAsm12B JMP memmove_long_match_emit_encodeBlockAsm12B one_byte_match_emit_encodeBlockAsm12B: SHLB $0x02, R8 MOVB R8, (CX) ADDQ $0x01, CX memmove_match_emit_encodeBlockAsm12B: LEAQ (CX)(R9*1), R8 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: MOVQ (DI), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: MOVQ (DI), R10 MOVQ -8(DI)(R9*1), DI MOVQ R10, (CX) MOVQ DI, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeBlockAsm12B: MOVQ R8, CX JMP emit_literal_done_match_emit_encodeBlockAsm12B memmove_long_match_emit_encodeBlockAsm12B: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVQ R9, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(DI)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(DI)(R12*1), X4 MOVOU -16(DI)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_match_emit_encodeBlockAsm12B: match_nolit_loop_encodeBlockAsm12B: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_encodeBlockAsm12B: CMPL DI, $0x10 JB matchlen_match8_match_nolit_encodeBlockAsm12B MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm12B LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm12B matchlen_bsf_16match_nolit_encodeBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_encodeBlockAsm12B matchlen_match8_match_nolit_encodeBlockAsm12B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm12B MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_encodeBlockAsm12B matchlen_bsf_8_match_nolit_encodeBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeBlockAsm12B matchlen_match4_match_nolit_encodeBlockAsm12B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm12B MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_encodeBlockAsm12B LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_encodeBlockAsm12B: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm12B JB match_nolit_end_encodeBlockAsm12B MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_encodeBlockAsm12B LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm12B matchlen_match1_match_nolit_encodeBlockAsm12B: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm12B LEAL 1(R10), R10 match_nolit_end_encodeBlockAsm12B: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_encodeBlockAsm12B CMPL SI, $0x00000800 JAE long_offset_short_match_nolit_encodeBlockAsm12B MOVL $0x00000001, DI LEAL 16(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX SUBL $0x08, R10 // emitRepeat LEAL -4(R10), R10 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b MOVL R10, DI LEAL -4(R10), R10 CMPL DI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b CMPL DI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b CMPL SI, $0x00000800 JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm12B long_offset_short_match_nolit_encodeBlockAsm12B: MOVB $0xee, (CX) MOVW SI, 1(CX) LEAL -60(R10), R10 ADDQ $0x03, CX // emitRepeat MOVL R10, DI LEAL -4(R10), R10 CMPL DI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short CMPL DI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short CMPL SI, $0x00000800 JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm12B two_byte_offset_short_match_nolit_encodeBlockAsm12B: MOVL R10, DI SHLL $0x02, DI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_encodeBlockAsm12B CMPL SI, $0x00000800 JAE emit_copy_three_match_nolit_encodeBlockAsm12B LEAL -15(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm12B emit_copy_three_match_nolit_encodeBlockAsm12B: LEAL -2(DI), DI MOVB DI, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm12B: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm12B MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm12B: MOVQ $0x000000cf1bbcdcbb, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x18, R8 IMULQ R9, R8 SHRQ $0x34, R8 SHLQ $0x18, SI IMULQ R9, SI SHRQ $0x34, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_encodeBlockAsm12B INCL DX JMP search_loop_encodeBlockAsm12B emit_remainder_encodeBlockAsm12B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBlockAsm12B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeBlockAsm12B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm12B JB three_bytes_emit_remainder_encodeBlockAsm12B three_bytes_emit_remainder_encodeBlockAsm12B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeBlockAsm12B two_bytes_emit_remainder_encodeBlockAsm12B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeBlockAsm12B JMP memmove_long_emit_remainder_encodeBlockAsm12B one_byte_emit_remainder_encodeBlockAsm12B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeBlockAsm12B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBlockAsm12B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBlockAsm12B memmove_long_emit_remainder_encodeBlockAsm12B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBlockAsm12B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm10B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000020, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm10B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm10B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x05, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm10B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x9e3779b1, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x36, R10 SHLQ $0x20, R11 IMULQ R9, R11 SHRQ $0x36, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x36, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm10B LEAL 1(DX), DI MOVL 12(SP), R8 MOVL DI, SI SUBL 16(SP), SI JZ repeat_extend_back_end_encodeBlockAsm10B repeat_extend_back_loop_encodeBlockAsm10B: CMPL DI, R8 JBE repeat_extend_back_end_encodeBlockAsm10B MOVB -1(BX)(SI*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm10B LEAL -1(DI), DI DECL SI JNZ repeat_extend_back_loop_encodeBlockAsm10B repeat_extend_back_end_encodeBlockAsm10B: MOVL DI, SI SUBL 12(SP), SI LEAQ 3(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBlockAsm10B: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_repeat_emit_encodeBlockAsm10B CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBlockAsm10B JB three_bytes_repeat_emit_encodeBlockAsm10B three_bytes_repeat_emit_encodeBlockAsm10B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_repeat_emit_encodeBlockAsm10B two_bytes_repeat_emit_encodeBlockAsm10B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_encodeBlockAsm10B JMP memmove_long_repeat_emit_encodeBlockAsm10B one_byte_repeat_emit_encodeBlockAsm10B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_repeat_emit_encodeBlockAsm10B: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: MOVQ (R10), R11 MOVQ R11, (CX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_repeat_emit_encodeBlockAsm10B: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBlockAsm10B memmove_long_repeat_emit_encodeBlockAsm10B: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R12 SHRQ $0x05, R12 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R13 SUBQ R11, R13 DECQ R12 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R10)(R13*1), R11 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R11 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R10)(R13*1), X4 MOVOU -16(R10)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R9, R13 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeBlockAsm10B: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R9 SUBL DX, R9 LEAQ (BX)(DX*1), R10 LEAQ (BX)(SI*1), SI // matchLen XORL R12, R12 matchlen_loopback_16_repeat_extend_encodeBlockAsm10B: CMPL R9, $0x10 JB matchlen_match8_repeat_extend_encodeBlockAsm10B MOVQ (R10)(R12*1), R11 MOVQ 8(R10)(R12*1), R13 XORQ (SI)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B XORQ 8(SI)(R12*1), R13 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm10B LEAL -16(R9), R9 LEAL 16(R12), R12 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm10B matchlen_bsf_16repeat_extend_encodeBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm10B matchlen_match8_repeat_extend_encodeBlockAsm10B: CMPL R9, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm10B MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B LEAL -8(R9), R9 LEAL 8(R12), R12 JMP matchlen_match4_repeat_extend_encodeBlockAsm10B matchlen_bsf_8_repeat_extend_encodeBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm10B matchlen_match4_repeat_extend_encodeBlockAsm10B: CMPL R9, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm10B MOVL (R10)(R12*1), R11 CMPL (SI)(R12*1), R11 JNE matchlen_match2_repeat_extend_encodeBlockAsm10B LEAL -4(R9), R9 LEAL 4(R12), R12 matchlen_match2_repeat_extend_encodeBlockAsm10B: CMPL R9, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm10B JB repeat_extend_forward_end_encodeBlockAsm10B MOVW (R10)(R12*1), R11 CMPW (SI)(R12*1), R11 JNE matchlen_match1_repeat_extend_encodeBlockAsm10B LEAL 2(R12), R12 SUBL $0x02, R9 JZ repeat_extend_forward_end_encodeBlockAsm10B matchlen_match1_repeat_extend_encodeBlockAsm10B: MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm10B LEAL 1(R12), R12 repeat_extend_forward_end_encodeBlockAsm10B: ADDL R12, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm10B // emitRepeat MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_match_repeat_encodeBlockAsm10B CMPL R8, $0x0c JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B CMPL DI, $0x00000800 JB repeat_two_offset_match_repeat_encodeBlockAsm10B cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: CMPL SI, $0x00000104 JB repeat_three_match_repeat_encodeBlockAsm10B LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm10B repeat_three_match_repeat_encodeBlockAsm10B: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_match_repeat_encodeBlockAsm10B: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_offset_match_repeat_encodeBlockAsm10B: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm10B repeat_as_copy_encodeBlockAsm10B: // emitCopy CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B CMPL DI, $0x00000800 JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B MOVL $0x00000001, R8 LEAL 16(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX SUBL $0x08, SI // emitRepeat LEAL -4(SI), SI JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b CMPL R8, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b CMPL DI, $0x00000800 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm10B repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm10B long_offset_short_repeat_as_copy_encodeBlockAsm10B: MOVB $0xee, (CX) MOVW DI, 1(CX) LEAL -60(SI), SI ADDQ $0x03, CX // emitRepeat MOVL SI, R8 LEAL -4(SI), SI CMPL R8, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short CMPL R8, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short CMPL DI, $0x00000800 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm10B repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm10B two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: MOVL SI, R8 SHLL $0x02, R8 CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B CMPL DI, $0x00000800 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B LEAL -15(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm10B emit_copy_three_repeat_as_copy_encodeBlockAsm10B: LEAL -2(R8), R8 MOVB R8, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX repeat_end_emit_encodeBlockAsm10B: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm10B no_repeat_found_encodeBlockAsm10B: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm10B SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm10B MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm10B MOVL 20(SP), DX JMP search_loop_encodeBlockAsm10B candidate3_match_encodeBlockAsm10B: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm10B candidate2_match_encodeBlockAsm10B: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeBlockAsm10B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm10B match_extend_back_loop_encodeBlockAsm10B: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm10B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm10B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm10B JMP match_extend_back_loop_encodeBlockAsm10B match_extend_back_end_encodeBlockAsm10B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBlockAsm10B: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_encodeBlockAsm10B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), R8 CMPL R8, $0x3c JB one_byte_match_emit_encodeBlockAsm10B CMPL R8, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm10B JB three_bytes_match_emit_encodeBlockAsm10B three_bytes_match_emit_encodeBlockAsm10B: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeBlockAsm10B two_bytes_match_emit_encodeBlockAsm10B: MOVB $0xf0, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_match_emit_encodeBlockAsm10B JMP memmove_long_match_emit_encodeBlockAsm10B one_byte_match_emit_encodeBlockAsm10B: SHLB $0x02, R8 MOVB R8, (CX) ADDQ $0x01, CX memmove_match_emit_encodeBlockAsm10B: LEAQ (CX)(R9*1), R8 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: MOVQ (DI), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: MOVQ (DI), R10 MOVQ -8(DI)(R9*1), DI MOVQ R10, (CX) MOVQ DI, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeBlockAsm10B: MOVQ R8, CX JMP emit_literal_done_match_emit_encodeBlockAsm10B memmove_long_match_emit_encodeBlockAsm10B: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVQ R9, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(DI)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(DI)(R12*1), X4 MOVOU -16(DI)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_match_emit_encodeBlockAsm10B: match_nolit_loop_encodeBlockAsm10B: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_encodeBlockAsm10B: CMPL DI, $0x10 JB matchlen_match8_match_nolit_encodeBlockAsm10B MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm10B LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm10B matchlen_bsf_16match_nolit_encodeBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_encodeBlockAsm10B matchlen_match8_match_nolit_encodeBlockAsm10B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm10B MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_encodeBlockAsm10B matchlen_bsf_8_match_nolit_encodeBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeBlockAsm10B matchlen_match4_match_nolit_encodeBlockAsm10B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm10B MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_encodeBlockAsm10B LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_encodeBlockAsm10B: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm10B JB match_nolit_end_encodeBlockAsm10B MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_encodeBlockAsm10B LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm10B matchlen_match1_match_nolit_encodeBlockAsm10B: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm10B LEAL 1(R10), R10 match_nolit_end_encodeBlockAsm10B: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_encodeBlockAsm10B CMPL SI, $0x00000800 JAE long_offset_short_match_nolit_encodeBlockAsm10B MOVL $0x00000001, DI LEAL 16(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX SUBL $0x08, R10 // emitRepeat LEAL -4(R10), R10 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b MOVL R10, DI LEAL -4(R10), R10 CMPL DI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b CMPL DI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b CMPL SI, $0x00000800 JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm10B long_offset_short_match_nolit_encodeBlockAsm10B: MOVB $0xee, (CX) MOVW SI, 1(CX) LEAL -60(R10), R10 ADDQ $0x03, CX // emitRepeat MOVL R10, DI LEAL -4(R10), R10 CMPL DI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short CMPL DI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short CMPL SI, $0x00000800 JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm10B two_byte_offset_short_match_nolit_encodeBlockAsm10B: MOVL R10, DI SHLL $0x02, DI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_encodeBlockAsm10B CMPL SI, $0x00000800 JAE emit_copy_three_match_nolit_encodeBlockAsm10B LEAL -15(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm10B emit_copy_three_match_nolit_encodeBlockAsm10B: LEAL -2(DI), DI MOVB DI, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm10B: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm10B MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm10B: MOVQ $0x9e3779b1, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x20, R8 IMULQ R9, R8 SHRQ $0x36, R8 SHLQ $0x20, SI IMULQ R9, SI SHRQ $0x36, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_encodeBlockAsm10B INCL DX JMP search_loop_encodeBlockAsm10B emit_remainder_encodeBlockAsm10B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBlockAsm10B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeBlockAsm10B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm10B JB three_bytes_emit_remainder_encodeBlockAsm10B three_bytes_emit_remainder_encodeBlockAsm10B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeBlockAsm10B two_bytes_emit_remainder_encodeBlockAsm10B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeBlockAsm10B JMP memmove_long_emit_remainder_encodeBlockAsm10B one_byte_emit_remainder_encodeBlockAsm10B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeBlockAsm10B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBlockAsm10B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBlockAsm10B memmove_long_emit_remainder_encodeBlockAsm10B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBlockAsm10B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm8B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000008, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBlockAsm8B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm8B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x04, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBlockAsm8B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x9e3779b1, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x38, R10 SHLQ $0x20, R11 IMULQ R9, R11 SHRQ $0x38, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x38, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeBlockAsm8B LEAL 1(DX), DI MOVL 12(SP), R8 MOVL DI, SI SUBL 16(SP), SI JZ repeat_extend_back_end_encodeBlockAsm8B repeat_extend_back_loop_encodeBlockAsm8B: CMPL DI, R8 JBE repeat_extend_back_end_encodeBlockAsm8B MOVB -1(BX)(SI*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeBlockAsm8B LEAL -1(DI), DI DECL SI JNZ repeat_extend_back_loop_encodeBlockAsm8B repeat_extend_back_end_encodeBlockAsm8B: MOVL DI, SI SUBL 12(SP), SI LEAQ 3(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBlockAsm8B: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_repeat_emit_encodeBlockAsm8B CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBlockAsm8B JB three_bytes_repeat_emit_encodeBlockAsm8B three_bytes_repeat_emit_encodeBlockAsm8B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_repeat_emit_encodeBlockAsm8B two_bytes_repeat_emit_encodeBlockAsm8B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_encodeBlockAsm8B JMP memmove_long_repeat_emit_encodeBlockAsm8B one_byte_repeat_emit_encodeBlockAsm8B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_repeat_emit_encodeBlockAsm8B: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: MOVQ (R10), R11 MOVQ R11, (CX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_repeat_emit_encodeBlockAsm8B: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBlockAsm8B memmove_long_repeat_emit_encodeBlockAsm8B: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R12 SHRQ $0x05, R12 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R13 SUBQ R11, R13 DECQ R12 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R10)(R13*1), R11 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R11 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R10)(R13*1), X4 MOVOU -16(R10)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R9, R13 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeBlockAsm8B: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R9 SUBL DX, R9 LEAQ (BX)(DX*1), R10 LEAQ (BX)(SI*1), SI // matchLen XORL R12, R12 matchlen_loopback_16_repeat_extend_encodeBlockAsm8B: CMPL R9, $0x10 JB matchlen_match8_repeat_extend_encodeBlockAsm8B MOVQ (R10)(R12*1), R11 MOVQ 8(R10)(R12*1), R13 XORQ (SI)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B XORQ 8(SI)(R12*1), R13 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm8B LEAL -16(R9), R9 LEAL 16(R12), R12 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm8B matchlen_bsf_16repeat_extend_encodeBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm8B matchlen_match8_repeat_extend_encodeBlockAsm8B: CMPL R9, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm8B MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B LEAL -8(R9), R9 LEAL 8(R12), R12 JMP matchlen_match4_repeat_extend_encodeBlockAsm8B matchlen_bsf_8_repeat_extend_encodeBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP repeat_extend_forward_end_encodeBlockAsm8B matchlen_match4_repeat_extend_encodeBlockAsm8B: CMPL R9, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm8B MOVL (R10)(R12*1), R11 CMPL (SI)(R12*1), R11 JNE matchlen_match2_repeat_extend_encodeBlockAsm8B LEAL -4(R9), R9 LEAL 4(R12), R12 matchlen_match2_repeat_extend_encodeBlockAsm8B: CMPL R9, $0x01 JE matchlen_match1_repeat_extend_encodeBlockAsm8B JB repeat_extend_forward_end_encodeBlockAsm8B MOVW (R10)(R12*1), R11 CMPW (SI)(R12*1), R11 JNE matchlen_match1_repeat_extend_encodeBlockAsm8B LEAL 2(R12), R12 SUBL $0x02, R9 JZ repeat_extend_forward_end_encodeBlockAsm8B matchlen_match1_repeat_extend_encodeBlockAsm8B: MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm8B LEAL 1(R12), R12 repeat_extend_forward_end_encodeBlockAsm8B: ADDL R12, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI TESTL R8, R8 JZ repeat_as_copy_encodeBlockAsm8B // emitRepeat MOVL SI, DI LEAL -4(SI), SI CMPL DI, $0x08 JBE repeat_two_match_repeat_encodeBlockAsm8B CMPL DI, $0x0c JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: CMPL SI, $0x00000104 JB repeat_three_match_repeat_encodeBlockAsm8B LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm8B repeat_three_match_repeat_encodeBlockAsm8B: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_match_repeat_encodeBlockAsm8B: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm8B XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm8B repeat_as_copy_encodeBlockAsm8B: // emitCopy CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B CMPL DI, $0x00000800 JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B MOVL $0x00000001, R8 LEAL 16(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX SUBL $0x08, SI // emitRepeat LEAL -4(SI), SI JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b MOVL SI, DI LEAL -4(SI), SI CMPL DI, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b CMPL DI, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm8B repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm8B XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm8B long_offset_short_repeat_as_copy_encodeBlockAsm8B: MOVB $0xee, (CX) MOVW DI, 1(CX) LEAL -60(SI), SI ADDQ $0x03, CX // emitRepeat MOVL SI, DI LEAL -4(SI), SI CMPL DI, $0x08 JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short CMPL DI, $0x0c JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: CMPL SI, $0x00000104 JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short LEAL -256(SI), SI MOVW $0x0019, (CX) MOVW SI, 2(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBlockAsm8B repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: LEAL -4(SI), SI MOVW $0x0015, (CX) MOVB SI, 2(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: SHLL $0x02, SI ORL $0x01, SI MOVW SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm8B XORQ R8, R8 LEAL 1(R8)(SI*4), SI MOVB DI, 1(CX) SARL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm8B two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: MOVL SI, R8 SHLL $0x02, R8 CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_encodeBlockAsm8B LEAL -15(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBlockAsm8B emit_copy_three_repeat_as_copy_encodeBlockAsm8B: LEAL -2(R8), R8 MOVB R8, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX repeat_end_emit_encodeBlockAsm8B: MOVL DX, 12(SP) JMP search_loop_encodeBlockAsm8B no_repeat_found_encodeBlockAsm8B: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm8B SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeBlockAsm8B MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeBlockAsm8B MOVL 20(SP), DX JMP search_loop_encodeBlockAsm8B candidate3_match_encodeBlockAsm8B: ADDL $0x02, DX JMP candidate_match_encodeBlockAsm8B candidate2_match_encodeBlockAsm8B: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeBlockAsm8B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBlockAsm8B match_extend_back_loop_encodeBlockAsm8B: CMPL DX, DI JBE match_extend_back_end_encodeBlockAsm8B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBlockAsm8B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBlockAsm8B JMP match_extend_back_loop_encodeBlockAsm8B match_extend_back_end_encodeBlockAsm8B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBlockAsm8B: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_encodeBlockAsm8B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), R8 CMPL R8, $0x3c JB one_byte_match_emit_encodeBlockAsm8B CMPL R8, $0x00000100 JB two_bytes_match_emit_encodeBlockAsm8B JB three_bytes_match_emit_encodeBlockAsm8B three_bytes_match_emit_encodeBlockAsm8B: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeBlockAsm8B two_bytes_match_emit_encodeBlockAsm8B: MOVB $0xf0, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_match_emit_encodeBlockAsm8B JMP memmove_long_match_emit_encodeBlockAsm8B one_byte_match_emit_encodeBlockAsm8B: SHLB $0x02, R8 MOVB R8, (CX) ADDQ $0x01, CX memmove_match_emit_encodeBlockAsm8B: LEAQ (CX)(R9*1), R8 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: MOVQ (DI), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: MOVQ (DI), R10 MOVQ -8(DI)(R9*1), DI MOVQ R10, (CX) MOVQ DI, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeBlockAsm8B: MOVQ R8, CX JMP emit_literal_done_match_emit_encodeBlockAsm8B memmove_long_match_emit_encodeBlockAsm8B: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVQ R9, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(DI)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(DI)(R12*1), X4 MOVOU -16(DI)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_match_emit_encodeBlockAsm8B: match_nolit_loop_encodeBlockAsm8B: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_encodeBlockAsm8B: CMPL DI, $0x10 JB matchlen_match8_match_nolit_encodeBlockAsm8B MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm8B LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm8B matchlen_bsf_16match_nolit_encodeBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_encodeBlockAsm8B matchlen_match8_match_nolit_encodeBlockAsm8B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm8B MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_encodeBlockAsm8B matchlen_bsf_8_match_nolit_encodeBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeBlockAsm8B matchlen_match4_match_nolit_encodeBlockAsm8B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm8B MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_encodeBlockAsm8B LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_encodeBlockAsm8B: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBlockAsm8B JB match_nolit_end_encodeBlockAsm8B MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_encodeBlockAsm8B LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_encodeBlockAsm8B matchlen_match1_match_nolit_encodeBlockAsm8B: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm8B LEAL 1(R10), R10 match_nolit_end_encodeBlockAsm8B: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_encodeBlockAsm8B CMPL SI, $0x00000800 JAE long_offset_short_match_nolit_encodeBlockAsm8B MOVL $0x00000001, DI LEAL 16(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX SUBL $0x08, R10 // emitRepeat LEAL -4(R10), R10 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b MOVL R10, SI LEAL -4(R10), R10 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm8B XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm8B long_offset_short_match_nolit_encodeBlockAsm8B: MOVB $0xee, (CX) MOVW SI, 1(CX) LEAL -60(R10), R10 ADDQ $0x03, CX // emitRepeat MOVL R10, SI LEAL -4(R10), R10 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: CMPL R10, $0x00000104 JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short LEAL -256(R10), R10 MOVW $0x0019, (CX) MOVW R10, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: LEAL -4(R10), R10 MOVW $0x0015, (CX) MOVB R10, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm8B XORQ DI, DI LEAL 1(DI)(R10*4), R10 MOVB SI, 1(CX) SARL $0x08, SI SHLL $0x05, SI ORL SI, R10 MOVB R10, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm8B two_byte_offset_short_match_nolit_encodeBlockAsm8B: MOVL R10, DI SHLL $0x02, DI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_encodeBlockAsm8B LEAL -15(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBlockAsm8B emit_copy_three_match_nolit_encodeBlockAsm8B: LEAL -2(DI), DI MOVB DI, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeBlockAsm8B: CMPL DX, 8(SP) JAE emit_remainder_encodeBlockAsm8B MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBlockAsm8B: MOVQ $0x9e3779b1, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x20, R8 IMULQ R9, R8 SHRQ $0x38, R8 SHLQ $0x20, SI IMULQ R9, SI SHRQ $0x38, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_encodeBlockAsm8B INCL DX JMP search_loop_encodeBlockAsm8B emit_remainder_encodeBlockAsm8B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBlockAsm8B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeBlockAsm8B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBlockAsm8B JB three_bytes_emit_remainder_encodeBlockAsm8B three_bytes_emit_remainder_encodeBlockAsm8B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeBlockAsm8B two_bytes_emit_remainder_encodeBlockAsm8B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeBlockAsm8B JMP memmove_long_emit_remainder_encodeBlockAsm8B one_byte_emit_remainder_encodeBlockAsm8B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeBlockAsm8B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBlockAsm8B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBlockAsm8B memmove_long_emit_remainder_encodeBlockAsm8B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBlockAsm8B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int // Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00001200, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBetterBlockAsm: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBetterBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -6(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBetterBlockAsm: MOVL DX, SI SUBL 12(SP), SI SHRL $0x07, SI CMPL SI, $0x63 JBE check_maxskip_ok_encodeBetterBlockAsm LEAL 100(DX), SI JMP check_maxskip_cont_encodeBetterBlockAsm check_maxskip_ok_encodeBetterBlockAsm: LEAL 1(DX)(SI*1), SI check_maxskip_cont_encodeBetterBlockAsm: CMPL SI, 8(SP) JAE emit_remainder_encodeBetterBlockAsm MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL (AX)(R10*4), SI MOVL 524288(AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, 524288(AX)(R11*4) MOVQ (BX)(SI*1), R10 MOVQ (BX)(R8*1), R11 CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm CMPQ R11, DI JNE no_short_found_encodeBetterBlockAsm MOVL R8, SI JMP candidate_match_encodeBetterBlockAsm no_short_found_encodeBetterBlockAsm: CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm MOVL 20(SP), DX JMP search_loop_encodeBetterBlockAsm candidateS_match_encodeBetterBlockAsm: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x2f, R10 MOVL (AX)(R10*4), SI INCL DX MOVL DX, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm DECL DX MOVL R8, SI candidate_match_encodeBetterBlockAsm: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm match_extend_back_loop_encodeBetterBlockAsm: CMPL DX, DI JBE match_extend_back_end_encodeBetterBlockAsm MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBetterBlockAsm LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBetterBlockAsm JMP match_extend_back_loop_encodeBetterBlockAsm match_extend_back_end_encodeBetterBlockAsm: MOVL DX, DI SUBL 12(SP), DI LEAQ 5(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm: MOVL DX, DI ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), R10 // matchLen XORL R12, R12 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm: CMPL R8, $0x10 JB matchlen_match8_match_nolit_encodeBetterBlockAsm MOVQ (R9)(R12*1), R11 MOVQ 8(R9)(R12*1), R13 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm XORQ 8(R10)(R12*1), R13 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm LEAL -16(R8), R8 LEAL 16(R12), R12 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm matchlen_bsf_16match_nolit_encodeBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP match_nolit_end_encodeBetterBlockAsm matchlen_match8_match_nolit_encodeBetterBlockAsm: CMPL R8, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm LEAL -8(R8), R8 LEAL 8(R12), R12 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm matchlen_bsf_8_match_nolit_encodeBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeBetterBlockAsm matchlen_match4_match_nolit_encodeBetterBlockAsm: CMPL R8, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm MOVL (R9)(R12*1), R11 CMPL (R10)(R12*1), R11 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm LEAL -4(R8), R8 LEAL 4(R12), R12 matchlen_match2_match_nolit_encodeBetterBlockAsm: CMPL R8, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm JB match_nolit_end_encodeBetterBlockAsm MOVW (R9)(R12*1), R11 CMPW (R10)(R12*1), R11 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm LEAL 2(R12), R12 SUBL $0x02, R8 JZ match_nolit_end_encodeBetterBlockAsm matchlen_match1_match_nolit_encodeBetterBlockAsm: MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm LEAL 1(R12), R12 match_nolit_end_encodeBetterBlockAsm: MOVL DX, R8 SUBL SI, R8 // Check if repeat CMPL 16(SP), R8 JEQ match_is_repeat_encodeBetterBlockAsm CMPL R12, $0x01 JA match_length_ok_encodeBetterBlockAsm CMPL R8, $0x0000ffff JBE match_length_ok_encodeBetterBlockAsm MOVL 20(SP), DX INCL DX JMP search_loop_encodeBetterBlockAsm match_length_ok_encodeBetterBlockAsm: MOVL R8, 16(SP) MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_encodeBetterBlockAsm CMPL SI, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm CMPL SI, $0x00010000 JB three_bytes_match_emit_encodeBetterBlockAsm CMPL SI, $0x01000000 JB four_bytes_match_emit_encodeBetterBlockAsm MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x05, CX JMP memmove_long_match_emit_encodeBetterBlockAsm four_bytes_match_emit_encodeBetterBlockAsm: MOVL SI, R11 SHRL $0x10, R11 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R11, 3(CX) ADDQ $0x04, CX JMP memmove_long_match_emit_encodeBetterBlockAsm three_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeBetterBlockAsm two_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_encodeBetterBlockAsm JMP memmove_long_match_emit_encodeBetterBlockAsm one_byte_match_emit_encodeBetterBlockAsm: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_encodeBetterBlockAsm: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x04 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4 CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4: MOVL (R10), R11 MOVL R11, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: MOVL (R10), R11 MOVL -4(R10)(R9*1), R10 MOVL R11, (CX) MOVL R10, -4(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeBetterBlockAsm: MOVQ SI, CX JMP emit_literal_done_match_emit_encodeBetterBlockAsm memmove_long_match_emit_encodeBetterBlockAsm: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_encodeBetterBlockAsm: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitCopy CMPL R8, $0x00010000 JB two_byte_offset_match_nolit_encodeBetterBlockAsm CMPL R12, $0x40 JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm MOVB $0xff, (CX) MOVL R8, 1(CX) LEAL -64(R12), R12 ADDQ $0x05, CX CMPL R12, $0x04 JB four_bytes_remain_match_nolit_encodeBetterBlockAsm // emitRepeat emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy CMPL R12, $0x00010100 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy CMPL R12, $0x0100ffff JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy LEAL -16842747(R12), R12 MOVL $0xfffb001d, (CX) MOVB $0xff, 4(CX) ADDQ $0x05, CX JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: LEAL -65536(R12), R12 MOVL R12, R8 MOVW $0x001d, (CX) MOVW R12, 2(CX) SARL $0x10, R8 MOVB R8, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm four_bytes_remain_match_nolit_encodeBetterBlockAsm: TESTL R12, R12 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm XORL SI, SI LEAL -1(SI)(R12*4), R12 MOVB R12, (CX) MOVL R8, 1(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm two_byte_offset_match_nolit_encodeBetterBlockAsm: CMPL R12, $0x40 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm CMPL R8, $0x00000800 JAE long_offset_short_match_nolit_encodeBetterBlockAsm MOVL $0x00000001, SI LEAL 16(SI), SI MOVB R8, 1(CX) MOVL R8, R9 SHRL $0x08, R9 SHLL $0x05, R9 ORL R9, SI MOVB SI, (CX) ADDQ $0x02, CX SUBL $0x08, R12 // emitRepeat LEAL -4(R12), R12 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b CMPL R12, $0x00010100 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b CMPL R12, $0x0100ffff JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b LEAL -16842747(R12), R12 MOVL $0xfffb001d, (CX) MOVB $0xff, 4(CX) ADDQ $0x05, CX JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: LEAL -65536(R12), R12 MOVL R12, R8 MOVW $0x001d, (CX) MOVW R12, 2(CX) SARL $0x10, R8 MOVB R8, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm long_offset_short_match_nolit_encodeBetterBlockAsm: MOVB $0xee, (CX) MOVW R8, 1(CX) LEAL -60(R12), R12 ADDQ $0x03, CX // emitRepeat emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short CMPL R12, $0x00010100 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short CMPL R12, $0x0100ffff JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short LEAL -16842747(R12), R12 MOVL $0xfffb001d, (CX) MOVB $0xff, 4(CX) ADDQ $0x05, CX JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: LEAL -65536(R12), R12 MOVL R12, R8 MOVW $0x001d, (CX) MOVW R12, 2(CX) SARL $0x10, R8 MOVB R8, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm two_byte_offset_short_match_nolit_encodeBetterBlockAsm: MOVL R12, SI SHLL $0x02, SI CMPL R12, $0x0c JAE emit_copy_three_match_nolit_encodeBetterBlockAsm CMPL R8, $0x00000800 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm LEAL -15(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy_three_match_nolit_encodeBetterBlockAsm: LEAL -2(SI), SI MOVB SI, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm match_is_repeat_encodeBetterBlockAsm: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_repeat_encodeBetterBlockAsm CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm CMPL SI, $0x00010000 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm CMPL SI, $0x01000000 JB four_bytes_match_emit_repeat_encodeBetterBlockAsm MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x05, CX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm four_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVL SI, R11 SHRL $0x10, R11 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R11, 3(CX) ADDQ $0x04, CX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm three_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm two_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_repeat_encodeBetterBlockAsm JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm one_byte_match_emit_repeat_encodeBetterBlockAsm: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_repeat_encodeBetterBlockAsm: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x04 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4 CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4: MOVL (R10), R11 MOVL R11, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7: MOVL (R10), R11 MOVL -4(R10)(R9*1), R10 MOVL R11, (CX) MOVL R10, -4(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm memmove_long_match_emit_repeat_encodeBetterBlockAsm: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitRepeat emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm: MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: CMPL R12, $0x00000104 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm CMPL R12, $0x00010100 JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm CMPL R12, $0x0100ffff JB repeat_five_match_nolit_repeat_encodeBetterBlockAsm LEAL -16842747(R12), R12 MOVL $0xfffb001d, (CX) MOVB $0xff, 4(CX) ADDQ $0x05, CX JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm repeat_five_match_nolit_repeat_encodeBetterBlockAsm: LEAL -65536(R12), R12 MOVL R12, R8 MOVW $0x001d, (CX) MOVW R12, 2(CX) SARL $0x10, R8 MOVB R8, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_four_match_nolit_repeat_encodeBetterBlockAsm: LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_repeat_encodeBetterBlockAsm: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_repeat_encodeBetterBlockAsm: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX match_nolit_emitcopy_end_encodeBetterBlockAsm: CMPL DX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 LEAQ 1(DI), DI LEAQ -2(DX), R9 MOVQ (BX)(DI*1), R10 MOVQ 1(BX)(DI*1), R11 MOVQ (BX)(R9*1), R12 MOVQ 1(BX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 SHLQ $0x08, R12 IMULQ SI, R12 SHRQ $0x2f, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x32, R13 LEAQ 1(DI), R8 LEAQ 1(R9), R14 MOVL DI, (AX)(R10*4) MOVL R9, (AX)(R12*4) MOVL R8, 524288(AX)(R11*4) MOVL R14, 524288(AX)(R13*4) LEAQ 1(R9)(DI*1), R8 SHRQ $0x01, R8 ADDQ $0x01, DI SUBQ $0x01, R9 index_loop_encodeBetterBlockAsm: CMPQ R8, R9 JAE search_loop_encodeBetterBlockAsm MOVQ (BX)(DI*1), R10 MOVQ (BX)(R8*1), R11 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 SHLQ $0x08, R11 IMULQ SI, R11 SHRQ $0x2f, R11 MOVL DI, (AX)(R10*4) MOVL R8, (AX)(R11*4) ADDQ $0x02, DI ADDQ $0x02, R8 JMP index_loop_encodeBetterBlockAsm emit_remainder_encodeBetterBlockAsm: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 5(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x00010000 JB three_bytes_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x01000000 JB four_bytes_emit_remainder_encodeBetterBlockAsm MOVB $0xfc, (CX) MOVL DX, 1(CX) ADDQ $0x05, CX JMP memmove_long_emit_remainder_encodeBetterBlockAsm four_bytes_emit_remainder_encodeBetterBlockAsm: MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (CX) MOVW DX, 1(CX) MOVB BL, 3(CX) ADDQ $0x04, CX JMP memmove_long_emit_remainder_encodeBetterBlockAsm three_bytes_emit_remainder_encodeBetterBlockAsm: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeBetterBlockAsm two_bytes_emit_remainder_encodeBetterBlockAsm: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeBetterBlockAsm JMP memmove_long_emit_remainder_encodeBetterBlockAsm one_byte_emit_remainder_encodeBetterBlockAsm: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeBetterBlockAsm: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm memmove_long_emit_remainder_encodeBetterBlockAsm: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm4MB(dst []byte, src []byte, tmp *[589824]byte) int // Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm4MB(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00001200, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBetterBlockAsm4MB: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBetterBlockAsm4MB MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -6(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBetterBlockAsm4MB: MOVL DX, SI SUBL 12(SP), SI SHRL $0x07, SI CMPL SI, $0x63 JBE check_maxskip_ok_encodeBetterBlockAsm4MB LEAL 100(DX), SI JMP check_maxskip_cont_encodeBetterBlockAsm4MB check_maxskip_ok_encodeBetterBlockAsm4MB: LEAL 1(DX)(SI*1), SI check_maxskip_cont_encodeBetterBlockAsm4MB: CMPL SI, 8(SP) JAE emit_remainder_encodeBetterBlockAsm4MB MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL (AX)(R10*4), SI MOVL 524288(AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, 524288(AX)(R11*4) MOVQ (BX)(SI*1), R10 MOVQ (BX)(R8*1), R11 CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm4MB CMPQ R11, DI JNE no_short_found_encodeBetterBlockAsm4MB MOVL R8, SI JMP candidate_match_encodeBetterBlockAsm4MB no_short_found_encodeBetterBlockAsm4MB: CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm4MB CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm4MB MOVL 20(SP), DX JMP search_loop_encodeBetterBlockAsm4MB candidateS_match_encodeBetterBlockAsm4MB: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x2f, R10 MOVL (AX)(R10*4), SI INCL DX MOVL DX, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm4MB DECL DX MOVL R8, SI candidate_match_encodeBetterBlockAsm4MB: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm4MB match_extend_back_loop_encodeBetterBlockAsm4MB: CMPL DX, DI JBE match_extend_back_end_encodeBetterBlockAsm4MB MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBetterBlockAsm4MB LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBetterBlockAsm4MB JMP match_extend_back_loop_encodeBetterBlockAsm4MB match_extend_back_end_encodeBetterBlockAsm4MB: MOVL DX, DI SUBL 12(SP), DI LEAQ 4(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeBetterBlockAsm4MB MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm4MB: MOVL DX, DI ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), R10 // matchLen XORL R12, R12 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB: CMPL R8, $0x10 JB matchlen_match8_match_nolit_encodeBetterBlockAsm4MB MOVQ (R9)(R12*1), R11 MOVQ 8(R9)(R12*1), R13 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB XORQ 8(R10)(R12*1), R13 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB LEAL -16(R8), R8 LEAL 16(R12), R12 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP match_nolit_end_encodeBetterBlockAsm4MB matchlen_match8_match_nolit_encodeBetterBlockAsm4MB: CMPL R8, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB LEAL -8(R8), R8 LEAL 8(R12), R12 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4MB matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeBetterBlockAsm4MB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB: CMPL R8, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm4MB MOVL (R9)(R12*1), R11 CMPL (R10)(R12*1), R11 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB LEAL -4(R8), R8 LEAL 4(R12), R12 matchlen_match2_match_nolit_encodeBetterBlockAsm4MB: CMPL R8, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB JB match_nolit_end_encodeBetterBlockAsm4MB MOVW (R9)(R12*1), R11 CMPW (R10)(R12*1), R11 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB LEAL 2(R12), R12 SUBL $0x02, R8 JZ match_nolit_end_encodeBetterBlockAsm4MB matchlen_match1_match_nolit_encodeBetterBlockAsm4MB: MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm4MB LEAL 1(R12), R12 match_nolit_end_encodeBetterBlockAsm4MB: MOVL DX, R8 SUBL SI, R8 // Check if repeat CMPL 16(SP), R8 JEQ match_is_repeat_encodeBetterBlockAsm4MB CMPL R12, $0x01 JA match_length_ok_encodeBetterBlockAsm4MB CMPL R8, $0x0000ffff JBE match_length_ok_encodeBetterBlockAsm4MB MOVL 20(SP), DX INCL DX JMP search_loop_encodeBetterBlockAsm4MB match_length_ok_encodeBetterBlockAsm4MB: MOVL R8, 16(SP) MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_encodeBetterBlockAsm4MB CMPL SI, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm4MB CMPL SI, $0x00010000 JB three_bytes_match_emit_encodeBetterBlockAsm4MB MOVL SI, R11 SHRL $0x10, R11 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R11, 3(CX) ADDQ $0x04, CX JMP memmove_long_match_emit_encodeBetterBlockAsm4MB three_bytes_match_emit_encodeBetterBlockAsm4MB: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeBetterBlockAsm4MB two_bytes_match_emit_encodeBetterBlockAsm4MB: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_encodeBetterBlockAsm4MB JMP memmove_long_match_emit_encodeBetterBlockAsm4MB one_byte_match_emit_encodeBetterBlockAsm4MB: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_encodeBetterBlockAsm4MB: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x04 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4 CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4: MOVL (R10), R11 MOVL R11, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: MOVL (R10), R11 MOVL -4(R10)(R9*1), R10 MOVL R11, (CX) MOVL R10, -4(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: MOVQ SI, CX JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB memmove_long_match_emit_encodeBetterBlockAsm4MB: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_encodeBetterBlockAsm4MB: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitCopy CMPL R8, $0x00010000 JB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB CMPL R12, $0x40 JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB MOVB $0xff, (CX) MOVL R8, 1(CX) LEAL -64(R12), R12 ADDQ $0x05, CX CMPL R12, $0x04 JB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB // emitRepeat MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy CMPL R12, $0x00010100 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy LEAL -65536(R12), R12 MOVL R12, R8 MOVW $0x001d, (CX) MOVW R12, 2(CX) SARL $0x10, R8 MOVB R8, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: TESTL R12, R12 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB XORL SI, SI LEAL -1(SI)(R12*4), R12 MOVB R12, (CX) MOVL R8, 1(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: CMPL R12, $0x40 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB CMPL R8, $0x00000800 JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB MOVL $0x00000001, SI LEAL 16(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX SUBL $0x08, R12 // emitRepeat LEAL -4(R12), R12 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b CMPL R12, $0x00010100 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b LEAL -65536(R12), R12 MOVL R12, R8 MOVW $0x001d, (CX) MOVW R12, 2(CX) SARL $0x10, R8 MOVB R8, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB long_offset_short_match_nolit_encodeBetterBlockAsm4MB: MOVB $0xee, (CX) MOVW R8, 1(CX) LEAL -60(R12), R12 ADDQ $0x03, CX // emitRepeat MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short CMPL R12, $0x00010100 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short LEAL -65536(R12), R12 MOVL R12, R8 MOVW $0x001d, (CX) MOVW R12, 2(CX) SARL $0x10, R8 MOVB R8, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: MOVL R12, SI SHLL $0x02, SI CMPL R12, $0x0c JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB CMPL R8, $0x00000800 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB LEAL -15(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: LEAL -2(SI), SI MOVB SI, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB match_is_repeat_encodeBetterBlockAsm4MB: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB CMPL SI, $0x00010000 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB MOVL SI, R11 SHRL $0x10, R11 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R11, 3(CX) ADDQ $0x04, CX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_repeat_encodeBetterBlockAsm4MB JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_repeat_encodeBetterBlockAsm4MB: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x04 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4 CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4: MOVL (R10), R11 MOVL R11, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7: MOVL (R10), R11 MOVL -4(R10)(R9*1), R10 MOVL R11, (CX) MOVL R10, -4(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitRepeat MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: CMPL R12, $0x00000104 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB CMPL R12, $0x00010100 JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB LEAL -65536(R12), R12 MOVL R12, R8 MOVW $0x001d, (CX) MOVW R12, 2(CX) SARL $0x10, R8 MOVB R8, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB: LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: CMPL DX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm4MB CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm4MB MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm4MB: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 LEAQ 1(DI), DI LEAQ -2(DX), R9 MOVQ (BX)(DI*1), R10 MOVQ 1(BX)(DI*1), R11 MOVQ (BX)(R9*1), R12 MOVQ 1(BX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 SHLQ $0x08, R12 IMULQ SI, R12 SHRQ $0x2f, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x32, R13 LEAQ 1(DI), R8 LEAQ 1(R9), R14 MOVL DI, (AX)(R10*4) MOVL R9, (AX)(R12*4) MOVL R8, 524288(AX)(R11*4) MOVL R14, 524288(AX)(R13*4) LEAQ 1(R9)(DI*1), R8 SHRQ $0x01, R8 ADDQ $0x01, DI SUBQ $0x01, R9 index_loop_encodeBetterBlockAsm4MB: CMPQ R8, R9 JAE search_loop_encodeBetterBlockAsm4MB MOVQ (BX)(DI*1), R10 MOVQ (BX)(R8*1), R11 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 SHLQ $0x08, R11 IMULQ SI, R11 SHRQ $0x2f, R11 MOVL DI, (AX)(R10*4) MOVL R8, (AX)(R11*4) ADDQ $0x02, DI ADDQ $0x02, R8 JMP index_loop_encodeBetterBlockAsm4MB emit_remainder_encodeBetterBlockAsm4MB: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 4(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm4MB MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm4MB: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeBetterBlockAsm4MB CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm4MB CMPL DX, $0x00010000 JB three_bytes_emit_remainder_encodeBetterBlockAsm4MB MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (CX) MOVW DX, 1(CX) MOVB BL, 3(CX) ADDQ $0x04, CX JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB three_bytes_emit_remainder_encodeBetterBlockAsm4MB: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB two_bytes_emit_remainder_encodeBetterBlockAsm4MB: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeBetterBlockAsm4MB JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB one_byte_emit_remainder_encodeBetterBlockAsm4MB: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeBetterBlockAsm4MB: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB memmove_long_emit_remainder_encodeBetterBlockAsm4MB: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int // Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm12B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000280, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBetterBlockAsm12B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBetterBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -6(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBetterBlockAsm12B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x06, SI LEAL 1(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBetterBlockAsm12B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x34, R11 MOVL (AX)(R10*4), SI MOVL 65536(AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, 65536(AX)(R11*4) MOVQ (BX)(SI*1), R10 MOVQ (BX)(R8*1), R11 CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm12B CMPQ R11, DI JNE no_short_found_encodeBetterBlockAsm12B MOVL R8, SI JMP candidate_match_encodeBetterBlockAsm12B no_short_found_encodeBetterBlockAsm12B: CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm12B CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm12B MOVL 20(SP), DX JMP search_loop_encodeBetterBlockAsm12B candidateS_match_encodeBetterBlockAsm12B: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 MOVL (AX)(R10*4), SI INCL DX MOVL DX, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm12B DECL DX MOVL R8, SI candidate_match_encodeBetterBlockAsm12B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm12B match_extend_back_loop_encodeBetterBlockAsm12B: CMPL DX, DI JBE match_extend_back_end_encodeBetterBlockAsm12B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBetterBlockAsm12B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBetterBlockAsm12B JMP match_extend_back_loop_encodeBetterBlockAsm12B match_extend_back_end_encodeBetterBlockAsm12B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm12B: MOVL DX, DI ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), R10 // matchLen XORL R12, R12 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B: CMPL R8, $0x10 JB matchlen_match8_match_nolit_encodeBetterBlockAsm12B MOVQ (R9)(R12*1), R11 MOVQ 8(R9)(R12*1), R13 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B XORQ 8(R10)(R12*1), R13 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B LEAL -16(R8), R8 LEAL 16(R12), R12 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP match_nolit_end_encodeBetterBlockAsm12B matchlen_match8_match_nolit_encodeBetterBlockAsm12B: CMPL R8, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm12B MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B LEAL -8(R8), R8 LEAL 8(R12), R12 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm12B matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeBetterBlockAsm12B matchlen_match4_match_nolit_encodeBetterBlockAsm12B: CMPL R8, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm12B MOVL (R9)(R12*1), R11 CMPL (R10)(R12*1), R11 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B LEAL -4(R8), R8 LEAL 4(R12), R12 matchlen_match2_match_nolit_encodeBetterBlockAsm12B: CMPL R8, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm12B JB match_nolit_end_encodeBetterBlockAsm12B MOVW (R9)(R12*1), R11 CMPW (R10)(R12*1), R11 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B LEAL 2(R12), R12 SUBL $0x02, R8 JZ match_nolit_end_encodeBetterBlockAsm12B matchlen_match1_match_nolit_encodeBetterBlockAsm12B: MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm12B LEAL 1(R12), R12 match_nolit_end_encodeBetterBlockAsm12B: MOVL DX, R8 SUBL SI, R8 // Check if repeat CMPL 16(SP), R8 JEQ match_is_repeat_encodeBetterBlockAsm12B MOVL R8, 16(SP) MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_encodeBetterBlockAsm12B CMPL SI, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm12B JB three_bytes_match_emit_encodeBetterBlockAsm12B three_bytes_match_emit_encodeBetterBlockAsm12B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeBetterBlockAsm12B two_bytes_match_emit_encodeBetterBlockAsm12B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_encodeBetterBlockAsm12B JMP memmove_long_match_emit_encodeBetterBlockAsm12B one_byte_match_emit_encodeBetterBlockAsm12B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_encodeBetterBlockAsm12B: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x04 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4 CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4: MOVL (R10), R11 MOVL R11, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: MOVL (R10), R11 MOVL -4(R10)(R9*1), R10 MOVL R11, (CX) MOVL R10, -4(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeBetterBlockAsm12B: MOVQ SI, CX JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B memmove_long_match_emit_encodeBetterBlockAsm12B: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_encodeBetterBlockAsm12B: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitCopy CMPL R12, $0x40 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B CMPL R8, $0x00000800 JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B MOVL $0x00000001, SI LEAL 16(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX SUBL $0x08, R12 // emitRepeat LEAL -4(R12), R12 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B long_offset_short_match_nolit_encodeBetterBlockAsm12B: MOVB $0xee, (CX) MOVW R8, 1(CX) LEAL -60(R12), R12 ADDQ $0x03, CX // emitRepeat MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: MOVL R12, SI SHLL $0x02, SI CMPL R12, $0x0c JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B CMPL R8, $0x00000800 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B LEAL -15(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B emit_copy_three_match_nolit_encodeBetterBlockAsm12B: LEAL -2(SI), SI MOVB SI, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B match_is_repeat_encodeBetterBlockAsm12B: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_repeat_encodeBetterBlockAsm12B CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm12B JB three_bytes_match_emit_repeat_encodeBetterBlockAsm12B three_bytes_match_emit_repeat_encodeBetterBlockAsm12B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B two_bytes_match_emit_repeat_encodeBetterBlockAsm12B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_repeat_encodeBetterBlockAsm12B JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B one_byte_match_emit_repeat_encodeBetterBlockAsm12B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_repeat_encodeBetterBlockAsm12B: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x04 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4 CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4: MOVL (R10), R11 MOVL R11, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7: MOVL (R10), R11 MOVL -4(R10)(R9*1), R10 MOVL R11, (CX) MOVL R10, -4(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B memmove_long_match_emit_repeat_encodeBetterBlockAsm12B: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitRepeat MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: CMPL R12, $0x00000104 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX match_nolit_emitcopy_end_encodeBetterBlockAsm12B: CMPL DX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm12B CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm12B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 LEAQ 1(DI), DI LEAQ -2(DX), R9 MOVQ (BX)(DI*1), R10 MOVQ 1(BX)(DI*1), R11 MOVQ (BX)(R9*1), R12 MOVQ 1(BX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x34, R11 SHLQ $0x10, R12 IMULQ SI, R12 SHRQ $0x32, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x34, R13 LEAQ 1(DI), R8 LEAQ 1(R9), R14 MOVL DI, (AX)(R10*4) MOVL R9, (AX)(R12*4) MOVL R8, 65536(AX)(R11*4) MOVL R14, 65536(AX)(R13*4) LEAQ 1(R9)(DI*1), R8 SHRQ $0x01, R8 ADDQ $0x01, DI SUBQ $0x01, R9 index_loop_encodeBetterBlockAsm12B: CMPQ R8, R9 JAE search_loop_encodeBetterBlockAsm12B MOVQ (BX)(DI*1), R10 MOVQ (BX)(R8*1), R11 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 SHLQ $0x10, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL DI, (AX)(R10*4) MOVL R8, (AX)(R11*4) ADDQ $0x02, DI ADDQ $0x02, R8 JMP index_loop_encodeBetterBlockAsm12B emit_remainder_encodeBetterBlockAsm12B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm12B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeBetterBlockAsm12B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm12B JB three_bytes_emit_remainder_encodeBetterBlockAsm12B three_bytes_emit_remainder_encodeBetterBlockAsm12B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B two_bytes_emit_remainder_encodeBetterBlockAsm12B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeBetterBlockAsm12B JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B one_byte_emit_remainder_encodeBetterBlockAsm12B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeBetterBlockAsm12B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B memmove_long_emit_remainder_encodeBetterBlockAsm12B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int // Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm10B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x000000a0, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBetterBlockAsm10B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBetterBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -6(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBetterBlockAsm10B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x05, SI LEAL 1(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBetterBlockAsm10B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x34, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x36, R11 MOVL (AX)(R10*4), SI MOVL 16384(AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, 16384(AX)(R11*4) MOVQ (BX)(SI*1), R10 MOVQ (BX)(R8*1), R11 CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm10B CMPQ R11, DI JNE no_short_found_encodeBetterBlockAsm10B MOVL R8, SI JMP candidate_match_encodeBetterBlockAsm10B no_short_found_encodeBetterBlockAsm10B: CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm10B CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm10B MOVL 20(SP), DX JMP search_loop_encodeBetterBlockAsm10B candidateS_match_encodeBetterBlockAsm10B: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x34, R10 MOVL (AX)(R10*4), SI INCL DX MOVL DX, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm10B DECL DX MOVL R8, SI candidate_match_encodeBetterBlockAsm10B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm10B match_extend_back_loop_encodeBetterBlockAsm10B: CMPL DX, DI JBE match_extend_back_end_encodeBetterBlockAsm10B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBetterBlockAsm10B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBetterBlockAsm10B JMP match_extend_back_loop_encodeBetterBlockAsm10B match_extend_back_end_encodeBetterBlockAsm10B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeBetterBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm10B: MOVL DX, DI ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), R10 // matchLen XORL R12, R12 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B: CMPL R8, $0x10 JB matchlen_match8_match_nolit_encodeBetterBlockAsm10B MOVQ (R9)(R12*1), R11 MOVQ 8(R9)(R12*1), R13 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B XORQ 8(R10)(R12*1), R13 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B LEAL -16(R8), R8 LEAL 16(R12), R12 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP match_nolit_end_encodeBetterBlockAsm10B matchlen_match8_match_nolit_encodeBetterBlockAsm10B: CMPL R8, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm10B MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B LEAL -8(R8), R8 LEAL 8(R12), R12 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm10B matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeBetterBlockAsm10B matchlen_match4_match_nolit_encodeBetterBlockAsm10B: CMPL R8, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm10B MOVL (R9)(R12*1), R11 CMPL (R10)(R12*1), R11 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B LEAL -4(R8), R8 LEAL 4(R12), R12 matchlen_match2_match_nolit_encodeBetterBlockAsm10B: CMPL R8, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm10B JB match_nolit_end_encodeBetterBlockAsm10B MOVW (R9)(R12*1), R11 CMPW (R10)(R12*1), R11 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B LEAL 2(R12), R12 SUBL $0x02, R8 JZ match_nolit_end_encodeBetterBlockAsm10B matchlen_match1_match_nolit_encodeBetterBlockAsm10B: MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm10B LEAL 1(R12), R12 match_nolit_end_encodeBetterBlockAsm10B: MOVL DX, R8 SUBL SI, R8 // Check if repeat CMPL 16(SP), R8 JEQ match_is_repeat_encodeBetterBlockAsm10B MOVL R8, 16(SP) MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_encodeBetterBlockAsm10B CMPL SI, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm10B JB three_bytes_match_emit_encodeBetterBlockAsm10B three_bytes_match_emit_encodeBetterBlockAsm10B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeBetterBlockAsm10B two_bytes_match_emit_encodeBetterBlockAsm10B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_encodeBetterBlockAsm10B JMP memmove_long_match_emit_encodeBetterBlockAsm10B one_byte_match_emit_encodeBetterBlockAsm10B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_encodeBetterBlockAsm10B: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x04 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4 CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4: MOVL (R10), R11 MOVL R11, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: MOVL (R10), R11 MOVL -4(R10)(R9*1), R10 MOVL R11, (CX) MOVL R10, -4(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeBetterBlockAsm10B: MOVQ SI, CX JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B memmove_long_match_emit_encodeBetterBlockAsm10B: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_encodeBetterBlockAsm10B: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitCopy CMPL R12, $0x40 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B CMPL R8, $0x00000800 JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B MOVL $0x00000001, SI LEAL 16(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX SUBL $0x08, R12 // emitRepeat LEAL -4(R12), R12 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B long_offset_short_match_nolit_encodeBetterBlockAsm10B: MOVB $0xee, (CX) MOVW R8, 1(CX) LEAL -60(R12), R12 ADDQ $0x03, CX // emitRepeat MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: MOVL R12, SI SHLL $0x02, SI CMPL R12, $0x0c JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B CMPL R8, $0x00000800 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B LEAL -15(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B emit_copy_three_match_nolit_encodeBetterBlockAsm10B: LEAL -2(SI), SI MOVB SI, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B match_is_repeat_encodeBetterBlockAsm10B: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_repeat_encodeBetterBlockAsm10B CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm10B JB three_bytes_match_emit_repeat_encodeBetterBlockAsm10B three_bytes_match_emit_repeat_encodeBetterBlockAsm10B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B two_bytes_match_emit_repeat_encodeBetterBlockAsm10B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_repeat_encodeBetterBlockAsm10B JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B one_byte_match_emit_repeat_encodeBetterBlockAsm10B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_repeat_encodeBetterBlockAsm10B: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x04 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4 CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4: MOVL (R10), R11 MOVL R11, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7: MOVL (R10), R11 MOVL -4(R10)(R9*1), R10 MOVL R11, (CX) MOVL R10, -4(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B memmove_long_match_emit_repeat_encodeBetterBlockAsm10B: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitRepeat MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B CMPL R8, $0x00000800 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: CMPL R12, $0x00000104 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX match_nolit_emitcopy_end_encodeBetterBlockAsm10B: CMPL DX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm10B CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm10B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 LEAQ 1(DI), DI LEAQ -2(DX), R9 MOVQ (BX)(DI*1), R10 MOVQ 1(BX)(DI*1), R11 MOVQ (BX)(R9*1), R12 MOVQ 1(BX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x36, R11 SHLQ $0x10, R12 IMULQ SI, R12 SHRQ $0x34, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x36, R13 LEAQ 1(DI), R8 LEAQ 1(R9), R14 MOVL DI, (AX)(R10*4) MOVL R9, (AX)(R12*4) MOVL R8, 16384(AX)(R11*4) MOVL R14, 16384(AX)(R13*4) LEAQ 1(R9)(DI*1), R8 SHRQ $0x01, R8 ADDQ $0x01, DI SUBQ $0x01, R9 index_loop_encodeBetterBlockAsm10B: CMPQ R8, R9 JAE search_loop_encodeBetterBlockAsm10B MOVQ (BX)(DI*1), R10 MOVQ (BX)(R8*1), R11 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 SHLQ $0x10, R11 IMULQ SI, R11 SHRQ $0x34, R11 MOVL DI, (AX)(R10*4) MOVL R8, (AX)(R11*4) ADDQ $0x02, DI ADDQ $0x02, R8 JMP index_loop_encodeBetterBlockAsm10B emit_remainder_encodeBetterBlockAsm10B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm10B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeBetterBlockAsm10B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm10B JB three_bytes_emit_remainder_encodeBetterBlockAsm10B three_bytes_emit_remainder_encodeBetterBlockAsm10B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B two_bytes_emit_remainder_encodeBetterBlockAsm10B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeBetterBlockAsm10B JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B one_byte_emit_remainder_encodeBetterBlockAsm10B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeBetterBlockAsm10B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B memmove_long_emit_remainder_encodeBetterBlockAsm10B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int // Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm8B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000028, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeBetterBlockAsm8B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeBetterBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -6(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeBetterBlockAsm8B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x04, SI LEAL 1(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeBetterBlockAsm8B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x36, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x38, R11 MOVL (AX)(R10*4), SI MOVL 4096(AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, 4096(AX)(R11*4) MOVQ (BX)(SI*1), R10 MOVQ (BX)(R8*1), R11 CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm8B CMPQ R11, DI JNE no_short_found_encodeBetterBlockAsm8B MOVL R8, SI JMP candidate_match_encodeBetterBlockAsm8B no_short_found_encodeBetterBlockAsm8B: CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm8B CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm8B MOVL 20(SP), DX JMP search_loop_encodeBetterBlockAsm8B candidateS_match_encodeBetterBlockAsm8B: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x36, R10 MOVL (AX)(R10*4), SI INCL DX MOVL DX, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm8B DECL DX MOVL R8, SI candidate_match_encodeBetterBlockAsm8B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm8B match_extend_back_loop_encodeBetterBlockAsm8B: CMPL DX, DI JBE match_extend_back_end_encodeBetterBlockAsm8B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeBetterBlockAsm8B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeBetterBlockAsm8B JMP match_extend_back_loop_encodeBetterBlockAsm8B match_extend_back_end_encodeBetterBlockAsm8B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm8B: MOVL DX, DI ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), R10 // matchLen XORL R12, R12 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B: CMPL R8, $0x10 JB matchlen_match8_match_nolit_encodeBetterBlockAsm8B MOVQ (R9)(R12*1), R11 MOVQ 8(R9)(R12*1), R13 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B XORQ 8(R10)(R12*1), R13 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B LEAL -16(R8), R8 LEAL 16(R12), R12 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP match_nolit_end_encodeBetterBlockAsm8B matchlen_match8_match_nolit_encodeBetterBlockAsm8B: CMPL R8, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm8B MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B LEAL -8(R8), R8 LEAL 8(R12), R12 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm8B matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeBetterBlockAsm8B matchlen_match4_match_nolit_encodeBetterBlockAsm8B: CMPL R8, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm8B MOVL (R9)(R12*1), R11 CMPL (R10)(R12*1), R11 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B LEAL -4(R8), R8 LEAL 4(R12), R12 matchlen_match2_match_nolit_encodeBetterBlockAsm8B: CMPL R8, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm8B JB match_nolit_end_encodeBetterBlockAsm8B MOVW (R9)(R12*1), R11 CMPW (R10)(R12*1), R11 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B LEAL 2(R12), R12 SUBL $0x02, R8 JZ match_nolit_end_encodeBetterBlockAsm8B matchlen_match1_match_nolit_encodeBetterBlockAsm8B: MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm8B LEAL 1(R12), R12 match_nolit_end_encodeBetterBlockAsm8B: MOVL DX, R8 SUBL SI, R8 // Check if repeat CMPL 16(SP), R8 JEQ match_is_repeat_encodeBetterBlockAsm8B MOVL R8, 16(SP) MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_encodeBetterBlockAsm8B CMPL SI, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm8B JB three_bytes_match_emit_encodeBetterBlockAsm8B three_bytes_match_emit_encodeBetterBlockAsm8B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeBetterBlockAsm8B two_bytes_match_emit_encodeBetterBlockAsm8B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_encodeBetterBlockAsm8B JMP memmove_long_match_emit_encodeBetterBlockAsm8B one_byte_match_emit_encodeBetterBlockAsm8B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_encodeBetterBlockAsm8B: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x04 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4 CMPQ R9, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4: MOVL (R10), R11 MOVL R11, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: MOVL (R10), R11 MOVL -4(R10)(R9*1), R10 MOVL R11, (CX) MOVL R10, -4(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeBetterBlockAsm8B: MOVQ SI, CX JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B memmove_long_match_emit_encodeBetterBlockAsm8B: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_encodeBetterBlockAsm8B: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitCopy CMPL R12, $0x40 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B CMPL R8, $0x00000800 JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B MOVL $0x00000001, SI LEAL 16(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX SUBL $0x08, R12 // emitRepeat LEAL -4(R12), R12 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B long_offset_short_match_nolit_encodeBetterBlockAsm8B: MOVB $0xee, (CX) MOVW R8, 1(CX) LEAL -60(R12), R12 ADDQ $0x03, CX // emitRepeat MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: CMPL R12, $0x00000104 JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: MOVL R12, SI SHLL $0x02, SI CMPL R12, $0x0c JAE emit_copy_three_match_nolit_encodeBetterBlockAsm8B LEAL -15(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B emit_copy_three_match_nolit_encodeBetterBlockAsm8B: LEAL -2(SI), SI MOVB SI, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B match_is_repeat_encodeBetterBlockAsm8B: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B MOVL DI, R8 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R9 SUBL SI, R8 LEAL -1(R8), SI CMPL SI, $0x3c JB one_byte_match_emit_repeat_encodeBetterBlockAsm8B CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm8B JB three_bytes_match_emit_repeat_encodeBetterBlockAsm8B three_bytes_match_emit_repeat_encodeBetterBlockAsm8B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_repeat_encodeBetterBlockAsm8B JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B one_byte_match_emit_repeat_encodeBetterBlockAsm8B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_repeat_encodeBetterBlockAsm8B: LEAQ (CX)(R8*1), SI // genMemMoveShort CMPQ R8, $0x04 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4 CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4: MOVL (R9), R10 MOVL R10, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (CX) MOVL R9, -4(CX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (CX) MOVQ R9, -8(CX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: LEAQ (CX)(R8*1), SI // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R11 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(CX)(R13*1), R14 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R11 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(CX)(R13*1) MOVOA X5, -16(CX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitRepeat MOVL R12, SI LEAL -4(R12), R12 CMPL SI, $0x08 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B CMPL SI, $0x0c JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B: CMPL R12, $0x00000104 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B LEAL -256(R12), R12 MOVW $0x0019, (CX) MOVW R12, 2(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B: LEAL -4(R12), R12 MOVW $0x0015, (CX) MOVB R12, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B: SHLL $0x02, R12 ORL $0x01, R12 MOVW R12, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B XORQ SI, SI LEAL 1(SI)(R12*4), R12 MOVB R8, 1(CX) SARL $0x08, R8 SHLL $0x05, R8 ORL R8, R12 MOVB R12, (CX) ADDQ $0x02, CX match_nolit_emitcopy_end_encodeBetterBlockAsm8B: CMPL DX, 8(SP) JAE emit_remainder_encodeBetterBlockAsm8B CMPQ CX, (SP) JB match_nolit_dst_ok_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm8B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 LEAQ 1(DI), DI LEAQ -2(DX), R9 MOVQ (BX)(DI*1), R10 MOVQ 1(BX)(DI*1), R11 MOVQ (BX)(R9*1), R12 MOVQ 1(BX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x38, R11 SHLQ $0x10, R12 IMULQ SI, R12 SHRQ $0x36, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x38, R13 LEAQ 1(DI), R8 LEAQ 1(R9), R14 MOVL DI, (AX)(R10*4) MOVL R9, (AX)(R12*4) MOVL R8, 4096(AX)(R11*4) MOVL R14, 4096(AX)(R13*4) LEAQ 1(R9)(DI*1), R8 SHRQ $0x01, R8 ADDQ $0x01, DI SUBQ $0x01, R9 index_loop_encodeBetterBlockAsm8B: CMPQ R8, R9 JAE search_loop_encodeBetterBlockAsm8B MOVQ (BX)(DI*1), R10 MOVQ (BX)(R8*1), R11 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 SHLQ $0x10, R11 IMULQ SI, R11 SHRQ $0x36, R11 MOVL DI, (AX)(R10*4) MOVL R8, (AX)(R11*4) ADDQ $0x02, DI ADDQ $0x02, R8 JMP index_loop_encodeBetterBlockAsm8B emit_remainder_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeBetterBlockAsm8B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeBetterBlockAsm8B JB three_bytes_emit_remainder_encodeBetterBlockAsm8B three_bytes_emit_remainder_encodeBetterBlockAsm8B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B two_bytes_emit_remainder_encodeBetterBlockAsm8B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeBetterBlockAsm8B JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B one_byte_emit_remainder_encodeBetterBlockAsm8B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeBetterBlockAsm8B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B memmove_long_emit_remainder_encodeBetterBlockAsm8B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeSnappyBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000200, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeSnappyBlockAsm: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeSnappyBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeSnappyBlockAsm: MOVL DX, SI SUBL 12(SP), SI SHRL $0x06, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeSnappyBlockAsm MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 SHLQ $0x10, R11 IMULQ R9, R11 SHRQ $0x32, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeSnappyBlockAsm LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeSnappyBlockAsm repeat_extend_back_loop_encodeSnappyBlockAsm: CMPL DI, SI JBE repeat_extend_back_end_encodeSnappyBlockAsm MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeSnappyBlockAsm LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm repeat_extend_back_end_encodeSnappyBlockAsm: MOVL DI, SI SUBL 12(SP), SI LEAQ 5(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeSnappyBlockAsm MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeSnappyBlockAsm: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm MOVL DI, R8 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R9 SUBL SI, R8 LEAL -1(R8), SI CMPL SI, $0x3c JB one_byte_repeat_emit_encodeSnappyBlockAsm CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeSnappyBlockAsm CMPL SI, $0x00010000 JB three_bytes_repeat_emit_encodeSnappyBlockAsm CMPL SI, $0x01000000 JB four_bytes_repeat_emit_encodeSnappyBlockAsm MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x05, CX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm four_bytes_repeat_emit_encodeSnappyBlockAsm: MOVL SI, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm three_bytes_repeat_emit_encodeSnappyBlockAsm: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm two_bytes_repeat_emit_encodeSnappyBlockAsm: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_encodeSnappyBlockAsm JMP memmove_long_repeat_emit_encodeSnappyBlockAsm one_byte_repeat_emit_encodeSnappyBlockAsm: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_repeat_emit_encodeSnappyBlockAsm: LEAQ (CX)(R8*1), SI // genMemMoveShort CMPQ R8, $0x08 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (CX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (CX) MOVQ R9, -8(CX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm memmove_long_repeat_emit_encodeSnappyBlockAsm: LEAQ (CX)(R8*1), SI // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeSnappyBlockAsm: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm: CMPL R8, $0x10 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm LEAL -16(R8), R8 LEAL 16(R11), R11 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_match8_repeat_extend_encodeSnappyBlockAsm: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_match4_repeat_extend_encodeSnappyBlockAsm: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeSnappyBlockAsm: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm JB repeat_extend_forward_end_encodeSnappyBlockAsm MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_match1_repeat_extend_encodeSnappyBlockAsm: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm LEAL 1(R11), R11 repeat_extend_forward_end_encodeSnappyBlockAsm: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitCopy CMPL DI, $0x00010000 JB two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: CMPL SI, $0x40 JBE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm MOVB $0xff, (CX) MOVL DI, 1(CX) LEAL -64(SI), SI ADDQ $0x05, CX CMPL SI, $0x04 JB four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: TESTL SI, SI JZ repeat_end_emit_encodeSnappyBlockAsm XORL R8, R8 LEAL -1(R8)(SI*4), SI MOVB SI, (CX) MOVL DI, 1(CX) ADDQ $0x05, CX JMP repeat_end_emit_encodeSnappyBlockAsm two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm MOVB $0xee, (CX) MOVW DI, 1(CX) LEAL -60(SI), SI ADDQ $0x03, CX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: MOVL SI, R8 SHLL $0x02, R8 CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm CMPL DI, $0x00000800 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm LEAL -15(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeSnappyBlockAsm emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: LEAL -2(R8), R8 MOVB R8, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX repeat_end_emit_encodeSnappyBlockAsm: MOVL DX, 12(SP) JMP search_loop_encodeSnappyBlockAsm no_repeat_found_encodeSnappyBlockAsm: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeSnappyBlockAsm SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeSnappyBlockAsm MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeSnappyBlockAsm MOVL 20(SP), DX JMP search_loop_encodeSnappyBlockAsm candidate3_match_encodeSnappyBlockAsm: ADDL $0x02, DX JMP candidate_match_encodeSnappyBlockAsm candidate2_match_encodeSnappyBlockAsm: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeSnappyBlockAsm: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeSnappyBlockAsm match_extend_back_loop_encodeSnappyBlockAsm: CMPL DX, DI JBE match_extend_back_end_encodeSnappyBlockAsm MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeSnappyBlockAsm LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeSnappyBlockAsm JMP match_extend_back_loop_encodeSnappyBlockAsm match_extend_back_end_encodeSnappyBlockAsm: MOVL DX, DI SUBL 12(SP), DI LEAQ 5(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeSnappyBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeSnappyBlockAsm: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), R8 CMPL R8, $0x3c JB one_byte_match_emit_encodeSnappyBlockAsm CMPL R8, $0x00000100 JB two_bytes_match_emit_encodeSnappyBlockAsm CMPL R8, $0x00010000 JB three_bytes_match_emit_encodeSnappyBlockAsm CMPL R8, $0x01000000 JB four_bytes_match_emit_encodeSnappyBlockAsm MOVB $0xfc, (CX) MOVL R8, 1(CX) ADDQ $0x05, CX JMP memmove_long_match_emit_encodeSnappyBlockAsm four_bytes_match_emit_encodeSnappyBlockAsm: MOVL R8, R10 SHRL $0x10, R10 MOVB $0xf8, (CX) MOVW R8, 1(CX) MOVB R10, 3(CX) ADDQ $0x04, CX JMP memmove_long_match_emit_encodeSnappyBlockAsm three_bytes_match_emit_encodeSnappyBlockAsm: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeSnappyBlockAsm two_bytes_match_emit_encodeSnappyBlockAsm: MOVB $0xf0, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_match_emit_encodeSnappyBlockAsm JMP memmove_long_match_emit_encodeSnappyBlockAsm one_byte_match_emit_encodeSnappyBlockAsm: SHLB $0x02, R8 MOVB R8, (CX) ADDQ $0x01, CX memmove_match_emit_encodeSnappyBlockAsm: LEAQ (CX)(R9*1), R8 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: MOVQ (DI), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: MOVQ (DI), R10 MOVQ -8(DI)(R9*1), DI MOVQ R10, (CX) MOVQ DI, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm: MOVQ R8, CX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm memmove_long_match_emit_encodeSnappyBlockAsm: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVQ R9, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 LEAQ -32(DI)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: MOVOU -32(DI)(R12*1), X4 MOVOU -16(DI)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_match_emit_encodeSnappyBlockAsm: match_nolit_loop_encodeSnappyBlockAsm: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm: CMPL DI, $0x10 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm matchlen_bsf_16match_nolit_encodeSnappyBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm matchlen_match8_match_nolit_encodeSnappyBlockAsm: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm matchlen_match4_match_nolit_encodeSnappyBlockAsm: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_encodeSnappyBlockAsm: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm JB match_nolit_end_encodeSnappyBlockAsm MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_encodeSnappyBlockAsm matchlen_match1_match_nolit_encodeSnappyBlockAsm: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm LEAL 1(R10), R10 match_nolit_end_encodeSnappyBlockAsm: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy CMPL SI, $0x00010000 JB two_byte_offset_match_nolit_encodeSnappyBlockAsm four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: CMPL R10, $0x40 JBE four_bytes_remain_match_nolit_encodeSnappyBlockAsm MOVB $0xff, (CX) MOVL SI, 1(CX) LEAL -64(R10), R10 ADDQ $0x05, CX CMPL R10, $0x04 JB four_bytes_remain_match_nolit_encodeSnappyBlockAsm JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm four_bytes_remain_match_nolit_encodeSnappyBlockAsm: TESTL R10, R10 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm XORL DI, DI LEAL -1(DI)(R10*4), R10 MOVB R10, (CX) MOVL SI, 1(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm two_byte_offset_match_nolit_encodeSnappyBlockAsm: CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm MOVB $0xee, (CX) MOVW SI, 1(CX) LEAL -60(R10), R10 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: MOVL R10, DI SHLL $0x02, DI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm CMPL SI, $0x00000800 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm LEAL -15(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm emit_copy_three_match_nolit_encodeSnappyBlockAsm: LEAL -2(DI), DI MOVB DI, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeSnappyBlockAsm: CMPL DX, 8(SP) JAE emit_remainder_encodeSnappyBlockAsm MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeSnappyBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm: MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x10, R8 IMULQ R9, R8 SHRQ $0x32, R8 SHLQ $0x10, SI IMULQ R9, SI SHRQ $0x32, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_encodeSnappyBlockAsm INCL DX JMP search_loop_encodeSnappyBlockAsm emit_remainder_encodeSnappyBlockAsm: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 5(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeSnappyBlockAsm MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeSnappyBlockAsm: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeSnappyBlockAsm CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeSnappyBlockAsm CMPL DX, $0x00010000 JB three_bytes_emit_remainder_encodeSnappyBlockAsm CMPL DX, $0x01000000 JB four_bytes_emit_remainder_encodeSnappyBlockAsm MOVB $0xfc, (CX) MOVL DX, 1(CX) ADDQ $0x05, CX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm four_bytes_emit_remainder_encodeSnappyBlockAsm: MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (CX) MOVW DX, 1(CX) MOVB BL, 3(CX) ADDQ $0x04, CX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm three_bytes_emit_remainder_encodeSnappyBlockAsm: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm two_bytes_emit_remainder_encodeSnappyBlockAsm: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeSnappyBlockAsm JMP memmove_long_emit_remainder_encodeSnappyBlockAsm one_byte_emit_remainder_encodeSnappyBlockAsm: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeSnappyBlockAsm: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm memmove_long_emit_remainder_encodeSnappyBlockAsm: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeSnappyBlockAsm: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeSnappyBlockAsm64K(dst []byte, src []byte, tmp *[65536]byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm64K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000200, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeSnappyBlockAsm64K: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeSnappyBlockAsm64K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeSnappyBlockAsm64K: MOVL DX, SI SUBL 12(SP), SI SHRL $0x06, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeSnappyBlockAsm64K MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 SHLQ $0x10, R11 IMULQ R9, R11 SHRQ $0x32, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeSnappyBlockAsm64K LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeSnappyBlockAsm64K repeat_extend_back_loop_encodeSnappyBlockAsm64K: CMPL DI, SI JBE repeat_extend_back_end_encodeSnappyBlockAsm64K MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeSnappyBlockAsm64K LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K repeat_extend_back_end_encodeSnappyBlockAsm64K: MOVL DI, SI SUBL 12(SP), SI LEAQ 3(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeSnappyBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeSnappyBlockAsm64K: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K MOVL DI, R8 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R9 SUBL SI, R8 LEAL -1(R8), SI CMPL SI, $0x3c JB one_byte_repeat_emit_encodeSnappyBlockAsm64K CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeSnappyBlockAsm64K JB three_bytes_repeat_emit_encodeSnappyBlockAsm64K three_bytes_repeat_emit_encodeSnappyBlockAsm64K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K two_bytes_repeat_emit_encodeSnappyBlockAsm64K: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_encodeSnappyBlockAsm64K JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K one_byte_repeat_emit_encodeSnappyBlockAsm64K: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_repeat_emit_encodeSnappyBlockAsm64K: LEAQ (CX)(R8*1), SI // genMemMoveShort CMPQ R8, $0x08 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (CX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (CX) MOVQ R9, -8(CX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K memmove_long_repeat_emit_encodeSnappyBlockAsm64K: LEAQ (CX)(R8*1), SI // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K: CMPL R8, $0x10 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K LEAL -16(R8), R8 LEAL 16(R11), R11 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K JB repeat_extend_forward_end_encodeSnappyBlockAsm64K MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K LEAL 1(R11), R11 repeat_extend_forward_end_encodeSnappyBlockAsm64K: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K: CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K MOVB $0xee, (CX) MOVW DI, 1(CX) LEAL -60(SI), SI ADDQ $0x03, CX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K: MOVL SI, R8 SHLL $0x02, R8 CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K CMPL DI, $0x00000800 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K LEAL -15(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeSnappyBlockAsm64K emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K: LEAL -2(R8), R8 MOVB R8, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX repeat_end_emit_encodeSnappyBlockAsm64K: MOVL DX, 12(SP) JMP search_loop_encodeSnappyBlockAsm64K no_repeat_found_encodeSnappyBlockAsm64K: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeSnappyBlockAsm64K SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeSnappyBlockAsm64K MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeSnappyBlockAsm64K MOVL 20(SP), DX JMP search_loop_encodeSnappyBlockAsm64K candidate3_match_encodeSnappyBlockAsm64K: ADDL $0x02, DX JMP candidate_match_encodeSnappyBlockAsm64K candidate2_match_encodeSnappyBlockAsm64K: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeSnappyBlockAsm64K: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeSnappyBlockAsm64K match_extend_back_loop_encodeSnappyBlockAsm64K: CMPL DX, DI JBE match_extend_back_end_encodeSnappyBlockAsm64K MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeSnappyBlockAsm64K LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeSnappyBlockAsm64K JMP match_extend_back_loop_encodeSnappyBlockAsm64K match_extend_back_end_encodeSnappyBlockAsm64K: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeSnappyBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeSnappyBlockAsm64K: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), R8 CMPL R8, $0x3c JB one_byte_match_emit_encodeSnappyBlockAsm64K CMPL R8, $0x00000100 JB two_bytes_match_emit_encodeSnappyBlockAsm64K JB three_bytes_match_emit_encodeSnappyBlockAsm64K three_bytes_match_emit_encodeSnappyBlockAsm64K: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeSnappyBlockAsm64K two_bytes_match_emit_encodeSnappyBlockAsm64K: MOVB $0xf0, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_match_emit_encodeSnappyBlockAsm64K JMP memmove_long_match_emit_encodeSnappyBlockAsm64K one_byte_match_emit_encodeSnappyBlockAsm64K: SHLB $0x02, R8 MOVB R8, (CX) ADDQ $0x01, CX memmove_match_emit_encodeSnappyBlockAsm64K: LEAQ (CX)(R9*1), R8 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8: MOVQ (DI), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: MOVQ (DI), R10 MOVQ -8(DI)(R9*1), DI MOVQ R10, (CX) MOVQ DI, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm64K: MOVQ R8, CX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K memmove_long_match_emit_encodeSnappyBlockAsm64K: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVQ R9, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(DI)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(DI)(R12*1), X4 MOVOU -16(DI)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_match_emit_encodeSnappyBlockAsm64K: match_nolit_loop_encodeSnappyBlockAsm64K: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K: CMPL DI, $0x10 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm64K MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm64K matchlen_match8_match_nolit_encodeSnappyBlockAsm64K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm64K MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm64K matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm64K matchlen_match4_match_nolit_encodeSnappyBlockAsm64K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm64K MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_encodeSnappyBlockAsm64K: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K JB match_nolit_end_encodeSnappyBlockAsm64K MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_encodeSnappyBlockAsm64K matchlen_match1_match_nolit_encodeSnappyBlockAsm64K: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm64K LEAL 1(R10), R10 match_nolit_end_encodeSnappyBlockAsm64K: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm64K: CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K MOVB $0xee, (CX) MOVW SI, 1(CX) LEAL -60(R10), R10 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K: MOVL R10, DI SHLL $0x02, DI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K CMPL SI, $0x00000800 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K LEAL -15(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K emit_copy_three_match_nolit_encodeSnappyBlockAsm64K: LEAL -2(DI), DI MOVB DI, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeSnappyBlockAsm64K: CMPL DX, 8(SP) JAE emit_remainder_encodeSnappyBlockAsm64K MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeSnappyBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm64K: MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x10, R8 IMULQ R9, R8 SHRQ $0x32, R8 SHLQ $0x10, SI IMULQ R9, SI SHRQ $0x32, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_encodeSnappyBlockAsm64K INCL DX JMP search_loop_encodeSnappyBlockAsm64K emit_remainder_encodeSnappyBlockAsm64K: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeSnappyBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeSnappyBlockAsm64K: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeSnappyBlockAsm64K CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeSnappyBlockAsm64K JB three_bytes_emit_remainder_encodeSnappyBlockAsm64K three_bytes_emit_remainder_encodeSnappyBlockAsm64K: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K two_bytes_emit_remainder_encodeSnappyBlockAsm64K: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeSnappyBlockAsm64K JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K one_byte_emit_remainder_encodeSnappyBlockAsm64K: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeSnappyBlockAsm64K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K memmove_long_emit_remainder_encodeSnappyBlockAsm64K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeSnappyBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm12B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000080, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeSnappyBlockAsm12B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeSnappyBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeSnappyBlockAsm12B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x05, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeSnappyBlockAsm12B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x000000cf1bbcdcbb, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x18, R10 IMULQ R9, R10 SHRQ $0x34, R10 SHLQ $0x18, R11 IMULQ R9, R11 SHRQ $0x34, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x18, R10 IMULQ R9, R10 SHRQ $0x34, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeSnappyBlockAsm12B LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeSnappyBlockAsm12B repeat_extend_back_loop_encodeSnappyBlockAsm12B: CMPL DI, SI JBE repeat_extend_back_end_encodeSnappyBlockAsm12B MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeSnappyBlockAsm12B LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B repeat_extend_back_end_encodeSnappyBlockAsm12B: MOVL DI, SI SUBL 12(SP), SI LEAQ 3(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeSnappyBlockAsm12B: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B MOVL DI, R8 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R9 SUBL SI, R8 LEAL -1(R8), SI CMPL SI, $0x3c JB one_byte_repeat_emit_encodeSnappyBlockAsm12B CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeSnappyBlockAsm12B JB three_bytes_repeat_emit_encodeSnappyBlockAsm12B three_bytes_repeat_emit_encodeSnappyBlockAsm12B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B two_bytes_repeat_emit_encodeSnappyBlockAsm12B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_encodeSnappyBlockAsm12B JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B one_byte_repeat_emit_encodeSnappyBlockAsm12B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_repeat_emit_encodeSnappyBlockAsm12B: LEAQ (CX)(R8*1), SI // genMemMoveShort CMPQ R8, $0x08 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (CX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (CX) MOVQ R9, -8(CX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B memmove_long_repeat_emit_encodeSnappyBlockAsm12B: LEAQ (CX)(R8*1), SI // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B: CMPL R8, $0x10 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B LEAL -16(R8), R8 LEAL 16(R11), R11 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B JB repeat_extend_forward_end_encodeSnappyBlockAsm12B MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B LEAL 1(R11), R11 repeat_extend_forward_end_encodeSnappyBlockAsm12B: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B MOVB $0xee, (CX) MOVW DI, 1(CX) LEAL -60(SI), SI ADDQ $0x03, CX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: MOVL SI, R8 SHLL $0x02, R8 CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B CMPL DI, $0x00000800 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B LEAL -15(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeSnappyBlockAsm12B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: LEAL -2(R8), R8 MOVB R8, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX repeat_end_emit_encodeSnappyBlockAsm12B: MOVL DX, 12(SP) JMP search_loop_encodeSnappyBlockAsm12B no_repeat_found_encodeSnappyBlockAsm12B: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeSnappyBlockAsm12B SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeSnappyBlockAsm12B MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeSnappyBlockAsm12B MOVL 20(SP), DX JMP search_loop_encodeSnappyBlockAsm12B candidate3_match_encodeSnappyBlockAsm12B: ADDL $0x02, DX JMP candidate_match_encodeSnappyBlockAsm12B candidate2_match_encodeSnappyBlockAsm12B: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeSnappyBlockAsm12B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeSnappyBlockAsm12B match_extend_back_loop_encodeSnappyBlockAsm12B: CMPL DX, DI JBE match_extend_back_end_encodeSnappyBlockAsm12B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeSnappyBlockAsm12B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeSnappyBlockAsm12B JMP match_extend_back_loop_encodeSnappyBlockAsm12B match_extend_back_end_encodeSnappyBlockAsm12B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeSnappyBlockAsm12B: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), R8 CMPL R8, $0x3c JB one_byte_match_emit_encodeSnappyBlockAsm12B CMPL R8, $0x00000100 JB two_bytes_match_emit_encodeSnappyBlockAsm12B JB three_bytes_match_emit_encodeSnappyBlockAsm12B three_bytes_match_emit_encodeSnappyBlockAsm12B: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeSnappyBlockAsm12B two_bytes_match_emit_encodeSnappyBlockAsm12B: MOVB $0xf0, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_match_emit_encodeSnappyBlockAsm12B JMP memmove_long_match_emit_encodeSnappyBlockAsm12B one_byte_match_emit_encodeSnappyBlockAsm12B: SHLB $0x02, R8 MOVB R8, (CX) ADDQ $0x01, CX memmove_match_emit_encodeSnappyBlockAsm12B: LEAQ (CX)(R9*1), R8 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: MOVQ (DI), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: MOVQ (DI), R10 MOVQ -8(DI)(R9*1), DI MOVQ R10, (CX) MOVQ DI, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: MOVQ R8, CX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B memmove_long_match_emit_encodeSnappyBlockAsm12B: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVQ R9, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(DI)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(DI)(R12*1), X4 MOVOU -16(DI)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_match_emit_encodeSnappyBlockAsm12B: match_nolit_loop_encodeSnappyBlockAsm12B: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B: CMPL DI, $0x10 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm12B MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm12B matchlen_match8_match_nolit_encodeSnappyBlockAsm12B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm12B MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm12B matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm12B matchlen_match4_match_nolit_encodeSnappyBlockAsm12B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm12B MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_encodeSnappyBlockAsm12B: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B JB match_nolit_end_encodeSnappyBlockAsm12B MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_encodeSnappyBlockAsm12B matchlen_match1_match_nolit_encodeSnappyBlockAsm12B: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm12B LEAL 1(R10), R10 match_nolit_end_encodeSnappyBlockAsm12B: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B MOVB $0xee, (CX) MOVW SI, 1(CX) LEAL -60(R10), R10 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: MOVL R10, DI SHLL $0x02, DI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B CMPL SI, $0x00000800 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B LEAL -15(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: LEAL -2(DI), DI MOVB DI, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: CMPL DX, 8(SP) JAE emit_remainder_encodeSnappyBlockAsm12B MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm12B: MOVQ $0x000000cf1bbcdcbb, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x18, R8 IMULQ R9, R8 SHRQ $0x34, R8 SHLQ $0x18, SI IMULQ R9, SI SHRQ $0x34, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_encodeSnappyBlockAsm12B INCL DX JMP search_loop_encodeSnappyBlockAsm12B emit_remainder_encodeSnappyBlockAsm12B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeSnappyBlockAsm12B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeSnappyBlockAsm12B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeSnappyBlockAsm12B JB three_bytes_emit_remainder_encodeSnappyBlockAsm12B three_bytes_emit_remainder_encodeSnappyBlockAsm12B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B two_bytes_emit_remainder_encodeSnappyBlockAsm12B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeSnappyBlockAsm12B JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B one_byte_emit_remainder_encodeSnappyBlockAsm12B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeSnappyBlockAsm12B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B memmove_long_emit_remainder_encodeSnappyBlockAsm12B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeSnappyBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm10B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000020, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeSnappyBlockAsm10B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeSnappyBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeSnappyBlockAsm10B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x05, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeSnappyBlockAsm10B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x9e3779b1, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x36, R10 SHLQ $0x20, R11 IMULQ R9, R11 SHRQ $0x36, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x36, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeSnappyBlockAsm10B LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeSnappyBlockAsm10B repeat_extend_back_loop_encodeSnappyBlockAsm10B: CMPL DI, SI JBE repeat_extend_back_end_encodeSnappyBlockAsm10B MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeSnappyBlockAsm10B LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B repeat_extend_back_end_encodeSnappyBlockAsm10B: MOVL DI, SI SUBL 12(SP), SI LEAQ 3(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeSnappyBlockAsm10B: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B MOVL DI, R8 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R9 SUBL SI, R8 LEAL -1(R8), SI CMPL SI, $0x3c JB one_byte_repeat_emit_encodeSnappyBlockAsm10B CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeSnappyBlockAsm10B JB three_bytes_repeat_emit_encodeSnappyBlockAsm10B three_bytes_repeat_emit_encodeSnappyBlockAsm10B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B two_bytes_repeat_emit_encodeSnappyBlockAsm10B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_encodeSnappyBlockAsm10B JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B one_byte_repeat_emit_encodeSnappyBlockAsm10B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_repeat_emit_encodeSnappyBlockAsm10B: LEAQ (CX)(R8*1), SI // genMemMoveShort CMPQ R8, $0x08 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (CX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (CX) MOVQ R9, -8(CX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B memmove_long_repeat_emit_encodeSnappyBlockAsm10B: LEAQ (CX)(R8*1), SI // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B: CMPL R8, $0x10 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B LEAL -16(R8), R8 LEAL 16(R11), R11 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B JB repeat_extend_forward_end_encodeSnappyBlockAsm10B MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B LEAL 1(R11), R11 repeat_extend_forward_end_encodeSnappyBlockAsm10B: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B MOVB $0xee, (CX) MOVW DI, 1(CX) LEAL -60(SI), SI ADDQ $0x03, CX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: MOVL SI, R8 SHLL $0x02, R8 CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B CMPL DI, $0x00000800 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B LEAL -15(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeSnappyBlockAsm10B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: LEAL -2(R8), R8 MOVB R8, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX repeat_end_emit_encodeSnappyBlockAsm10B: MOVL DX, 12(SP) JMP search_loop_encodeSnappyBlockAsm10B no_repeat_found_encodeSnappyBlockAsm10B: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeSnappyBlockAsm10B SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeSnappyBlockAsm10B MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeSnappyBlockAsm10B MOVL 20(SP), DX JMP search_loop_encodeSnappyBlockAsm10B candidate3_match_encodeSnappyBlockAsm10B: ADDL $0x02, DX JMP candidate_match_encodeSnappyBlockAsm10B candidate2_match_encodeSnappyBlockAsm10B: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeSnappyBlockAsm10B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeSnappyBlockAsm10B match_extend_back_loop_encodeSnappyBlockAsm10B: CMPL DX, DI JBE match_extend_back_end_encodeSnappyBlockAsm10B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeSnappyBlockAsm10B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeSnappyBlockAsm10B JMP match_extend_back_loop_encodeSnappyBlockAsm10B match_extend_back_end_encodeSnappyBlockAsm10B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeSnappyBlockAsm10B: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), R8 CMPL R8, $0x3c JB one_byte_match_emit_encodeSnappyBlockAsm10B CMPL R8, $0x00000100 JB two_bytes_match_emit_encodeSnappyBlockAsm10B JB three_bytes_match_emit_encodeSnappyBlockAsm10B three_bytes_match_emit_encodeSnappyBlockAsm10B: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeSnappyBlockAsm10B two_bytes_match_emit_encodeSnappyBlockAsm10B: MOVB $0xf0, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_match_emit_encodeSnappyBlockAsm10B JMP memmove_long_match_emit_encodeSnappyBlockAsm10B one_byte_match_emit_encodeSnappyBlockAsm10B: SHLB $0x02, R8 MOVB R8, (CX) ADDQ $0x01, CX memmove_match_emit_encodeSnappyBlockAsm10B: LEAQ (CX)(R9*1), R8 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: MOVQ (DI), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: MOVQ (DI), R10 MOVQ -8(DI)(R9*1), DI MOVQ R10, (CX) MOVQ DI, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: MOVQ R8, CX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B memmove_long_match_emit_encodeSnappyBlockAsm10B: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVQ R9, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(DI)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(DI)(R12*1), X4 MOVOU -16(DI)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_match_emit_encodeSnappyBlockAsm10B: match_nolit_loop_encodeSnappyBlockAsm10B: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B: CMPL DI, $0x10 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm10B MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm10B matchlen_match8_match_nolit_encodeSnappyBlockAsm10B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm10B MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm10B matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm10B matchlen_match4_match_nolit_encodeSnappyBlockAsm10B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm10B MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_encodeSnappyBlockAsm10B: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B JB match_nolit_end_encodeSnappyBlockAsm10B MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_encodeSnappyBlockAsm10B matchlen_match1_match_nolit_encodeSnappyBlockAsm10B: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm10B LEAL 1(R10), R10 match_nolit_end_encodeSnappyBlockAsm10B: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B MOVB $0xee, (CX) MOVW SI, 1(CX) LEAL -60(R10), R10 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: MOVL R10, DI SHLL $0x02, DI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B CMPL SI, $0x00000800 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B LEAL -15(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: LEAL -2(DI), DI MOVB DI, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: CMPL DX, 8(SP) JAE emit_remainder_encodeSnappyBlockAsm10B MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm10B: MOVQ $0x9e3779b1, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x20, R8 IMULQ R9, R8 SHRQ $0x36, R8 SHLQ $0x20, SI IMULQ R9, SI SHRQ $0x36, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_encodeSnappyBlockAsm10B INCL DX JMP search_loop_encodeSnappyBlockAsm10B emit_remainder_encodeSnappyBlockAsm10B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeSnappyBlockAsm10B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeSnappyBlockAsm10B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeSnappyBlockAsm10B JB three_bytes_emit_remainder_encodeSnappyBlockAsm10B three_bytes_emit_remainder_encodeSnappyBlockAsm10B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B two_bytes_emit_remainder_encodeSnappyBlockAsm10B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeSnappyBlockAsm10B JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B one_byte_emit_remainder_encodeSnappyBlockAsm10B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeSnappyBlockAsm10B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B memmove_long_emit_remainder_encodeSnappyBlockAsm10B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeSnappyBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm8B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000008, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeSnappyBlockAsm8B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeSnappyBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeSnappyBlockAsm8B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x04, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeSnappyBlockAsm8B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x9e3779b1, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x38, R10 SHLQ $0x20, R11 IMULQ R9, R11 SHRQ $0x38, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x38, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_encodeSnappyBlockAsm8B LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_encodeSnappyBlockAsm8B repeat_extend_back_loop_encodeSnappyBlockAsm8B: CMPL DI, SI JBE repeat_extend_back_end_encodeSnappyBlockAsm8B MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_encodeSnappyBlockAsm8B LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B repeat_extend_back_end_encodeSnappyBlockAsm8B: MOVL DI, SI SUBL 12(SP), SI LEAQ 3(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeSnappyBlockAsm8B: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B MOVL DI, R8 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R9 SUBL SI, R8 LEAL -1(R8), SI CMPL SI, $0x3c JB one_byte_repeat_emit_encodeSnappyBlockAsm8B CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeSnappyBlockAsm8B JB three_bytes_repeat_emit_encodeSnappyBlockAsm8B three_bytes_repeat_emit_encodeSnappyBlockAsm8B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B two_bytes_repeat_emit_encodeSnappyBlockAsm8B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_encodeSnappyBlockAsm8B JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B one_byte_repeat_emit_encodeSnappyBlockAsm8B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_repeat_emit_encodeSnappyBlockAsm8B: LEAQ (CX)(R8*1), SI // genMemMoveShort CMPQ R8, $0x08 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (CX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (CX) MOVQ R9, -8(CX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B memmove_long_repeat_emit_encodeSnappyBlockAsm8B: LEAQ (CX)(R8*1), SI // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ SI, CX emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B: CMPL R8, $0x10 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B LEAL -16(R8), R8 LEAL 16(R11), R11 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B JB repeat_extend_forward_end_encodeSnappyBlockAsm8B MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B LEAL 1(R11), R11 repeat_extend_forward_end_encodeSnappyBlockAsm8B: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B MOVB $0xee, (CX) MOVW DI, 1(CX) LEAL -60(SI), SI ADDQ $0x03, CX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: MOVL SI, R8 SHLL $0x02, R8 CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B LEAL -15(R8), R8 MOVB DI, 1(CX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, R8 MOVB R8, (CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeSnappyBlockAsm8B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: LEAL -2(R8), R8 MOVB R8, (CX) MOVW DI, 1(CX) ADDQ $0x03, CX repeat_end_emit_encodeSnappyBlockAsm8B: MOVL DX, 12(SP) JMP search_loop_encodeSnappyBlockAsm8B no_repeat_found_encodeSnappyBlockAsm8B: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeSnappyBlockAsm8B SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_encodeSnappyBlockAsm8B MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_encodeSnappyBlockAsm8B MOVL 20(SP), DX JMP search_loop_encodeSnappyBlockAsm8B candidate3_match_encodeSnappyBlockAsm8B: ADDL $0x02, DX JMP candidate_match_encodeSnappyBlockAsm8B candidate2_match_encodeSnappyBlockAsm8B: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_encodeSnappyBlockAsm8B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeSnappyBlockAsm8B match_extend_back_loop_encodeSnappyBlockAsm8B: CMPL DX, DI JBE match_extend_back_end_encodeSnappyBlockAsm8B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeSnappyBlockAsm8B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeSnappyBlockAsm8B JMP match_extend_back_loop_encodeSnappyBlockAsm8B match_extend_back_end_encodeSnappyBlockAsm8B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeSnappyBlockAsm8B: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), R8 CMPL R8, $0x3c JB one_byte_match_emit_encodeSnappyBlockAsm8B CMPL R8, $0x00000100 JB two_bytes_match_emit_encodeSnappyBlockAsm8B JB three_bytes_match_emit_encodeSnappyBlockAsm8B three_bytes_match_emit_encodeSnappyBlockAsm8B: MOVB $0xf4, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeSnappyBlockAsm8B two_bytes_match_emit_encodeSnappyBlockAsm8B: MOVB $0xf0, (CX) MOVB R8, 1(CX) ADDQ $0x02, CX CMPL R8, $0x40 JB memmove_match_emit_encodeSnappyBlockAsm8B JMP memmove_long_match_emit_encodeSnappyBlockAsm8B one_byte_match_emit_encodeSnappyBlockAsm8B: SHLB $0x02, R8 MOVB R8, (CX) ADDQ $0x01, CX memmove_match_emit_encodeSnappyBlockAsm8B: LEAQ (CX)(R9*1), R8 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: MOVQ (DI), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: MOVQ (DI), R10 MOVQ -8(DI)(R9*1), DI MOVQ R10, (CX) MOVQ DI, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: MOVOU (DI), X0 MOVOU -16(DI)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: MOVQ R8, CX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B memmove_long_match_emit_encodeSnappyBlockAsm8B: LEAQ (CX)(R9*1), R8 // genMemMoveLong MOVOU (DI), X0 MOVOU 16(DI), X1 MOVOU -32(DI)(R9*1), X2 MOVOU -16(DI)(R9*1), X3 MOVQ R9, R11 SHRQ $0x05, R11 MOVQ CX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(DI)(R12*1), R10 LEAQ -32(CX)(R12*1), R13 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(DI)(R12*1), X4 MOVOU -16(DI)(R12*1), X5 MOVOA X4, -32(CX)(R12*1) MOVOA X5, -16(CX)(R12*1) ADDQ $0x20, R12 CMPQ R9, R12 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ R8, CX emit_literal_done_match_emit_encodeSnappyBlockAsm8B: match_nolit_loop_encodeSnappyBlockAsm8B: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B: CMPL DI, $0x10 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm8B MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm8B matchlen_match8_match_nolit_encodeSnappyBlockAsm8B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm8B MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm8B matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_encodeSnappyBlockAsm8B matchlen_match4_match_nolit_encodeSnappyBlockAsm8B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm8B MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_encodeSnappyBlockAsm8B: CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B JB match_nolit_end_encodeSnappyBlockAsm8B MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_encodeSnappyBlockAsm8B matchlen_match1_match_nolit_encodeSnappyBlockAsm8B: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm8B LEAL 1(R10), R10 match_nolit_end_encodeSnappyBlockAsm8B: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B MOVB $0xee, (CX) MOVW SI, 1(CX) LEAL -60(R10), R10 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: MOVL R10, DI SHLL $0x02, DI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B LEAL -15(DI), DI MOVB SI, 1(CX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: LEAL -2(DI), DI MOVB DI, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: CMPL DX, 8(SP) JAE emit_remainder_encodeSnappyBlockAsm8B MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm8B: MOVQ $0x9e3779b1, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x20, R8 IMULQ R9, R8 SHRQ $0x38, R8 SHLQ $0x20, SI IMULQ R9, SI SHRQ $0x38, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_encodeSnappyBlockAsm8B INCL DX JMP search_loop_encodeSnappyBlockAsm8B emit_remainder_encodeSnappyBlockAsm8B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeSnappyBlockAsm8B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeSnappyBlockAsm8B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeSnappyBlockAsm8B JB three_bytes_emit_remainder_encodeSnappyBlockAsm8B three_bytes_emit_remainder_encodeSnappyBlockAsm8B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B two_bytes_emit_remainder_encodeSnappyBlockAsm8B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeSnappyBlockAsm8B JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B one_byte_emit_remainder_encodeSnappyBlockAsm8B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeSnappyBlockAsm8B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B memmove_long_emit_remainder_encodeSnappyBlockAsm8B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeSnappyBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00001200, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeSnappyBetterBlockAsm: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeSnappyBetterBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeSnappyBetterBlockAsm: MOVL DX, SI SUBL 12(SP), SI SHRL $0x07, SI CMPL SI, $0x63 JBE check_maxskip_ok_encodeSnappyBetterBlockAsm LEAL 100(DX), SI JMP check_maxskip_cont_encodeSnappyBetterBlockAsm check_maxskip_ok_encodeSnappyBetterBlockAsm: LEAL 1(DX)(SI*1), SI check_maxskip_cont_encodeSnappyBetterBlockAsm: CMPL SI, 8(SP) JAE emit_remainder_encodeSnappyBetterBlockAsm MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL (AX)(R10*4), SI MOVL 524288(AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, 524288(AX)(R11*4) MOVQ (BX)(SI*1), R10 MOVQ (BX)(R8*1), R11 CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm CMPQ R11, DI JNE no_short_found_encodeSnappyBetterBlockAsm MOVL R8, SI JMP candidate_match_encodeSnappyBetterBlockAsm no_short_found_encodeSnappyBetterBlockAsm: CMPL R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm CMPL R11, DI JEQ candidateS_match_encodeSnappyBetterBlockAsm MOVL 20(SP), DX JMP search_loop_encodeSnappyBetterBlockAsm candidateS_match_encodeSnappyBetterBlockAsm: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x2f, R10 MOVL (AX)(R10*4), SI INCL DX MOVL DX, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate_match_encodeSnappyBetterBlockAsm DECL DX MOVL R8, SI candidate_match_encodeSnappyBetterBlockAsm: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeSnappyBetterBlockAsm match_extend_back_loop_encodeSnappyBetterBlockAsm: CMPL DX, DI JBE match_extend_back_end_encodeSnappyBetterBlockAsm MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeSnappyBetterBlockAsm LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeSnappyBetterBlockAsm JMP match_extend_back_loop_encodeSnappyBetterBlockAsm match_extend_back_end_encodeSnappyBetterBlockAsm: MOVL DX, DI SUBL 12(SP), DI LEAQ 5(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeSnappyBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeSnappyBetterBlockAsm: MOVL DX, DI ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), R10 // matchLen XORL R12, R12 matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm: CMPL R8, $0x10 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm MOVQ (R9)(R12*1), R11 MOVQ 8(R9)(R12*1), R13 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm XORQ 8(R10)(R12*1), R13 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm LEAL -16(R8), R8 LEAL 16(R12), R12 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP match_nolit_end_encodeSnappyBetterBlockAsm matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm: CMPL R8, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm LEAL -8(R8), R8 LEAL 8(R12), R12 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeSnappyBetterBlockAsm matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm: CMPL R8, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm MOVL (R9)(R12*1), R11 CMPL (R10)(R12*1), R11 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm LEAL -4(R8), R8 LEAL 4(R12), R12 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm: CMPL R8, $0x01 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm JB match_nolit_end_encodeSnappyBetterBlockAsm MOVW (R9)(R12*1), R11 CMPW (R10)(R12*1), R11 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm LEAL 2(R12), R12 SUBL $0x02, R8 JZ match_nolit_end_encodeSnappyBetterBlockAsm matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm: MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm LEAL 1(R12), R12 match_nolit_end_encodeSnappyBetterBlockAsm: MOVL DX, R8 SUBL SI, R8 // Check if repeat CMPL R12, $0x01 JA match_length_ok_encodeSnappyBetterBlockAsm CMPL R8, $0x0000ffff JBE match_length_ok_encodeSnappyBetterBlockAsm MOVL 20(SP), DX INCL DX JMP search_loop_encodeSnappyBetterBlockAsm match_length_ok_encodeSnappyBetterBlockAsm: MOVL R8, 16(SP) MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_encodeSnappyBetterBlockAsm CMPL SI, $0x00000100 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm CMPL SI, $0x00010000 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm CMPL SI, $0x01000000 JB four_bytes_match_emit_encodeSnappyBetterBlockAsm MOVB $0xfc, (CX) MOVL SI, 1(CX) ADDQ $0x05, CX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm four_bytes_match_emit_encodeSnappyBetterBlockAsm: MOVL SI, R11 SHRL $0x10, R11 MOVB $0xf8, (CX) MOVW SI, 1(CX) MOVB R11, 3(CX) ADDQ $0x04, CX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm three_bytes_match_emit_encodeSnappyBetterBlockAsm: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm two_bytes_match_emit_encodeSnappyBetterBlockAsm: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_encodeSnappyBetterBlockAsm JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm one_byte_match_emit_encodeSnappyBetterBlockAsm: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_encodeSnappyBetterBlockAsm: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8: MOVQ (R10), R11 MOVQ R11, (CX) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm: MOVQ SI, CX JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm memmove_long_match_emit_encodeSnappyBetterBlockAsm: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_encodeSnappyBetterBlockAsm: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitCopy CMPL R8, $0x00010000 JB two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm: CMPL R12, $0x40 JBE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm MOVB $0xff, (CX) MOVL R8, 1(CX) LEAL -64(R12), R12 ADDQ $0x05, CX CMPL R12, $0x04 JB four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm: TESTL R12, R12 JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm XORL SI, SI LEAL -1(SI)(R12*4), R12 MOVB R12, (CX) MOVL R8, 1(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm: CMPL R12, $0x40 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm MOVB $0xee, (CX) MOVW R8, 1(CX) LEAL -60(R12), R12 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm: MOVL R12, SI SHLL $0x02, SI CMPL R12, $0x0c JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm CMPL R8, $0x00000800 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm LEAL -15(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm: LEAL -2(SI), SI MOVB SI, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: CMPL DX, 8(SP) JAE emit_remainder_encodeSnappyBetterBlockAsm CMPQ CX, (SP) JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeSnappyBetterBlockAsm: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 LEAQ 1(DI), DI LEAQ -2(DX), R9 MOVQ (BX)(DI*1), R10 MOVQ 1(BX)(DI*1), R11 MOVQ (BX)(R9*1), R12 MOVQ 1(BX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 SHLQ $0x08, R12 IMULQ SI, R12 SHRQ $0x2f, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x32, R13 LEAQ 1(DI), R8 LEAQ 1(R9), R14 MOVL DI, (AX)(R10*4) MOVL R9, (AX)(R12*4) MOVL R8, 524288(AX)(R11*4) MOVL R14, 524288(AX)(R13*4) LEAQ 1(R9)(DI*1), R8 SHRQ $0x01, R8 ADDQ $0x01, DI SUBQ $0x01, R9 index_loop_encodeSnappyBetterBlockAsm: CMPQ R8, R9 JAE search_loop_encodeSnappyBetterBlockAsm MOVQ (BX)(DI*1), R10 MOVQ (BX)(R8*1), R11 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 SHLQ $0x08, R11 IMULQ SI, R11 SHRQ $0x2f, R11 MOVL DI, (AX)(R10*4) MOVL R8, (AX)(R11*4) ADDQ $0x02, DI ADDQ $0x02, R8 JMP index_loop_encodeSnappyBetterBlockAsm emit_remainder_encodeSnappyBetterBlockAsm: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 5(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeSnappyBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeSnappyBetterBlockAsm: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm CMPL DX, $0x00010000 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm CMPL DX, $0x01000000 JB four_bytes_emit_remainder_encodeSnappyBetterBlockAsm MOVB $0xfc, (CX) MOVL DX, 1(CX) ADDQ $0x05, CX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm four_bytes_emit_remainder_encodeSnappyBetterBlockAsm: MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (CX) MOVW DX, 1(CX) MOVB BL, 3(CX) ADDQ $0x04, CX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm three_bytes_emit_remainder_encodeSnappyBetterBlockAsm: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm two_bytes_emit_remainder_encodeSnappyBetterBlockAsm: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm one_byte_emit_remainder_encodeSnappyBetterBlockAsm: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeSnappyBetterBlockAsm: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm memmove_long_emit_remainder_encodeSnappyBetterBlockAsm: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte, tmp *[294912]byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm64K(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000900, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeSnappyBetterBlockAsm64K: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeSnappyBetterBlockAsm64K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeSnappyBetterBlockAsm64K: MOVL DX, SI SUBL 12(SP), SI SHRL $0x07, SI LEAL 1(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeSnappyBetterBlockAsm64K MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x30, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x33, R11 MOVL (AX)(R10*4), SI MOVL 262144(AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, 262144(AX)(R11*4) MOVQ (BX)(SI*1), R10 MOVQ (BX)(R8*1), R11 CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm64K CMPQ R11, DI JNE no_short_found_encodeSnappyBetterBlockAsm64K MOVL R8, SI JMP candidate_match_encodeSnappyBetterBlockAsm64K no_short_found_encodeSnappyBetterBlockAsm64K: CMPL R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm64K CMPL R11, DI JEQ candidateS_match_encodeSnappyBetterBlockAsm64K MOVL 20(SP), DX JMP search_loop_encodeSnappyBetterBlockAsm64K candidateS_match_encodeSnappyBetterBlockAsm64K: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x30, R10 MOVL (AX)(R10*4), SI INCL DX MOVL DX, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate_match_encodeSnappyBetterBlockAsm64K DECL DX MOVL R8, SI candidate_match_encodeSnappyBetterBlockAsm64K: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K match_extend_back_loop_encodeSnappyBetterBlockAsm64K: CMPL DX, DI JBE match_extend_back_end_encodeSnappyBetterBlockAsm64K MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K match_extend_back_end_encodeSnappyBetterBlockAsm64K: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeSnappyBetterBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeSnappyBetterBlockAsm64K: MOVL DX, DI ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), R10 // matchLen XORL R12, R12 matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K: CMPL R8, $0x10 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K MOVQ (R9)(R12*1), R11 MOVQ 8(R9)(R12*1), R13 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K XORQ 8(R10)(R12*1), R13 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K LEAL -16(R8), R8 LEAL 16(R12), R12 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP match_nolit_end_encodeSnappyBetterBlockAsm64K matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K: CMPL R8, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K LEAL -8(R8), R8 LEAL 8(R12), R12 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeSnappyBetterBlockAsm64K matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K: CMPL R8, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K MOVL (R9)(R12*1), R11 CMPL (R10)(R12*1), R11 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K LEAL -4(R8), R8 LEAL 4(R12), R12 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K: CMPL R8, $0x01 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K JB match_nolit_end_encodeSnappyBetterBlockAsm64K MOVW (R9)(R12*1), R11 CMPW (R10)(R12*1), R11 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K LEAL 2(R12), R12 SUBL $0x02, R8 JZ match_nolit_end_encodeSnappyBetterBlockAsm64K matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K: MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm64K LEAL 1(R12), R12 match_nolit_end_encodeSnappyBetterBlockAsm64K: MOVL DX, R8 SUBL SI, R8 // Check if repeat MOVL R8, 16(SP) MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_encodeSnappyBetterBlockAsm64K CMPL SI, $0x00000100 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm64K JB three_bytes_match_emit_encodeSnappyBetterBlockAsm64K three_bytes_match_emit_encodeSnappyBetterBlockAsm64K: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K two_bytes_match_emit_encodeSnappyBetterBlockAsm64K: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_encodeSnappyBetterBlockAsm64K JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K one_byte_match_emit_encodeSnappyBetterBlockAsm64K: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_encodeSnappyBetterBlockAsm64K: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8: MOVQ (R10), R11 MOVQ R11, (CX) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K: MOVQ SI, CX JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K memmove_long_match_emit_encodeSnappyBetterBlockAsm64K: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K: CMPL R12, $0x40 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K MOVB $0xee, (CX) MOVW R8, 1(CX) LEAL -60(R12), R12 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K: MOVL R12, SI SHLL $0x02, SI CMPL R12, $0x0c JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K CMPL R8, $0x00000800 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K LEAL -15(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K: LEAL -2(SI), SI MOVB SI, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: CMPL DX, 8(SP) JAE emit_remainder_encodeSnappyBetterBlockAsm64K CMPQ CX, (SP) JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 LEAQ 1(DI), DI LEAQ -2(DX), R9 MOVQ (BX)(DI*1), R10 MOVQ 1(BX)(DI*1), R11 MOVQ (BX)(R9*1), R12 MOVQ 1(BX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x30, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x33, R11 SHLQ $0x08, R12 IMULQ SI, R12 SHRQ $0x30, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x33, R13 LEAQ 1(DI), R8 LEAQ 1(R9), R14 MOVL DI, (AX)(R10*4) MOVL R9, (AX)(R12*4) MOVL R8, 262144(AX)(R11*4) MOVL R14, 262144(AX)(R13*4) LEAQ 1(R9)(DI*1), R8 SHRQ $0x01, R8 ADDQ $0x01, DI SUBQ $0x01, R9 index_loop_encodeSnappyBetterBlockAsm64K: CMPQ R8, R9 JAE search_loop_encodeSnappyBetterBlockAsm64K MOVQ (BX)(DI*1), R10 MOVQ (BX)(R8*1), R11 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x30, R10 SHLQ $0x08, R11 IMULQ SI, R11 SHRQ $0x30, R11 MOVL DI, (AX)(R10*4) MOVL R8, (AX)(R11*4) ADDQ $0x02, DI ADDQ $0x02, R8 JMP index_loop_encodeSnappyBetterBlockAsm64K emit_remainder_encodeSnappyBetterBlockAsm64K: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeSnappyBetterBlockAsm64K MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeSnappyBetterBlockAsm64K: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm64K JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm12B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000280, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeSnappyBetterBlockAsm12B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeSnappyBetterBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeSnappyBetterBlockAsm12B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x06, SI LEAL 1(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeSnappyBetterBlockAsm12B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x34, R11 MOVL (AX)(R10*4), SI MOVL 65536(AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, 65536(AX)(R11*4) MOVQ (BX)(SI*1), R10 MOVQ (BX)(R8*1), R11 CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm12B CMPQ R11, DI JNE no_short_found_encodeSnappyBetterBlockAsm12B MOVL R8, SI JMP candidate_match_encodeSnappyBetterBlockAsm12B no_short_found_encodeSnappyBetterBlockAsm12B: CMPL R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm12B CMPL R11, DI JEQ candidateS_match_encodeSnappyBetterBlockAsm12B MOVL 20(SP), DX JMP search_loop_encodeSnappyBetterBlockAsm12B candidateS_match_encodeSnappyBetterBlockAsm12B: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 MOVL (AX)(R10*4), SI INCL DX MOVL DX, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate_match_encodeSnappyBetterBlockAsm12B DECL DX MOVL R8, SI candidate_match_encodeSnappyBetterBlockAsm12B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B match_extend_back_loop_encodeSnappyBetterBlockAsm12B: CMPL DX, DI JBE match_extend_back_end_encodeSnappyBetterBlockAsm12B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B match_extend_back_end_encodeSnappyBetterBlockAsm12B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeSnappyBetterBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeSnappyBetterBlockAsm12B: MOVL DX, DI ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), R10 // matchLen XORL R12, R12 matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B: CMPL R8, $0x10 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B MOVQ (R9)(R12*1), R11 MOVQ 8(R9)(R12*1), R13 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B XORQ 8(R10)(R12*1), R13 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B LEAL -16(R8), R8 LEAL 16(R12), R12 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP match_nolit_end_encodeSnappyBetterBlockAsm12B matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B: CMPL R8, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B LEAL -8(R8), R8 LEAL 8(R12), R12 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeSnappyBetterBlockAsm12B matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B: CMPL R8, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B MOVL (R9)(R12*1), R11 CMPL (R10)(R12*1), R11 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B LEAL -4(R8), R8 LEAL 4(R12), R12 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B: CMPL R8, $0x01 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B JB match_nolit_end_encodeSnappyBetterBlockAsm12B MOVW (R9)(R12*1), R11 CMPW (R10)(R12*1), R11 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B LEAL 2(R12), R12 SUBL $0x02, R8 JZ match_nolit_end_encodeSnappyBetterBlockAsm12B matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B: MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm12B LEAL 1(R12), R12 match_nolit_end_encodeSnappyBetterBlockAsm12B: MOVL DX, R8 SUBL SI, R8 // Check if repeat MOVL R8, 16(SP) MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_encodeSnappyBetterBlockAsm12B CMPL SI, $0x00000100 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm12B JB three_bytes_match_emit_encodeSnappyBetterBlockAsm12B three_bytes_match_emit_encodeSnappyBetterBlockAsm12B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B two_bytes_match_emit_encodeSnappyBetterBlockAsm12B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_encodeSnappyBetterBlockAsm12B JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B one_byte_match_emit_encodeSnappyBetterBlockAsm12B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_encodeSnappyBetterBlockAsm12B: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8: MOVQ (R10), R11 MOVQ R11, (CX) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B: MOVQ SI, CX JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B memmove_long_match_emit_encodeSnappyBetterBlockAsm12B: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B: CMPL R12, $0x40 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B MOVB $0xee, (CX) MOVW R8, 1(CX) LEAL -60(R12), R12 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B: MOVL R12, SI SHLL $0x02, SI CMPL R12, $0x0c JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B CMPL R8, $0x00000800 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B LEAL -15(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B: LEAL -2(SI), SI MOVB SI, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: CMPL DX, 8(SP) JAE emit_remainder_encodeSnappyBetterBlockAsm12B CMPQ CX, (SP) JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 LEAQ 1(DI), DI LEAQ -2(DX), R9 MOVQ (BX)(DI*1), R10 MOVQ 1(BX)(DI*1), R11 MOVQ (BX)(R9*1), R12 MOVQ 1(BX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x34, R11 SHLQ $0x10, R12 IMULQ SI, R12 SHRQ $0x32, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x34, R13 LEAQ 1(DI), R8 LEAQ 1(R9), R14 MOVL DI, (AX)(R10*4) MOVL R9, (AX)(R12*4) MOVL R8, 65536(AX)(R11*4) MOVL R14, 65536(AX)(R13*4) LEAQ 1(R9)(DI*1), R8 SHRQ $0x01, R8 ADDQ $0x01, DI SUBQ $0x01, R9 index_loop_encodeSnappyBetterBlockAsm12B: CMPQ R8, R9 JAE search_loop_encodeSnappyBetterBlockAsm12B MOVQ (BX)(DI*1), R10 MOVQ (BX)(R8*1), R11 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 SHLQ $0x10, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL DI, (AX)(R10*4) MOVL R8, (AX)(R11*4) ADDQ $0x02, DI ADDQ $0x02, R8 JMP index_loop_encodeSnappyBetterBlockAsm12B emit_remainder_encodeSnappyBetterBlockAsm12B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeSnappyBetterBlockAsm12B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeSnappyBetterBlockAsm12B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm12B JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm10B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x000000a0, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeSnappyBetterBlockAsm10B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeSnappyBetterBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeSnappyBetterBlockAsm10B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x05, SI LEAL 1(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeSnappyBetterBlockAsm10B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x34, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x36, R11 MOVL (AX)(R10*4), SI MOVL 16384(AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, 16384(AX)(R11*4) MOVQ (BX)(SI*1), R10 MOVQ (BX)(R8*1), R11 CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm10B CMPQ R11, DI JNE no_short_found_encodeSnappyBetterBlockAsm10B MOVL R8, SI JMP candidate_match_encodeSnappyBetterBlockAsm10B no_short_found_encodeSnappyBetterBlockAsm10B: CMPL R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm10B CMPL R11, DI JEQ candidateS_match_encodeSnappyBetterBlockAsm10B MOVL 20(SP), DX JMP search_loop_encodeSnappyBetterBlockAsm10B candidateS_match_encodeSnappyBetterBlockAsm10B: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x34, R10 MOVL (AX)(R10*4), SI INCL DX MOVL DX, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate_match_encodeSnappyBetterBlockAsm10B DECL DX MOVL R8, SI candidate_match_encodeSnappyBetterBlockAsm10B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B match_extend_back_loop_encodeSnappyBetterBlockAsm10B: CMPL DX, DI JBE match_extend_back_end_encodeSnappyBetterBlockAsm10B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B match_extend_back_end_encodeSnappyBetterBlockAsm10B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeSnappyBetterBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeSnappyBetterBlockAsm10B: MOVL DX, DI ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), R10 // matchLen XORL R12, R12 matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B: CMPL R8, $0x10 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B MOVQ (R9)(R12*1), R11 MOVQ 8(R9)(R12*1), R13 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B XORQ 8(R10)(R12*1), R13 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B LEAL -16(R8), R8 LEAL 16(R12), R12 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP match_nolit_end_encodeSnappyBetterBlockAsm10B matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B: CMPL R8, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B LEAL -8(R8), R8 LEAL 8(R12), R12 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeSnappyBetterBlockAsm10B matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B: CMPL R8, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B MOVL (R9)(R12*1), R11 CMPL (R10)(R12*1), R11 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B LEAL -4(R8), R8 LEAL 4(R12), R12 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B: CMPL R8, $0x01 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B JB match_nolit_end_encodeSnappyBetterBlockAsm10B MOVW (R9)(R12*1), R11 CMPW (R10)(R12*1), R11 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B LEAL 2(R12), R12 SUBL $0x02, R8 JZ match_nolit_end_encodeSnappyBetterBlockAsm10B matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B: MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm10B LEAL 1(R12), R12 match_nolit_end_encodeSnappyBetterBlockAsm10B: MOVL DX, R8 SUBL SI, R8 // Check if repeat MOVL R8, 16(SP) MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_encodeSnappyBetterBlockAsm10B CMPL SI, $0x00000100 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm10B JB three_bytes_match_emit_encodeSnappyBetterBlockAsm10B three_bytes_match_emit_encodeSnappyBetterBlockAsm10B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B two_bytes_match_emit_encodeSnappyBetterBlockAsm10B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_encodeSnappyBetterBlockAsm10B JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B one_byte_match_emit_encodeSnappyBetterBlockAsm10B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_encodeSnappyBetterBlockAsm10B: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8: MOVQ (R10), R11 MOVQ R11, (CX) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B: MOVQ SI, CX JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B memmove_long_match_emit_encodeSnappyBetterBlockAsm10B: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B: CMPL R12, $0x40 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B MOVB $0xee, (CX) MOVW R8, 1(CX) LEAL -60(R12), R12 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B: MOVL R12, SI SHLL $0x02, SI CMPL R12, $0x0c JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B CMPL R8, $0x00000800 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B LEAL -15(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B: LEAL -2(SI), SI MOVB SI, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: CMPL DX, 8(SP) JAE emit_remainder_encodeSnappyBetterBlockAsm10B CMPQ CX, (SP) JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 LEAQ 1(DI), DI LEAQ -2(DX), R9 MOVQ (BX)(DI*1), R10 MOVQ 1(BX)(DI*1), R11 MOVQ (BX)(R9*1), R12 MOVQ 1(BX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x36, R11 SHLQ $0x10, R12 IMULQ SI, R12 SHRQ $0x34, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x36, R13 LEAQ 1(DI), R8 LEAQ 1(R9), R14 MOVL DI, (AX)(R10*4) MOVL R9, (AX)(R12*4) MOVL R8, 16384(AX)(R11*4) MOVL R14, 16384(AX)(R13*4) LEAQ 1(R9)(DI*1), R8 SHRQ $0x01, R8 ADDQ $0x01, DI SUBQ $0x01, R9 index_loop_encodeSnappyBetterBlockAsm10B: CMPQ R8, R9 JAE search_loop_encodeSnappyBetterBlockAsm10B MOVQ (BX)(DI*1), R10 MOVQ (BX)(R8*1), R11 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 SHLQ $0x10, R11 IMULQ SI, R11 SHRQ $0x34, R11 MOVL DI, (AX)(R10*4) MOVL R8, (AX)(R11*4) ADDQ $0x02, DI ADDQ $0x02, R8 JMP index_loop_encodeSnappyBetterBlockAsm10B emit_remainder_encodeSnappyBetterBlockAsm10B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeSnappyBetterBlockAsm10B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeSnappyBetterBlockAsm10B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm10B JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm8B(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX MOVQ $0x00000028, DX MOVQ AX, BX PXOR X0, X0 zero_loop_encodeSnappyBetterBlockAsm8B: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_encodeSnappyBetterBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), BX search_loop_encodeSnappyBetterBlockAsm8B: MOVL DX, SI SUBL 12(SP), SI SHRL $0x04, SI LEAL 1(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_encodeSnappyBetterBlockAsm8B MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ $0x9e3779b1, SI MOVQ DI, R10 MOVQ DI, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x36, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x38, R11 MOVL (AX)(R10*4), SI MOVL 4096(AX)(R11*4), R8 MOVL DX, (AX)(R10*4) MOVL DX, 4096(AX)(R11*4) MOVQ (BX)(SI*1), R10 MOVQ (BX)(R8*1), R11 CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm8B CMPQ R11, DI JNE no_short_found_encodeSnappyBetterBlockAsm8B MOVL R8, SI JMP candidate_match_encodeSnappyBetterBlockAsm8B no_short_found_encodeSnappyBetterBlockAsm8B: CMPL R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm8B CMPL R11, DI JEQ candidateS_match_encodeSnappyBetterBlockAsm8B MOVL 20(SP), DX JMP search_loop_encodeSnappyBetterBlockAsm8B candidateS_match_encodeSnappyBetterBlockAsm8B: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x36, R10 MOVL (AX)(R10*4), SI INCL DX MOVL DX, (AX)(R10*4) CMPL (BX)(SI*1), DI JEQ candidate_match_encodeSnappyBetterBlockAsm8B DECL DX MOVL R8, SI candidate_match_encodeSnappyBetterBlockAsm8B: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B match_extend_back_loop_encodeSnappyBetterBlockAsm8B: CMPL DX, DI JBE match_extend_back_end_encodeSnappyBetterBlockAsm8B MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B LEAL -1(DX), DX DECL SI JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B match_extend_back_end_encodeSnappyBetterBlockAsm8B: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_encodeSnappyBetterBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeSnappyBetterBlockAsm8B: MOVL DX, DI ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+32(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), R10 // matchLen XORL R12, R12 matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B: CMPL R8, $0x10 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B MOVQ (R9)(R12*1), R11 MOVQ 8(R9)(R12*1), R13 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B XORQ 8(R10)(R12*1), R13 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B LEAL -16(R8), R8 LEAL 16(R12), R12 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R13, R13 #else BSFQ R13, R13 #endif SARQ $0x03, R13 LEAL 8(R12)(R13*1), R12 JMP match_nolit_end_encodeSnappyBetterBlockAsm8B matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B: CMPL R8, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B LEAL -8(R8), R8 LEAL 8(R12), R12 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL (R12)(R11*1), R12 JMP match_nolit_end_encodeSnappyBetterBlockAsm8B matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B: CMPL R8, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B MOVL (R9)(R12*1), R11 CMPL (R10)(R12*1), R11 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B LEAL -4(R8), R8 LEAL 4(R12), R12 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B: CMPL R8, $0x01 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B JB match_nolit_end_encodeSnappyBetterBlockAsm8B MOVW (R9)(R12*1), R11 CMPW (R10)(R12*1), R11 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B LEAL 2(R12), R12 SUBL $0x02, R8 JZ match_nolit_end_encodeSnappyBetterBlockAsm8B matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B: MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm8B LEAL 1(R12), R12 match_nolit_end_encodeSnappyBetterBlockAsm8B: MOVL DX, R8 SUBL SI, R8 // Check if repeat MOVL R8, 16(SP) MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R10 SUBL SI, R9 LEAL -1(R9), SI CMPL SI, $0x3c JB one_byte_match_emit_encodeSnappyBetterBlockAsm8B CMPL SI, $0x00000100 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm8B JB three_bytes_match_emit_encodeSnappyBetterBlockAsm8B three_bytes_match_emit_encodeSnappyBetterBlockAsm8B: MOVB $0xf4, (CX) MOVW SI, 1(CX) ADDQ $0x03, CX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B two_bytes_match_emit_encodeSnappyBetterBlockAsm8B: MOVB $0xf0, (CX) MOVB SI, 1(CX) ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_match_emit_encodeSnappyBetterBlockAsm8B JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B one_byte_match_emit_encodeSnappyBetterBlockAsm8B: SHLB $0x02, SI MOVB SI, (CX) ADDQ $0x01, CX memmove_match_emit_encodeSnappyBetterBlockAsm8B: LEAQ (CX)(R9*1), SI // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8: MOVQ (R10), R11 MOVQ R11, (CX) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: MOVQ (R10), R11 MOVQ -8(R10)(R9*1), R10 MOVQ R11, (CX) MOVQ R10, -8(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: MOVOU (R10), X0 MOVOU -16(R10)(R9*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(R9*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B: MOVQ SI, CX JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B memmove_long_match_emit_encodeSnappyBetterBlockAsm8B: LEAQ (CX)(R9*1), SI // genMemMoveLong MOVOU (R10), X0 MOVOU 16(R10), X1 MOVOU -32(R10)(R9*1), X2 MOVOU -16(R10)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ CX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R14 SUBQ R11, R14 DECQ R13 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R10)(R14*1), R11 LEAQ -32(CX)(R14*1), R15 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R11 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R10)(R14*1), X4 MOVOU -16(R10)(R14*1), X5 MOVOA X4, -32(CX)(R14*1) MOVOA X5, -16(CX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(R9*1) MOVOU X3, -16(CX)(R9*1) MOVQ SI, CX emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B: ADDL R12, DX ADDL $0x04, R12 MOVL DX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B: CMPL R12, $0x40 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B MOVB $0xee, (CX) MOVW R8, 1(CX) LEAL -60(R12), R12 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B: MOVL R12, SI SHLL $0x02, SI CMPL R12, $0x0c JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B LEAL -15(SI), SI MOVB R8, 1(CX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, SI MOVB SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B: LEAL -2(SI), SI MOVB SI, (CX) MOVW R8, 1(CX) ADDQ $0x03, CX match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: CMPL DX, 8(SP) JAE emit_remainder_encodeSnappyBetterBlockAsm8B CMPQ CX, (SP) JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 LEAQ 1(DI), DI LEAQ -2(DX), R9 MOVQ (BX)(DI*1), R10 MOVQ 1(BX)(DI*1), R11 MOVQ (BX)(R9*1), R12 MOVQ 1(BX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x38, R11 SHLQ $0x10, R12 IMULQ SI, R12 SHRQ $0x36, R12 SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x38, R13 LEAQ 1(DI), R8 LEAQ 1(R9), R14 MOVL DI, (AX)(R10*4) MOVL R9, (AX)(R12*4) MOVL R8, 4096(AX)(R11*4) MOVL R14, 4096(AX)(R13*4) LEAQ 1(R9)(DI*1), R8 SHRQ $0x01, R8 ADDQ $0x01, DI SUBQ $0x01, R9 index_loop_encodeSnappyBetterBlockAsm8B: CMPQ R8, R9 JAE search_loop_encodeSnappyBetterBlockAsm8B MOVQ (BX)(DI*1), R10 MOVQ (BX)(R8*1), R11 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 SHLQ $0x10, R11 IMULQ SI, R11 SHRQ $0x36, R11 MOVL DI, (AX)(R10*4) MOVL R8, (AX)(R11*4) ADDQ $0x02, DI ADDQ $0x02, R8 JMP index_loop_encodeSnappyBetterBlockAsm8B emit_remainder_encodeSnappyBetterBlockAsm8B: MOVQ src_len+32(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_encodeSnappyBetterBlockAsm8B MOVQ $0x00000000, ret+56(FP) RET emit_remainder_ok_encodeSnappyBetterBlockAsm8B: MOVQ src_len+32(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), DX CMPL DX, $0x3c JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B CMPL DX, $0x00000100 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: MOVB $0xf4, (CX) MOVW DX, 1(CX) ADDQ $0x03, CX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: MOVB $0xf0, (CX) MOVB DL, 1(CX) ADDQ $0x02, CX CMPL DX, $0x40 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm8B JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B: SHLB $0x02, DL MOVB DL, (CX) ADDQ $0x01, CX memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2: MOVB (AX), SI MOVB -1(AX)(BX*1), AL MOVB SI, (CX) MOVB AL, -1(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3: MOVW (AX), SI MOVB 2(AX), AL MOVW SI, (CX) MOVB AL, 2(CX) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7: MOVL (AX), SI MOVL -4(AX)(BX*1), AX MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: MOVQ (AX), SI MOVQ -8(AX)(BX*1), AX MOVQ SI, (CX) MOVQ AX, -8(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: MOVOU (AX), X0 MOVOU -16(AX)(BX*1), X1 MOVOU X0, (CX) MOVOU X1, -16(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B: MOVQ DX, CX JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B: LEAQ (CX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU -32(AX)(BX*1), X2 MOVOU -16(AX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ CX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(AX)(R8*1), SI LEAQ -32(CX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(AX)(R8*1), X4 MOVOU -16(AX)(R8*1), X5 MOVOA X4, -32(CX)(R8*1) MOVOA X5, -16(CX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) MOVOU X2, -32(CX)(BX*1) MOVOU X3, -16(CX)(BX*1) MOVQ DX, CX emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B: MOVQ dst_base+0(FP), AX SUBQ AX, CX MOVQ CX, ret+56(FP) RET // func calcBlockSize(src []byte, tmp *[32768]byte) int // Requires: BMI, SSE2 TEXT ·calcBlockSize(SB), $24-40 MOVQ tmp+24(FP), AX XORQ CX, CX MOVQ $0x00000100, DX MOVQ AX, BX PXOR X0, X0 zero_loop_calcBlockSize: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_calcBlockSize MOVL $0x00000000, 12(SP) MOVQ src_len+8(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+0(FP), BX search_loop_calcBlockSize: MOVL DX, SI SUBL 12(SP), SI SHRL $0x05, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_calcBlockSize MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x33, R10 SHLQ $0x10, R11 IMULQ R9, R11 SHRQ $0x33, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x33, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_calcBlockSize LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_calcBlockSize repeat_extend_back_loop_calcBlockSize: CMPL DI, SI JBE repeat_extend_back_end_calcBlockSize MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_calcBlockSize LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_calcBlockSize repeat_extend_back_end_calcBlockSize: MOVL DI, SI SUBL 12(SP), SI LEAQ 5(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_calcBlockSize MOVQ $0x00000000, ret+32(FP) RET repeat_dst_size_check_calcBlockSize: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_calcBlockSize MOVL DI, R8 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R9 SUBL SI, R8 LEAL -1(R8), SI CMPL SI, $0x3c JB one_byte_repeat_emit_calcBlockSize CMPL SI, $0x00000100 JB two_bytes_repeat_emit_calcBlockSize CMPL SI, $0x00010000 JB three_bytes_repeat_emit_calcBlockSize CMPL SI, $0x01000000 JB four_bytes_repeat_emit_calcBlockSize ADDQ $0x05, CX JMP memmove_long_repeat_emit_calcBlockSize four_bytes_repeat_emit_calcBlockSize: ADDQ $0x04, CX JMP memmove_long_repeat_emit_calcBlockSize three_bytes_repeat_emit_calcBlockSize: ADDQ $0x03, CX JMP memmove_long_repeat_emit_calcBlockSize two_bytes_repeat_emit_calcBlockSize: ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_calcBlockSize JMP memmove_long_repeat_emit_calcBlockSize one_byte_repeat_emit_calcBlockSize: ADDQ $0x01, CX memmove_repeat_emit_calcBlockSize: LEAQ (CX)(R8*1), CX JMP emit_literal_done_repeat_emit_calcBlockSize memmove_long_repeat_emit_calcBlockSize: LEAQ (CX)(R8*1), CX emit_literal_done_repeat_emit_calcBlockSize: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+8(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 matchlen_loopback_16_repeat_extend_calcBlockSize: CMPL R8, $0x10 JB matchlen_match8_repeat_extend_calcBlockSize MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_calcBlockSize XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_calcBlockSize LEAL -16(R8), R8 LEAL 16(R11), R11 JMP matchlen_loopback_16_repeat_extend_calcBlockSize matchlen_bsf_16repeat_extend_calcBlockSize: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_calcBlockSize matchlen_match8_repeat_extend_calcBlockSize: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_calcBlockSize MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_calcBlockSize LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_calcBlockSize matchlen_bsf_8_repeat_extend_calcBlockSize: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_calcBlockSize matchlen_match4_repeat_extend_calcBlockSize: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_calcBlockSize MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_calcBlockSize LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_calcBlockSize: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_calcBlockSize JB repeat_extend_forward_end_calcBlockSize MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_calcBlockSize LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_calcBlockSize matchlen_match1_repeat_extend_calcBlockSize: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_calcBlockSize LEAL 1(R11), R11 repeat_extend_forward_end_calcBlockSize: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitCopy CMPL DI, $0x00010000 JB two_byte_offset_repeat_as_copy_calcBlockSize four_bytes_loop_back_repeat_as_copy_calcBlockSize: CMPL SI, $0x40 JBE four_bytes_remain_repeat_as_copy_calcBlockSize LEAL -64(SI), SI ADDQ $0x05, CX CMPL SI, $0x04 JB four_bytes_remain_repeat_as_copy_calcBlockSize JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize four_bytes_remain_repeat_as_copy_calcBlockSize: TESTL SI, SI JZ repeat_end_emit_calcBlockSize XORL SI, SI ADDQ $0x05, CX JMP repeat_end_emit_calcBlockSize two_byte_offset_repeat_as_copy_calcBlockSize: CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_calcBlockSize LEAL -60(SI), SI ADDQ $0x03, CX JMP two_byte_offset_repeat_as_copy_calcBlockSize two_byte_offset_short_repeat_as_copy_calcBlockSize: MOVL SI, R8 SHLL $0x02, R8 CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_calcBlockSize CMPL DI, $0x00000800 JAE emit_copy_three_repeat_as_copy_calcBlockSize ADDQ $0x02, CX JMP repeat_end_emit_calcBlockSize emit_copy_three_repeat_as_copy_calcBlockSize: ADDQ $0x03, CX repeat_end_emit_calcBlockSize: MOVL DX, 12(SP) JMP search_loop_calcBlockSize no_repeat_found_calcBlockSize: CMPL (BX)(SI*1), DI JEQ candidate_match_calcBlockSize SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_calcBlockSize MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_calcBlockSize MOVL 20(SP), DX JMP search_loop_calcBlockSize candidate3_match_calcBlockSize: ADDL $0x02, DX JMP candidate_match_calcBlockSize candidate2_match_calcBlockSize: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_calcBlockSize: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_calcBlockSize match_extend_back_loop_calcBlockSize: CMPL DX, DI JBE match_extend_back_end_calcBlockSize MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_calcBlockSize LEAL -1(DX), DX DECL SI JZ match_extend_back_end_calcBlockSize JMP match_extend_back_loop_calcBlockSize match_extend_back_end_calcBlockSize: MOVL DX, DI SUBL 12(SP), DI LEAQ 5(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_calcBlockSize MOVQ $0x00000000, ret+32(FP) RET match_dst_size_check_calcBlockSize: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_calcBlockSize MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), DI CMPL DI, $0x3c JB one_byte_match_emit_calcBlockSize CMPL DI, $0x00000100 JB two_bytes_match_emit_calcBlockSize CMPL DI, $0x00010000 JB three_bytes_match_emit_calcBlockSize CMPL DI, $0x01000000 JB four_bytes_match_emit_calcBlockSize ADDQ $0x05, CX JMP memmove_long_match_emit_calcBlockSize four_bytes_match_emit_calcBlockSize: ADDQ $0x04, CX JMP memmove_long_match_emit_calcBlockSize three_bytes_match_emit_calcBlockSize: ADDQ $0x03, CX JMP memmove_long_match_emit_calcBlockSize two_bytes_match_emit_calcBlockSize: ADDQ $0x02, CX CMPL DI, $0x40 JB memmove_match_emit_calcBlockSize JMP memmove_long_match_emit_calcBlockSize one_byte_match_emit_calcBlockSize: ADDQ $0x01, CX memmove_match_emit_calcBlockSize: LEAQ (CX)(R9*1), CX JMP emit_literal_done_match_emit_calcBlockSize memmove_long_match_emit_calcBlockSize: LEAQ (CX)(R9*1), CX emit_literal_done_match_emit_calcBlockSize: match_nolit_loop_calcBlockSize: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+8(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_calcBlockSize: CMPL DI, $0x10 JB matchlen_match8_match_nolit_calcBlockSize MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_calcBlockSize XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_calcBlockSize LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_calcBlockSize matchlen_bsf_16match_nolit_calcBlockSize: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_calcBlockSize matchlen_match8_match_nolit_calcBlockSize: CMPL DI, $0x08 JB matchlen_match4_match_nolit_calcBlockSize MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_calcBlockSize LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_calcBlockSize matchlen_bsf_8_match_nolit_calcBlockSize: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_calcBlockSize matchlen_match4_match_nolit_calcBlockSize: CMPL DI, $0x04 JB matchlen_match2_match_nolit_calcBlockSize MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_calcBlockSize LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_calcBlockSize: CMPL DI, $0x01 JE matchlen_match1_match_nolit_calcBlockSize JB match_nolit_end_calcBlockSize MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_calcBlockSize LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_calcBlockSize matchlen_match1_match_nolit_calcBlockSize: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_calcBlockSize LEAL 1(R10), R10 match_nolit_end_calcBlockSize: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy CMPL SI, $0x00010000 JB two_byte_offset_match_nolit_calcBlockSize four_bytes_loop_back_match_nolit_calcBlockSize: CMPL R10, $0x40 JBE four_bytes_remain_match_nolit_calcBlockSize LEAL -64(R10), R10 ADDQ $0x05, CX CMPL R10, $0x04 JB four_bytes_remain_match_nolit_calcBlockSize JMP four_bytes_loop_back_match_nolit_calcBlockSize four_bytes_remain_match_nolit_calcBlockSize: TESTL R10, R10 JZ match_nolit_emitcopy_end_calcBlockSize XORL SI, SI ADDQ $0x05, CX JMP match_nolit_emitcopy_end_calcBlockSize two_byte_offset_match_nolit_calcBlockSize: CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_calcBlockSize LEAL -60(R10), R10 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_calcBlockSize two_byte_offset_short_match_nolit_calcBlockSize: MOVL R10, DI SHLL $0x02, DI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_calcBlockSize CMPL SI, $0x00000800 JAE emit_copy_three_match_nolit_calcBlockSize ADDQ $0x02, CX JMP match_nolit_emitcopy_end_calcBlockSize emit_copy_three_match_nolit_calcBlockSize: ADDQ $0x03, CX match_nolit_emitcopy_end_calcBlockSize: CMPL DX, 8(SP) JAE emit_remainder_calcBlockSize MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_calcBlockSize MOVQ $0x00000000, ret+32(FP) RET match_nolit_dst_ok_calcBlockSize: MOVQ $0x0000cf1bbcdcbf9b, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x10, R8 IMULQ R9, R8 SHRQ $0x33, R8 SHLQ $0x10, SI IMULQ R9, SI SHRQ $0x33, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_calcBlockSize INCL DX JMP search_loop_calcBlockSize emit_remainder_calcBlockSize: MOVQ src_len+8(FP), AX SUBL 12(SP), AX LEAQ 5(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_calcBlockSize MOVQ $0x00000000, ret+32(FP) RET emit_remainder_ok_calcBlockSize: MOVQ src_len+8(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_calcBlockSize MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), AX CMPL AX, $0x3c JB one_byte_emit_remainder_calcBlockSize CMPL AX, $0x00000100 JB two_bytes_emit_remainder_calcBlockSize CMPL AX, $0x00010000 JB three_bytes_emit_remainder_calcBlockSize CMPL AX, $0x01000000 JB four_bytes_emit_remainder_calcBlockSize ADDQ $0x05, CX JMP memmove_long_emit_remainder_calcBlockSize four_bytes_emit_remainder_calcBlockSize: ADDQ $0x04, CX JMP memmove_long_emit_remainder_calcBlockSize three_bytes_emit_remainder_calcBlockSize: ADDQ $0x03, CX JMP memmove_long_emit_remainder_calcBlockSize two_bytes_emit_remainder_calcBlockSize: ADDQ $0x02, CX CMPL AX, $0x40 JB memmove_emit_remainder_calcBlockSize JMP memmove_long_emit_remainder_calcBlockSize one_byte_emit_remainder_calcBlockSize: ADDQ $0x01, CX memmove_emit_remainder_calcBlockSize: LEAQ (CX)(SI*1), AX MOVQ AX, CX JMP emit_literal_done_emit_remainder_calcBlockSize memmove_long_emit_remainder_calcBlockSize: LEAQ (CX)(SI*1), AX MOVQ AX, CX emit_literal_done_emit_remainder_calcBlockSize: MOVQ CX, ret+32(FP) RET // func calcBlockSizeSmall(src []byte, tmp *[2048]byte) int // Requires: BMI, SSE2 TEXT ·calcBlockSizeSmall(SB), $24-40 MOVQ tmp+24(FP), AX XORQ CX, CX MOVQ $0x00000010, DX MOVQ AX, BX PXOR X0, X0 zero_loop_calcBlockSizeSmall: MOVOU X0, (BX) MOVOU X0, 16(BX) MOVOU X0, 32(BX) MOVOU X0, 48(BX) MOVOU X0, 64(BX) MOVOU X0, 80(BX) MOVOU X0, 96(BX) MOVOU X0, 112(BX) ADDQ $0x80, BX DECQ DX JNZ zero_loop_calcBlockSizeSmall MOVL $0x00000000, 12(SP) MOVQ src_len+8(FP), DX LEAQ -9(DX), BX LEAQ -8(DX), SI MOVL SI, 8(SP) SHRQ $0x05, DX SUBL DX, BX LEAQ (CX)(BX*1), BX MOVQ BX, (SP) MOVL $0x00000001, DX MOVL DX, 16(SP) MOVQ src_base+0(FP), BX search_loop_calcBlockSizeSmall: MOVL DX, SI SUBL 12(SP), SI SHRL $0x04, SI LEAL 4(DX)(SI*1), SI CMPL SI, 8(SP) JAE emit_remainder_calcBlockSizeSmall MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x9e3779b1, R9 MOVQ DI, R10 MOVQ DI, R11 SHRQ $0x08, R11 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x37, R10 SHLQ $0x20, R11 IMULQ R9, R11 SHRQ $0x37, R11 MOVL (AX)(R10*4), SI MOVL (AX)(R11*4), R8 MOVL DX, (AX)(R10*4) LEAL 1(DX), R10 MOVL R10, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x37, R10 MOVL DX, R9 SUBL 16(SP), R9 MOVL 1(BX)(R9*1), R11 MOVQ DI, R9 SHRQ $0x08, R9 CMPL R9, R11 JNE no_repeat_found_calcBlockSizeSmall LEAL 1(DX), DI MOVL 12(SP), SI MOVL DI, R8 SUBL 16(SP), R8 JZ repeat_extend_back_end_calcBlockSizeSmall repeat_extend_back_loop_calcBlockSizeSmall: CMPL DI, SI JBE repeat_extend_back_end_calcBlockSizeSmall MOVB -1(BX)(R8*1), R9 MOVB -1(BX)(DI*1), R10 CMPB R9, R10 JNE repeat_extend_back_end_calcBlockSizeSmall LEAL -1(DI), DI DECL R8 JNZ repeat_extend_back_loop_calcBlockSizeSmall repeat_extend_back_end_calcBlockSizeSmall: MOVL DI, SI SUBL 12(SP), SI LEAQ 3(CX)(SI*1), SI CMPQ SI, (SP) JB repeat_dst_size_check_calcBlockSizeSmall MOVQ $0x00000000, ret+32(FP) RET repeat_dst_size_check_calcBlockSizeSmall: MOVL 12(SP), SI CMPL SI, DI JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall MOVL DI, R8 MOVL DI, 12(SP) LEAQ (BX)(SI*1), R9 SUBL SI, R8 LEAL -1(R8), SI CMPL SI, $0x3c JB one_byte_repeat_emit_calcBlockSizeSmall CMPL SI, $0x00000100 JB two_bytes_repeat_emit_calcBlockSizeSmall JB three_bytes_repeat_emit_calcBlockSizeSmall three_bytes_repeat_emit_calcBlockSizeSmall: ADDQ $0x03, CX JMP memmove_long_repeat_emit_calcBlockSizeSmall two_bytes_repeat_emit_calcBlockSizeSmall: ADDQ $0x02, CX CMPL SI, $0x40 JB memmove_repeat_emit_calcBlockSizeSmall JMP memmove_long_repeat_emit_calcBlockSizeSmall one_byte_repeat_emit_calcBlockSizeSmall: ADDQ $0x01, CX memmove_repeat_emit_calcBlockSizeSmall: LEAQ (CX)(R8*1), CX JMP emit_literal_done_repeat_emit_calcBlockSizeSmall memmove_long_repeat_emit_calcBlockSizeSmall: LEAQ (CX)(R8*1), CX emit_literal_done_repeat_emit_calcBlockSizeSmall: ADDL $0x05, DX MOVL DX, SI SUBL 16(SP), SI MOVQ src_len+8(FP), R8 SUBL DX, R8 LEAQ (BX)(DX*1), R9 LEAQ (BX)(SI*1), SI // matchLen XORL R11, R11 matchlen_loopback_16_repeat_extend_calcBlockSizeSmall: CMPL R8, $0x10 JB matchlen_match8_repeat_extend_calcBlockSizeSmall MOVQ (R9)(R11*1), R10 MOVQ 8(R9)(R11*1), R12 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall XORQ 8(SI)(R11*1), R12 JNZ matchlen_bsf_16repeat_extend_calcBlockSizeSmall LEAL -16(R8), R8 LEAL 16(R11), R11 JMP matchlen_loopback_16_repeat_extend_calcBlockSizeSmall matchlen_bsf_16repeat_extend_calcBlockSizeSmall: #ifdef GOAMD64_v3 TZCNTQ R12, R12 #else BSFQ R12, R12 #endif SARQ $0x03, R12 LEAL 8(R11)(R12*1), R11 JMP repeat_extend_forward_end_calcBlockSizeSmall matchlen_match8_repeat_extend_calcBlockSizeSmall: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_calcBlockSizeSmall MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall LEAL -8(R8), R8 LEAL 8(R11), R11 JMP matchlen_match4_repeat_extend_calcBlockSizeSmall matchlen_bsf_8_repeat_extend_calcBlockSizeSmall: #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_calcBlockSizeSmall matchlen_match4_repeat_extend_calcBlockSizeSmall: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_calcBlockSizeSmall MOVL (R9)(R11*1), R10 CMPL (SI)(R11*1), R10 JNE matchlen_match2_repeat_extend_calcBlockSizeSmall LEAL -4(R8), R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_calcBlockSizeSmall: CMPL R8, $0x01 JE matchlen_match1_repeat_extend_calcBlockSizeSmall JB repeat_extend_forward_end_calcBlockSizeSmall MOVW (R9)(R11*1), R10 CMPW (SI)(R11*1), R10 JNE matchlen_match1_repeat_extend_calcBlockSizeSmall LEAL 2(R11), R11 SUBL $0x02, R8 JZ repeat_extend_forward_end_calcBlockSizeSmall matchlen_match1_repeat_extend_calcBlockSizeSmall: MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_calcBlockSizeSmall LEAL 1(R11), R11 repeat_extend_forward_end_calcBlockSizeSmall: ADDL R11, DX MOVL DX, SI SUBL DI, SI MOVL 16(SP), DI // emitCopy two_byte_offset_repeat_as_copy_calcBlockSizeSmall: CMPL SI, $0x40 JBE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall LEAL -60(SI), SI ADDQ $0x03, CX JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall: MOVL SI, DI SHLL $0x02, DI CMPL SI, $0x0c JAE emit_copy_three_repeat_as_copy_calcBlockSizeSmall ADDQ $0x02, CX JMP repeat_end_emit_calcBlockSizeSmall emit_copy_three_repeat_as_copy_calcBlockSizeSmall: ADDQ $0x03, CX repeat_end_emit_calcBlockSizeSmall: MOVL DX, 12(SP) JMP search_loop_calcBlockSizeSmall no_repeat_found_calcBlockSizeSmall: CMPL (BX)(SI*1), DI JEQ candidate_match_calcBlockSizeSmall SHRQ $0x08, DI MOVL (AX)(R10*4), SI LEAL 2(DX), R9 CMPL (BX)(R8*1), DI JEQ candidate2_match_calcBlockSizeSmall MOVL R9, (AX)(R10*4) SHRQ $0x08, DI CMPL (BX)(SI*1), DI JEQ candidate3_match_calcBlockSizeSmall MOVL 20(SP), DX JMP search_loop_calcBlockSizeSmall candidate3_match_calcBlockSizeSmall: ADDL $0x02, DX JMP candidate_match_calcBlockSizeSmall candidate2_match_calcBlockSizeSmall: MOVL R9, (AX)(R10*4) INCL DX MOVL R8, SI candidate_match_calcBlockSizeSmall: MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_calcBlockSizeSmall match_extend_back_loop_calcBlockSizeSmall: CMPL DX, DI JBE match_extend_back_end_calcBlockSizeSmall MOVB -1(BX)(SI*1), R8 MOVB -1(BX)(DX*1), R9 CMPB R8, R9 JNE match_extend_back_end_calcBlockSizeSmall LEAL -1(DX), DX DECL SI JZ match_extend_back_end_calcBlockSizeSmall JMP match_extend_back_loop_calcBlockSizeSmall match_extend_back_end_calcBlockSizeSmall: MOVL DX, DI SUBL 12(SP), DI LEAQ 3(CX)(DI*1), DI CMPQ DI, (SP) JB match_dst_size_check_calcBlockSizeSmall MOVQ $0x00000000, ret+32(FP) RET match_dst_size_check_calcBlockSizeSmall: MOVL DX, DI MOVL 12(SP), R8 CMPL R8, DI JEQ emit_literal_done_match_emit_calcBlockSizeSmall MOVL DI, R9 MOVL DI, 12(SP) LEAQ (BX)(R8*1), DI SUBL R8, R9 LEAL -1(R9), DI CMPL DI, $0x3c JB one_byte_match_emit_calcBlockSizeSmall CMPL DI, $0x00000100 JB two_bytes_match_emit_calcBlockSizeSmall JB three_bytes_match_emit_calcBlockSizeSmall three_bytes_match_emit_calcBlockSizeSmall: ADDQ $0x03, CX JMP memmove_long_match_emit_calcBlockSizeSmall two_bytes_match_emit_calcBlockSizeSmall: ADDQ $0x02, CX CMPL DI, $0x40 JB memmove_match_emit_calcBlockSizeSmall JMP memmove_long_match_emit_calcBlockSizeSmall one_byte_match_emit_calcBlockSizeSmall: ADDQ $0x01, CX memmove_match_emit_calcBlockSizeSmall: LEAQ (CX)(R9*1), CX JMP emit_literal_done_match_emit_calcBlockSizeSmall memmove_long_match_emit_calcBlockSizeSmall: LEAQ (CX)(R9*1), CX emit_literal_done_match_emit_calcBlockSizeSmall: match_nolit_loop_calcBlockSizeSmall: MOVL DX, DI SUBL SI, DI MOVL DI, 16(SP) ADDL $0x04, DX ADDL $0x04, SI MOVQ src_len+8(FP), DI SUBL DX, DI LEAQ (BX)(DX*1), R8 LEAQ (BX)(SI*1), SI // matchLen XORL R10, R10 matchlen_loopback_16_match_nolit_calcBlockSizeSmall: CMPL DI, $0x10 JB matchlen_match8_match_nolit_calcBlockSizeSmall MOVQ (R8)(R10*1), R9 MOVQ 8(R8)(R10*1), R11 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16match_nolit_calcBlockSizeSmall LEAL -16(DI), DI LEAL 16(R10), R10 JMP matchlen_loopback_16_match_nolit_calcBlockSizeSmall matchlen_bsf_16match_nolit_calcBlockSizeSmall: #ifdef GOAMD64_v3 TZCNTQ R11, R11 #else BSFQ R11, R11 #endif SARQ $0x03, R11 LEAL 8(R10)(R11*1), R10 JMP match_nolit_end_calcBlockSizeSmall matchlen_match8_match_nolit_calcBlockSizeSmall: CMPL DI, $0x08 JB matchlen_match4_match_nolit_calcBlockSizeSmall MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_match_nolit_calcBlockSizeSmall matchlen_bsf_8_match_nolit_calcBlockSizeSmall: #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP match_nolit_end_calcBlockSizeSmall matchlen_match4_match_nolit_calcBlockSizeSmall: CMPL DI, $0x04 JB matchlen_match2_match_nolit_calcBlockSizeSmall MOVL (R8)(R10*1), R9 CMPL (SI)(R10*1), R9 JNE matchlen_match2_match_nolit_calcBlockSizeSmall LEAL -4(DI), DI LEAL 4(R10), R10 matchlen_match2_match_nolit_calcBlockSizeSmall: CMPL DI, $0x01 JE matchlen_match1_match_nolit_calcBlockSizeSmall JB match_nolit_end_calcBlockSizeSmall MOVW (R8)(R10*1), R9 CMPW (SI)(R10*1), R9 JNE matchlen_match1_match_nolit_calcBlockSizeSmall LEAL 2(R10), R10 SUBL $0x02, DI JZ match_nolit_end_calcBlockSizeSmall matchlen_match1_match_nolit_calcBlockSizeSmall: MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_calcBlockSizeSmall LEAL 1(R10), R10 match_nolit_end_calcBlockSizeSmall: ADDL R10, DX MOVL 16(SP), SI ADDL $0x04, R10 MOVL DX, 12(SP) // emitCopy two_byte_offset_match_nolit_calcBlockSizeSmall: CMPL R10, $0x40 JBE two_byte_offset_short_match_nolit_calcBlockSizeSmall LEAL -60(R10), R10 ADDQ $0x03, CX JMP two_byte_offset_match_nolit_calcBlockSizeSmall two_byte_offset_short_match_nolit_calcBlockSizeSmall: MOVL R10, SI SHLL $0x02, SI CMPL R10, $0x0c JAE emit_copy_three_match_nolit_calcBlockSizeSmall ADDQ $0x02, CX JMP match_nolit_emitcopy_end_calcBlockSizeSmall emit_copy_three_match_nolit_calcBlockSizeSmall: ADDQ $0x03, CX match_nolit_emitcopy_end_calcBlockSizeSmall: CMPL DX, 8(SP) JAE emit_remainder_calcBlockSizeSmall MOVQ -2(BX)(DX*1), DI CMPQ CX, (SP) JB match_nolit_dst_ok_calcBlockSizeSmall MOVQ $0x00000000, ret+32(FP) RET match_nolit_dst_ok_calcBlockSizeSmall: MOVQ $0x9e3779b1, R9 MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, SI SHLQ $0x20, R8 IMULQ R9, R8 SHRQ $0x37, R8 SHLQ $0x20, SI IMULQ R9, SI SHRQ $0x37, SI LEAL -2(DX), R9 LEAQ (AX)(SI*4), R10 MOVL (R10), SI MOVL R9, (AX)(R8*4) MOVL DX, (R10) CMPL (BX)(SI*1), DI JEQ match_nolit_loop_calcBlockSizeSmall INCL DX JMP search_loop_calcBlockSizeSmall emit_remainder_calcBlockSizeSmall: MOVQ src_len+8(FP), AX SUBL 12(SP), AX LEAQ 3(CX)(AX*1), AX CMPQ AX, (SP) JB emit_remainder_ok_calcBlockSizeSmall MOVQ $0x00000000, ret+32(FP) RET emit_remainder_ok_calcBlockSizeSmall: MOVQ src_len+8(FP), AX MOVL 12(SP), DX CMPL DX, AX JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall MOVL AX, SI MOVL AX, 12(SP) LEAQ (BX)(DX*1), AX SUBL DX, SI LEAL -1(SI), AX CMPL AX, $0x3c JB one_byte_emit_remainder_calcBlockSizeSmall CMPL AX, $0x00000100 JB two_bytes_emit_remainder_calcBlockSizeSmall JB three_bytes_emit_remainder_calcBlockSizeSmall three_bytes_emit_remainder_calcBlockSizeSmall: ADDQ $0x03, CX JMP memmove_long_emit_remainder_calcBlockSizeSmall two_bytes_emit_remainder_calcBlockSizeSmall: ADDQ $0x02, CX CMPL AX, $0x40 JB memmove_emit_remainder_calcBlockSizeSmall JMP memmove_long_emit_remainder_calcBlockSizeSmall one_byte_emit_remainder_calcBlockSizeSmall: ADDQ $0x01, CX memmove_emit_remainder_calcBlockSizeSmall: LEAQ (CX)(SI*1), AX MOVQ AX, CX JMP emit_literal_done_emit_remainder_calcBlockSizeSmall memmove_long_emit_remainder_calcBlockSizeSmall: LEAQ (CX)(SI*1), AX MOVQ AX, CX emit_literal_done_emit_remainder_calcBlockSizeSmall: MOVQ CX, ret+32(FP) RET // func emitLiteral(dst []byte, lit []byte) int // Requires: SSE2 TEXT ·emitLiteral(SB), NOSPLIT, $0-56 MOVQ lit_len+32(FP), DX MOVQ dst_base+0(FP), AX MOVQ lit_base+24(FP), CX TESTQ DX, DX JZ emit_literal_end_standalone_skip MOVL DX, BX LEAL -1(DX), SI CMPL SI, $0x3c JB one_byte_standalone CMPL SI, $0x00000100 JB two_bytes_standalone CMPL SI, $0x00010000 JB three_bytes_standalone CMPL SI, $0x01000000 JB four_bytes_standalone MOVB $0xfc, (AX) MOVL SI, 1(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP memmove_long_standalone four_bytes_standalone: MOVL SI, DI SHRL $0x10, DI MOVB $0xf8, (AX) MOVW SI, 1(AX) MOVB DI, 3(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP memmove_long_standalone three_bytes_standalone: MOVB $0xf4, (AX) MOVW SI, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP memmove_long_standalone two_bytes_standalone: MOVB $0xf0, (AX) MOVB SI, 1(AX) ADDQ $0x02, BX ADDQ $0x02, AX CMPL SI, $0x40 JB memmove_standalone JMP memmove_long_standalone one_byte_standalone: SHLB $0x02, SI MOVB SI, (AX) ADDQ $0x01, BX ADDQ $0x01, AX memmove_standalone: // genMemMoveShort CMPQ DX, $0x03 JB emit_lit_memmove_standalone_memmove_move_1or2 JE emit_lit_memmove_standalone_memmove_move_3 CMPQ DX, $0x08 JB emit_lit_memmove_standalone_memmove_move_4through7 CMPQ DX, $0x10 JBE emit_lit_memmove_standalone_memmove_move_8through16 CMPQ DX, $0x20 JBE emit_lit_memmove_standalone_memmove_move_17through32 JMP emit_lit_memmove_standalone_memmove_move_33through64 emit_lit_memmove_standalone_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(DX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(DX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(DX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(DX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(DX*1), X2 MOVOU -16(CX)(DX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DX*1) MOVOU X3, -16(AX)(DX*1) JMP emit_literal_end_standalone JMP emit_literal_end_standalone memmove_long_standalone: // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(DX*1), X2 MOVOU -16(CX)(DX*1), X3 MOVQ DX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_standalonelarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_standalonelarge_big_loop_back emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ DX, R8 JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DX*1) MOVOU X3, -16(AX)(DX*1) JMP emit_literal_end_standalone JMP emit_literal_end_standalone emit_literal_end_standalone_skip: XORQ BX, BX emit_literal_end_standalone: MOVQ BX, ret+48(FP) RET // func emitRepeat(dst []byte, offset int, length int) int TEXT ·emitRepeat(SB), NOSPLIT, $0-48 XORQ BX, BX MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX // emitRepeat emit_repeat_again_standalone: MOVL DX, SI LEAL -4(DX), DX CMPL SI, $0x08 JBE repeat_two_standalone CMPL SI, $0x0c JAE cant_repeat_two_offset_standalone CMPL CX, $0x00000800 JB repeat_two_offset_standalone cant_repeat_two_offset_standalone: CMPL DX, $0x00000104 JB repeat_three_standalone CMPL DX, $0x00010100 JB repeat_four_standalone CMPL DX, $0x0100ffff JB repeat_five_standalone LEAL -16842747(DX), DX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX ADDQ $0x05, BX JMP emit_repeat_again_standalone repeat_five_standalone: LEAL -65536(DX), DX MOVL DX, CX MOVW $0x001d, (AX) MOVW DX, 2(AX) SARL $0x10, CX MOVB CL, 4(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_repeat_end repeat_four_standalone: LEAL -256(DX), DX MOVW $0x0019, (AX) MOVW DX, 2(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP gen_emit_repeat_end repeat_three_standalone: LEAL -4(DX), DX MOVW $0x0015, (AX) MOVB DL, 2(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP gen_emit_repeat_end repeat_two_standalone: SHLL $0x02, DX ORL $0x01, DX MOVW DX, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_repeat_end repeat_two_offset_standalone: XORQ SI, SI LEAL 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX ORL CX, DX MOVB DL, (AX) ADDQ $0x02, BX ADDQ $0x02, AX gen_emit_repeat_end: MOVQ BX, ret+40(FP) RET // func emitCopy(dst []byte, offset int, length int) int TEXT ·emitCopy(SB), NOSPLIT, $0-48 XORQ BX, BX MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX // emitCopy CMPL CX, $0x00010000 JB two_byte_offset_standalone CMPL DX, $0x40 JBE four_bytes_remain_standalone MOVB $0xff, (AX) MOVL CX, 1(AX) LEAL -64(DX), DX ADDQ $0x05, BX ADDQ $0x05, AX CMPL DX, $0x04 JB four_bytes_remain_standalone // emitRepeat emit_repeat_again_standalone_emit_copy: MOVL DX, SI LEAL -4(DX), DX CMPL SI, $0x08 JBE repeat_two_standalone_emit_copy CMPL SI, $0x0c JAE cant_repeat_two_offset_standalone_emit_copy CMPL CX, $0x00000800 JB repeat_two_offset_standalone_emit_copy cant_repeat_two_offset_standalone_emit_copy: CMPL DX, $0x00000104 JB repeat_three_standalone_emit_copy CMPL DX, $0x00010100 JB repeat_four_standalone_emit_copy CMPL DX, $0x0100ffff JB repeat_five_standalone_emit_copy LEAL -16842747(DX), DX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX ADDQ $0x05, BX JMP emit_repeat_again_standalone_emit_copy repeat_five_standalone_emit_copy: LEAL -65536(DX), DX MOVL DX, CX MOVW $0x001d, (AX) MOVW DX, 2(AX) SARL $0x10, CX MOVB CL, 4(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end repeat_four_standalone_emit_copy: LEAL -256(DX), DX MOVW $0x0019, (AX) MOVW DX, 2(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP gen_emit_copy_end repeat_three_standalone_emit_copy: LEAL -4(DX), DX MOVW $0x0015, (AX) MOVB DL, 2(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP gen_emit_copy_end repeat_two_standalone_emit_copy: SHLL $0x02, DX ORL $0x01, DX MOVW DX, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy: XORQ SI, SI LEAL 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX ORL CX, DX MOVB DL, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end four_bytes_remain_standalone: TESTL DX, DX JZ gen_emit_copy_end XORL SI, SI LEAL -1(SI)(DX*4), DX MOVB DL, (AX) MOVL CX, 1(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end two_byte_offset_standalone: CMPL DX, $0x40 JBE two_byte_offset_short_standalone CMPL CX, $0x00000800 JAE long_offset_short_standalone MOVL $0x00000001, SI LEAL 16(SI), SI MOVB CL, 1(AX) MOVL CX, DI SHRL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (AX) ADDQ $0x02, BX ADDQ $0x02, AX SUBL $0x08, DX // emitRepeat LEAL -4(DX), DX JMP cant_repeat_two_offset_standalone_emit_copy_short_2b emit_repeat_again_standalone_emit_copy_short_2b: MOVL DX, SI LEAL -4(DX), DX CMPL SI, $0x08 JBE repeat_two_standalone_emit_copy_short_2b CMPL SI, $0x0c JAE cant_repeat_two_offset_standalone_emit_copy_short_2b CMPL CX, $0x00000800 JB repeat_two_offset_standalone_emit_copy_short_2b cant_repeat_two_offset_standalone_emit_copy_short_2b: CMPL DX, $0x00000104 JB repeat_three_standalone_emit_copy_short_2b CMPL DX, $0x00010100 JB repeat_four_standalone_emit_copy_short_2b CMPL DX, $0x0100ffff JB repeat_five_standalone_emit_copy_short_2b LEAL -16842747(DX), DX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX ADDQ $0x05, BX JMP emit_repeat_again_standalone_emit_copy_short_2b repeat_five_standalone_emit_copy_short_2b: LEAL -65536(DX), DX MOVL DX, CX MOVW $0x001d, (AX) MOVW DX, 2(AX) SARL $0x10, CX MOVB CL, 4(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end repeat_four_standalone_emit_copy_short_2b: LEAL -256(DX), DX MOVW $0x0019, (AX) MOVW DX, 2(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP gen_emit_copy_end repeat_three_standalone_emit_copy_short_2b: LEAL -4(DX), DX MOVW $0x0015, (AX) MOVB DL, 2(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP gen_emit_copy_end repeat_two_standalone_emit_copy_short_2b: SHLL $0x02, DX ORL $0x01, DX MOVW DX, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy_short_2b: XORQ SI, SI LEAL 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX ORL CX, DX MOVB DL, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end long_offset_short_standalone: MOVB $0xee, (AX) MOVW CX, 1(AX) LEAL -60(DX), DX ADDQ $0x03, AX ADDQ $0x03, BX // emitRepeat emit_repeat_again_standalone_emit_copy_short: MOVL DX, SI LEAL -4(DX), DX CMPL SI, $0x08 JBE repeat_two_standalone_emit_copy_short CMPL SI, $0x0c JAE cant_repeat_two_offset_standalone_emit_copy_short CMPL CX, $0x00000800 JB repeat_two_offset_standalone_emit_copy_short cant_repeat_two_offset_standalone_emit_copy_short: CMPL DX, $0x00000104 JB repeat_three_standalone_emit_copy_short CMPL DX, $0x00010100 JB repeat_four_standalone_emit_copy_short CMPL DX, $0x0100ffff JB repeat_five_standalone_emit_copy_short LEAL -16842747(DX), DX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX ADDQ $0x05, BX JMP emit_repeat_again_standalone_emit_copy_short repeat_five_standalone_emit_copy_short: LEAL -65536(DX), DX MOVL DX, CX MOVW $0x001d, (AX) MOVW DX, 2(AX) SARL $0x10, CX MOVB CL, 4(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end repeat_four_standalone_emit_copy_short: LEAL -256(DX), DX MOVW $0x0019, (AX) MOVW DX, 2(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP gen_emit_copy_end repeat_three_standalone_emit_copy_short: LEAL -4(DX), DX MOVW $0x0015, (AX) MOVB DL, 2(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP gen_emit_copy_end repeat_two_standalone_emit_copy_short: SHLL $0x02, DX ORL $0x01, DX MOVW DX, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy_short: XORQ SI, SI LEAL 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX ORL CX, DX MOVB DL, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end two_byte_offset_short_standalone: MOVL DX, SI SHLL $0x02, SI CMPL DX, $0x0c JAE emit_copy_three_standalone CMPL CX, $0x00000800 JAE emit_copy_three_standalone LEAL -15(SI), SI MOVB CL, 1(AX) SHRL $0x08, CX SHLL $0x05, CX ORL CX, SI MOVB SI, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end emit_copy_three_standalone: LEAL -2(SI), SI MOVB SI, (AX) MOVW CX, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX gen_emit_copy_end: MOVQ BX, ret+40(FP) RET // func emitCopyNoRepeat(dst []byte, offset int, length int) int TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 XORQ BX, BX MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX // emitCopy CMPL CX, $0x00010000 JB two_byte_offset_standalone_snappy four_bytes_loop_back_standalone_snappy: CMPL DX, $0x40 JBE four_bytes_remain_standalone_snappy MOVB $0xff, (AX) MOVL CX, 1(AX) LEAL -64(DX), DX ADDQ $0x05, BX ADDQ $0x05, AX CMPL DX, $0x04 JB four_bytes_remain_standalone_snappy JMP four_bytes_loop_back_standalone_snappy four_bytes_remain_standalone_snappy: TESTL DX, DX JZ gen_emit_copy_end_snappy XORL SI, SI LEAL -1(SI)(DX*4), DX MOVB DL, (AX) MOVL CX, 1(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end_snappy two_byte_offset_standalone_snappy: CMPL DX, $0x40 JBE two_byte_offset_short_standalone_snappy MOVB $0xee, (AX) MOVW CX, 1(AX) LEAL -60(DX), DX ADDQ $0x03, AX ADDQ $0x03, BX JMP two_byte_offset_standalone_snappy two_byte_offset_short_standalone_snappy: MOVL DX, SI SHLL $0x02, SI CMPL DX, $0x0c JAE emit_copy_three_standalone_snappy CMPL CX, $0x00000800 JAE emit_copy_three_standalone_snappy LEAL -15(SI), SI MOVB CL, 1(AX) SHRL $0x08, CX SHLL $0x05, CX ORL CX, SI MOVB SI, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end_snappy emit_copy_three_standalone_snappy: LEAL -2(SI), SI MOVB SI, (AX) MOVW CX, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX gen_emit_copy_end_snappy: MOVQ BX, ret+40(FP) RET // func matchLen(a []byte, b []byte) int // Requires: BMI TEXT ·matchLen(SB), NOSPLIT, $0-56 MOVQ a_base+0(FP), AX MOVQ b_base+24(FP), CX MOVQ a_len+8(FP), DX // matchLen XORL SI, SI matchlen_loopback_16_standalone: CMPL DX, $0x10 JB matchlen_match8_standalone MOVQ (AX)(SI*1), BX MOVQ 8(AX)(SI*1), DI XORQ (CX)(SI*1), BX JNZ matchlen_bsf_8_standalone XORQ 8(CX)(SI*1), DI JNZ matchlen_bsf_16standalone LEAL -16(DX), DX LEAL 16(SI), SI JMP matchlen_loopback_16_standalone matchlen_bsf_16standalone: #ifdef GOAMD64_v3 TZCNTQ DI, DI #else BSFQ DI, DI #endif SARQ $0x03, DI LEAL 8(SI)(DI*1), SI JMP gen_match_len_end matchlen_match8_standalone: CMPL DX, $0x08 JB matchlen_match4_standalone MOVQ (AX)(SI*1), BX XORQ (CX)(SI*1), BX JNZ matchlen_bsf_8_standalone LEAL -8(DX), DX LEAL 8(SI), SI JMP matchlen_match4_standalone matchlen_bsf_8_standalone: #ifdef GOAMD64_v3 TZCNTQ BX, BX #else BSFQ BX, BX #endif SARQ $0x03, BX LEAL (SI)(BX*1), SI JMP gen_match_len_end matchlen_match4_standalone: CMPL DX, $0x04 JB matchlen_match2_standalone MOVL (AX)(SI*1), BX CMPL (CX)(SI*1), BX JNE matchlen_match2_standalone LEAL -4(DX), DX LEAL 4(SI), SI matchlen_match2_standalone: CMPL DX, $0x01 JE matchlen_match1_standalone JB gen_match_len_end MOVW (AX)(SI*1), BX CMPW (CX)(SI*1), BX JNE matchlen_match1_standalone LEAL 2(SI), SI SUBL $0x02, DX JZ gen_match_len_end matchlen_match1_standalone: MOVB (AX)(SI*1), BL CMPB (CX)(SI*1), BL JNE gen_match_len_end LEAL 1(SI), SI gen_match_len_end: MOVQ SI, ret+48(FP) RET // func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) // Requires: SSE2 TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64 XORQ SI, SI MOVQ dst_base+0(FP), AX MOVQ dst_len+8(FP), CX MOVQ src_base+24(FP), DX MOVQ src_len+32(FP), BX LEAQ (DX)(BX*1), BX LEAQ -8(AX)(CX*1), CX XORQ DI, DI lz4_s2_loop: CMPQ DX, BX JAE lz4_s2_corrupt CMPQ AX, CX JAE lz4_s2_dstfull MOVBQZX (DX), R8 MOVQ R8, R9 MOVQ R8, R10 SHRQ $0x04, R9 ANDQ $0x0f, R10 CMPQ R8, $0xf0 JB lz4_s2_ll_end lz4_s2_ll_loop: INCQ DX CMPQ DX, BX JAE lz4_s2_corrupt MOVBQZX (DX), R8 ADDQ R8, R9 CMPQ R8, $0xff JEQ lz4_s2_ll_loop lz4_s2_ll_end: LEAQ (DX)(R9*1), R8 ADDQ $0x04, R10 CMPQ R8, BX JAE lz4_s2_corrupt INCQ DX INCQ R8 TESTQ R9, R9 JZ lz4_s2_lits_done LEAQ (AX)(R9*1), R11 CMPQ R11, CX JAE lz4_s2_dstfull ADDQ R9, SI LEAL -1(R9), R11 CMPL R11, $0x3c JB one_byte_lz4_s2 CMPL R11, $0x00000100 JB two_bytes_lz4_s2 CMPL R11, $0x00010000 JB three_bytes_lz4_s2 CMPL R11, $0x01000000 JB four_bytes_lz4_s2 MOVB $0xfc, (AX) MOVL R11, 1(AX) ADDQ $0x05, AX JMP memmove_long_lz4_s2 four_bytes_lz4_s2: MOVL R11, R12 SHRL $0x10, R12 MOVB $0xf8, (AX) MOVW R11, 1(AX) MOVB R12, 3(AX) ADDQ $0x04, AX JMP memmove_long_lz4_s2 three_bytes_lz4_s2: MOVB $0xf4, (AX) MOVW R11, 1(AX) ADDQ $0x03, AX JMP memmove_long_lz4_s2 two_bytes_lz4_s2: MOVB $0xf0, (AX) MOVB R11, 1(AX) ADDQ $0x02, AX CMPL R11, $0x40 JB memmove_lz4_s2 JMP memmove_long_lz4_s2 one_byte_lz4_s2: SHLB $0x02, R11 MOVB R11, (AX) ADDQ $0x01, AX memmove_lz4_s2: LEAQ (AX)(R9*1), R11 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_lz4_s2_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_lz4_s2_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_lz4_s2_memmove_move_17through32 JMP emit_lit_memmove_lz4_s2_memmove_move_33through64 emit_lit_memmove_lz4_s2_memmove_move_8: MOVQ (DX), R12 MOVQ R12, (AX) JMP memmove_end_copy_lz4_s2 emit_lit_memmove_lz4_s2_memmove_move_8through16: MOVQ (DX), R12 MOVQ -8(DX)(R9*1), DX MOVQ R12, (AX) MOVQ DX, -8(AX)(R9*1) JMP memmove_end_copy_lz4_s2 emit_lit_memmove_lz4_s2_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(R9*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_lz4_s2 emit_lit_memmove_lz4_s2_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R9*1), X2 MOVOU -16(DX)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R9*1) MOVOU X3, -16(AX)(R9*1) memmove_end_copy_lz4_s2: MOVQ R11, AX JMP lz4_s2_lits_emit_done memmove_long_lz4_s2: LEAQ (AX)(R9*1), R11 // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R9*1), X2 MOVOU -16(DX)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ AX, R12 ANDL $0x0000001f, R12 MOVQ $0x00000040, R14 SUBQ R12, R14 DECQ R13 JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 LEAQ -32(DX)(R14*1), R12 LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_lz4_s2large_big_loop_back: MOVOU (R12), X4 MOVOU 16(R12), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R12 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_lz4_s2large_big_loop_back emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32: MOVOU -32(DX)(R14*1), X4 MOVOU -16(DX)(R14*1), X5 MOVOA X4, -32(AX)(R14*1) MOVOA X5, -16(AX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R9*1) MOVOU X3, -16(AX)(R9*1) MOVQ R11, AX lz4_s2_lits_emit_done: MOVQ R8, DX lz4_s2_lits_done: CMPQ DX, BX JNE lz4_s2_match CMPQ R10, $0x04 JEQ lz4_s2_done JMP lz4_s2_corrupt lz4_s2_match: LEAQ 2(DX), R8 CMPQ R8, BX JAE lz4_s2_corrupt MOVWQZX (DX), R9 MOVQ R8, DX TESTQ R9, R9 JZ lz4_s2_corrupt CMPQ R9, SI JA lz4_s2_corrupt CMPQ R10, $0x13 JNE lz4_s2_ml_done lz4_s2_ml_loop: MOVBQZX (DX), R8 INCQ DX ADDQ R8, R10 CMPQ DX, BX JAE lz4_s2_corrupt CMPQ R8, $0xff JEQ lz4_s2_ml_loop lz4_s2_ml_done: ADDQ R10, SI CMPQ R9, DI JNE lz4_s2_docopy // emitRepeat emit_repeat_again_lz4_s2: MOVL R10, R8 LEAL -4(R10), R10 CMPL R8, $0x08 JBE repeat_two_lz4_s2 CMPL R8, $0x0c JAE cant_repeat_two_offset_lz4_s2 CMPL R9, $0x00000800 JB repeat_two_offset_lz4_s2 cant_repeat_two_offset_lz4_s2: CMPL R10, $0x00000104 JB repeat_three_lz4_s2 CMPL R10, $0x00010100 JB repeat_four_lz4_s2 CMPL R10, $0x0100ffff JB repeat_five_lz4_s2 LEAL -16842747(R10), R10 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_lz4_s2 repeat_five_lz4_s2: LEAL -65536(R10), R10 MOVL R10, R9 MOVW $0x001d, (AX) MOVW R10, 2(AX) SARL $0x10, R9 MOVB R9, 4(AX) ADDQ $0x05, AX JMP lz4_s2_loop repeat_four_lz4_s2: LEAL -256(R10), R10 MOVW $0x0019, (AX) MOVW R10, 2(AX) ADDQ $0x04, AX JMP lz4_s2_loop repeat_three_lz4_s2: LEAL -4(R10), R10 MOVW $0x0015, (AX) MOVB R10, 2(AX) ADDQ $0x03, AX JMP lz4_s2_loop repeat_two_lz4_s2: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop repeat_two_offset_lz4_s2: XORQ R8, R8 LEAL 1(R8)(R10*4), R10 MOVB R9, 1(AX) SARL $0x08, R9 SHLL $0x05, R9 ORL R9, R10 MOVB R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop lz4_s2_docopy: MOVQ R9, DI // emitCopy CMPL R10, $0x40 JBE two_byte_offset_short_lz4_s2 CMPL R9, $0x00000800 JAE long_offset_short_lz4_s2 MOVL $0x00000001, R8 LEAL 16(R8), R8 MOVB R9, 1(AX) MOVL R9, R11 SHRL $0x08, R11 SHLL $0x05, R11 ORL R11, R8 MOVB R8, (AX) ADDQ $0x02, AX SUBL $0x08, R10 // emitRepeat LEAL -4(R10), R10 JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b emit_repeat_again_lz4_s2_emit_copy_short_2b: MOVL R10, R8 LEAL -4(R10), R10 CMPL R8, $0x08 JBE repeat_two_lz4_s2_emit_copy_short_2b CMPL R8, $0x0c JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b CMPL R9, $0x00000800 JB repeat_two_offset_lz4_s2_emit_copy_short_2b cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: CMPL R10, $0x00000104 JB repeat_three_lz4_s2_emit_copy_short_2b CMPL R10, $0x00010100 JB repeat_four_lz4_s2_emit_copy_short_2b CMPL R10, $0x0100ffff JB repeat_five_lz4_s2_emit_copy_short_2b LEAL -16842747(R10), R10 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_lz4_s2_emit_copy_short_2b repeat_five_lz4_s2_emit_copy_short_2b: LEAL -65536(R10), R10 MOVL R10, R9 MOVW $0x001d, (AX) MOVW R10, 2(AX) SARL $0x10, R9 MOVB R9, 4(AX) ADDQ $0x05, AX JMP lz4_s2_loop repeat_four_lz4_s2_emit_copy_short_2b: LEAL -256(R10), R10 MOVW $0x0019, (AX) MOVW R10, 2(AX) ADDQ $0x04, AX JMP lz4_s2_loop repeat_three_lz4_s2_emit_copy_short_2b: LEAL -4(R10), R10 MOVW $0x0015, (AX) MOVB R10, 2(AX) ADDQ $0x03, AX JMP lz4_s2_loop repeat_two_lz4_s2_emit_copy_short_2b: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop repeat_two_offset_lz4_s2_emit_copy_short_2b: XORQ R8, R8 LEAL 1(R8)(R10*4), R10 MOVB R9, 1(AX) SARL $0x08, R9 SHLL $0x05, R9 ORL R9, R10 MOVB R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop long_offset_short_lz4_s2: MOVB $0xee, (AX) MOVW R9, 1(AX) LEAL -60(R10), R10 ADDQ $0x03, AX // emitRepeat emit_repeat_again_lz4_s2_emit_copy_short: MOVL R10, R8 LEAL -4(R10), R10 CMPL R8, $0x08 JBE repeat_two_lz4_s2_emit_copy_short CMPL R8, $0x0c JAE cant_repeat_two_offset_lz4_s2_emit_copy_short CMPL R9, $0x00000800 JB repeat_two_offset_lz4_s2_emit_copy_short cant_repeat_two_offset_lz4_s2_emit_copy_short: CMPL R10, $0x00000104 JB repeat_three_lz4_s2_emit_copy_short CMPL R10, $0x00010100 JB repeat_four_lz4_s2_emit_copy_short CMPL R10, $0x0100ffff JB repeat_five_lz4_s2_emit_copy_short LEAL -16842747(R10), R10 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_lz4_s2_emit_copy_short repeat_five_lz4_s2_emit_copy_short: LEAL -65536(R10), R10 MOVL R10, R9 MOVW $0x001d, (AX) MOVW R10, 2(AX) SARL $0x10, R9 MOVB R9, 4(AX) ADDQ $0x05, AX JMP lz4_s2_loop repeat_four_lz4_s2_emit_copy_short: LEAL -256(R10), R10 MOVW $0x0019, (AX) MOVW R10, 2(AX) ADDQ $0x04, AX JMP lz4_s2_loop repeat_three_lz4_s2_emit_copy_short: LEAL -4(R10), R10 MOVW $0x0015, (AX) MOVB R10, 2(AX) ADDQ $0x03, AX JMP lz4_s2_loop repeat_two_lz4_s2_emit_copy_short: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop repeat_two_offset_lz4_s2_emit_copy_short: XORQ R8, R8 LEAL 1(R8)(R10*4), R10 MOVB R9, 1(AX) SARL $0x08, R9 SHLL $0x05, R9 ORL R9, R10 MOVB R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop two_byte_offset_short_lz4_s2: MOVL R10, R8 SHLL $0x02, R8 CMPL R10, $0x0c JAE emit_copy_three_lz4_s2 CMPL R9, $0x00000800 JAE emit_copy_three_lz4_s2 LEAL -15(R8), R8 MOVB R9, 1(AX) SHRL $0x08, R9 SHLL $0x05, R9 ORL R9, R8 MOVB R8, (AX) ADDQ $0x02, AX JMP lz4_s2_loop emit_copy_three_lz4_s2: LEAL -2(R8), R8 MOVB R8, (AX) MOVW R9, 1(AX) ADDQ $0x03, AX JMP lz4_s2_loop lz4_s2_done: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ SI, uncompressed+48(FP) MOVQ AX, dstUsed+56(FP) RET lz4_s2_corrupt: XORQ AX, AX LEAQ -1(AX), SI MOVQ SI, uncompressed+48(FP) RET lz4_s2_dstfull: XORQ AX, AX LEAQ -2(AX), SI MOVQ SI, uncompressed+48(FP) RET // func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) // Requires: SSE2 TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64 XORQ SI, SI MOVQ dst_base+0(FP), AX MOVQ dst_len+8(FP), CX MOVQ src_base+24(FP), DX MOVQ src_len+32(FP), BX LEAQ (DX)(BX*1), BX LEAQ -8(AX)(CX*1), CX XORQ DI, DI lz4s_s2_loop: CMPQ DX, BX JAE lz4s_s2_corrupt CMPQ AX, CX JAE lz4s_s2_dstfull MOVBQZX (DX), R8 MOVQ R8, R9 MOVQ R8, R10 SHRQ $0x04, R9 ANDQ $0x0f, R10 CMPQ R8, $0xf0 JB lz4s_s2_ll_end lz4s_s2_ll_loop: INCQ DX CMPQ DX, BX JAE lz4s_s2_corrupt MOVBQZX (DX), R8 ADDQ R8, R9 CMPQ R8, $0xff JEQ lz4s_s2_ll_loop lz4s_s2_ll_end: LEAQ (DX)(R9*1), R8 ADDQ $0x03, R10 CMPQ R8, BX JAE lz4s_s2_corrupt INCQ DX INCQ R8 TESTQ R9, R9 JZ lz4s_s2_lits_done LEAQ (AX)(R9*1), R11 CMPQ R11, CX JAE lz4s_s2_dstfull ADDQ R9, SI LEAL -1(R9), R11 CMPL R11, $0x3c JB one_byte_lz4s_s2 CMPL R11, $0x00000100 JB two_bytes_lz4s_s2 CMPL R11, $0x00010000 JB three_bytes_lz4s_s2 CMPL R11, $0x01000000 JB four_bytes_lz4s_s2 MOVB $0xfc, (AX) MOVL R11, 1(AX) ADDQ $0x05, AX JMP memmove_long_lz4s_s2 four_bytes_lz4s_s2: MOVL R11, R12 SHRL $0x10, R12 MOVB $0xf8, (AX) MOVW R11, 1(AX) MOVB R12, 3(AX) ADDQ $0x04, AX JMP memmove_long_lz4s_s2 three_bytes_lz4s_s2: MOVB $0xf4, (AX) MOVW R11, 1(AX) ADDQ $0x03, AX JMP memmove_long_lz4s_s2 two_bytes_lz4s_s2: MOVB $0xf0, (AX) MOVB R11, 1(AX) ADDQ $0x02, AX CMPL R11, $0x40 JB memmove_lz4s_s2 JMP memmove_long_lz4s_s2 one_byte_lz4s_s2: SHLB $0x02, R11 MOVB R11, (AX) ADDQ $0x01, AX memmove_lz4s_s2: LEAQ (AX)(R9*1), R11 // genMemMoveShort CMPQ R9, $0x08 JBE emit_lit_memmove_lz4s_s2_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_lz4s_s2_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_lz4s_s2_memmove_move_17through32 JMP emit_lit_memmove_lz4s_s2_memmove_move_33through64 emit_lit_memmove_lz4s_s2_memmove_move_8: MOVQ (DX), R12 MOVQ R12, (AX) JMP memmove_end_copy_lz4s_s2 emit_lit_memmove_lz4s_s2_memmove_move_8through16: MOVQ (DX), R12 MOVQ -8(DX)(R9*1), DX MOVQ R12, (AX) MOVQ DX, -8(AX)(R9*1) JMP memmove_end_copy_lz4s_s2 emit_lit_memmove_lz4s_s2_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(R9*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_lz4s_s2 emit_lit_memmove_lz4s_s2_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R9*1), X2 MOVOU -16(DX)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R9*1) MOVOU X3, -16(AX)(R9*1) memmove_end_copy_lz4s_s2: MOVQ R11, AX JMP lz4s_s2_lits_emit_done memmove_long_lz4s_s2: LEAQ (AX)(R9*1), R11 // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R9*1), X2 MOVOU -16(DX)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ AX, R12 ANDL $0x0000001f, R12 MOVQ $0x00000040, R14 SUBQ R12, R14 DECQ R13 JA emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 LEAQ -32(DX)(R14*1), R12 LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_lz4s_s2large_big_loop_back: MOVOU (R12), X4 MOVOU 16(R12), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R12 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_lz4s_s2large_big_loop_back emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32: MOVOU -32(DX)(R14*1), X4 MOVOU -16(DX)(R14*1), X5 MOVOA X4, -32(AX)(R14*1) MOVOA X5, -16(AX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R9*1) MOVOU X3, -16(AX)(R9*1) MOVQ R11, AX lz4s_s2_lits_emit_done: MOVQ R8, DX lz4s_s2_lits_done: CMPQ DX, BX JNE lz4s_s2_match CMPQ R10, $0x03 JEQ lz4s_s2_done JMP lz4s_s2_corrupt lz4s_s2_match: CMPQ R10, $0x03 JEQ lz4s_s2_loop LEAQ 2(DX), R8 CMPQ R8, BX JAE lz4s_s2_corrupt MOVWQZX (DX), R9 MOVQ R8, DX TESTQ R9, R9 JZ lz4s_s2_corrupt CMPQ R9, SI JA lz4s_s2_corrupt CMPQ R10, $0x12 JNE lz4s_s2_ml_done lz4s_s2_ml_loop: MOVBQZX (DX), R8 INCQ DX ADDQ R8, R10 CMPQ DX, BX JAE lz4s_s2_corrupt CMPQ R8, $0xff JEQ lz4s_s2_ml_loop lz4s_s2_ml_done: ADDQ R10, SI CMPQ R9, DI JNE lz4s_s2_docopy // emitRepeat emit_repeat_again_lz4_s2: MOVL R10, R8 LEAL -4(R10), R10 CMPL R8, $0x08 JBE repeat_two_lz4_s2 CMPL R8, $0x0c JAE cant_repeat_two_offset_lz4_s2 CMPL R9, $0x00000800 JB repeat_two_offset_lz4_s2 cant_repeat_two_offset_lz4_s2: CMPL R10, $0x00000104 JB repeat_three_lz4_s2 CMPL R10, $0x00010100 JB repeat_four_lz4_s2 CMPL R10, $0x0100ffff JB repeat_five_lz4_s2 LEAL -16842747(R10), R10 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_lz4_s2 repeat_five_lz4_s2: LEAL -65536(R10), R10 MOVL R10, R9 MOVW $0x001d, (AX) MOVW R10, 2(AX) SARL $0x10, R9 MOVB R9, 4(AX) ADDQ $0x05, AX JMP lz4s_s2_loop repeat_four_lz4_s2: LEAL -256(R10), R10 MOVW $0x0019, (AX) MOVW R10, 2(AX) ADDQ $0x04, AX JMP lz4s_s2_loop repeat_three_lz4_s2: LEAL -4(R10), R10 MOVW $0x0015, (AX) MOVB R10, 2(AX) ADDQ $0x03, AX JMP lz4s_s2_loop repeat_two_lz4_s2: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (AX) ADDQ $0x02, AX JMP lz4s_s2_loop repeat_two_offset_lz4_s2: XORQ R8, R8 LEAL 1(R8)(R10*4), R10 MOVB R9, 1(AX) SARL $0x08, R9 SHLL $0x05, R9 ORL R9, R10 MOVB R10, (AX) ADDQ $0x02, AX JMP lz4s_s2_loop lz4s_s2_docopy: MOVQ R9, DI // emitCopy CMPL R10, $0x40 JBE two_byte_offset_short_lz4_s2 CMPL R9, $0x00000800 JAE long_offset_short_lz4_s2 MOVL $0x00000001, R8 LEAL 16(R8), R8 MOVB R9, 1(AX) MOVL R9, R11 SHRL $0x08, R11 SHLL $0x05, R11 ORL R11, R8 MOVB R8, (AX) ADDQ $0x02, AX SUBL $0x08, R10 // emitRepeat LEAL -4(R10), R10 JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b emit_repeat_again_lz4_s2_emit_copy_short_2b: MOVL R10, R8 LEAL -4(R10), R10 CMPL R8, $0x08 JBE repeat_two_lz4_s2_emit_copy_short_2b CMPL R8, $0x0c JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b CMPL R9, $0x00000800 JB repeat_two_offset_lz4_s2_emit_copy_short_2b cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: CMPL R10, $0x00000104 JB repeat_three_lz4_s2_emit_copy_short_2b CMPL R10, $0x00010100 JB repeat_four_lz4_s2_emit_copy_short_2b CMPL R10, $0x0100ffff JB repeat_five_lz4_s2_emit_copy_short_2b LEAL -16842747(R10), R10 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_lz4_s2_emit_copy_short_2b repeat_five_lz4_s2_emit_copy_short_2b: LEAL -65536(R10), R10 MOVL R10, R9 MOVW $0x001d, (AX) MOVW R10, 2(AX) SARL $0x10, R9 MOVB R9, 4(AX) ADDQ $0x05, AX JMP lz4s_s2_loop repeat_four_lz4_s2_emit_copy_short_2b: LEAL -256(R10), R10 MOVW $0x0019, (AX) MOVW R10, 2(AX) ADDQ $0x04, AX JMP lz4s_s2_loop repeat_three_lz4_s2_emit_copy_short_2b: LEAL -4(R10), R10 MOVW $0x0015, (AX) MOVB R10, 2(AX) ADDQ $0x03, AX JMP lz4s_s2_loop repeat_two_lz4_s2_emit_copy_short_2b: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (AX) ADDQ $0x02, AX JMP lz4s_s2_loop repeat_two_offset_lz4_s2_emit_copy_short_2b: XORQ R8, R8 LEAL 1(R8)(R10*4), R10 MOVB R9, 1(AX) SARL $0x08, R9 SHLL $0x05, R9 ORL R9, R10 MOVB R10, (AX) ADDQ $0x02, AX JMP lz4s_s2_loop long_offset_short_lz4_s2: MOVB $0xee, (AX) MOVW R9, 1(AX) LEAL -60(R10), R10 ADDQ $0x03, AX // emitRepeat emit_repeat_again_lz4_s2_emit_copy_short: MOVL R10, R8 LEAL -4(R10), R10 CMPL R8, $0x08 JBE repeat_two_lz4_s2_emit_copy_short CMPL R8, $0x0c JAE cant_repeat_two_offset_lz4_s2_emit_copy_short CMPL R9, $0x00000800 JB repeat_two_offset_lz4_s2_emit_copy_short cant_repeat_two_offset_lz4_s2_emit_copy_short: CMPL R10, $0x00000104 JB repeat_three_lz4_s2_emit_copy_short CMPL R10, $0x00010100 JB repeat_four_lz4_s2_emit_copy_short CMPL R10, $0x0100ffff JB repeat_five_lz4_s2_emit_copy_short LEAL -16842747(R10), R10 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_lz4_s2_emit_copy_short repeat_five_lz4_s2_emit_copy_short: LEAL -65536(R10), R10 MOVL R10, R9 MOVW $0x001d, (AX) MOVW R10, 2(AX) SARL $0x10, R9 MOVB R9, 4(AX) ADDQ $0x05, AX JMP lz4s_s2_loop repeat_four_lz4_s2_emit_copy_short: LEAL -256(R10), R10 MOVW $0x0019, (AX) MOVW R10, 2(AX) ADDQ $0x04, AX JMP lz4s_s2_loop repeat_three_lz4_s2_emit_copy_short: LEAL -4(R10), R10 MOVW $0x0015, (AX) MOVB R10, 2(AX) ADDQ $0x03, AX JMP lz4s_s2_loop repeat_two_lz4_s2_emit_copy_short: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (AX) ADDQ $0x02, AX JMP lz4s_s2_loop repeat_two_offset_lz4_s2_emit_copy_short: XORQ R8, R8 LEAL 1(R8)(R10*4), R10 MOVB R9, 1(AX) SARL $0x08, R9 SHLL $0x05, R9 ORL R9, R10 MOVB R10, (AX) ADDQ $0x02, AX JMP lz4s_s2_loop two_byte_offset_short_lz4_s2: MOVL R10, R8 SHLL $0x02, R8 CMPL R10, $0x0c JAE emit_copy_three_lz4_s2 CMPL R9, $0x00000800 JAE emit_copy_three_lz4_s2 LEAL -15(R8), R8 MOVB R9, 1(AX) SHRL $0x08, R9 SHLL $0x05, R9 ORL R9, R8 MOVB R8, (AX) ADDQ $0x02, AX JMP lz4s_s2_loop emit_copy_three_lz4_s2: LEAL -2(R8), R8 MOVB R8, (AX) MOVW R9, 1(AX) ADDQ $0x03, AX JMP lz4s_s2_loop lz4s_s2_done: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ SI, uncompressed+48(FP) MOVQ AX, dstUsed+56(FP) RET lz4s_s2_corrupt: XORQ AX, AX LEAQ -1(AX), SI MOVQ SI, uncompressed+48(FP) RET lz4s_s2_dstfull: XORQ AX, AX LEAQ -2(AX), SI MOVQ SI, uncompressed+48(FP) RET // func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) // Requires: SSE2 TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64 XORQ SI, SI MOVQ dst_base+0(FP), AX MOVQ dst_len+8(FP), CX MOVQ src_base+24(FP), DX MOVQ src_len+32(FP), BX LEAQ (DX)(BX*1), BX LEAQ -8(AX)(CX*1), CX lz4_snappy_loop: CMPQ DX, BX JAE lz4_snappy_corrupt CMPQ AX, CX JAE lz4_snappy_dstfull MOVBQZX (DX), DI MOVQ DI, R8 MOVQ DI, R9 SHRQ $0x04, R8 ANDQ $0x0f, R9 CMPQ DI, $0xf0 JB lz4_snappy_ll_end lz4_snappy_ll_loop: INCQ DX CMPQ DX, BX JAE lz4_snappy_corrupt MOVBQZX (DX), DI ADDQ DI, R8 CMPQ DI, $0xff JEQ lz4_snappy_ll_loop lz4_snappy_ll_end: LEAQ (DX)(R8*1), DI ADDQ $0x04, R9 CMPQ DI, BX JAE lz4_snappy_corrupt INCQ DX INCQ DI TESTQ R8, R8 JZ lz4_snappy_lits_done LEAQ (AX)(R8*1), R10 CMPQ R10, CX JAE lz4_snappy_dstfull ADDQ R8, SI LEAL -1(R8), R10 CMPL R10, $0x3c JB one_byte_lz4_snappy CMPL R10, $0x00000100 JB two_bytes_lz4_snappy CMPL R10, $0x00010000 JB three_bytes_lz4_snappy CMPL R10, $0x01000000 JB four_bytes_lz4_snappy MOVB $0xfc, (AX) MOVL R10, 1(AX) ADDQ $0x05, AX JMP memmove_long_lz4_snappy four_bytes_lz4_snappy: MOVL R10, R11 SHRL $0x10, R11 MOVB $0xf8, (AX) MOVW R10, 1(AX) MOVB R11, 3(AX) ADDQ $0x04, AX JMP memmove_long_lz4_snappy three_bytes_lz4_snappy: MOVB $0xf4, (AX) MOVW R10, 1(AX) ADDQ $0x03, AX JMP memmove_long_lz4_snappy two_bytes_lz4_snappy: MOVB $0xf0, (AX) MOVB R10, 1(AX) ADDQ $0x02, AX CMPL R10, $0x40 JB memmove_lz4_snappy JMP memmove_long_lz4_snappy one_byte_lz4_snappy: SHLB $0x02, R10 MOVB R10, (AX) ADDQ $0x01, AX memmove_lz4_snappy: LEAQ (AX)(R8*1), R10 // genMemMoveShort CMPQ R8, $0x08 JBE emit_lit_memmove_lz4_snappy_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32 JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64 emit_lit_memmove_lz4_snappy_memmove_move_8: MOVQ (DX), R11 MOVQ R11, (AX) JMP memmove_end_copy_lz4_snappy emit_lit_memmove_lz4_snappy_memmove_move_8through16: MOVQ (DX), R11 MOVQ -8(DX)(R8*1), DX MOVQ R11, (AX) MOVQ DX, -8(AX)(R8*1) JMP memmove_end_copy_lz4_snappy emit_lit_memmove_lz4_snappy_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_lz4_snappy emit_lit_memmove_lz4_snappy_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R8*1), X2 MOVOU -16(DX)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_lz4_snappy: MOVQ R10, AX JMP lz4_snappy_lits_emit_done memmove_long_lz4_snappy: LEAQ (AX)(R8*1), R10 // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R8*1), X2 MOVOU -16(DX)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R13 SUBQ R11, R13 DECQ R12 JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 LEAQ -32(DX)(R13*1), R11 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_lz4_snappylarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R11 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32: MOVOU -32(DX)(R13*1), X4 MOVOU -16(DX)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ R10, AX lz4_snappy_lits_emit_done: MOVQ DI, DX lz4_snappy_lits_done: CMPQ DX, BX JNE lz4_snappy_match CMPQ R9, $0x04 JEQ lz4_snappy_done JMP lz4_snappy_corrupt lz4_snappy_match: LEAQ 2(DX), DI CMPQ DI, BX JAE lz4_snappy_corrupt MOVWQZX (DX), R8 MOVQ DI, DX TESTQ R8, R8 JZ lz4_snappy_corrupt CMPQ R8, SI JA lz4_snappy_corrupt CMPQ R9, $0x13 JNE lz4_snappy_ml_done lz4_snappy_ml_loop: MOVBQZX (DX), DI INCQ DX ADDQ DI, R9 CMPQ DX, BX JAE lz4_snappy_corrupt CMPQ DI, $0xff JEQ lz4_snappy_ml_loop lz4_snappy_ml_done: ADDQ R9, SI // emitCopy two_byte_offset_lz4_s2: CMPL R9, $0x40 JBE two_byte_offset_short_lz4_s2 MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX CMPQ AX, CX JAE lz4_snappy_loop JMP two_byte_offset_lz4_s2 two_byte_offset_short_lz4_s2: MOVL R9, DI SHLL $0x02, DI CMPL R9, $0x0c JAE emit_copy_three_lz4_s2 CMPL R8, $0x00000800 JAE emit_copy_three_lz4_s2 LEAL -15(DI), DI MOVB R8, 1(AX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, DI MOVB DI, (AX) ADDQ $0x02, AX JMP lz4_snappy_loop emit_copy_three_lz4_s2: LEAL -2(DI), DI MOVB DI, (AX) MOVW R8, 1(AX) ADDQ $0x03, AX JMP lz4_snappy_loop lz4_snappy_done: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ SI, uncompressed+48(FP) MOVQ AX, dstUsed+56(FP) RET lz4_snappy_corrupt: XORQ AX, AX LEAQ -1(AX), SI MOVQ SI, uncompressed+48(FP) RET lz4_snappy_dstfull: XORQ AX, AX LEAQ -2(AX), SI MOVQ SI, uncompressed+48(FP) RET // func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) // Requires: SSE2 TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64 XORQ SI, SI MOVQ dst_base+0(FP), AX MOVQ dst_len+8(FP), CX MOVQ src_base+24(FP), DX MOVQ src_len+32(FP), BX LEAQ (DX)(BX*1), BX LEAQ -8(AX)(CX*1), CX lz4s_snappy_loop: CMPQ DX, BX JAE lz4s_snappy_corrupt CMPQ AX, CX JAE lz4s_snappy_dstfull MOVBQZX (DX), DI MOVQ DI, R8 MOVQ DI, R9 SHRQ $0x04, R8 ANDQ $0x0f, R9 CMPQ DI, $0xf0 JB lz4s_snappy_ll_end lz4s_snappy_ll_loop: INCQ DX CMPQ DX, BX JAE lz4s_snappy_corrupt MOVBQZX (DX), DI ADDQ DI, R8 CMPQ DI, $0xff JEQ lz4s_snappy_ll_loop lz4s_snappy_ll_end: LEAQ (DX)(R8*1), DI ADDQ $0x03, R9 CMPQ DI, BX JAE lz4s_snappy_corrupt INCQ DX INCQ DI TESTQ R8, R8 JZ lz4s_snappy_lits_done LEAQ (AX)(R8*1), R10 CMPQ R10, CX JAE lz4s_snappy_dstfull ADDQ R8, SI LEAL -1(R8), R10 CMPL R10, $0x3c JB one_byte_lz4s_snappy CMPL R10, $0x00000100 JB two_bytes_lz4s_snappy CMPL R10, $0x00010000 JB three_bytes_lz4s_snappy CMPL R10, $0x01000000 JB four_bytes_lz4s_snappy MOVB $0xfc, (AX) MOVL R10, 1(AX) ADDQ $0x05, AX JMP memmove_long_lz4s_snappy four_bytes_lz4s_snappy: MOVL R10, R11 SHRL $0x10, R11 MOVB $0xf8, (AX) MOVW R10, 1(AX) MOVB R11, 3(AX) ADDQ $0x04, AX JMP memmove_long_lz4s_snappy three_bytes_lz4s_snappy: MOVB $0xf4, (AX) MOVW R10, 1(AX) ADDQ $0x03, AX JMP memmove_long_lz4s_snappy two_bytes_lz4s_snappy: MOVB $0xf0, (AX) MOVB R10, 1(AX) ADDQ $0x02, AX CMPL R10, $0x40 JB memmove_lz4s_snappy JMP memmove_long_lz4s_snappy one_byte_lz4s_snappy: SHLB $0x02, R10 MOVB R10, (AX) ADDQ $0x01, AX memmove_lz4s_snappy: LEAQ (AX)(R8*1), R10 // genMemMoveShort CMPQ R8, $0x08 JBE emit_lit_memmove_lz4s_snappy_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_lz4s_snappy_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_lz4s_snappy_memmove_move_17through32 JMP emit_lit_memmove_lz4s_snappy_memmove_move_33through64 emit_lit_memmove_lz4s_snappy_memmove_move_8: MOVQ (DX), R11 MOVQ R11, (AX) JMP memmove_end_copy_lz4s_snappy emit_lit_memmove_lz4s_snappy_memmove_move_8through16: MOVQ (DX), R11 MOVQ -8(DX)(R8*1), DX MOVQ R11, (AX) MOVQ DX, -8(AX)(R8*1) JMP memmove_end_copy_lz4s_snappy emit_lit_memmove_lz4s_snappy_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_lz4s_snappy emit_lit_memmove_lz4s_snappy_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R8*1), X2 MOVOU -16(DX)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_lz4s_snappy: MOVQ R10, AX JMP lz4s_snappy_lits_emit_done memmove_long_lz4s_snappy: LEAQ (AX)(R8*1), R10 // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R8*1), X2 MOVOU -16(DX)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R13 SUBQ R11, R13 DECQ R12 JA emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 LEAQ -32(DX)(R13*1), R11 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_lz4s_snappylarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R11 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_lz4s_snappylarge_big_loop_back emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32: MOVOU -32(DX)(R13*1), X4 MOVOU -16(DX)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ R10, AX lz4s_snappy_lits_emit_done: MOVQ DI, DX lz4s_snappy_lits_done: CMPQ DX, BX JNE lz4s_snappy_match CMPQ R9, $0x03 JEQ lz4s_snappy_done JMP lz4s_snappy_corrupt lz4s_snappy_match: CMPQ R9, $0x03 JEQ lz4s_snappy_loop LEAQ 2(DX), DI CMPQ DI, BX JAE lz4s_snappy_corrupt MOVWQZX (DX), R8 MOVQ DI, DX TESTQ R8, R8 JZ lz4s_snappy_corrupt CMPQ R8, SI JA lz4s_snappy_corrupt CMPQ R9, $0x12 JNE lz4s_snappy_ml_done lz4s_snappy_ml_loop: MOVBQZX (DX), DI INCQ DX ADDQ DI, R9 CMPQ DX, BX JAE lz4s_snappy_corrupt CMPQ DI, $0xff JEQ lz4s_snappy_ml_loop lz4s_snappy_ml_done: ADDQ R9, SI // emitCopy two_byte_offset_lz4_s2: CMPL R9, $0x40 JBE two_byte_offset_short_lz4_s2 MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX CMPQ AX, CX JAE lz4s_snappy_loop JMP two_byte_offset_lz4_s2 two_byte_offset_short_lz4_s2: MOVL R9, DI SHLL $0x02, DI CMPL R9, $0x0c JAE emit_copy_three_lz4_s2 CMPL R8, $0x00000800 JAE emit_copy_three_lz4_s2 LEAL -15(DI), DI MOVB R8, 1(AX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, DI MOVB DI, (AX) ADDQ $0x02, AX JMP lz4s_snappy_loop emit_copy_three_lz4_s2: LEAL -2(DI), DI MOVB DI, (AX) MOVW R8, 1(AX) ADDQ $0x03, AX JMP lz4s_snappy_loop lz4s_snappy_done: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ SI, uncompressed+48(FP) MOVQ AX, dstUsed+56(FP) RET lz4s_snappy_corrupt: XORQ AX, AX LEAQ -1(AX), SI MOVQ SI, uncompressed+48(FP) RET lz4s_snappy_dstfull: XORQ AX, AX LEAQ -2(AX), SI MOVQ SI, uncompressed+48(FP) RET