// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.

//go:build !appengine && !noasm && gc && !noasm

#include "textflag.h"

// func _dummy_()
TEXT ·_dummy_(SB), $0
#ifdef GOAMD64_v4
#ifndef GOAMD64_v3
#define GOAMD64_v3
#endif
#endif
	RET

// func encodeBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int
// Requires: BMI, SSE2
TEXT ·encodeBlockAsm(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000200, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeBlockAsm:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeBlockAsm:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeBlockAsm
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R11
	IMULQ R9, R11
	SHRQ  $0x32, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeBlockAsm
	LEAL  1(DX), DI
	MOVL  12(SP), R8
	MOVL  DI, SI
	SUBL  16(SP), SI
	JZ    repeat_extend_back_end_encodeBlockAsm

repeat_extend_back_loop_encodeBlockAsm:
	CMPL DI, R8
	JBE  repeat_extend_back_end_encodeBlockAsm
	MOVB -1(BX)(SI*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_encodeBlockAsm
	LEAL -1(DI), DI
	DECL SI
	JNZ  repeat_extend_back_loop_encodeBlockAsm

repeat_extend_back_end_encodeBlockAsm:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 5(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_encodeBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

repeat_dst_size_check_encodeBlockAsm:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_encodeBlockAsm
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_encodeBlockAsm
	CMPL SI, $0x00010000
	JB   three_bytes_repeat_emit_encodeBlockAsm
	CMPL SI, $0x01000000
	JB   four_bytes_repeat_emit_encodeBlockAsm
	MOVB $0xfc, (CX)
	MOVL SI, 1(CX)
	ADDQ $0x05, CX
	JMP  memmove_long_repeat_emit_encodeBlockAsm

four_bytes_repeat_emit_encodeBlockAsm:
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (CX)
	MOVW SI, 1(CX)
	MOVB R11, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_repeat_emit_encodeBlockAsm

three_bytes_repeat_emit_encodeBlockAsm:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_encodeBlockAsm

two_bytes_repeat_emit_encodeBlockAsm:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_encodeBlockAsm
	JMP  memmove_long_repeat_emit_encodeBlockAsm

one_byte_repeat_emit_encodeBlockAsm:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_repeat_emit_encodeBlockAsm:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (CX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm

emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_repeat_emit_encodeBlockAsm:
	MOVQ SI, CX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm

memmove_long_repeat_emit_encodeBlockAsm:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R12
	SHRQ  $0x05, R12
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R13*1), R11
	LEAQ  -32(CX)(R13*1), R14

emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R13*1), X4
	MOVOU -16(R10)(R13*1), X5
	MOVOA X4, -32(CX)(R13*1)
	MOVOA X5, -16(CX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R9, R13
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_repeat_emit_encodeBlockAsm:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R9
	SUBL DX, R9
	LEAQ (BX)(DX*1), R10
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R12, R12

matchlen_loopback_16_repeat_extend_encodeBlockAsm:
	CMPL R9, $0x10
	JB   matchlen_match8_repeat_extend_encodeBlockAsm
	MOVQ (R10)(R12*1), R11
	MOVQ 8(R10)(R12*1), R13
	XORQ (SI)(R12*1), R11
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm
	XORQ 8(SI)(R12*1), R13
	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm
	LEAL -16(R9), R9
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm

matchlen_bsf_16repeat_extend_encodeBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  repeat_extend_forward_end_encodeBlockAsm

matchlen_match8_repeat_extend_encodeBlockAsm:
	CMPL R9, $0x08
	JB   matchlen_match4_repeat_extend_encodeBlockAsm
	MOVQ (R10)(R12*1), R11
	XORQ (SI)(R12*1), R11
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm
	LEAL -8(R9), R9
	LEAL 8(R12), R12
	JMP  matchlen_match4_repeat_extend_encodeBlockAsm

matchlen_bsf_8_repeat_extend_encodeBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  repeat_extend_forward_end_encodeBlockAsm

matchlen_match4_repeat_extend_encodeBlockAsm:
	CMPL R9, $0x04
	JB   matchlen_match2_repeat_extend_encodeBlockAsm
	MOVL (R10)(R12*1), R11
	CMPL (SI)(R12*1), R11
	JNE  matchlen_match2_repeat_extend_encodeBlockAsm
	LEAL -4(R9), R9
	LEAL 4(R12), R12

matchlen_match2_repeat_extend_encodeBlockAsm:
	CMPL R9, $0x01
	JE   matchlen_match1_repeat_extend_encodeBlockAsm
	JB   repeat_extend_forward_end_encodeBlockAsm
	MOVW (R10)(R12*1), R11
	CMPW (SI)(R12*1), R11
	JNE  matchlen_match1_repeat_extend_encodeBlockAsm
	LEAL 2(R12), R12
	SUBL $0x02, R9
	JZ   repeat_extend_forward_end_encodeBlockAsm

matchlen_match1_repeat_extend_encodeBlockAsm:
	MOVB (R10)(R12*1), R11
	CMPB (SI)(R12*1), R11
	JNE  repeat_extend_forward_end_encodeBlockAsm
	LEAL 1(R12), R12

repeat_extend_forward_end_encodeBlockAsm:
	ADDL  R12, DX
	MOVL  DX, SI
	SUBL  DI, SI
	MOVL  16(SP), DI
	TESTL R8, R8
	JZ    repeat_as_copy_encodeBlockAsm

	// emitRepeat
emit_repeat_again_match_repeat_encodeBlockAsm:
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_match_repeat_encodeBlockAsm
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_repeat_encodeBlockAsm

cant_repeat_two_offset_match_repeat_encodeBlockAsm:
	CMPL SI, $0x00000104
	JB   repeat_three_match_repeat_encodeBlockAsm
	CMPL SI, $0x00010100
	JB   repeat_four_match_repeat_encodeBlockAsm
	CMPL SI, $0x0100ffff
	JB   repeat_five_match_repeat_encodeBlockAsm
	LEAL -16842747(SI), SI
	MOVL $0xfffb001d, (CX)
	MOVB $0xff, 4(CX)
	ADDQ $0x05, CX
	JMP  emit_repeat_again_match_repeat_encodeBlockAsm

repeat_five_match_repeat_encodeBlockAsm:
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (CX)
	MOVW SI, 2(CX)
	SARL $0x10, DI
	MOVB DI, 4(CX)
	ADDQ $0x05, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_four_match_repeat_encodeBlockAsm:
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_three_match_repeat_encodeBlockAsm:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_match_repeat_encodeBlockAsm:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_offset_match_repeat_encodeBlockAsm:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_as_copy_encodeBlockAsm:
	// emitCopy
	CMPL DI, $0x00010000
	JB   two_byte_offset_repeat_as_copy_encodeBlockAsm
	CMPL SI, $0x40
	JBE  four_bytes_remain_repeat_as_copy_encodeBlockAsm
	MOVB $0xff, (CX)
	MOVL DI, 1(CX)
	LEAL -64(SI), SI
	ADDQ $0x05, CX
	CMPL SI, $0x04
	JB   four_bytes_remain_repeat_as_copy_encodeBlockAsm

	// emitRepeat
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL DI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL SI, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
	CMPL SI, $0x0100ffff
	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
	LEAL -16842747(SI), SI
	MOVL $0xfffb001d, (CX)
	MOVB $0xff, 4(CX)
	ADDQ $0x05, CX
	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy

repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (CX)
	MOVW SI, 2(CX)
	SARL $0x10, DI
	MOVB DI, 4(CX)
	ADDQ $0x05, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm

four_bytes_remain_repeat_as_copy_encodeBlockAsm:
	TESTL SI, SI
	JZ    repeat_end_emit_encodeBlockAsm
	XORL  R8, R8
	LEAL  -1(R8)(SI*4), SI
	MOVB  SI, (CX)
	MOVL  DI, 1(CX)
	ADDQ  $0x05, CX
	JMP   repeat_end_emit_encodeBlockAsm

two_byte_offset_repeat_as_copy_encodeBlockAsm:
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm
	CMPL DI, $0x00000800
	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm
	MOVL $0x00000001, R8
	LEAL 16(R8), R8
	MOVB DI, 1(CX)
	MOVL DI, R9
	SHRL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, SI

	// emitRepeat
	LEAL -4(SI), SI
	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b

emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
	CMPL DI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
	CMPL SI, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
	CMPL SI, $0x0100ffff
	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
	LEAL -16842747(SI), SI
	MOVL $0xfffb001d, (CX)
	MOVB $0xff, 4(CX)
	ADDQ $0x05, CX
	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b

repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (CX)
	MOVW SI, 2(CX)
	SARL $0x10, DI
	MOVB DI, 4(CX)
	ADDQ $0x05, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm

long_offset_short_repeat_as_copy_encodeBlockAsm:
	MOVB $0xee, (CX)
	MOVW DI, 1(CX)
	LEAL -60(SI), SI
	ADDQ $0x03, CX

	// emitRepeat
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL DI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL SI, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
	CMPL SI, $0x0100ffff
	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
	LEAL -16842747(SI), SI
	MOVL $0xfffb001d, (CX)
	MOVB $0xff, 4(CX)
	ADDQ $0x05, CX
	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short

repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (CX)
	MOVW SI, 2(CX)
	SARL $0x10, DI
	MOVB DI, 4(CX)
	ADDQ $0x05, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm

repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm

two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
	MOVL SI, R8
	SHLL $0x02, R8
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm
	CMPL DI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm
	LEAL -15(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm

emit_copy_three_repeat_as_copy_encodeBlockAsm:
	LEAL -2(R8), R8
	MOVB R8, (CX)
	MOVW DI, 1(CX)
	ADDQ $0x03, CX

repeat_end_emit_encodeBlockAsm:
	MOVL DX, 12(SP)
	JMP  search_loop_encodeBlockAsm

no_repeat_found_encodeBlockAsm:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_encodeBlockAsm
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_encodeBlockAsm
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_encodeBlockAsm
	MOVL 20(SP), DX
	JMP  search_loop_encodeBlockAsm

candidate3_match_encodeBlockAsm:
	ADDL $0x02, DX
	JMP  candidate_match_encodeBlockAsm

candidate2_match_encodeBlockAsm:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_encodeBlockAsm:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBlockAsm

match_extend_back_loop_encodeBlockAsm:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeBlockAsm
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeBlockAsm
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeBlockAsm
	JMP  match_extend_back_loop_encodeBlockAsm

match_extend_back_end_encodeBlockAsm:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 5(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeBlockAsm:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JB   one_byte_match_emit_encodeBlockAsm
	CMPL R8, $0x00000100
	JB   two_bytes_match_emit_encodeBlockAsm
	CMPL R8, $0x00010000
	JB   three_bytes_match_emit_encodeBlockAsm
	CMPL R8, $0x01000000
	JB   four_bytes_match_emit_encodeBlockAsm
	MOVB $0xfc, (CX)
	MOVL R8, 1(CX)
	ADDQ $0x05, CX
	JMP  memmove_long_match_emit_encodeBlockAsm

four_bytes_match_emit_encodeBlockAsm:
	MOVL R8, R10
	SHRL $0x10, R10
	MOVB $0xf8, (CX)
	MOVW R8, 1(CX)
	MOVB R10, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_match_emit_encodeBlockAsm

three_bytes_match_emit_encodeBlockAsm:
	MOVB $0xf4, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeBlockAsm

two_bytes_match_emit_encodeBlockAsm:
	MOVB $0xf0, (CX)
	MOVB R8, 1(CX)
	ADDQ $0x02, CX
	CMPL R8, $0x40
	JB   memmove_match_emit_encodeBlockAsm
	JMP  memmove_long_match_emit_encodeBlockAsm

one_byte_match_emit_encodeBlockAsm:
	SHLB $0x02, R8
	MOVB R8, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeBlockAsm:
	LEAQ (CX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (CX)
	MOVQ DI, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm

emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeBlockAsm:
	MOVQ R8, CX
	JMP  emit_literal_done_match_emit_encodeBlockAsm

memmove_long_match_emit_encodeBlockAsm:
	LEAQ (CX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  R8, CX

emit_literal_done_match_emit_encodeBlockAsm:
match_nolit_loop_encodeBlockAsm:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_encodeBlockAsm:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeBlockAsm
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm

matchlen_bsf_16match_nolit_encodeBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_encodeBlockAsm

matchlen_match8_match_nolit_encodeBlockAsm:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeBlockAsm
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_encodeBlockAsm

matchlen_bsf_8_match_nolit_encodeBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_encodeBlockAsm

matchlen_match4_match_nolit_encodeBlockAsm:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeBlockAsm
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_encodeBlockAsm
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_encodeBlockAsm:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeBlockAsm
	JB   match_nolit_end_encodeBlockAsm
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_encodeBlockAsm
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeBlockAsm

matchlen_match1_match_nolit_encodeBlockAsm:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeBlockAsm
	LEAL 1(R10), R10

match_nolit_end_encodeBlockAsm:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
	CMPL SI, $0x00010000
	JB   two_byte_offset_match_nolit_encodeBlockAsm
	CMPL R10, $0x40
	JBE  four_bytes_remain_match_nolit_encodeBlockAsm
	MOVB $0xff, (CX)
	MOVL SI, 1(CX)
	LEAL -64(R10), R10
	ADDQ $0x05, CX
	CMPL R10, $0x04
	JB   four_bytes_remain_match_nolit_encodeBlockAsm

	// emitRepeat
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy

cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy
	CMPL R10, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy
	CMPL R10, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (CX)
	MOVB $0xff, 4(CX)
	ADDQ $0x05, CX
	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy

repeat_five_match_nolit_encodeBlockAsm_emit_copy:
	LEAL -65536(R10), R10
	MOVL R10, SI
	MOVW $0x001d, (CX)
	MOVW R10, 2(CX)
	SARL $0x10, SI
	MOVB SI, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_four_match_nolit_encodeBlockAsm_emit_copy:
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_three_match_nolit_encodeBlockAsm_emit_copy:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_match_nolit_encodeBlockAsm_emit_copy:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

four_bytes_remain_match_nolit_encodeBlockAsm:
	TESTL R10, R10
	JZ    match_nolit_emitcopy_end_encodeBlockAsm
	XORL  DI, DI
	LEAL  -1(DI)(R10*4), R10
	MOVB  R10, (CX)
	MOVL  SI, 1(CX)
	ADDQ  $0x05, CX
	JMP   match_nolit_emitcopy_end_encodeBlockAsm

two_byte_offset_match_nolit_encodeBlockAsm:
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm
	CMPL SI, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBlockAsm
	MOVL $0x00000001, DI
	LEAL 16(DI), DI
	MOVB SI, 1(CX)
	MOVL SI, R8
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, R10

	// emitRepeat
	LEAL -4(R10), R10
	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b

emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
	CMPL R10, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
	CMPL R10, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (CX)
	MOVB $0xff, 4(CX)
	ADDQ $0x05, CX
	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b

repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	LEAL -65536(R10), R10
	MOVL R10, SI
	MOVW $0x001d, (CX)
	MOVW R10, 2(CX)
	SARL $0x10, SI
	MOVB SI, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

long_offset_short_match_nolit_encodeBlockAsm:
	MOVB $0xee, (CX)
	MOVW SI, 1(CX)
	LEAL -60(R10), R10
	ADDQ $0x03, CX

	// emitRepeat
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL R10, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
	CMPL R10, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (CX)
	MOVB $0xff, 4(CX)
	ADDQ $0x05, CX
	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short

repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
	LEAL -65536(R10), R10
	MOVL R10, SI
	MOVW $0x001d, (CX)
	MOVW R10, 2(CX)
	SARL $0x10, SI
	MOVB SI, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

two_byte_offset_short_match_nolit_encodeBlockAsm:
	MOVL R10, DI
	SHLL $0x02, DI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBlockAsm
	CMPL SI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBlockAsm
	LEAL -15(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm

emit_copy_three_match_nolit_encodeBlockAsm:
	LEAL -2(DI), DI
	MOVB DI, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeBlockAsm:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeBlockAsm
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm:
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x10, R8
	IMULQ R9, R8
	SHRQ  $0x32, R8
	SHLQ  $0x10, SI
	IMULQ R9, SI
	SHRQ  $0x32, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_encodeBlockAsm
	INCL  DX
	JMP   search_loop_encodeBlockAsm

emit_remainder_encodeBlockAsm:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 5(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeBlockAsm:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBlockAsm
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBlockAsm
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeBlockAsm
	CMPL DX, $0x01000000
	JB   four_bytes_emit_remainder_encodeBlockAsm
	MOVB $0xfc, (CX)
	MOVL DX, 1(CX)
	ADDQ $0x05, CX
	JMP  memmove_long_emit_remainder_encodeBlockAsm

four_bytes_emit_remainder_encodeBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (CX)
	MOVW DX, 1(CX)
	MOVB BL, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_emit_remainder_encodeBlockAsm

three_bytes_emit_remainder_encodeBlockAsm:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeBlockAsm

two_bytes_emit_remainder_encodeBlockAsm:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBlockAsm
	JMP  memmove_long_emit_remainder_encodeBlockAsm

one_byte_emit_remainder_encodeBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeBlockAsm:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm

emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm

memmove_long_emit_remainder_encodeBlockAsm:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeBlockAsm:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeBlockAsm4MB(dst []byte, src []byte, tmp *[65536]byte) int
// Requires: BMI, SSE2
TEXT ·encodeBlockAsm4MB(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000200, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeBlockAsm4MB:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeBlockAsm4MB
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeBlockAsm4MB:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeBlockAsm4MB
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R11
	IMULQ R9, R11
	SHRQ  $0x32, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeBlockAsm4MB
	LEAL  1(DX), DI
	MOVL  12(SP), R8
	MOVL  DI, SI
	SUBL  16(SP), SI
	JZ    repeat_extend_back_end_encodeBlockAsm4MB

repeat_extend_back_loop_encodeBlockAsm4MB:
	CMPL DI, R8
	JBE  repeat_extend_back_end_encodeBlockAsm4MB
	MOVB -1(BX)(SI*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_encodeBlockAsm4MB
	LEAL -1(DI), DI
	DECL SI
	JNZ  repeat_extend_back_loop_encodeBlockAsm4MB

repeat_extend_back_end_encodeBlockAsm4MB:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 4(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_encodeBlockAsm4MB
	MOVQ $0x00000000, ret+56(FP)
	RET

repeat_dst_size_check_encodeBlockAsm4MB:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm4MB
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_encodeBlockAsm4MB
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_encodeBlockAsm4MB
	CMPL SI, $0x00010000
	JB   three_bytes_repeat_emit_encodeBlockAsm4MB
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (CX)
	MOVW SI, 1(CX)
	MOVB R11, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB

three_bytes_repeat_emit_encodeBlockAsm4MB:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB

two_bytes_repeat_emit_encodeBlockAsm4MB:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_encodeBlockAsm4MB
	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB

one_byte_repeat_emit_encodeBlockAsm4MB:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_repeat_emit_encodeBlockAsm4MB:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (CX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm4MB

emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
	MOVQ SI, CX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm4MB

memmove_long_repeat_emit_encodeBlockAsm4MB:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R12
	SHRQ  $0x05, R12
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R13*1), R11
	LEAQ  -32(CX)(R13*1), R14

emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R13*1), X4
	MOVOU -16(R10)(R13*1), X5
	MOVOA X4, -32(CX)(R13*1)
	MOVOA X5, -16(CX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R9, R13
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_repeat_emit_encodeBlockAsm4MB:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R9
	SUBL DX, R9
	LEAQ (BX)(DX*1), R10
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R12, R12

matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB:
	CMPL R9, $0x10
	JB   matchlen_match8_repeat_extend_encodeBlockAsm4MB
	MOVQ (R10)(R12*1), R11
	MOVQ 8(R10)(R12*1), R13
	XORQ (SI)(R12*1), R11
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
	XORQ 8(SI)(R12*1), R13
	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm4MB
	LEAL -16(R9), R9
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB

matchlen_bsf_16repeat_extend_encodeBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  repeat_extend_forward_end_encodeBlockAsm4MB

matchlen_match8_repeat_extend_encodeBlockAsm4MB:
	CMPL R9, $0x08
	JB   matchlen_match4_repeat_extend_encodeBlockAsm4MB
	MOVQ (R10)(R12*1), R11
	XORQ (SI)(R12*1), R11
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
	LEAL -8(R9), R9
	LEAL 8(R12), R12
	JMP  matchlen_match4_repeat_extend_encodeBlockAsm4MB

matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  repeat_extend_forward_end_encodeBlockAsm4MB

matchlen_match4_repeat_extend_encodeBlockAsm4MB:
	CMPL R9, $0x04
	JB   matchlen_match2_repeat_extend_encodeBlockAsm4MB
	MOVL (R10)(R12*1), R11
	CMPL (SI)(R12*1), R11
	JNE  matchlen_match2_repeat_extend_encodeBlockAsm4MB
	LEAL -4(R9), R9
	LEAL 4(R12), R12

matchlen_match2_repeat_extend_encodeBlockAsm4MB:
	CMPL R9, $0x01
	JE   matchlen_match1_repeat_extend_encodeBlockAsm4MB
	JB   repeat_extend_forward_end_encodeBlockAsm4MB
	MOVW (R10)(R12*1), R11
	CMPW (SI)(R12*1), R11
	JNE  matchlen_match1_repeat_extend_encodeBlockAsm4MB
	LEAL 2(R12), R12
	SUBL $0x02, R9
	JZ   repeat_extend_forward_end_encodeBlockAsm4MB

matchlen_match1_repeat_extend_encodeBlockAsm4MB:
	MOVB (R10)(R12*1), R11
	CMPB (SI)(R12*1), R11
	JNE  repeat_extend_forward_end_encodeBlockAsm4MB
	LEAL 1(R12), R12

repeat_extend_forward_end_encodeBlockAsm4MB:
	ADDL  R12, DX
	MOVL  DX, SI
	SUBL  DI, SI
	MOVL  16(SP), DI
	TESTL R8, R8
	JZ    repeat_as_copy_encodeBlockAsm4MB

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_match_repeat_encodeBlockAsm4MB
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_repeat_encodeBlockAsm4MB

cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
	CMPL SI, $0x00000104
	JB   repeat_three_match_repeat_encodeBlockAsm4MB
	CMPL SI, $0x00010100
	JB   repeat_four_match_repeat_encodeBlockAsm4MB
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (CX)
	MOVW SI, 2(CX)
	SARL $0x10, DI
	MOVB DI, 4(CX)
	ADDQ $0x05, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_four_match_repeat_encodeBlockAsm4MB:
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_three_match_repeat_encodeBlockAsm4MB:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_match_repeat_encodeBlockAsm4MB:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_offset_match_repeat_encodeBlockAsm4MB:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_as_copy_encodeBlockAsm4MB:
	// emitCopy
	CMPL DI, $0x00010000
	JB   two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
	CMPL SI, $0x40
	JBE  four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
	MOVB $0xff, (CX)
	MOVL DI, 1(CX)
	LEAL -64(SI), SI
	ADDQ $0x05, CX
	CMPL SI, $0x04
	JB   four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	CMPL DI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	CMPL SI, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (CX)
	MOVW SI, 2(CX)
	SARL $0x10, DI
	MOVB DI, 4(CX)
	ADDQ $0x05, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
	TESTL SI, SI
	JZ    repeat_end_emit_encodeBlockAsm4MB
	XORL  R8, R8
	LEAL  -1(R8)(SI*4), SI
	MOVB  SI, (CX)
	MOVL  DI, 1(CX)
	ADDQ  $0x05, CX
	JMP   repeat_end_emit_encodeBlockAsm4MB

two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
	CMPL DI, $0x00000800
	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm4MB
	MOVL $0x00000001, R8
	LEAL 16(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, SI

	// emitRepeat
	LEAL -4(SI), SI
	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL DI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL SI, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (CX)
	MOVW SI, 2(CX)
	SARL $0x10, DI
	MOVB DI, 4(CX)
	ADDQ $0x05, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
	MOVB $0xee, (CX)
	MOVW DI, 1(CX)
	LEAL -60(SI), SI
	ADDQ $0x03, CX

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	CMPL DI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	CMPL SI, $0x00010100
	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
	LEAL -65536(SI), SI
	MOVL SI, DI
	MOVW $0x001d, (CX)
	MOVW SI, 2(CX)
	SARL $0x10, DI
	MOVB DI, 4(CX)
	ADDQ $0x05, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
	MOVL SI, R8
	SHLL $0x02, R8
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
	CMPL DI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
	LEAL -15(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm4MB

emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
	LEAL -2(R8), R8
	MOVB R8, (CX)
	MOVW DI, 1(CX)
	ADDQ $0x03, CX

repeat_end_emit_encodeBlockAsm4MB:
	MOVL DX, 12(SP)
	JMP  search_loop_encodeBlockAsm4MB

no_repeat_found_encodeBlockAsm4MB:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_encodeBlockAsm4MB
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_encodeBlockAsm4MB
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_encodeBlockAsm4MB
	MOVL 20(SP), DX
	JMP  search_loop_encodeBlockAsm4MB

candidate3_match_encodeBlockAsm4MB:
	ADDL $0x02, DX
	JMP  candidate_match_encodeBlockAsm4MB

candidate2_match_encodeBlockAsm4MB:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_encodeBlockAsm4MB:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBlockAsm4MB

match_extend_back_loop_encodeBlockAsm4MB:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeBlockAsm4MB
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeBlockAsm4MB
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeBlockAsm4MB
	JMP  match_extend_back_loop_encodeBlockAsm4MB

match_extend_back_end_encodeBlockAsm4MB:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 4(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeBlockAsm4MB
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeBlockAsm4MB:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm4MB
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JB   one_byte_match_emit_encodeBlockAsm4MB
	CMPL R8, $0x00000100
	JB   two_bytes_match_emit_encodeBlockAsm4MB
	CMPL R8, $0x00010000
	JB   three_bytes_match_emit_encodeBlockAsm4MB
	MOVL R8, R10
	SHRL $0x10, R10
	MOVB $0xf8, (CX)
	MOVW R8, 1(CX)
	MOVB R10, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_match_emit_encodeBlockAsm4MB

three_bytes_match_emit_encodeBlockAsm4MB:
	MOVB $0xf4, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeBlockAsm4MB

two_bytes_match_emit_encodeBlockAsm4MB:
	MOVB $0xf0, (CX)
	MOVB R8, 1(CX)
	ADDQ $0x02, CX
	CMPL R8, $0x40
	JB   memmove_match_emit_encodeBlockAsm4MB
	JMP  memmove_long_match_emit_encodeBlockAsm4MB

one_byte_match_emit_encodeBlockAsm4MB:
	SHLB $0x02, R8
	MOVB R8, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeBlockAsm4MB:
	LEAQ (CX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (CX)
	MOVQ DI, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm4MB

emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeBlockAsm4MB:
	MOVQ R8, CX
	JMP  emit_literal_done_match_emit_encodeBlockAsm4MB

memmove_long_match_emit_encodeBlockAsm4MB:
	LEAQ (CX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  R8, CX

emit_literal_done_match_emit_encodeBlockAsm4MB:
match_nolit_loop_encodeBlockAsm4MB:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_encodeBlockAsm4MB:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeBlockAsm4MB
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm4MB
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm4MB

matchlen_bsf_16match_nolit_encodeBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_encodeBlockAsm4MB

matchlen_match8_match_nolit_encodeBlockAsm4MB:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeBlockAsm4MB
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_encodeBlockAsm4MB

matchlen_bsf_8_match_nolit_encodeBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_encodeBlockAsm4MB

matchlen_match4_match_nolit_encodeBlockAsm4MB:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeBlockAsm4MB
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_encodeBlockAsm4MB
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_encodeBlockAsm4MB:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeBlockAsm4MB
	JB   match_nolit_end_encodeBlockAsm4MB
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_encodeBlockAsm4MB
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeBlockAsm4MB

matchlen_match1_match_nolit_encodeBlockAsm4MB:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeBlockAsm4MB
	LEAL 1(R10), R10

match_nolit_end_encodeBlockAsm4MB:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
	CMPL SI, $0x00010000
	JB   two_byte_offset_match_nolit_encodeBlockAsm4MB
	CMPL R10, $0x40
	JBE  four_bytes_remain_match_nolit_encodeBlockAsm4MB
	MOVB $0xff, (CX)
	MOVL SI, 1(CX)
	LEAL -64(R10), R10
	ADDQ $0x05, CX
	CMPL R10, $0x04
	JB   four_bytes_remain_match_nolit_encodeBlockAsm4MB

	// emitRepeat
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy

cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
	CMPL R10, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
	LEAL -65536(R10), R10
	MOVL R10, SI
	MOVW $0x001d, (CX)
	MOVW R10, 2(CX)
	SARL $0x10, SI
	MOVB SI, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

four_bytes_remain_match_nolit_encodeBlockAsm4MB:
	TESTL R10, R10
	JZ    match_nolit_emitcopy_end_encodeBlockAsm4MB
	XORL  DI, DI
	LEAL  -1(DI)(R10*4), R10
	MOVB  R10, (CX)
	MOVL  SI, 1(CX)
	ADDQ  $0x05, CX
	JMP   match_nolit_emitcopy_end_encodeBlockAsm4MB

two_byte_offset_match_nolit_encodeBlockAsm4MB:
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm4MB
	CMPL SI, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBlockAsm4MB
	MOVL $0x00000001, DI
	LEAL 16(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, R10

	// emitRepeat
	LEAL -4(R10), R10
	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
	CMPL R10, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
	LEAL -65536(R10), R10
	MOVL R10, SI
	MOVW $0x001d, (CX)
	MOVW R10, 2(CX)
	SARL $0x10, SI
	MOVB SI, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

long_offset_short_match_nolit_encodeBlockAsm4MB:
	MOVB $0xee, (CX)
	MOVW SI, 1(CX)
	LEAL -60(R10), R10
	ADDQ $0x03, CX

	// emitRepeat
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
	CMPL R10, $0x00010100
	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
	LEAL -65536(R10), R10
	MOVL R10, SI
	MOVW $0x001d, (CX)
	MOVW R10, 2(CX)
	SARL $0x10, SI
	MOVB SI, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
	MOVL R10, DI
	SHLL $0x02, DI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBlockAsm4MB
	CMPL SI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBlockAsm4MB
	LEAL -15(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB

emit_copy_three_match_nolit_encodeBlockAsm4MB:
	LEAL -2(DI), DI
	MOVB DI, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeBlockAsm4MB:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeBlockAsm4MB
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeBlockAsm4MB
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm4MB:
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x10, R8
	IMULQ R9, R8
	SHRQ  $0x32, R8
	SHLQ  $0x10, SI
	IMULQ R9, SI
	SHRQ  $0x32, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_encodeBlockAsm4MB
	INCL  DX
	JMP   search_loop_encodeBlockAsm4MB

emit_remainder_encodeBlockAsm4MB:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 4(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeBlockAsm4MB
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeBlockAsm4MB:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm4MB
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBlockAsm4MB
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBlockAsm4MB
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeBlockAsm4MB
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (CX)
	MOVW DX, 1(CX)
	MOVB BL, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB

three_bytes_emit_remainder_encodeBlockAsm4MB:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB

two_bytes_emit_remainder_encodeBlockAsm4MB:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBlockAsm4MB
	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB

one_byte_emit_remainder_encodeBlockAsm4MB:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeBlockAsm4MB:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm4MB

memmove_long_emit_remainder_encodeBlockAsm4MB:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeBlockAsm4MB:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int
// Requires: BMI, SSE2
TEXT ·encodeBlockAsm12B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000080, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeBlockAsm12B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeBlockAsm12B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeBlockAsm12B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x000000cf1bbcdcbb, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x18, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	SHLQ  $0x18, R11
	IMULQ R9, R11
	SHRQ  $0x34, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x18, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeBlockAsm12B
	LEAL  1(DX), DI
	MOVL  12(SP), R8
	MOVL  DI, SI
	SUBL  16(SP), SI
	JZ    repeat_extend_back_end_encodeBlockAsm12B

repeat_extend_back_loop_encodeBlockAsm12B:
	CMPL DI, R8
	JBE  repeat_extend_back_end_encodeBlockAsm12B
	MOVB -1(BX)(SI*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_encodeBlockAsm12B
	LEAL -1(DI), DI
	DECL SI
	JNZ  repeat_extend_back_loop_encodeBlockAsm12B

repeat_extend_back_end_encodeBlockAsm12B:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 3(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_encodeBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

repeat_dst_size_check_encodeBlockAsm12B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_encodeBlockAsm12B
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_encodeBlockAsm12B
	JB   three_bytes_repeat_emit_encodeBlockAsm12B

three_bytes_repeat_emit_encodeBlockAsm12B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_encodeBlockAsm12B

two_bytes_repeat_emit_encodeBlockAsm12B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_encodeBlockAsm12B
	JMP  memmove_long_repeat_emit_encodeBlockAsm12B

one_byte_repeat_emit_encodeBlockAsm12B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_repeat_emit_encodeBlockAsm12B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (CX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm12B

emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_repeat_emit_encodeBlockAsm12B:
	MOVQ SI, CX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm12B

memmove_long_repeat_emit_encodeBlockAsm12B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R12
	SHRQ  $0x05, R12
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R13*1), R11
	LEAQ  -32(CX)(R13*1), R14

emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R13*1), X4
	MOVOU -16(R10)(R13*1), X5
	MOVOA X4, -32(CX)(R13*1)
	MOVOA X5, -16(CX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R9, R13
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_repeat_emit_encodeBlockAsm12B:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R9
	SUBL DX, R9
	LEAQ (BX)(DX*1), R10
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R12, R12

matchlen_loopback_16_repeat_extend_encodeBlockAsm12B:
	CMPL R9, $0x10
	JB   matchlen_match8_repeat_extend_encodeBlockAsm12B
	MOVQ (R10)(R12*1), R11
	MOVQ 8(R10)(R12*1), R13
	XORQ (SI)(R12*1), R11
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
	XORQ 8(SI)(R12*1), R13
	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm12B
	LEAL -16(R9), R9
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm12B

matchlen_bsf_16repeat_extend_encodeBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  repeat_extend_forward_end_encodeBlockAsm12B

matchlen_match8_repeat_extend_encodeBlockAsm12B:
	CMPL R9, $0x08
	JB   matchlen_match4_repeat_extend_encodeBlockAsm12B
	MOVQ (R10)(R12*1), R11
	XORQ (SI)(R12*1), R11
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
	LEAL -8(R9), R9
	LEAL 8(R12), R12
	JMP  matchlen_match4_repeat_extend_encodeBlockAsm12B

matchlen_bsf_8_repeat_extend_encodeBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  repeat_extend_forward_end_encodeBlockAsm12B

matchlen_match4_repeat_extend_encodeBlockAsm12B:
	CMPL R9, $0x04
	JB   matchlen_match2_repeat_extend_encodeBlockAsm12B
	MOVL (R10)(R12*1), R11
	CMPL (SI)(R12*1), R11
	JNE  matchlen_match2_repeat_extend_encodeBlockAsm12B
	LEAL -4(R9), R9
	LEAL 4(R12), R12

matchlen_match2_repeat_extend_encodeBlockAsm12B:
	CMPL R9, $0x01
	JE   matchlen_match1_repeat_extend_encodeBlockAsm12B
	JB   repeat_extend_forward_end_encodeBlockAsm12B
	MOVW (R10)(R12*1), R11
	CMPW (SI)(R12*1), R11
	JNE  matchlen_match1_repeat_extend_encodeBlockAsm12B
	LEAL 2(R12), R12
	SUBL $0x02, R9
	JZ   repeat_extend_forward_end_encodeBlockAsm12B

matchlen_match1_repeat_extend_encodeBlockAsm12B:
	MOVB (R10)(R12*1), R11
	CMPB (SI)(R12*1), R11
	JNE  repeat_extend_forward_end_encodeBlockAsm12B
	LEAL 1(R12), R12

repeat_extend_forward_end_encodeBlockAsm12B:
	ADDL  R12, DX
	MOVL  DX, SI
	SUBL  DI, SI
	MOVL  16(SP), DI
	TESTL R8, R8
	JZ    repeat_as_copy_encodeBlockAsm12B

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_match_repeat_encodeBlockAsm12B
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_repeat_encodeBlockAsm12B

cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
	CMPL SI, $0x00000104
	JB   repeat_three_match_repeat_encodeBlockAsm12B
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_three_match_repeat_encodeBlockAsm12B:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_match_repeat_encodeBlockAsm12B:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_offset_match_repeat_encodeBlockAsm12B:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_as_copy_encodeBlockAsm12B:
	// emitCopy
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
	CMPL DI, $0x00000800
	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm12B
	MOVL $0x00000001, R8
	LEAL 16(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, SI

	// emitRepeat
	LEAL -4(SI), SI
	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
	CMPL DI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

long_offset_short_repeat_as_copy_encodeBlockAsm12B:
	MOVB $0xee, (CX)
	MOVW DI, 1(CX)
	LEAL -60(SI), SI
	ADDQ $0x03, CX

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
	CMPL DI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
	MOVL SI, R8
	SHLL $0x02, R8
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
	CMPL DI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
	LEAL -15(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm12B

emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
	LEAL -2(R8), R8
	MOVB R8, (CX)
	MOVW DI, 1(CX)
	ADDQ $0x03, CX

repeat_end_emit_encodeBlockAsm12B:
	MOVL DX, 12(SP)
	JMP  search_loop_encodeBlockAsm12B

no_repeat_found_encodeBlockAsm12B:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_encodeBlockAsm12B
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_encodeBlockAsm12B
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_encodeBlockAsm12B
	MOVL 20(SP), DX
	JMP  search_loop_encodeBlockAsm12B

candidate3_match_encodeBlockAsm12B:
	ADDL $0x02, DX
	JMP  candidate_match_encodeBlockAsm12B

candidate2_match_encodeBlockAsm12B:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_encodeBlockAsm12B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBlockAsm12B

match_extend_back_loop_encodeBlockAsm12B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeBlockAsm12B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeBlockAsm12B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeBlockAsm12B
	JMP  match_extend_back_loop_encodeBlockAsm12B

match_extend_back_end_encodeBlockAsm12B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeBlockAsm12B:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JB   one_byte_match_emit_encodeBlockAsm12B
	CMPL R8, $0x00000100
	JB   two_bytes_match_emit_encodeBlockAsm12B
	JB   three_bytes_match_emit_encodeBlockAsm12B

three_bytes_match_emit_encodeBlockAsm12B:
	MOVB $0xf4, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeBlockAsm12B

two_bytes_match_emit_encodeBlockAsm12B:
	MOVB $0xf0, (CX)
	MOVB R8, 1(CX)
	ADDQ $0x02, CX
	CMPL R8, $0x40
	JB   memmove_match_emit_encodeBlockAsm12B
	JMP  memmove_long_match_emit_encodeBlockAsm12B

one_byte_match_emit_encodeBlockAsm12B:
	SHLB $0x02, R8
	MOVB R8, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeBlockAsm12B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (CX)
	MOVQ DI, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm12B

emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeBlockAsm12B:
	MOVQ R8, CX
	JMP  emit_literal_done_match_emit_encodeBlockAsm12B

memmove_long_match_emit_encodeBlockAsm12B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  R8, CX

emit_literal_done_match_emit_encodeBlockAsm12B:
match_nolit_loop_encodeBlockAsm12B:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_encodeBlockAsm12B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeBlockAsm12B
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm12B
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm12B
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm12B

matchlen_bsf_16match_nolit_encodeBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_encodeBlockAsm12B

matchlen_match8_match_nolit_encodeBlockAsm12B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeBlockAsm12B
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm12B
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_encodeBlockAsm12B

matchlen_bsf_8_match_nolit_encodeBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_encodeBlockAsm12B

matchlen_match4_match_nolit_encodeBlockAsm12B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeBlockAsm12B
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_encodeBlockAsm12B
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_encodeBlockAsm12B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeBlockAsm12B
	JB   match_nolit_end_encodeBlockAsm12B
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_encodeBlockAsm12B
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeBlockAsm12B

matchlen_match1_match_nolit_encodeBlockAsm12B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeBlockAsm12B
	LEAL 1(R10), R10

match_nolit_end_encodeBlockAsm12B:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm12B
	CMPL SI, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBlockAsm12B
	MOVL $0x00000001, DI
	LEAL 16(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, R10

	// emitRepeat
	LEAL -4(R10), R10
	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

long_offset_short_match_nolit_encodeBlockAsm12B:
	MOVB $0xee, (CX)
	MOVW SI, 1(CX)
	LEAL -60(R10), R10
	ADDQ $0x03, CX

	// emitRepeat
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

two_byte_offset_short_match_nolit_encodeBlockAsm12B:
	MOVL R10, DI
	SHLL $0x02, DI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBlockAsm12B
	CMPL SI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBlockAsm12B
	LEAL -15(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B

emit_copy_three_match_nolit_encodeBlockAsm12B:
	LEAL -2(DI), DI
	MOVB DI, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeBlockAsm12B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeBlockAsm12B
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm12B:
	MOVQ  $0x000000cf1bbcdcbb, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x18, R8
	IMULQ R9, R8
	SHRQ  $0x34, R8
	SHLQ  $0x18, SI
	IMULQ R9, SI
	SHRQ  $0x34, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_encodeBlockAsm12B
	INCL  DX
	JMP   search_loop_encodeBlockAsm12B

emit_remainder_encodeBlockAsm12B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeBlockAsm12B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm12B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBlockAsm12B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBlockAsm12B
	JB   three_bytes_emit_remainder_encodeBlockAsm12B

three_bytes_emit_remainder_encodeBlockAsm12B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeBlockAsm12B

two_bytes_emit_remainder_encodeBlockAsm12B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeBlockAsm12B

one_byte_emit_remainder_encodeBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeBlockAsm12B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm12B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm12B

memmove_long_emit_remainder_encodeBlockAsm12B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeBlockAsm12B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int
// Requires: BMI, SSE2
TEXT ·encodeBlockAsm10B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000020, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeBlockAsm10B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeBlockAsm10B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeBlockAsm10B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ R9, R11
	SHRQ  $0x36, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeBlockAsm10B
	LEAL  1(DX), DI
	MOVL  12(SP), R8
	MOVL  DI, SI
	SUBL  16(SP), SI
	JZ    repeat_extend_back_end_encodeBlockAsm10B

repeat_extend_back_loop_encodeBlockAsm10B:
	CMPL DI, R8
	JBE  repeat_extend_back_end_encodeBlockAsm10B
	MOVB -1(BX)(SI*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_encodeBlockAsm10B
	LEAL -1(DI), DI
	DECL SI
	JNZ  repeat_extend_back_loop_encodeBlockAsm10B

repeat_extend_back_end_encodeBlockAsm10B:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 3(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_encodeBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

repeat_dst_size_check_encodeBlockAsm10B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_encodeBlockAsm10B
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_encodeBlockAsm10B
	JB   three_bytes_repeat_emit_encodeBlockAsm10B

three_bytes_repeat_emit_encodeBlockAsm10B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_encodeBlockAsm10B

two_bytes_repeat_emit_encodeBlockAsm10B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_encodeBlockAsm10B
	JMP  memmove_long_repeat_emit_encodeBlockAsm10B

one_byte_repeat_emit_encodeBlockAsm10B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_repeat_emit_encodeBlockAsm10B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (CX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm10B

emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_repeat_emit_encodeBlockAsm10B:
	MOVQ SI, CX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm10B

memmove_long_repeat_emit_encodeBlockAsm10B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R12
	SHRQ  $0x05, R12
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R13*1), R11
	LEAQ  -32(CX)(R13*1), R14

emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R13*1), X4
	MOVOU -16(R10)(R13*1), X5
	MOVOA X4, -32(CX)(R13*1)
	MOVOA X5, -16(CX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R9, R13
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_repeat_emit_encodeBlockAsm10B:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R9
	SUBL DX, R9
	LEAQ (BX)(DX*1), R10
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R12, R12

matchlen_loopback_16_repeat_extend_encodeBlockAsm10B:
	CMPL R9, $0x10
	JB   matchlen_match8_repeat_extend_encodeBlockAsm10B
	MOVQ (R10)(R12*1), R11
	MOVQ 8(R10)(R12*1), R13
	XORQ (SI)(R12*1), R11
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
	XORQ 8(SI)(R12*1), R13
	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm10B
	LEAL -16(R9), R9
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm10B

matchlen_bsf_16repeat_extend_encodeBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  repeat_extend_forward_end_encodeBlockAsm10B

matchlen_match8_repeat_extend_encodeBlockAsm10B:
	CMPL R9, $0x08
	JB   matchlen_match4_repeat_extend_encodeBlockAsm10B
	MOVQ (R10)(R12*1), R11
	XORQ (SI)(R12*1), R11
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
	LEAL -8(R9), R9
	LEAL 8(R12), R12
	JMP  matchlen_match4_repeat_extend_encodeBlockAsm10B

matchlen_bsf_8_repeat_extend_encodeBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  repeat_extend_forward_end_encodeBlockAsm10B

matchlen_match4_repeat_extend_encodeBlockAsm10B:
	CMPL R9, $0x04
	JB   matchlen_match2_repeat_extend_encodeBlockAsm10B
	MOVL (R10)(R12*1), R11
	CMPL (SI)(R12*1), R11
	JNE  matchlen_match2_repeat_extend_encodeBlockAsm10B
	LEAL -4(R9), R9
	LEAL 4(R12), R12

matchlen_match2_repeat_extend_encodeBlockAsm10B:
	CMPL R9, $0x01
	JE   matchlen_match1_repeat_extend_encodeBlockAsm10B
	JB   repeat_extend_forward_end_encodeBlockAsm10B
	MOVW (R10)(R12*1), R11
	CMPW (SI)(R12*1), R11
	JNE  matchlen_match1_repeat_extend_encodeBlockAsm10B
	LEAL 2(R12), R12
	SUBL $0x02, R9
	JZ   repeat_extend_forward_end_encodeBlockAsm10B

matchlen_match1_repeat_extend_encodeBlockAsm10B:
	MOVB (R10)(R12*1), R11
	CMPB (SI)(R12*1), R11
	JNE  repeat_extend_forward_end_encodeBlockAsm10B
	LEAL 1(R12), R12

repeat_extend_forward_end_encodeBlockAsm10B:
	ADDL  R12, DX
	MOVL  DX, SI
	SUBL  DI, SI
	MOVL  16(SP), DI
	TESTL R8, R8
	JZ    repeat_as_copy_encodeBlockAsm10B

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_match_repeat_encodeBlockAsm10B
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
	CMPL DI, $0x00000800
	JB   repeat_two_offset_match_repeat_encodeBlockAsm10B

cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
	CMPL SI, $0x00000104
	JB   repeat_three_match_repeat_encodeBlockAsm10B
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_three_match_repeat_encodeBlockAsm10B:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_match_repeat_encodeBlockAsm10B:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_offset_match_repeat_encodeBlockAsm10B:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_as_copy_encodeBlockAsm10B:
	// emitCopy
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
	CMPL DI, $0x00000800
	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm10B
	MOVL $0x00000001, R8
	LEAL 16(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, SI

	// emitRepeat
	LEAL -4(SI), SI
	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
	CMPL DI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

long_offset_short_repeat_as_copy_encodeBlockAsm10B:
	MOVB $0xee, (CX)
	MOVW DI, 1(CX)
	LEAL -60(SI), SI
	ADDQ $0x03, CX

	// emitRepeat
	MOVL SI, R8
	LEAL -4(SI), SI
	CMPL R8, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
	CMPL DI, $0x00000800
	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
	MOVL SI, R8
	SHLL $0x02, R8
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
	CMPL DI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
	LEAL -15(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm10B

emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
	LEAL -2(R8), R8
	MOVB R8, (CX)
	MOVW DI, 1(CX)
	ADDQ $0x03, CX

repeat_end_emit_encodeBlockAsm10B:
	MOVL DX, 12(SP)
	JMP  search_loop_encodeBlockAsm10B

no_repeat_found_encodeBlockAsm10B:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_encodeBlockAsm10B
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_encodeBlockAsm10B
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_encodeBlockAsm10B
	MOVL 20(SP), DX
	JMP  search_loop_encodeBlockAsm10B

candidate3_match_encodeBlockAsm10B:
	ADDL $0x02, DX
	JMP  candidate_match_encodeBlockAsm10B

candidate2_match_encodeBlockAsm10B:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_encodeBlockAsm10B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBlockAsm10B

match_extend_back_loop_encodeBlockAsm10B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeBlockAsm10B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeBlockAsm10B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeBlockAsm10B
	JMP  match_extend_back_loop_encodeBlockAsm10B

match_extend_back_end_encodeBlockAsm10B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeBlockAsm10B:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JB   one_byte_match_emit_encodeBlockAsm10B
	CMPL R8, $0x00000100
	JB   two_bytes_match_emit_encodeBlockAsm10B
	JB   three_bytes_match_emit_encodeBlockAsm10B

three_bytes_match_emit_encodeBlockAsm10B:
	MOVB $0xf4, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeBlockAsm10B

two_bytes_match_emit_encodeBlockAsm10B:
	MOVB $0xf0, (CX)
	MOVB R8, 1(CX)
	ADDQ $0x02, CX
	CMPL R8, $0x40
	JB   memmove_match_emit_encodeBlockAsm10B
	JMP  memmove_long_match_emit_encodeBlockAsm10B

one_byte_match_emit_encodeBlockAsm10B:
	SHLB $0x02, R8
	MOVB R8, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeBlockAsm10B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (CX)
	MOVQ DI, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm10B

emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeBlockAsm10B:
	MOVQ R8, CX
	JMP  emit_literal_done_match_emit_encodeBlockAsm10B

memmove_long_match_emit_encodeBlockAsm10B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  R8, CX

emit_literal_done_match_emit_encodeBlockAsm10B:
match_nolit_loop_encodeBlockAsm10B:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_encodeBlockAsm10B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeBlockAsm10B
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm10B
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm10B
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm10B

matchlen_bsf_16match_nolit_encodeBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_encodeBlockAsm10B

matchlen_match8_match_nolit_encodeBlockAsm10B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeBlockAsm10B
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm10B
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_encodeBlockAsm10B

matchlen_bsf_8_match_nolit_encodeBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_encodeBlockAsm10B

matchlen_match4_match_nolit_encodeBlockAsm10B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeBlockAsm10B
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_encodeBlockAsm10B
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_encodeBlockAsm10B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeBlockAsm10B
	JB   match_nolit_end_encodeBlockAsm10B
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_encodeBlockAsm10B
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeBlockAsm10B

matchlen_match1_match_nolit_encodeBlockAsm10B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeBlockAsm10B
	LEAL 1(R10), R10

match_nolit_end_encodeBlockAsm10B:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm10B
	CMPL SI, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBlockAsm10B
	MOVL $0x00000001, DI
	LEAL 16(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, R10

	// emitRepeat
	LEAL -4(R10), R10
	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

long_offset_short_match_nolit_encodeBlockAsm10B:
	MOVB $0xee, (CX)
	MOVW SI, 1(CX)
	LEAL -60(R10), R10
	ADDQ $0x03, CX

	// emitRepeat
	MOVL R10, DI
	LEAL -4(R10), R10
	CMPL DI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
	CMPL SI, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

two_byte_offset_short_match_nolit_encodeBlockAsm10B:
	MOVL R10, DI
	SHLL $0x02, DI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBlockAsm10B
	CMPL SI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBlockAsm10B
	LEAL -15(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B

emit_copy_three_match_nolit_encodeBlockAsm10B:
	LEAL -2(DI), DI
	MOVB DI, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeBlockAsm10B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeBlockAsm10B
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm10B:
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x20, R8
	IMULQ R9, R8
	SHRQ  $0x36, R8
	SHLQ  $0x20, SI
	IMULQ R9, SI
	SHRQ  $0x36, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_encodeBlockAsm10B
	INCL  DX
	JMP   search_loop_encodeBlockAsm10B

emit_remainder_encodeBlockAsm10B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeBlockAsm10B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm10B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBlockAsm10B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBlockAsm10B
	JB   three_bytes_emit_remainder_encodeBlockAsm10B

three_bytes_emit_remainder_encodeBlockAsm10B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeBlockAsm10B

two_bytes_emit_remainder_encodeBlockAsm10B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeBlockAsm10B

one_byte_emit_remainder_encodeBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeBlockAsm10B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm10B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm10B

memmove_long_emit_remainder_encodeBlockAsm10B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeBlockAsm10B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int
// Requires: BMI, SSE2
TEXT ·encodeBlockAsm8B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000008, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeBlockAsm8B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeBlockAsm8B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x04, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeBlockAsm8B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x38, R10
	SHLQ  $0x20, R11
	IMULQ R9, R11
	SHRQ  $0x38, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x38, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeBlockAsm8B
	LEAL  1(DX), DI
	MOVL  12(SP), R8
	MOVL  DI, SI
	SUBL  16(SP), SI
	JZ    repeat_extend_back_end_encodeBlockAsm8B

repeat_extend_back_loop_encodeBlockAsm8B:
	CMPL DI, R8
	JBE  repeat_extend_back_end_encodeBlockAsm8B
	MOVB -1(BX)(SI*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_encodeBlockAsm8B
	LEAL -1(DI), DI
	DECL SI
	JNZ  repeat_extend_back_loop_encodeBlockAsm8B

repeat_extend_back_end_encodeBlockAsm8B:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 3(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_encodeBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

repeat_dst_size_check_encodeBlockAsm8B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm8B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_encodeBlockAsm8B
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_encodeBlockAsm8B
	JB   three_bytes_repeat_emit_encodeBlockAsm8B

three_bytes_repeat_emit_encodeBlockAsm8B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_encodeBlockAsm8B

two_bytes_repeat_emit_encodeBlockAsm8B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_encodeBlockAsm8B
	JMP  memmove_long_repeat_emit_encodeBlockAsm8B

one_byte_repeat_emit_encodeBlockAsm8B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_repeat_emit_encodeBlockAsm8B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (CX)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm8B

emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_repeat_emit_encodeBlockAsm8B:
	MOVQ SI, CX
	JMP  emit_literal_done_repeat_emit_encodeBlockAsm8B

memmove_long_repeat_emit_encodeBlockAsm8B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R12
	SHRQ  $0x05, R12
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R13*1), R11
	LEAQ  -32(CX)(R13*1), R14

emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R13*1), X4
	MOVOU -16(R10)(R13*1), X5
	MOVOA X4, -32(CX)(R13*1)
	MOVOA X5, -16(CX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R9, R13
	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_repeat_emit_encodeBlockAsm8B:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R9
	SUBL DX, R9
	LEAQ (BX)(DX*1), R10
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R12, R12

matchlen_loopback_16_repeat_extend_encodeBlockAsm8B:
	CMPL R9, $0x10
	JB   matchlen_match8_repeat_extend_encodeBlockAsm8B
	MOVQ (R10)(R12*1), R11
	MOVQ 8(R10)(R12*1), R13
	XORQ (SI)(R12*1), R11
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
	XORQ 8(SI)(R12*1), R13
	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm8B
	LEAL -16(R9), R9
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm8B

matchlen_bsf_16repeat_extend_encodeBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  repeat_extend_forward_end_encodeBlockAsm8B

matchlen_match8_repeat_extend_encodeBlockAsm8B:
	CMPL R9, $0x08
	JB   matchlen_match4_repeat_extend_encodeBlockAsm8B
	MOVQ (R10)(R12*1), R11
	XORQ (SI)(R12*1), R11
	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
	LEAL -8(R9), R9
	LEAL 8(R12), R12
	JMP  matchlen_match4_repeat_extend_encodeBlockAsm8B

matchlen_bsf_8_repeat_extend_encodeBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  repeat_extend_forward_end_encodeBlockAsm8B

matchlen_match4_repeat_extend_encodeBlockAsm8B:
	CMPL R9, $0x04
	JB   matchlen_match2_repeat_extend_encodeBlockAsm8B
	MOVL (R10)(R12*1), R11
	CMPL (SI)(R12*1), R11
	JNE  matchlen_match2_repeat_extend_encodeBlockAsm8B
	LEAL -4(R9), R9
	LEAL 4(R12), R12

matchlen_match2_repeat_extend_encodeBlockAsm8B:
	CMPL R9, $0x01
	JE   matchlen_match1_repeat_extend_encodeBlockAsm8B
	JB   repeat_extend_forward_end_encodeBlockAsm8B
	MOVW (R10)(R12*1), R11
	CMPW (SI)(R12*1), R11
	JNE  matchlen_match1_repeat_extend_encodeBlockAsm8B
	LEAL 2(R12), R12
	SUBL $0x02, R9
	JZ   repeat_extend_forward_end_encodeBlockAsm8B

matchlen_match1_repeat_extend_encodeBlockAsm8B:
	MOVB (R10)(R12*1), R11
	CMPB (SI)(R12*1), R11
	JNE  repeat_extend_forward_end_encodeBlockAsm8B
	LEAL 1(R12), R12

repeat_extend_forward_end_encodeBlockAsm8B:
	ADDL  R12, DX
	MOVL  DX, SI
	SUBL  DI, SI
	MOVL  16(SP), DI
	TESTL R8, R8
	JZ    repeat_as_copy_encodeBlockAsm8B

	// emitRepeat
	MOVL SI, DI
	LEAL -4(SI), SI
	CMPL DI, $0x08
	JBE  repeat_two_match_repeat_encodeBlockAsm8B
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm8B

cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
	CMPL SI, $0x00000104
	JB   repeat_three_match_repeat_encodeBlockAsm8B
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_three_match_repeat_encodeBlockAsm8B:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_two_match_repeat_encodeBlockAsm8B:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm8B
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_as_copy_encodeBlockAsm8B:
	// emitCopy
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
	CMPL DI, $0x00000800
	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm8B
	MOVL $0x00000001, R8
	LEAL 16(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, SI

	// emitRepeat
	LEAL -4(SI), SI
	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
	MOVL SI, DI
	LEAL -4(SI), SI
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm8B
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm8B

long_offset_short_repeat_as_copy_encodeBlockAsm8B:
	MOVB $0xee, (CX)
	MOVW DI, 1(CX)
	LEAL -60(SI), SI
	ADDQ $0x03, CX

	// emitRepeat
	MOVL SI, DI
	LEAL -4(SI), SI
	CMPL DI, $0x08
	JBE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
	CMPL DI, $0x0c
	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short

cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
	CMPL SI, $0x00000104
	JB   repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
	LEAL -256(SI), SI
	MOVW $0x0019, (CX)
	MOVW SI, 2(CX)
	ADDQ $0x04, CX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
	LEAL -4(SI), SI
	MOVW $0x0015, (CX)
	MOVB SI, 2(CX)
	ADDQ $0x03, CX
	JMP  repeat_end_emit_encodeBlockAsm8B

repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
	SHLL $0x02, SI
	ORL  $0x01, SI
	MOVW SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm8B
	XORQ R8, R8
	LEAL 1(R8)(SI*4), SI
	MOVB DI, 1(CX)
	SARL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm8B

two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
	MOVL SI, R8
	SHLL $0x02, R8
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm8B
	LEAL -15(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeBlockAsm8B

emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
	LEAL -2(R8), R8
	MOVB R8, (CX)
	MOVW DI, 1(CX)
	ADDQ $0x03, CX

repeat_end_emit_encodeBlockAsm8B:
	MOVL DX, 12(SP)
	JMP  search_loop_encodeBlockAsm8B

no_repeat_found_encodeBlockAsm8B:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_encodeBlockAsm8B
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_encodeBlockAsm8B
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_encodeBlockAsm8B
	MOVL 20(SP), DX
	JMP  search_loop_encodeBlockAsm8B

candidate3_match_encodeBlockAsm8B:
	ADDL $0x02, DX
	JMP  candidate_match_encodeBlockAsm8B

candidate2_match_encodeBlockAsm8B:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_encodeBlockAsm8B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBlockAsm8B

match_extend_back_loop_encodeBlockAsm8B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeBlockAsm8B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeBlockAsm8B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeBlockAsm8B
	JMP  match_extend_back_loop_encodeBlockAsm8B

match_extend_back_end_encodeBlockAsm8B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeBlockAsm8B:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeBlockAsm8B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JB   one_byte_match_emit_encodeBlockAsm8B
	CMPL R8, $0x00000100
	JB   two_bytes_match_emit_encodeBlockAsm8B
	JB   three_bytes_match_emit_encodeBlockAsm8B

three_bytes_match_emit_encodeBlockAsm8B:
	MOVB $0xf4, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeBlockAsm8B

two_bytes_match_emit_encodeBlockAsm8B:
	MOVB $0xf0, (CX)
	MOVB R8, 1(CX)
	ADDQ $0x02, CX
	CMPL R8, $0x40
	JB   memmove_match_emit_encodeBlockAsm8B
	JMP  memmove_long_match_emit_encodeBlockAsm8B

one_byte_match_emit_encodeBlockAsm8B:
	SHLB $0x02, R8
	MOVB R8, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeBlockAsm8B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (CX)
	MOVQ DI, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBlockAsm8B

emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeBlockAsm8B:
	MOVQ R8, CX
	JMP  emit_literal_done_match_emit_encodeBlockAsm8B

memmove_long_match_emit_encodeBlockAsm8B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  R8, CX

emit_literal_done_match_emit_encodeBlockAsm8B:
match_nolit_loop_encodeBlockAsm8B:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_encodeBlockAsm8B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeBlockAsm8B
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm8B
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm8B
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm8B

matchlen_bsf_16match_nolit_encodeBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_encodeBlockAsm8B

matchlen_match8_match_nolit_encodeBlockAsm8B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeBlockAsm8B
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm8B
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_encodeBlockAsm8B

matchlen_bsf_8_match_nolit_encodeBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_encodeBlockAsm8B

matchlen_match4_match_nolit_encodeBlockAsm8B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeBlockAsm8B
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_encodeBlockAsm8B
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_encodeBlockAsm8B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeBlockAsm8B
	JB   match_nolit_end_encodeBlockAsm8B
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_encodeBlockAsm8B
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeBlockAsm8B

matchlen_match1_match_nolit_encodeBlockAsm8B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeBlockAsm8B
	LEAL 1(R10), R10

match_nolit_end_encodeBlockAsm8B:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm8B
	CMPL SI, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBlockAsm8B
	MOVL $0x00000001, DI
	LEAL 16(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, R10

	// emitRepeat
	LEAL -4(R10), R10
	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
	MOVL R10, SI
	LEAL -4(R10), R10
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

long_offset_short_match_nolit_encodeBlockAsm8B:
	MOVB $0xee, (CX)
	MOVW SI, 1(CX)
	LEAL -60(R10), R10
	ADDQ $0x03, CX

	// emitRepeat
	MOVL R10, SI
	LEAL -4(R10), R10
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
	CMPL R10, $0x00000104
	JB   repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
	LEAL -256(R10), R10
	MOVW $0x0019, (CX)
	MOVW R10, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (CX)
	MOVB R10, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
	XORQ DI, DI
	LEAL 1(DI)(R10*4), R10
	MOVB SI, 1(CX)
	SARL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, R10
	MOVB R10, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

two_byte_offset_short_match_nolit_encodeBlockAsm8B:
	MOVL R10, DI
	SHLL $0x02, DI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBlockAsm8B
	LEAL -15(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B

emit_copy_three_match_nolit_encodeBlockAsm8B:
	LEAL -2(DI), DI
	MOVB DI, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeBlockAsm8B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeBlockAsm8B
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeBlockAsm8B:
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x20, R8
	IMULQ R9, R8
	SHRQ  $0x38, R8
	SHLQ  $0x20, SI
	IMULQ R9, SI
	SHRQ  $0x38, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_encodeBlockAsm8B
	INCL  DX
	JMP   search_loop_encodeBlockAsm8B

emit_remainder_encodeBlockAsm8B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeBlockAsm8B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm8B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBlockAsm8B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBlockAsm8B
	JB   three_bytes_emit_remainder_encodeBlockAsm8B

three_bytes_emit_remainder_encodeBlockAsm8B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeBlockAsm8B

two_bytes_emit_remainder_encodeBlockAsm8B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeBlockAsm8B

one_byte_emit_remainder_encodeBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeBlockAsm8B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeBlockAsm8B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeBlockAsm8B

memmove_long_emit_remainder_encodeBlockAsm8B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeBlockAsm8B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int
// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00001200, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeBetterBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -6(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeBetterBlockAsm:
	MOVL DX, SI
	SUBL 12(SP), SI
	SHRL $0x07, SI
	CMPL SI, $0x63
	JBE  check_maxskip_ok_encodeBetterBlockAsm
	LEAL 100(DX), SI
	JMP  check_maxskip_cont_encodeBetterBlockAsm

check_maxskip_ok_encodeBetterBlockAsm:
	LEAL 1(DX)(SI*1), SI

check_maxskip_cont_encodeBetterBlockAsm:
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeBetterBlockAsm
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x2f, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x32, R11
	MOVL  (AX)(R10*4), SI
	MOVL  524288(AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	MOVL  DX, 524288(AX)(R11*4)
	MOVQ  (BX)(SI*1), R10
	MOVQ  (BX)(R8*1), R11
	CMPQ  R10, DI
	JEQ   candidate_match_encodeBetterBlockAsm
	CMPQ  R11, DI
	JNE   no_short_found_encodeBetterBlockAsm
	MOVL  R8, SI
	JMP   candidate_match_encodeBetterBlockAsm

no_short_found_encodeBetterBlockAsm:
	CMPL R10, DI
	JEQ  candidate_match_encodeBetterBlockAsm
	CMPL R11, DI
	JEQ  candidateS_match_encodeBetterBlockAsm
	MOVL 20(SP), DX
	JMP  search_loop_encodeBetterBlockAsm

candidateS_match_encodeBetterBlockAsm:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x2f, R10
	MOVL  (AX)(R10*4), SI
	INCL  DX
	MOVL  DX, (AX)(R10*4)
	CMPL  (BX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm
	DECL  DX
	MOVL  R8, SI

candidate_match_encodeBetterBlockAsm:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBetterBlockAsm

match_extend_back_loop_encodeBetterBlockAsm:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeBetterBlockAsm
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeBetterBlockAsm
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeBetterBlockAsm
	JMP  match_extend_back_loop_encodeBetterBlockAsm

match_extend_back_end_encodeBetterBlockAsm:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 5(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeBetterBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm:
	MOVL DX, DI
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), R10

	// matchLen
	XORL R12, R12

matchlen_loopback_16_match_nolit_encodeBetterBlockAsm:
	CMPL R8, $0x10
	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm
	MOVQ (R9)(R12*1), R11
	MOVQ 8(R9)(R12*1), R13
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
	XORQ 8(R10)(R12*1), R13
	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm
	LEAL -16(R8), R8
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm

matchlen_bsf_16match_nolit_encodeBetterBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  match_nolit_end_encodeBetterBlockAsm

matchlen_match8_match_nolit_encodeBetterBlockAsm:
	CMPL R8, $0x08
	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm
	MOVQ (R9)(R12*1), R11
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm

matchlen_bsf_8_match_nolit_encodeBetterBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  match_nolit_end_encodeBetterBlockAsm

matchlen_match4_match_nolit_encodeBetterBlockAsm:
	CMPL R8, $0x04
	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm
	MOVL (R9)(R12*1), R11
	CMPL (R10)(R12*1), R11
	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm
	LEAL -4(R8), R8
	LEAL 4(R12), R12

matchlen_match2_match_nolit_encodeBetterBlockAsm:
	CMPL R8, $0x01
	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm
	JB   match_nolit_end_encodeBetterBlockAsm
	MOVW (R9)(R12*1), R11
	CMPW (R10)(R12*1), R11
	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm
	LEAL 2(R12), R12
	SUBL $0x02, R8
	JZ   match_nolit_end_encodeBetterBlockAsm

matchlen_match1_match_nolit_encodeBetterBlockAsm:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeBetterBlockAsm
	LEAL 1(R12), R12

match_nolit_end_encodeBetterBlockAsm:
	MOVL DX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL 16(SP), R8
	JEQ  match_is_repeat_encodeBetterBlockAsm
	CMPL R12, $0x01
	JA   match_length_ok_encodeBetterBlockAsm
	CMPL R8, $0x0000ffff
	JBE  match_length_ok_encodeBetterBlockAsm
	MOVL 20(SP), DX
	INCL DX
	JMP  search_loop_encodeBetterBlockAsm

match_length_ok_encodeBetterBlockAsm:
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_encodeBetterBlockAsm
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_encodeBetterBlockAsm
	CMPL SI, $0x00010000
	JB   three_bytes_match_emit_encodeBetterBlockAsm
	CMPL SI, $0x01000000
	JB   four_bytes_match_emit_encodeBetterBlockAsm
	MOVB $0xfc, (CX)
	MOVL SI, 1(CX)
	ADDQ $0x05, CX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

four_bytes_match_emit_encodeBetterBlockAsm:
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (CX)
	MOVW SI, 1(CX)
	MOVB R11, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

three_bytes_match_emit_encodeBetterBlockAsm:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

two_bytes_match_emit_encodeBetterBlockAsm:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_encodeBetterBlockAsm
	JMP  memmove_long_match_emit_encodeBetterBlockAsm

one_byte_match_emit_encodeBetterBlockAsm:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeBetterBlockAsm:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (CX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (CX)
	MOVL R10, -4(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm

emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm

memmove_long_match_emit_encodeBetterBlockAsm:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_encodeBetterBlockAsm:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitCopy
	CMPL R8, $0x00010000
	JB   two_byte_offset_match_nolit_encodeBetterBlockAsm
	CMPL R12, $0x40
	JBE  four_bytes_remain_match_nolit_encodeBetterBlockAsm
	MOVB $0xff, (CX)
	MOVL R8, 1(CX)
	LEAL -64(R12), R12
	ADDQ $0x05, CX
	CMPL R12, $0x04
	JB   four_bytes_remain_match_nolit_encodeBetterBlockAsm

	// emitRepeat
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL R12, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
	CMPL R12, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
	LEAL -16842747(R12), R12
	MOVL $0xfffb001d, (CX)
	MOVB $0xff, 4(CX)
	ADDQ $0x05, CX
	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy

repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (CX)
	MOVW R12, 2(CX)
	SARL $0x10, R8
	MOVB R8, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

four_bytes_remain_match_nolit_encodeBetterBlockAsm:
	TESTL R12, R12
	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm
	XORL  SI, SI
	LEAL  -1(SI)(R12*4), R12
	MOVB  R12, (CX)
	MOVL  R8, 1(CX)
	ADDQ  $0x05, CX
	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm

two_byte_offset_match_nolit_encodeBetterBlockAsm:
	CMPL R12, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm
	CMPL R8, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB R8, 1(CX)
	MOVL R8, R9
	SHRL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, R12

	// emitRepeat
	LEAL -4(R12), R12
	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b

emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
	CMPL R12, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
	CMPL R12, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
	LEAL -16842747(R12), R12
	MOVL $0xfffb001d, (CX)
	MOVB $0xff, 4(CX)
	ADDQ $0x05, CX
	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b

repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (CX)
	MOVW R12, 2(CX)
	SARL $0x10, R8
	MOVB R8, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

long_offset_short_match_nolit_encodeBetterBlockAsm:
	MOVB $0xee, (CX)
	MOVW R8, 1(CX)
	LEAL -60(R12), R12
	ADDQ $0x03, CX

	// emitRepeat
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL R12, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
	CMPL R12, $0x0100ffff
	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
	LEAL -16842747(R12), R12
	MOVL $0xfffb001d, (CX)
	MOVB $0xff, 4(CX)
	ADDQ $0x05, CX
	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short

repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (CX)
	MOVW R12, 2(CX)
	SARL $0x10, R8
	MOVB R8, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
	MOVL R12, SI
	SHLL $0x02, SI
	CMPL R12, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm
	CMPL R8, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm
	LEAL -15(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

emit_copy_three_match_nolit_encodeBetterBlockAsm:
	LEAL -2(SI), SI
	MOVB SI, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

match_is_repeat_encodeBetterBlockAsm:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm
	CMPL SI, $0x00010000
	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm
	CMPL SI, $0x01000000
	JB   four_bytes_match_emit_repeat_encodeBetterBlockAsm
	MOVB $0xfc, (CX)
	MOVL SI, 1(CX)
	ADDQ $0x05, CX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

four_bytes_match_emit_repeat_encodeBetterBlockAsm:
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (CX)
	MOVW SI, 1(CX)
	MOVB R11, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

three_bytes_match_emit_repeat_encodeBetterBlockAsm:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

two_bytes_match_emit_repeat_encodeBetterBlockAsm:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_repeat_encodeBetterBlockAsm
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm

one_byte_match_emit_repeat_encodeBetterBlockAsm:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_repeat_encodeBetterBlockAsm:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (CX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (CX)
	MOVL R10, -4(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm

memmove_long_match_emit_repeat_encodeBetterBlockAsm:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitRepeat
emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm
	CMPL R12, $0x00010100
	JB   repeat_four_match_nolit_repeat_encodeBetterBlockAsm
	CMPL R12, $0x0100ffff
	JB   repeat_five_match_nolit_repeat_encodeBetterBlockAsm
	LEAL -16842747(R12), R12
	MOVL $0xfffb001d, (CX)
	MOVB $0xff, 4(CX)
	ADDQ $0x05, CX
	JMP  emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm

repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (CX)
	MOVW R12, 2(CX)
	SARL $0x10, R8
	MOVB R8, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX

match_nolit_emitcopy_end_encodeBetterBlockAsm:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeBetterBlockAsm
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeBetterBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm:
	MOVQ  $0x00cf1bbcdcbfa563, SI
	MOVQ  $0x9e3779b1, R8
	LEAQ  1(DI), DI
	LEAQ  -2(DX), R9
	MOVQ  (BX)(DI*1), R10
	MOVQ  1(BX)(DI*1), R11
	MOVQ  (BX)(R9*1), R12
	MOVQ  1(BX)(R9*1), R13
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x2f, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x32, R11
	SHLQ  $0x08, R12
	IMULQ SI, R12
	SHRQ  $0x2f, R12
	SHLQ  $0x20, R13
	IMULQ R8, R13
	SHRQ  $0x32, R13
	LEAQ  1(DI), R8
	LEAQ  1(R9), R14
	MOVL  DI, (AX)(R10*4)
	MOVL  R9, (AX)(R12*4)
	MOVL  R8, 524288(AX)(R11*4)
	MOVL  R14, 524288(AX)(R13*4)
	LEAQ  1(R9)(DI*1), R8
	SHRQ  $0x01, R8
	ADDQ  $0x01, DI
	SUBQ  $0x01, R9

index_loop_encodeBetterBlockAsm:
	CMPQ  R8, R9
	JAE   search_loop_encodeBetterBlockAsm
	MOVQ  (BX)(DI*1), R10
	MOVQ  (BX)(R8*1), R11
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x2f, R10
	SHLQ  $0x08, R11
	IMULQ SI, R11
	SHRQ  $0x2f, R11
	MOVL  DI, (AX)(R10*4)
	MOVL  R8, (AX)(R11*4)
	ADDQ  $0x02, DI
	ADDQ  $0x02, R8
	JMP   index_loop_encodeBetterBlockAsm

emit_remainder_encodeBetterBlockAsm:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 5(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeBetterBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBetterBlockAsm
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBetterBlockAsm
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeBetterBlockAsm
	CMPL DX, $0x01000000
	JB   four_bytes_emit_remainder_encodeBetterBlockAsm
	MOVB $0xfc, (CX)
	MOVL DX, 1(CX)
	ADDQ $0x05, CX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

four_bytes_emit_remainder_encodeBetterBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (CX)
	MOVW DX, 1(CX)
	MOVB BL, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

three_bytes_emit_remainder_encodeBetterBlockAsm:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

two_bytes_emit_remainder_encodeBetterBlockAsm:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBetterBlockAsm
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm

one_byte_emit_remainder_encodeBetterBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeBetterBlockAsm:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm

memmove_long_emit_remainder_encodeBetterBlockAsm:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeBetterBlockAsm:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeBetterBlockAsm4MB(dst []byte, src []byte, tmp *[589824]byte) int
// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm4MB(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00001200, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm4MB:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeBetterBlockAsm4MB
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -6(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeBetterBlockAsm4MB:
	MOVL DX, SI
	SUBL 12(SP), SI
	SHRL $0x07, SI
	CMPL SI, $0x63
	JBE  check_maxskip_ok_encodeBetterBlockAsm4MB
	LEAL 100(DX), SI
	JMP  check_maxskip_cont_encodeBetterBlockAsm4MB

check_maxskip_ok_encodeBetterBlockAsm4MB:
	LEAL 1(DX)(SI*1), SI

check_maxskip_cont_encodeBetterBlockAsm4MB:
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeBetterBlockAsm4MB
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x2f, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x32, R11
	MOVL  (AX)(R10*4), SI
	MOVL  524288(AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	MOVL  DX, 524288(AX)(R11*4)
	MOVQ  (BX)(SI*1), R10
	MOVQ  (BX)(R8*1), R11
	CMPQ  R10, DI
	JEQ   candidate_match_encodeBetterBlockAsm4MB
	CMPQ  R11, DI
	JNE   no_short_found_encodeBetterBlockAsm4MB
	MOVL  R8, SI
	JMP   candidate_match_encodeBetterBlockAsm4MB

no_short_found_encodeBetterBlockAsm4MB:
	CMPL R10, DI
	JEQ  candidate_match_encodeBetterBlockAsm4MB
	CMPL R11, DI
	JEQ  candidateS_match_encodeBetterBlockAsm4MB
	MOVL 20(SP), DX
	JMP  search_loop_encodeBetterBlockAsm4MB

candidateS_match_encodeBetterBlockAsm4MB:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x2f, R10
	MOVL  (AX)(R10*4), SI
	INCL  DX
	MOVL  DX, (AX)(R10*4)
	CMPL  (BX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm4MB
	DECL  DX
	MOVL  R8, SI

candidate_match_encodeBetterBlockAsm4MB:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBetterBlockAsm4MB

match_extend_back_loop_encodeBetterBlockAsm4MB:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeBetterBlockAsm4MB
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeBetterBlockAsm4MB
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeBetterBlockAsm4MB
	JMP  match_extend_back_loop_encodeBetterBlockAsm4MB

match_extend_back_end_encodeBetterBlockAsm4MB:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 4(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeBetterBlockAsm4MB
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm4MB:
	MOVL DX, DI
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), R10

	// matchLen
	XORL R12, R12

matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB:
	CMPL R8, $0x10
	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm4MB
	MOVQ (R9)(R12*1), R11
	MOVQ 8(R9)(R12*1), R13
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
	XORQ 8(R10)(R12*1), R13
	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB
	LEAL -16(R8), R8
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB

matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  match_nolit_end_encodeBetterBlockAsm4MB

matchlen_match8_match_nolit_encodeBetterBlockAsm4MB:
	CMPL R8, $0x08
	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
	MOVQ (R9)(R12*1), R11
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm4MB

matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  match_nolit_end_encodeBetterBlockAsm4MB

matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
	CMPL R8, $0x04
	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
	MOVL (R9)(R12*1), R11
	CMPL (R10)(R12*1), R11
	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
	LEAL -4(R8), R8
	LEAL 4(R12), R12

matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
	CMPL R8, $0x01
	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
	JB   match_nolit_end_encodeBetterBlockAsm4MB
	MOVW (R9)(R12*1), R11
	CMPW (R10)(R12*1), R11
	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
	LEAL 2(R12), R12
	SUBL $0x02, R8
	JZ   match_nolit_end_encodeBetterBlockAsm4MB

matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeBetterBlockAsm4MB
	LEAL 1(R12), R12

match_nolit_end_encodeBetterBlockAsm4MB:
	MOVL DX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL 16(SP), R8
	JEQ  match_is_repeat_encodeBetterBlockAsm4MB
	CMPL R12, $0x01
	JA   match_length_ok_encodeBetterBlockAsm4MB
	CMPL R8, $0x0000ffff
	JBE  match_length_ok_encodeBetterBlockAsm4MB
	MOVL 20(SP), DX
	INCL DX
	JMP  search_loop_encodeBetterBlockAsm4MB

match_length_ok_encodeBetterBlockAsm4MB:
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_encodeBetterBlockAsm4MB
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_encodeBetterBlockAsm4MB
	CMPL SI, $0x00010000
	JB   three_bytes_match_emit_encodeBetterBlockAsm4MB
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (CX)
	MOVW SI, 1(CX)
	MOVB R11, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB

three_bytes_match_emit_encodeBetterBlockAsm4MB:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB

two_bytes_match_emit_encodeBetterBlockAsm4MB:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_encodeBetterBlockAsm4MB
	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB

one_byte_match_emit_encodeBetterBlockAsm4MB:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeBetterBlockAsm4MB:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (CX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (CX)
	MOVL R10, -4(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm4MB

memmove_long_match_emit_encodeBetterBlockAsm4MB:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitCopy
	CMPL R8, $0x00010000
	JB   two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
	CMPL R12, $0x40
	JBE  four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
	MOVB $0xff, (CX)
	MOVL R8, 1(CX)
	LEAL -64(R12), R12
	ADDQ $0x05, CX
	CMPL R12, $0x04
	JB   four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	CMPL R12, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (CX)
	MOVW R12, 2(CX)
	SARL $0x10, R8
	MOVB R8, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
	TESTL R12, R12
	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
	XORL  SI, SI
	LEAL  -1(SI)(R12*4), R12
	MOVB  R12, (CX)
	MOVL  R8, 1(CX)
	ADDQ  $0x05, CX
	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
	CMPL R12, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
	CMPL R8, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm4MB
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, R12

	// emitRepeat
	LEAL -4(R12), R12
	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
	CMPL R12, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (CX)
	MOVW R12, 2(CX)
	SARL $0x10, R8
	MOVB R8, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
	MOVB $0xee, (CX)
	MOVW R8, 1(CX)
	LEAL -60(R12), R12
	ADDQ $0x03, CX

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	CMPL R12, $0x00010100
	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (CX)
	MOVW R12, 2(CX)
	SARL $0x10, R8
	MOVB R8, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
	MOVL R12, SI
	SHLL $0x02, SI
	CMPL R12, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
	CMPL R8, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
	LEAL -15(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
	LEAL -2(SI), SI
	MOVB SI, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

match_is_repeat_encodeBetterBlockAsm4MB:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
	CMPL SI, $0x00010000
	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (CX)
	MOVW SI, 1(CX)
	MOVB R11, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB

three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB

two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_repeat_encodeBetterBlockAsm4MB
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB

one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (CX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (CX)
	MOVL R10, -4(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB

memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
	CMPL R12, $0x00010100
	JB   repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
	LEAL -65536(R12), R12
	MOVL R12, R8
	MOVW $0x001d, (CX)
	MOVW R12, 2(CX)
	SARL $0x10, R8
	MOVB R8, 4(CX)
	ADDQ $0x05, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX

match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeBetterBlockAsm4MB
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeBetterBlockAsm4MB
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm4MB:
	MOVQ  $0x00cf1bbcdcbfa563, SI
	MOVQ  $0x9e3779b1, R8
	LEAQ  1(DI), DI
	LEAQ  -2(DX), R9
	MOVQ  (BX)(DI*1), R10
	MOVQ  1(BX)(DI*1), R11
	MOVQ  (BX)(R9*1), R12
	MOVQ  1(BX)(R9*1), R13
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x2f, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x32, R11
	SHLQ  $0x08, R12
	IMULQ SI, R12
	SHRQ  $0x2f, R12
	SHLQ  $0x20, R13
	IMULQ R8, R13
	SHRQ  $0x32, R13
	LEAQ  1(DI), R8
	LEAQ  1(R9), R14
	MOVL  DI, (AX)(R10*4)
	MOVL  R9, (AX)(R12*4)
	MOVL  R8, 524288(AX)(R11*4)
	MOVL  R14, 524288(AX)(R13*4)
	LEAQ  1(R9)(DI*1), R8
	SHRQ  $0x01, R8
	ADDQ  $0x01, DI
	SUBQ  $0x01, R9

index_loop_encodeBetterBlockAsm4MB:
	CMPQ  R8, R9
	JAE   search_loop_encodeBetterBlockAsm4MB
	MOVQ  (BX)(DI*1), R10
	MOVQ  (BX)(R8*1), R11
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x2f, R10
	SHLQ  $0x08, R11
	IMULQ SI, R11
	SHRQ  $0x2f, R11
	MOVL  DI, (AX)(R10*4)
	MOVL  R8, (AX)(R11*4)
	ADDQ  $0x02, DI
	ADDQ  $0x02, R8
	JMP   index_loop_encodeBetterBlockAsm4MB

emit_remainder_encodeBetterBlockAsm4MB:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 4(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeBetterBlockAsm4MB
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm4MB:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBetterBlockAsm4MB
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBetterBlockAsm4MB
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeBetterBlockAsm4MB
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (CX)
	MOVW DX, 1(CX)
	MOVB BL, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB

three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB

two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBetterBlockAsm4MB
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB

one_byte_emit_remainder_encodeBetterBlockAsm4MB:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeBetterBlockAsm4MB:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB

memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int
// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm12B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000280, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm12B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeBetterBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -6(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeBetterBlockAsm12B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  1(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeBetterBlockAsm12B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x34, R11
	MOVL  (AX)(R10*4), SI
	MOVL  65536(AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	MOVL  DX, 65536(AX)(R11*4)
	MOVQ  (BX)(SI*1), R10
	MOVQ  (BX)(R8*1), R11
	CMPQ  R10, DI
	JEQ   candidate_match_encodeBetterBlockAsm12B
	CMPQ  R11, DI
	JNE   no_short_found_encodeBetterBlockAsm12B
	MOVL  R8, SI
	JMP   candidate_match_encodeBetterBlockAsm12B

no_short_found_encodeBetterBlockAsm12B:
	CMPL R10, DI
	JEQ  candidate_match_encodeBetterBlockAsm12B
	CMPL R11, DI
	JEQ  candidateS_match_encodeBetterBlockAsm12B
	MOVL 20(SP), DX
	JMP  search_loop_encodeBetterBlockAsm12B

candidateS_match_encodeBetterBlockAsm12B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  (AX)(R10*4), SI
	INCL  DX
	MOVL  DX, (AX)(R10*4)
	CMPL  (BX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm12B
	DECL  DX
	MOVL  R8, SI

candidate_match_encodeBetterBlockAsm12B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBetterBlockAsm12B

match_extend_back_loop_encodeBetterBlockAsm12B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeBetterBlockAsm12B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeBetterBlockAsm12B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeBetterBlockAsm12B
	JMP  match_extend_back_loop_encodeBetterBlockAsm12B

match_extend_back_end_encodeBetterBlockAsm12B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeBetterBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm12B:
	MOVL DX, DI
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), R10

	// matchLen
	XORL R12, R12

matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B:
	CMPL R8, $0x10
	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm12B
	MOVQ (R9)(R12*1), R11
	MOVQ 8(R9)(R12*1), R13
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
	XORQ 8(R10)(R12*1), R13
	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B
	LEAL -16(R8), R8
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B

matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  match_nolit_end_encodeBetterBlockAsm12B

matchlen_match8_match_nolit_encodeBetterBlockAsm12B:
	CMPL R8, $0x08
	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm12B
	MOVQ (R9)(R12*1), R11
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm12B

matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  match_nolit_end_encodeBetterBlockAsm12B

matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
	CMPL R8, $0x04
	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm12B
	MOVL (R9)(R12*1), R11
	CMPL (R10)(R12*1), R11
	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm12B
	LEAL -4(R8), R8
	LEAL 4(R12), R12

matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
	CMPL R8, $0x01
	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm12B
	JB   match_nolit_end_encodeBetterBlockAsm12B
	MOVW (R9)(R12*1), R11
	CMPW (R10)(R12*1), R11
	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm12B
	LEAL 2(R12), R12
	SUBL $0x02, R8
	JZ   match_nolit_end_encodeBetterBlockAsm12B

matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeBetterBlockAsm12B
	LEAL 1(R12), R12

match_nolit_end_encodeBetterBlockAsm12B:
	MOVL DX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL 16(SP), R8
	JEQ  match_is_repeat_encodeBetterBlockAsm12B
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_encodeBetterBlockAsm12B
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_encodeBetterBlockAsm12B
	JB   three_bytes_match_emit_encodeBetterBlockAsm12B

three_bytes_match_emit_encodeBetterBlockAsm12B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B

two_bytes_match_emit_encodeBetterBlockAsm12B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_encodeBetterBlockAsm12B
	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B

one_byte_match_emit_encodeBetterBlockAsm12B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeBetterBlockAsm12B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (CX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (CX)
	MOVL R10, -4(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm12B

memmove_long_match_emit_encodeBetterBlockAsm12B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_encodeBetterBlockAsm12B:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitCopy
	CMPL R12, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
	CMPL R8, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm12B
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, R12

	// emitRepeat
	LEAL -4(R12), R12
	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

long_offset_short_match_nolit_encodeBetterBlockAsm12B:
	MOVB $0xee, (CX)
	MOVW R8, 1(CX)
	LEAL -60(R12), R12
	ADDQ $0x03, CX

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
	MOVL R12, SI
	SHLL $0x02, SI
	CMPL R12, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
	CMPL R8, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
	LEAL -15(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
	LEAL -2(SI), SI
	MOVB SI, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

match_is_repeat_encodeBetterBlockAsm12B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm12B
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm12B

three_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B

two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_repeat_encodeBetterBlockAsm12B
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B

one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_repeat_encodeBetterBlockAsm12B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (CX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (CX)
	MOVL R10, -4(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B

memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX

match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeBetterBlockAsm12B
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeBetterBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm12B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	LEAQ  1(DI), DI
	LEAQ  -2(DX), R9
	MOVQ  (BX)(DI*1), R10
	MOVQ  1(BX)(DI*1), R11
	MOVQ  (BX)(R9*1), R12
	MOVQ  1(BX)(R9*1), R13
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x34, R11
	SHLQ  $0x10, R12
	IMULQ SI, R12
	SHRQ  $0x32, R12
	SHLQ  $0x20, R13
	IMULQ R8, R13
	SHRQ  $0x34, R13
	LEAQ  1(DI), R8
	LEAQ  1(R9), R14
	MOVL  DI, (AX)(R10*4)
	MOVL  R9, (AX)(R12*4)
	MOVL  R8, 65536(AX)(R11*4)
	MOVL  R14, 65536(AX)(R13*4)
	LEAQ  1(R9)(DI*1), R8
	SHRQ  $0x01, R8
	ADDQ  $0x01, DI
	SUBQ  $0x01, R9

index_loop_encodeBetterBlockAsm12B:
	CMPQ  R8, R9
	JAE   search_loop_encodeBetterBlockAsm12B
	MOVQ  (BX)(DI*1), R10
	MOVQ  (BX)(R8*1), R11
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R11
	IMULQ SI, R11
	SHRQ  $0x32, R11
	MOVL  DI, (AX)(R10*4)
	MOVL  R8, (AX)(R11*4)
	ADDQ  $0x02, DI
	ADDQ  $0x02, R8
	JMP   index_loop_encodeBetterBlockAsm12B

emit_remainder_encodeBetterBlockAsm12B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeBetterBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm12B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBetterBlockAsm12B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBetterBlockAsm12B
	JB   three_bytes_emit_remainder_encodeBetterBlockAsm12B

three_bytes_emit_remainder_encodeBetterBlockAsm12B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B

two_bytes_emit_remainder_encodeBetterBlockAsm12B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBetterBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B

one_byte_emit_remainder_encodeBetterBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeBetterBlockAsm12B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B

memmove_long_emit_remainder_encodeBetterBlockAsm12B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int
// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm10B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x000000a0, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm10B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeBetterBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -6(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeBetterBlockAsm10B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  1(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeBetterBlockAsm10B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x36, R11
	MOVL  (AX)(R10*4), SI
	MOVL  16384(AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	MOVL  DX, 16384(AX)(R11*4)
	MOVQ  (BX)(SI*1), R10
	MOVQ  (BX)(R8*1), R11
	CMPQ  R10, DI
	JEQ   candidate_match_encodeBetterBlockAsm10B
	CMPQ  R11, DI
	JNE   no_short_found_encodeBetterBlockAsm10B
	MOVL  R8, SI
	JMP   candidate_match_encodeBetterBlockAsm10B

no_short_found_encodeBetterBlockAsm10B:
	CMPL R10, DI
	JEQ  candidate_match_encodeBetterBlockAsm10B
	CMPL R11, DI
	JEQ  candidateS_match_encodeBetterBlockAsm10B
	MOVL 20(SP), DX
	JMP  search_loop_encodeBetterBlockAsm10B

candidateS_match_encodeBetterBlockAsm10B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	MOVL  (AX)(R10*4), SI
	INCL  DX
	MOVL  DX, (AX)(R10*4)
	CMPL  (BX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm10B
	DECL  DX
	MOVL  R8, SI

candidate_match_encodeBetterBlockAsm10B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBetterBlockAsm10B

match_extend_back_loop_encodeBetterBlockAsm10B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeBetterBlockAsm10B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeBetterBlockAsm10B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeBetterBlockAsm10B
	JMP  match_extend_back_loop_encodeBetterBlockAsm10B

match_extend_back_end_encodeBetterBlockAsm10B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeBetterBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm10B:
	MOVL DX, DI
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), R10

	// matchLen
	XORL R12, R12

matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B:
	CMPL R8, $0x10
	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm10B
	MOVQ (R9)(R12*1), R11
	MOVQ 8(R9)(R12*1), R13
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
	XORQ 8(R10)(R12*1), R13
	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B
	LEAL -16(R8), R8
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B

matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  match_nolit_end_encodeBetterBlockAsm10B

matchlen_match8_match_nolit_encodeBetterBlockAsm10B:
	CMPL R8, $0x08
	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm10B
	MOVQ (R9)(R12*1), R11
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm10B

matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  match_nolit_end_encodeBetterBlockAsm10B

matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
	CMPL R8, $0x04
	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm10B
	MOVL (R9)(R12*1), R11
	CMPL (R10)(R12*1), R11
	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm10B
	LEAL -4(R8), R8
	LEAL 4(R12), R12

matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
	CMPL R8, $0x01
	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm10B
	JB   match_nolit_end_encodeBetterBlockAsm10B
	MOVW (R9)(R12*1), R11
	CMPW (R10)(R12*1), R11
	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm10B
	LEAL 2(R12), R12
	SUBL $0x02, R8
	JZ   match_nolit_end_encodeBetterBlockAsm10B

matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeBetterBlockAsm10B
	LEAL 1(R12), R12

match_nolit_end_encodeBetterBlockAsm10B:
	MOVL DX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL 16(SP), R8
	JEQ  match_is_repeat_encodeBetterBlockAsm10B
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_encodeBetterBlockAsm10B
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_encodeBetterBlockAsm10B
	JB   three_bytes_match_emit_encodeBetterBlockAsm10B

three_bytes_match_emit_encodeBetterBlockAsm10B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B

two_bytes_match_emit_encodeBetterBlockAsm10B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_encodeBetterBlockAsm10B
	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B

one_byte_match_emit_encodeBetterBlockAsm10B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeBetterBlockAsm10B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (CX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (CX)
	MOVL R10, -4(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm10B

memmove_long_match_emit_encodeBetterBlockAsm10B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_encodeBetterBlockAsm10B:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitCopy
	CMPL R12, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
	CMPL R8, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm10B
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, R12

	// emitRepeat
	LEAL -4(R12), R12
	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

long_offset_short_match_nolit_encodeBetterBlockAsm10B:
	MOVB $0xee, (CX)
	MOVW R8, 1(CX)
	LEAL -60(R12), R12
	ADDQ $0x03, CX

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
	MOVL R12, SI
	SHLL $0x02, SI
	CMPL R12, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
	CMPL R8, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
	LEAL -15(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
	LEAL -2(SI), SI
	MOVB SI, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

match_is_repeat_encodeBetterBlockAsm10B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm10B
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm10B

three_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B

two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_repeat_encodeBetterBlockAsm10B
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B

one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_repeat_encodeBetterBlockAsm10B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (CX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (CX)
	MOVL R10, -4(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B

memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
	CMPL R8, $0x00000800
	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B

repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX

match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeBetterBlockAsm10B
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeBetterBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm10B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	LEAQ  1(DI), DI
	LEAQ  -2(DX), R9
	MOVQ  (BX)(DI*1), R10
	MOVQ  1(BX)(DI*1), R11
	MOVQ  (BX)(R9*1), R12
	MOVQ  1(BX)(R9*1), R13
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x34, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x36, R11
	SHLQ  $0x10, R12
	IMULQ SI, R12
	SHRQ  $0x34, R12
	SHLQ  $0x20, R13
	IMULQ R8, R13
	SHRQ  $0x36, R13
	LEAQ  1(DI), R8
	LEAQ  1(R9), R14
	MOVL  DI, (AX)(R10*4)
	MOVL  R9, (AX)(R12*4)
	MOVL  R8, 16384(AX)(R11*4)
	MOVL  R14, 16384(AX)(R13*4)
	LEAQ  1(R9)(DI*1), R8
	SHRQ  $0x01, R8
	ADDQ  $0x01, DI
	SUBQ  $0x01, R9

index_loop_encodeBetterBlockAsm10B:
	CMPQ  R8, R9
	JAE   search_loop_encodeBetterBlockAsm10B
	MOVQ  (BX)(DI*1), R10
	MOVQ  (BX)(R8*1), R11
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x34, R10
	SHLQ  $0x10, R11
	IMULQ SI, R11
	SHRQ  $0x34, R11
	MOVL  DI, (AX)(R10*4)
	MOVL  R8, (AX)(R11*4)
	ADDQ  $0x02, DI
	ADDQ  $0x02, R8
	JMP   index_loop_encodeBetterBlockAsm10B

emit_remainder_encodeBetterBlockAsm10B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeBetterBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm10B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBetterBlockAsm10B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBetterBlockAsm10B
	JB   three_bytes_emit_remainder_encodeBetterBlockAsm10B

three_bytes_emit_remainder_encodeBetterBlockAsm10B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B

two_bytes_emit_remainder_encodeBetterBlockAsm10B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBetterBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B

one_byte_emit_remainder_encodeBetterBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeBetterBlockAsm10B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B

memmove_long_emit_remainder_encodeBetterBlockAsm10B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int
// Requires: BMI, SSE2
TEXT ·encodeBetterBlockAsm8B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000028, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeBetterBlockAsm8B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeBetterBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -6(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeBetterBlockAsm8B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x04, SI
	LEAL  1(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeBetterBlockAsm8B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x38, R11
	MOVL  (AX)(R10*4), SI
	MOVL  4096(AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	MOVL  DX, 4096(AX)(R11*4)
	MOVQ  (BX)(SI*1), R10
	MOVQ  (BX)(R8*1), R11
	CMPQ  R10, DI
	JEQ   candidate_match_encodeBetterBlockAsm8B
	CMPQ  R11, DI
	JNE   no_short_found_encodeBetterBlockAsm8B
	MOVL  R8, SI
	JMP   candidate_match_encodeBetterBlockAsm8B

no_short_found_encodeBetterBlockAsm8B:
	CMPL R10, DI
	JEQ  candidate_match_encodeBetterBlockAsm8B
	CMPL R11, DI
	JEQ  candidateS_match_encodeBetterBlockAsm8B
	MOVL 20(SP), DX
	JMP  search_loop_encodeBetterBlockAsm8B

candidateS_match_encodeBetterBlockAsm8B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	MOVL  (AX)(R10*4), SI
	INCL  DX
	MOVL  DX, (AX)(R10*4)
	CMPL  (BX)(SI*1), DI
	JEQ   candidate_match_encodeBetterBlockAsm8B
	DECL  DX
	MOVL  R8, SI

candidate_match_encodeBetterBlockAsm8B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeBetterBlockAsm8B

match_extend_back_loop_encodeBetterBlockAsm8B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeBetterBlockAsm8B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeBetterBlockAsm8B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeBetterBlockAsm8B
	JMP  match_extend_back_loop_encodeBetterBlockAsm8B

match_extend_back_end_encodeBetterBlockAsm8B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeBetterBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeBetterBlockAsm8B:
	MOVL DX, DI
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), R10

	// matchLen
	XORL R12, R12

matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B:
	CMPL R8, $0x10
	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm8B
	MOVQ (R9)(R12*1), R11
	MOVQ 8(R9)(R12*1), R13
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
	XORQ 8(R10)(R12*1), R13
	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B
	LEAL -16(R8), R8
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B

matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  match_nolit_end_encodeBetterBlockAsm8B

matchlen_match8_match_nolit_encodeBetterBlockAsm8B:
	CMPL R8, $0x08
	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm8B
	MOVQ (R9)(R12*1), R11
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm8B

matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  match_nolit_end_encodeBetterBlockAsm8B

matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
	CMPL R8, $0x04
	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm8B
	MOVL (R9)(R12*1), R11
	CMPL (R10)(R12*1), R11
	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm8B
	LEAL -4(R8), R8
	LEAL 4(R12), R12

matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
	CMPL R8, $0x01
	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm8B
	JB   match_nolit_end_encodeBetterBlockAsm8B
	MOVW (R9)(R12*1), R11
	CMPW (R10)(R12*1), R11
	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm8B
	LEAL 2(R12), R12
	SUBL $0x02, R8
	JZ   match_nolit_end_encodeBetterBlockAsm8B

matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeBetterBlockAsm8B
	LEAL 1(R12), R12

match_nolit_end_encodeBetterBlockAsm8B:
	MOVL DX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL 16(SP), R8
	JEQ  match_is_repeat_encodeBetterBlockAsm8B
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm8B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_encodeBetterBlockAsm8B
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_encodeBetterBlockAsm8B
	JB   three_bytes_match_emit_encodeBetterBlockAsm8B

three_bytes_match_emit_encodeBetterBlockAsm8B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B

two_bytes_match_emit_encodeBetterBlockAsm8B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_encodeBetterBlockAsm8B
	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B

one_byte_match_emit_encodeBetterBlockAsm8B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeBetterBlockAsm8B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x04
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
	CMPQ R9, $0x08
	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
	MOVL (R10), R11
	MOVL R11, (CX)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
	MOVL (R10), R11
	MOVL -4(R10)(R9*1), R10
	MOVL R11, (CX)
	MOVL R10, -4(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm8B

memmove_long_match_emit_encodeBetterBlockAsm8B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_encodeBetterBlockAsm8B:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitCopy
	CMPL R12, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
	CMPL R8, $0x00000800
	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm8B
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	SUBL $0x08, R12

	// emitRepeat
	LEAL -4(R12), R12
	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

long_offset_short_match_nolit_encodeBetterBlockAsm8B:
	MOVB $0xee, (CX)
	MOVW R8, 1(CX)
	LEAL -60(R12), R12
	ADDQ $0x03, CX

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short

cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
	MOVL R12, SI
	SHLL $0x02, SI
	CMPL R12, $0x0c
	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm8B
	LEAL -15(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
	LEAL -2(SI), SI
	MOVB SI, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

match_is_repeat_encodeBetterBlockAsm8B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm8B
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm8B

three_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B

two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_repeat_encodeBetterBlockAsm8B
	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B

one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_repeat_encodeBetterBlockAsm8B:
	LEAQ (CX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x04
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
	CMPQ R8, $0x08
	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
	MOVL (R9), R10
	MOVL R10, (CX)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
	MOVL (R9), R10
	MOVL -4(R9)(R8*1), R9
	MOVL R10, (CX)
	MOVL R9, -4(CX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (CX)
	MOVQ R9, -8(CX)(R8*1)
	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R8*1)
	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B

emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)

memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B

memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
	LEAQ (CX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R13
	SUBQ  R10, R13
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R13*1), R10
	LEAQ  -32(CX)(R13*1), R14

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R10
	ADDQ  $0x20, R13
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R13*1), X4
	MOVOU -16(R9)(R13*1), X5
	MOVOA X4, -32(CX)(R13*1)
	MOVOA X5, -16(CX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitRepeat
	MOVL R12, SI
	LEAL -4(R12), R12
	CMPL SI, $0x08
	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B

cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
	CMPL R12, $0x00000104
	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
	LEAL -256(R12), R12
	MOVW $0x0019, (CX)
	MOVW R12, 2(CX)
	ADDQ $0x04, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
	LEAL -4(R12), R12
	MOVW $0x0015, (CX)
	MOVB R12, 2(CX)
	ADDQ $0x03, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B

repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
	SHLL $0x02, R12
	ORL  $0x01, R12
	MOVW R12, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
	XORQ SI, SI
	LEAL 1(SI)(R12*4), R12
	MOVB R8, 1(CX)
	SARL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, R12
	MOVB R12, (CX)
	ADDQ $0x02, CX

match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeBetterBlockAsm8B
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeBetterBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeBetterBlockAsm8B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	LEAQ  1(DI), DI
	LEAQ  -2(DX), R9
	MOVQ  (BX)(DI*1), R10
	MOVQ  1(BX)(DI*1), R11
	MOVQ  (BX)(R9*1), R12
	MOVQ  1(BX)(R9*1), R13
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x38, R11
	SHLQ  $0x10, R12
	IMULQ SI, R12
	SHRQ  $0x36, R12
	SHLQ  $0x20, R13
	IMULQ R8, R13
	SHRQ  $0x38, R13
	LEAQ  1(DI), R8
	LEAQ  1(R9), R14
	MOVL  DI, (AX)(R10*4)
	MOVL  R9, (AX)(R12*4)
	MOVL  R8, 4096(AX)(R11*4)
	MOVL  R14, 4096(AX)(R13*4)
	LEAQ  1(R9)(DI*1), R8
	SHRQ  $0x01, R8
	ADDQ  $0x01, DI
	SUBQ  $0x01, R9

index_loop_encodeBetterBlockAsm8B:
	CMPQ  R8, R9
	JAE   search_loop_encodeBetterBlockAsm8B
	MOVQ  (BX)(DI*1), R10
	MOVQ  (BX)(R8*1), R11
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x36, R10
	SHLQ  $0x10, R11
	IMULQ SI, R11
	SHRQ  $0x36, R11
	MOVL  DI, (AX)(R10*4)
	MOVL  R8, (AX)(R11*4)
	ADDQ  $0x02, DI
	ADDQ  $0x02, R8
	JMP   index_loop_encodeBetterBlockAsm8B

emit_remainder_encodeBetterBlockAsm8B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeBetterBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeBetterBlockAsm8B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeBetterBlockAsm8B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeBetterBlockAsm8B
	JB   three_bytes_emit_remainder_encodeBetterBlockAsm8B

three_bytes_emit_remainder_encodeBetterBlockAsm8B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B

two_bytes_emit_remainder_encodeBetterBlockAsm8B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeBetterBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B

one_byte_emit_remainder_encodeBetterBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeBetterBlockAsm8B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B

memmove_long_emit_remainder_encodeBetterBlockAsm8B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeSnappyBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000200, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeSnappyBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeSnappyBlockAsm:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeSnappyBlockAsm
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R11
	IMULQ R9, R11
	SHRQ  $0x32, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeSnappyBlockAsm
	LEAL  1(DX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm

repeat_extend_back_loop_encodeSnappyBlockAsm:
	CMPL DI, SI
	JBE  repeat_extend_back_end_encodeSnappyBlockAsm
	MOVB -1(BX)(R8*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm

repeat_extend_back_end_encodeSnappyBlockAsm:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 5(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_encodeSnappyBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

repeat_dst_size_check_encodeSnappyBlockAsm:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_encodeSnappyBlockAsm
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm
	CMPL SI, $0x00010000
	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm
	CMPL SI, $0x01000000
	JB   four_bytes_repeat_emit_encodeSnappyBlockAsm
	MOVB $0xfc, (CX)
	MOVL SI, 1(CX)
	ADDQ $0x05, CX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

four_bytes_repeat_emit_encodeSnappyBlockAsm:
	MOVL SI, R10
	SHRL $0x10, R10
	MOVB $0xf8, (CX)
	MOVW SI, 1(CX)
	MOVB R10, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

three_bytes_repeat_emit_encodeSnappyBlockAsm:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

two_bytes_repeat_emit_encodeSnappyBlockAsm:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_encodeSnappyBlockAsm
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm

one_byte_repeat_emit_encodeSnappyBlockAsm:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_repeat_emit_encodeSnappyBlockAsm:
	LEAQ (CX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (CX)
	MOVQ R9, -8(CX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
	MOVQ SI, CX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm

memmove_long_repeat_emit_encodeSnappyBlockAsm:
	LEAQ (CX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)
	MOVQ  SI, CX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
	XORQ 8(SI)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm

matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm

matchlen_match8_repeat_extend_encodeSnappyBlockAsm:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm
	MOVQ (R9)(R11*1), R10
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm

matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm

matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm
	MOVL (R9)(R11*1), R10
	CMPL (SI)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm
	JB   repeat_extend_forward_end_encodeSnappyBlockAsm
	MOVW (R9)(R11*1), R10
	CMPW (SI)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm

matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm
	LEAL 1(R11), R11

repeat_extend_forward_end_encodeSnappyBlockAsm:
	ADDL R11, DX
	MOVL DX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
	CMPL DI, $0x00010000
	JB   two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm

four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
	CMPL SI, $0x40
	JBE  four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
	MOVB $0xff, (CX)
	MOVL DI, 1(CX)
	LEAL -64(SI), SI
	ADDQ $0x05, CX
	CMPL SI, $0x04
	JB   four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
	JMP  four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm

four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
	TESTL SI, SI
	JZ    repeat_end_emit_encodeSnappyBlockAsm
	XORL  R8, R8
	LEAL  -1(R8)(SI*4), SI
	MOVB  SI, (CX)
	MOVL  DI, 1(CX)
	ADDQ  $0x05, CX
	JMP   repeat_end_emit_encodeSnappyBlockAsm

two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
	MOVB $0xee, (CX)
	MOVW DI, 1(CX)
	LEAL -60(SI), SI
	ADDQ $0x03, CX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
	MOVL SI, R8
	SHLL $0x02, R8
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
	CMPL DI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
	LEAL -15(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeSnappyBlockAsm

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
	LEAL -2(R8), R8
	MOVB R8, (CX)
	MOVW DI, 1(CX)
	ADDQ $0x03, CX

repeat_end_emit_encodeSnappyBlockAsm:
	MOVL DX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm

no_repeat_found_encodeSnappyBlockAsm:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_encodeSnappyBlockAsm
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_encodeSnappyBlockAsm
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_encodeSnappyBlockAsm
	MOVL 20(SP), DX
	JMP  search_loop_encodeSnappyBlockAsm

candidate3_match_encodeSnappyBlockAsm:
	ADDL $0x02, DX
	JMP  candidate_match_encodeSnappyBlockAsm

candidate2_match_encodeSnappyBlockAsm:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_encodeSnappyBlockAsm:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBlockAsm

match_extend_back_loop_encodeSnappyBlockAsm:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeSnappyBlockAsm
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeSnappyBlockAsm
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBlockAsm
	JMP  match_extend_back_loop_encodeSnappyBlockAsm

match_extend_back_end_encodeSnappyBlockAsm:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 5(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeSnappyBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JB   one_byte_match_emit_encodeSnappyBlockAsm
	CMPL R8, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBlockAsm
	CMPL R8, $0x00010000
	JB   three_bytes_match_emit_encodeSnappyBlockAsm
	CMPL R8, $0x01000000
	JB   four_bytes_match_emit_encodeSnappyBlockAsm
	MOVB $0xfc, (CX)
	MOVL R8, 1(CX)
	ADDQ $0x05, CX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

four_bytes_match_emit_encodeSnappyBlockAsm:
	MOVL R8, R10
	SHRL $0x10, R10
	MOVB $0xf8, (CX)
	MOVW R8, 1(CX)
	MOVB R10, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

three_bytes_match_emit_encodeSnappyBlockAsm:
	MOVB $0xf4, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

two_bytes_match_emit_encodeSnappyBlockAsm:
	MOVB $0xf0, (CX)
	MOVB R8, 1(CX)
	ADDQ $0x02, CX
	CMPL R8, $0x40
	JB   memmove_match_emit_encodeSnappyBlockAsm
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm

one_byte_match_emit_encodeSnappyBlockAsm:
	SHLB $0x02, R8
	MOVB R8, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeSnappyBlockAsm:
	LEAQ (CX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (CX)
	MOVQ DI, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm:
	MOVQ R8, CX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm

memmove_long_match_emit_encodeSnappyBlockAsm:
	LEAQ (CX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  R8, CX

emit_literal_done_match_emit_encodeSnappyBlockAsm:
match_nolit_loop_encodeSnappyBlockAsm:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm

matchlen_bsf_16match_nolit_encodeSnappyBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_encodeSnappyBlockAsm

matchlen_match8_match_nolit_encodeSnappyBlockAsm:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm

matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_encodeSnappyBlockAsm

matchlen_match4_match_nolit_encodeSnappyBlockAsm:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_encodeSnappyBlockAsm:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm
	JB   match_nolit_end_encodeSnappyBlockAsm
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeSnappyBlockAsm

matchlen_match1_match_nolit_encodeSnappyBlockAsm:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeSnappyBlockAsm
	LEAL 1(R10), R10

match_nolit_end_encodeSnappyBlockAsm:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
	CMPL SI, $0x00010000
	JB   two_byte_offset_match_nolit_encodeSnappyBlockAsm

four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
	CMPL R10, $0x40
	JBE  four_bytes_remain_match_nolit_encodeSnappyBlockAsm
	MOVB $0xff, (CX)
	MOVL SI, 1(CX)
	LEAL -64(R10), R10
	ADDQ $0x05, CX
	CMPL R10, $0x04
	JB   four_bytes_remain_match_nolit_encodeSnappyBlockAsm
	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm

four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
	TESTL R10, R10
	JZ    match_nolit_emitcopy_end_encodeSnappyBlockAsm
	XORL  DI, DI
	LEAL  -1(DI)(R10*4), R10
	MOVB  R10, (CX)
	MOVL  SI, 1(CX)
	ADDQ  $0x05, CX
	JMP   match_nolit_emitcopy_end_encodeSnappyBlockAsm

two_byte_offset_match_nolit_encodeSnappyBlockAsm:
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
	MOVB $0xee, (CX)
	MOVW SI, 1(CX)
	LEAL -60(R10), R10
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
	MOVL R10, DI
	SHLL $0x02, DI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
	CMPL SI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
	LEAL -15(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm

emit_copy_three_match_nolit_encodeSnappyBlockAsm:
	LEAL -2(DI), DI
	MOVB DI, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeSnappyBlockAsm:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeSnappyBlockAsm
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm:
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x10, R8
	IMULQ R9, R8
	SHRQ  $0x32, R8
	SHLQ  $0x10, SI
	IMULQ R9, SI
	SHRQ  $0x32, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm
	INCL  DX
	JMP   search_loop_encodeSnappyBlockAsm

emit_remainder_encodeSnappyBlockAsm:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 5(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeSnappyBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBlockAsm
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm
	CMPL DX, $0x01000000
	JB   four_bytes_emit_remainder_encodeSnappyBlockAsm
	MOVB $0xfc, (CX)
	MOVL DX, 1(CX)
	ADDQ $0x05, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

four_bytes_emit_remainder_encodeSnappyBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (CX)
	MOVW DX, 1(CX)
	MOVB BL, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

three_bytes_emit_remainder_encodeSnappyBlockAsm:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

two_bytes_emit_remainder_encodeSnappyBlockAsm:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBlockAsm
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm

one_byte_emit_remainder_encodeSnappyBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeSnappyBlockAsm:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm

memmove_long_emit_remainder_encodeSnappyBlockAsm:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeSnappyBlockAsm64K(dst []byte, src []byte, tmp *[65536]byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm64K(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000200, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm64K:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeSnappyBlockAsm64K
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeSnappyBlockAsm64K:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeSnappyBlockAsm64K
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R11
	IMULQ R9, R11
	SHRQ  $0x32, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeSnappyBlockAsm64K
	LEAL  1(DX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm64K

repeat_extend_back_loop_encodeSnappyBlockAsm64K:
	CMPL DI, SI
	JBE  repeat_extend_back_end_encodeSnappyBlockAsm64K
	MOVB -1(BX)(R8*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm64K
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm64K

repeat_extend_back_end_encodeSnappyBlockAsm64K:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 3(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_encodeSnappyBlockAsm64K
	MOVQ $0x00000000, ret+56(FP)
	RET

repeat_dst_size_check_encodeSnappyBlockAsm64K:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_encodeSnappyBlockAsm64K
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm64K
	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm64K

three_bytes_repeat_emit_encodeSnappyBlockAsm64K:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K

two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_encodeSnappyBlockAsm64K
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K

one_byte_repeat_emit_encodeSnappyBlockAsm64K:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_repeat_emit_encodeSnappyBlockAsm64K:
	LEAQ (CX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (CX)
	MOVQ R9, -8(CX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
	MOVQ SI, CX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K

memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
	LEAQ (CX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)
	MOVQ  SI, CX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
	XORQ 8(SI)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K

matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm64K

matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
	MOVQ (R9)(R11*1), R10
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K

matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm64K

matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
	MOVL (R9)(R11*1), R10
	CMPL (SI)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
	JB   repeat_extend_forward_end_encodeSnappyBlockAsm64K
	MOVW (R9)(R11*1), R10
	CMPW (SI)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm64K

matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm64K
	LEAL 1(R11), R11

repeat_extend_forward_end_encodeSnappyBlockAsm64K:
	ADDL R11, DX
	MOVL DX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
	MOVB $0xee, (CX)
	MOVW DI, 1(CX)
	LEAL -60(SI), SI
	ADDQ $0x03, CX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
	MOVL SI, R8
	SHLL $0x02, R8
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
	CMPL DI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
	LEAL -15(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeSnappyBlockAsm64K

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
	LEAL -2(R8), R8
	MOVB R8, (CX)
	MOVW DI, 1(CX)
	ADDQ $0x03, CX

repeat_end_emit_encodeSnappyBlockAsm64K:
	MOVL DX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm64K

no_repeat_found_encodeSnappyBlockAsm64K:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_encodeSnappyBlockAsm64K
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_encodeSnappyBlockAsm64K
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_encodeSnappyBlockAsm64K
	MOVL 20(SP), DX
	JMP  search_loop_encodeSnappyBlockAsm64K

candidate3_match_encodeSnappyBlockAsm64K:
	ADDL $0x02, DX
	JMP  candidate_match_encodeSnappyBlockAsm64K

candidate2_match_encodeSnappyBlockAsm64K:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_encodeSnappyBlockAsm64K:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBlockAsm64K

match_extend_back_loop_encodeSnappyBlockAsm64K:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeSnappyBlockAsm64K
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeSnappyBlockAsm64K
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBlockAsm64K
	JMP  match_extend_back_loop_encodeSnappyBlockAsm64K

match_extend_back_end_encodeSnappyBlockAsm64K:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeSnappyBlockAsm64K
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm64K:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm64K
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JB   one_byte_match_emit_encodeSnappyBlockAsm64K
	CMPL R8, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBlockAsm64K
	JB   three_bytes_match_emit_encodeSnappyBlockAsm64K

three_bytes_match_emit_encodeSnappyBlockAsm64K:
	MOVB $0xf4, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K

two_bytes_match_emit_encodeSnappyBlockAsm64K:
	MOVB $0xf0, (CX)
	MOVB R8, 1(CX)
	ADDQ $0x02, CX
	CMPL R8, $0x40
	JB   memmove_match_emit_encodeSnappyBlockAsm64K
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K

one_byte_match_emit_encodeSnappyBlockAsm64K:
	SHLB $0x02, R8
	MOVB R8, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeSnappyBlockAsm64K:
	LEAQ (CX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (CX)
	MOVQ DI, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
	MOVQ R8, CX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm64K

memmove_long_match_emit_encodeSnappyBlockAsm64K:
	LEAQ (CX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  R8, CX

emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
match_nolit_loop_encodeSnappyBlockAsm64K:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm64K
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K

matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_encodeSnappyBlockAsm64K

matchlen_match8_match_nolit_encodeSnappyBlockAsm64K:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm64K

matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_encodeSnappyBlockAsm64K

matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
	JB   match_nolit_end_encodeSnappyBlockAsm64K
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeSnappyBlockAsm64K

matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeSnappyBlockAsm64K
	LEAL 1(R10), R10

match_nolit_end_encodeSnappyBlockAsm64K:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
	MOVB $0xee, (CX)
	MOVW SI, 1(CX)
	LEAL -60(R10), R10
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm64K

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
	MOVL R10, DI
	SHLL $0x02, DI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
	CMPL SI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
	LEAL -15(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm64K

emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
	LEAL -2(DI), DI
	MOVB DI, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeSnappyBlockAsm64K
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBlockAsm64K
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm64K:
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x10, R8
	IMULQ R9, R8
	SHRQ  $0x32, R8
	SHLQ  $0x10, SI
	IMULQ R9, SI
	SHRQ  $0x32, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm64K
	INCL  DX
	JMP   search_loop_encodeSnappyBlockAsm64K

emit_remainder_encodeSnappyBlockAsm64K:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeSnappyBlockAsm64K
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm64K:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBlockAsm64K
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm64K
	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm64K

three_bytes_emit_remainder_encodeSnappyBlockAsm64K:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K

two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBlockAsm64K
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K

one_byte_emit_remainder_encodeSnappyBlockAsm64K:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeSnappyBlockAsm64K:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K

memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeSnappyBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm12B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000080, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm12B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeSnappyBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeSnappyBlockAsm12B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeSnappyBlockAsm12B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x000000cf1bbcdcbb, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x18, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	SHLQ  $0x18, R11
	IMULQ R9, R11
	SHRQ  $0x34, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x18, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeSnappyBlockAsm12B
	LEAL  1(DX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm12B

repeat_extend_back_loop_encodeSnappyBlockAsm12B:
	CMPL DI, SI
	JBE  repeat_extend_back_end_encodeSnappyBlockAsm12B
	MOVB -1(BX)(R8*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm12B
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm12B

repeat_extend_back_end_encodeSnappyBlockAsm12B:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 3(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_encodeSnappyBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

repeat_dst_size_check_encodeSnappyBlockAsm12B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_encodeSnappyBlockAsm12B
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm12B
	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm12B

three_bytes_repeat_emit_encodeSnappyBlockAsm12B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B

two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_encodeSnappyBlockAsm12B
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B

one_byte_repeat_emit_encodeSnappyBlockAsm12B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_repeat_emit_encodeSnappyBlockAsm12B:
	LEAQ (CX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (CX)
	MOVQ R9, -8(CX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
	MOVQ SI, CX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B

memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
	LEAQ (CX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)
	MOVQ  SI, CX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
	XORQ 8(SI)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B

matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm12B

matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
	MOVQ (R9)(R11*1), R10
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B

matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm12B

matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
	MOVL (R9)(R11*1), R10
	CMPL (SI)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
	JB   repeat_extend_forward_end_encodeSnappyBlockAsm12B
	MOVW (R9)(R11*1), R10
	CMPW (SI)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm12B

matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm12B
	LEAL 1(R11), R11

repeat_extend_forward_end_encodeSnappyBlockAsm12B:
	ADDL R11, DX
	MOVL DX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
	MOVB $0xee, (CX)
	MOVW DI, 1(CX)
	LEAL -60(SI), SI
	ADDQ $0x03, CX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
	MOVL SI, R8
	SHLL $0x02, R8
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
	CMPL DI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
	LEAL -15(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeSnappyBlockAsm12B

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
	LEAL -2(R8), R8
	MOVB R8, (CX)
	MOVW DI, 1(CX)
	ADDQ $0x03, CX

repeat_end_emit_encodeSnappyBlockAsm12B:
	MOVL DX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm12B

no_repeat_found_encodeSnappyBlockAsm12B:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_encodeSnappyBlockAsm12B
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_encodeSnappyBlockAsm12B
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_encodeSnappyBlockAsm12B
	MOVL 20(SP), DX
	JMP  search_loop_encodeSnappyBlockAsm12B

candidate3_match_encodeSnappyBlockAsm12B:
	ADDL $0x02, DX
	JMP  candidate_match_encodeSnappyBlockAsm12B

candidate2_match_encodeSnappyBlockAsm12B:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_encodeSnappyBlockAsm12B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBlockAsm12B

match_extend_back_loop_encodeSnappyBlockAsm12B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeSnappyBlockAsm12B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeSnappyBlockAsm12B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBlockAsm12B
	JMP  match_extend_back_loop_encodeSnappyBlockAsm12B

match_extend_back_end_encodeSnappyBlockAsm12B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeSnappyBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm12B:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JB   one_byte_match_emit_encodeSnappyBlockAsm12B
	CMPL R8, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBlockAsm12B
	JB   three_bytes_match_emit_encodeSnappyBlockAsm12B

three_bytes_match_emit_encodeSnappyBlockAsm12B:
	MOVB $0xf4, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B

two_bytes_match_emit_encodeSnappyBlockAsm12B:
	MOVB $0xf0, (CX)
	MOVB R8, 1(CX)
	ADDQ $0x02, CX
	CMPL R8, $0x40
	JB   memmove_match_emit_encodeSnappyBlockAsm12B
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B

one_byte_match_emit_encodeSnappyBlockAsm12B:
	SHLB $0x02, R8
	MOVB R8, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeSnappyBlockAsm12B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (CX)
	MOVQ DI, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
	MOVQ R8, CX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm12B

memmove_long_match_emit_encodeSnappyBlockAsm12B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  R8, CX

emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
match_nolit_loop_encodeSnappyBlockAsm12B:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm12B
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B

matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_encodeSnappyBlockAsm12B

matchlen_match8_match_nolit_encodeSnappyBlockAsm12B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm12B

matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_encodeSnappyBlockAsm12B

matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
	JB   match_nolit_end_encodeSnappyBlockAsm12B
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeSnappyBlockAsm12B

matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeSnappyBlockAsm12B
	LEAL 1(R10), R10

match_nolit_end_encodeSnappyBlockAsm12B:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
	MOVB $0xee, (CX)
	MOVW SI, 1(CX)
	LEAL -60(R10), R10
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm12B

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
	MOVL R10, DI
	SHLL $0x02, DI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
	CMPL SI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
	LEAL -15(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm12B

emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
	LEAL -2(DI), DI
	MOVB DI, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeSnappyBlockAsm12B
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm12B:
	MOVQ  $0x000000cf1bbcdcbb, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x18, R8
	IMULQ R9, R8
	SHRQ  $0x34, R8
	SHLQ  $0x18, SI
	IMULQ R9, SI
	SHRQ  $0x34, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm12B
	INCL  DX
	JMP   search_loop_encodeSnappyBlockAsm12B

emit_remainder_encodeSnappyBlockAsm12B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeSnappyBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm12B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBlockAsm12B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm12B
	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm12B

three_bytes_emit_remainder_encodeSnappyBlockAsm12B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B

two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B

one_byte_emit_remainder_encodeSnappyBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeSnappyBlockAsm12B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B

memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeSnappyBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm10B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000020, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm10B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeSnappyBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeSnappyBlockAsm10B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeSnappyBlockAsm10B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ R9, R11
	SHRQ  $0x36, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeSnappyBlockAsm10B
	LEAL  1(DX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm10B

repeat_extend_back_loop_encodeSnappyBlockAsm10B:
	CMPL DI, SI
	JBE  repeat_extend_back_end_encodeSnappyBlockAsm10B
	MOVB -1(BX)(R8*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm10B
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm10B

repeat_extend_back_end_encodeSnappyBlockAsm10B:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 3(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_encodeSnappyBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

repeat_dst_size_check_encodeSnappyBlockAsm10B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_encodeSnappyBlockAsm10B
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm10B
	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm10B

three_bytes_repeat_emit_encodeSnappyBlockAsm10B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B

two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_encodeSnappyBlockAsm10B
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B

one_byte_repeat_emit_encodeSnappyBlockAsm10B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_repeat_emit_encodeSnappyBlockAsm10B:
	LEAQ (CX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (CX)
	MOVQ R9, -8(CX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
	MOVQ SI, CX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B

memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
	LEAQ (CX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)
	MOVQ  SI, CX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
	XORQ 8(SI)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B

matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm10B

matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
	MOVQ (R9)(R11*1), R10
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B

matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm10B

matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
	MOVL (R9)(R11*1), R10
	CMPL (SI)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
	JB   repeat_extend_forward_end_encodeSnappyBlockAsm10B
	MOVW (R9)(R11*1), R10
	CMPW (SI)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm10B

matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm10B
	LEAL 1(R11), R11

repeat_extend_forward_end_encodeSnappyBlockAsm10B:
	ADDL R11, DX
	MOVL DX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
	MOVB $0xee, (CX)
	MOVW DI, 1(CX)
	LEAL -60(SI), SI
	ADDQ $0x03, CX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
	MOVL SI, R8
	SHLL $0x02, R8
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
	CMPL DI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
	LEAL -15(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeSnappyBlockAsm10B

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
	LEAL -2(R8), R8
	MOVB R8, (CX)
	MOVW DI, 1(CX)
	ADDQ $0x03, CX

repeat_end_emit_encodeSnappyBlockAsm10B:
	MOVL DX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm10B

no_repeat_found_encodeSnappyBlockAsm10B:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_encodeSnappyBlockAsm10B
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_encodeSnappyBlockAsm10B
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_encodeSnappyBlockAsm10B
	MOVL 20(SP), DX
	JMP  search_loop_encodeSnappyBlockAsm10B

candidate3_match_encodeSnappyBlockAsm10B:
	ADDL $0x02, DX
	JMP  candidate_match_encodeSnappyBlockAsm10B

candidate2_match_encodeSnappyBlockAsm10B:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_encodeSnappyBlockAsm10B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBlockAsm10B

match_extend_back_loop_encodeSnappyBlockAsm10B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeSnappyBlockAsm10B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeSnappyBlockAsm10B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBlockAsm10B
	JMP  match_extend_back_loop_encodeSnappyBlockAsm10B

match_extend_back_end_encodeSnappyBlockAsm10B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeSnappyBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm10B:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JB   one_byte_match_emit_encodeSnappyBlockAsm10B
	CMPL R8, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBlockAsm10B
	JB   three_bytes_match_emit_encodeSnappyBlockAsm10B

three_bytes_match_emit_encodeSnappyBlockAsm10B:
	MOVB $0xf4, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B

two_bytes_match_emit_encodeSnappyBlockAsm10B:
	MOVB $0xf0, (CX)
	MOVB R8, 1(CX)
	ADDQ $0x02, CX
	CMPL R8, $0x40
	JB   memmove_match_emit_encodeSnappyBlockAsm10B
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B

one_byte_match_emit_encodeSnappyBlockAsm10B:
	SHLB $0x02, R8
	MOVB R8, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeSnappyBlockAsm10B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (CX)
	MOVQ DI, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
	MOVQ R8, CX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm10B

memmove_long_match_emit_encodeSnappyBlockAsm10B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  R8, CX

emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
match_nolit_loop_encodeSnappyBlockAsm10B:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm10B
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B

matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_encodeSnappyBlockAsm10B

matchlen_match8_match_nolit_encodeSnappyBlockAsm10B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm10B

matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_encodeSnappyBlockAsm10B

matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
	JB   match_nolit_end_encodeSnappyBlockAsm10B
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeSnappyBlockAsm10B

matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeSnappyBlockAsm10B
	LEAL 1(R10), R10

match_nolit_end_encodeSnappyBlockAsm10B:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
	MOVB $0xee, (CX)
	MOVW SI, 1(CX)
	LEAL -60(R10), R10
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm10B

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
	MOVL R10, DI
	SHLL $0x02, DI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
	CMPL SI, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
	LEAL -15(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm10B

emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
	LEAL -2(DI), DI
	MOVB DI, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeSnappyBlockAsm10B
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm10B:
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x20, R8
	IMULQ R9, R8
	SHRQ  $0x36, R8
	SHLQ  $0x20, SI
	IMULQ R9, SI
	SHRQ  $0x36, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm10B
	INCL  DX
	JMP   search_loop_encodeSnappyBlockAsm10B

emit_remainder_encodeSnappyBlockAsm10B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeSnappyBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm10B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBlockAsm10B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm10B
	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm10B

three_bytes_emit_remainder_encodeSnappyBlockAsm10B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B

two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B

one_byte_emit_remainder_encodeSnappyBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeSnappyBlockAsm10B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B

memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeSnappyBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBlockAsm8B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000008, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeSnappyBlockAsm8B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeSnappyBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeSnappyBlockAsm8B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x04, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeSnappyBlockAsm8B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x38, R10
	SHLQ  $0x20, R11
	IMULQ R9, R11
	SHRQ  $0x38, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x38, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_encodeSnappyBlockAsm8B
	LEAL  1(DX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_encodeSnappyBlockAsm8B

repeat_extend_back_loop_encodeSnappyBlockAsm8B:
	CMPL DI, SI
	JBE  repeat_extend_back_end_encodeSnappyBlockAsm8B
	MOVB -1(BX)(R8*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_encodeSnappyBlockAsm8B
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm8B

repeat_extend_back_end_encodeSnappyBlockAsm8B:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 3(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_encodeSnappyBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

repeat_dst_size_check_encodeSnappyBlockAsm8B:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_encodeSnappyBlockAsm8B
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm8B
	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm8B

three_bytes_repeat_emit_encodeSnappyBlockAsm8B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B

two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_encodeSnappyBlockAsm8B
	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B

one_byte_repeat_emit_encodeSnappyBlockAsm8B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_repeat_emit_encodeSnappyBlockAsm8B:
	LEAQ (CX)(R8*1), SI

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
	MOVQ (R9), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
	MOVQ (R9), R10
	MOVQ -8(R9)(R8*1), R9
	MOVQ R10, (CX)
	MOVQ R9, -8(CX)(R8*1)
	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
	MOVOU (R9), X0
	MOVOU -16(R9)(R8*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R8*1)
	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)

memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
	MOVQ SI, CX
	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B

memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
	LEAQ (CX)(R8*1), SI

	// genMemMoveLong
	MOVOU (R9), X0
	MOVOU 16(R9), X1
	MOVOU -32(R9)(R8*1), X2
	MOVOU -16(R9)(R8*1), X3
	MOVQ  R8, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R9)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R9)(R12*1), X4
	MOVOU -16(R9)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R8, R12
	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R8*1)
	MOVOU X3, -16(CX)(R8*1)
	MOVQ  SI, CX

emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
	XORQ 8(SI)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B

matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm8B

matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
	MOVQ (R9)(R11*1), R10
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B

matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm8B

matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
	MOVL (R9)(R11*1), R10
	CMPL (SI)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
	JB   repeat_extend_forward_end_encodeSnappyBlockAsm8B
	MOVW (R9)(R11*1), R10
	CMPW (SI)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm8B

matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm8B
	LEAL 1(R11), R11

repeat_extend_forward_end_encodeSnappyBlockAsm8B:
	ADDL R11, DX
	MOVL DX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
	MOVB $0xee, (CX)
	MOVW DI, 1(CX)
	LEAL -60(SI), SI
	ADDQ $0x03, CX
	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B

two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
	MOVL SI, R8
	SHLL $0x02, R8
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
	LEAL -15(R8), R8
	MOVB DI, 1(CX)
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, R8
	MOVB R8, (CX)
	ADDQ $0x02, CX
	JMP  repeat_end_emit_encodeSnappyBlockAsm8B

emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
	LEAL -2(R8), R8
	MOVB R8, (CX)
	MOVW DI, 1(CX)
	ADDQ $0x03, CX

repeat_end_emit_encodeSnappyBlockAsm8B:
	MOVL DX, 12(SP)
	JMP  search_loop_encodeSnappyBlockAsm8B

no_repeat_found_encodeSnappyBlockAsm8B:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_encodeSnappyBlockAsm8B
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_encodeSnappyBlockAsm8B
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_encodeSnappyBlockAsm8B
	MOVL 20(SP), DX
	JMP  search_loop_encodeSnappyBlockAsm8B

candidate3_match_encodeSnappyBlockAsm8B:
	ADDL $0x02, DX
	JMP  candidate_match_encodeSnappyBlockAsm8B

candidate2_match_encodeSnappyBlockAsm8B:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_encodeSnappyBlockAsm8B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBlockAsm8B

match_extend_back_loop_encodeSnappyBlockAsm8B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeSnappyBlockAsm8B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeSnappyBlockAsm8B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBlockAsm8B
	JMP  match_extend_back_loop_encodeSnappyBlockAsm8B

match_extend_back_end_encodeSnappyBlockAsm8B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeSnappyBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeSnappyBlockAsm8B:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), R8
	CMPL R8, $0x3c
	JB   one_byte_match_emit_encodeSnappyBlockAsm8B
	CMPL R8, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBlockAsm8B
	JB   three_bytes_match_emit_encodeSnappyBlockAsm8B

three_bytes_match_emit_encodeSnappyBlockAsm8B:
	MOVB $0xf4, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B

two_bytes_match_emit_encodeSnappyBlockAsm8B:
	MOVB $0xf0, (CX)
	MOVB R8, 1(CX)
	ADDQ $0x02, CX
	CMPL R8, $0x40
	JB   memmove_match_emit_encodeSnappyBlockAsm8B
	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B

one_byte_match_emit_encodeSnappyBlockAsm8B:
	SHLB $0x02, R8
	MOVB R8, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeSnappyBlockAsm8B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
	MOVQ (DI), R10
	MOVQ R10, (CX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
	MOVQ (DI), R10
	MOVQ -8(DI)(R9*1), DI
	MOVQ R10, (CX)
	MOVQ DI, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
	MOVOU (DI), X0
	MOVOU -16(DI)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
	MOVQ R8, CX
	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm8B

memmove_long_match_emit_encodeSnappyBlockAsm8B:
	LEAQ (CX)(R9*1), R8

	// genMemMoveLong
	MOVOU (DI), X0
	MOVOU 16(DI), X1
	MOVOU -32(DI)(R9*1), X2
	MOVOU -16(DI)(R9*1), X3
	MOVQ  R9, R11
	SHRQ  $0x05, R11
	MOVQ  CX, R10
	ANDL  $0x0000001f, R10
	MOVQ  $0x00000040, R12
	SUBQ  R10, R12
	DECQ  R11
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(DI)(R12*1), R10
	LEAQ  -32(CX)(R12*1), R13

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
	MOVOU (R10), X4
	MOVOU 16(R10), X5
	MOVOA X4, (R13)
	MOVOA X5, 16(R13)
	ADDQ  $0x20, R13
	ADDQ  $0x20, R10
	ADDQ  $0x20, R12
	DECQ  R11
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(DI)(R12*1), X4
	MOVOU -16(DI)(R12*1), X5
	MOVOA X4, -32(CX)(R12*1)
	MOVOA X5, -16(CX)(R12*1)
	ADDQ  $0x20, R12
	CMPQ  R9, R12
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  R8, CX

emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
match_nolit_loop_encodeSnappyBlockAsm8B:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm8B
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B

matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_encodeSnappyBlockAsm8B

matchlen_match8_match_nolit_encodeSnappyBlockAsm8B:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm8B

matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_encodeSnappyBlockAsm8B

matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
	JB   match_nolit_end_encodeSnappyBlockAsm8B
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_encodeSnappyBlockAsm8B

matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_encodeSnappyBlockAsm8B
	LEAL 1(R10), R10

match_nolit_end_encodeSnappyBlockAsm8B:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
	MOVB $0xee, (CX)
	MOVW SI, 1(CX)
	LEAL -60(R10), R10
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm8B

two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
	MOVL R10, DI
	SHLL $0x02, DI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
	LEAL -15(DI), DI
	MOVB SI, 1(CX)
	SHRL $0x08, SI
	SHLL $0x05, SI
	ORL  SI, DI
	MOVB DI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm8B

emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
	LEAL -2(DI), DI
	MOVB DI, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeSnappyBlockAsm8B
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeSnappyBlockAsm8B:
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x20, R8
	IMULQ R9, R8
	SHRQ  $0x38, R8
	SHLQ  $0x20, SI
	IMULQ R9, SI
	SHRQ  $0x38, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_encodeSnappyBlockAsm8B
	INCL  DX
	JMP   search_loop_encodeSnappyBlockAsm8B

emit_remainder_encodeSnappyBlockAsm8B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeSnappyBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeSnappyBlockAsm8B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBlockAsm8B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm8B
	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm8B

three_bytes_emit_remainder_encodeSnappyBlockAsm8B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B

two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B

one_byte_emit_remainder_encodeSnappyBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeSnappyBlockAsm8B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B

memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeSnappyBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00001200, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeSnappyBetterBlockAsm:
	MOVL DX, SI
	SUBL 12(SP), SI
	SHRL $0x07, SI
	CMPL SI, $0x63
	JBE  check_maxskip_ok_encodeSnappyBetterBlockAsm
	LEAL 100(DX), SI
	JMP  check_maxskip_cont_encodeSnappyBetterBlockAsm

check_maxskip_ok_encodeSnappyBetterBlockAsm:
	LEAL 1(DX)(SI*1), SI

check_maxskip_cont_encodeSnappyBetterBlockAsm:
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeSnappyBetterBlockAsm
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x2f, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x32, R11
	MOVL  (AX)(R10*4), SI
	MOVL  524288(AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	MOVL  DX, 524288(AX)(R11*4)
	MOVQ  (BX)(SI*1), R10
	MOVQ  (BX)(R8*1), R11
	CMPQ  R10, DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm
	CMPQ  R11, DI
	JNE   no_short_found_encodeSnappyBetterBlockAsm
	MOVL  R8, SI
	JMP   candidate_match_encodeSnappyBetterBlockAsm

no_short_found_encodeSnappyBetterBlockAsm:
	CMPL R10, DI
	JEQ  candidate_match_encodeSnappyBetterBlockAsm
	CMPL R11, DI
	JEQ  candidateS_match_encodeSnappyBetterBlockAsm
	MOVL 20(SP), DX
	JMP  search_loop_encodeSnappyBetterBlockAsm

candidateS_match_encodeSnappyBetterBlockAsm:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x2f, R10
	MOVL  (AX)(R10*4), SI
	INCL  DX
	MOVL  DX, (AX)(R10*4)
	CMPL  (BX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm
	DECL  DX
	MOVL  R8, SI

candidate_match_encodeSnappyBetterBlockAsm:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm

match_extend_back_loop_encodeSnappyBetterBlockAsm:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm

match_extend_back_end_encodeSnappyBetterBlockAsm:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 5(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeSnappyBetterBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm:
	MOVL DX, DI
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), R10

	// matchLen
	XORL R12, R12

matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL R8, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm
	MOVQ (R9)(R12*1), R11
	MOVQ 8(R9)(R12*1), R13
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
	XORQ 8(R10)(R12*1), R13
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm
	LEAL -16(R8), R8
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm

matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm

matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL R8, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
	MOVQ (R9)(R12*1), R11
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm

matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm

matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL R8, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
	MOVL (R9)(R12*1), R11
	CMPL (R10)(R12*1), R11
	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
	LEAL -4(R8), R8
	LEAL 4(R12), R12

matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL R8, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
	JB   match_nolit_end_encodeSnappyBetterBlockAsm
	MOVW (R9)(R12*1), R11
	CMPW (R10)(R12*1), R11
	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
	LEAL 2(R12), R12
	SUBL $0x02, R8
	JZ   match_nolit_end_encodeSnappyBetterBlockAsm

matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm
	LEAL 1(R12), R12

match_nolit_end_encodeSnappyBetterBlockAsm:
	MOVL DX, R8
	SUBL SI, R8

	// Check if repeat
	CMPL R12, $0x01
	JA   match_length_ok_encodeSnappyBetterBlockAsm
	CMPL R8, $0x0000ffff
	JBE  match_length_ok_encodeSnappyBetterBlockAsm
	MOVL 20(SP), DX
	INCL DX
	JMP  search_loop_encodeSnappyBetterBlockAsm

match_length_ok_encodeSnappyBetterBlockAsm:
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm
	CMPL SI, $0x00010000
	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm
	CMPL SI, $0x01000000
	JB   four_bytes_match_emit_encodeSnappyBetterBlockAsm
	MOVB $0xfc, (CX)
	MOVL SI, 1(CX)
	ADDQ $0x05, CX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

four_bytes_match_emit_encodeSnappyBetterBlockAsm:
	MOVL SI, R11
	SHRL $0x10, R11
	MOVB $0xf8, (CX)
	MOVW SI, 1(CX)
	MOVB R11, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

three_bytes_match_emit_encodeSnappyBetterBlockAsm:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

two_bytes_match_emit_encodeSnappyBetterBlockAsm:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_encodeSnappyBetterBlockAsm
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm

one_byte_match_emit_encodeSnappyBetterBlockAsm:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeSnappyBetterBlockAsm:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (CX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm

memmove_long_match_emit_encodeSnappyBetterBlockAsm:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitCopy
	CMPL R8, $0x00010000
	JB   two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm

four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL R12, $0x40
	JBE  four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
	MOVB $0xff, (CX)
	MOVL R8, 1(CX)
	LEAL -64(R12), R12
	ADDQ $0x05, CX
	CMPL R12, $0x04
	JB   four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm

four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
	TESTL R12, R12
	JZ    match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
	XORL  SI, SI
	LEAL  -1(SI)(R12*4), R12
	MOVB  R12, (CX)
	MOVL  R8, 1(CX)
	ADDQ  $0x05, CX
	JMP   match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm

two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
	CMPL R12, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
	MOVB $0xee, (CX)
	MOVW R8, 1(CX)
	LEAL -60(R12), R12
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
	MOVL R12, SI
	SHLL $0x02, SI
	CMPL R12, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
	CMPL R8, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
	LEAL -15(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
	LEAL -2(SI), SI
	MOVB SI, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeSnappyBetterBlockAsm
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
	MOVQ  $0x00cf1bbcdcbfa563, SI
	MOVQ  $0x9e3779b1, R8
	LEAQ  1(DI), DI
	LEAQ  -2(DX), R9
	MOVQ  (BX)(DI*1), R10
	MOVQ  1(BX)(DI*1), R11
	MOVQ  (BX)(R9*1), R12
	MOVQ  1(BX)(R9*1), R13
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x2f, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x32, R11
	SHLQ  $0x08, R12
	IMULQ SI, R12
	SHRQ  $0x2f, R12
	SHLQ  $0x20, R13
	IMULQ R8, R13
	SHRQ  $0x32, R13
	LEAQ  1(DI), R8
	LEAQ  1(R9), R14
	MOVL  DI, (AX)(R10*4)
	MOVL  R9, (AX)(R12*4)
	MOVL  R8, 524288(AX)(R11*4)
	MOVL  R14, 524288(AX)(R13*4)
	LEAQ  1(R9)(DI*1), R8
	SHRQ  $0x01, R8
	ADDQ  $0x01, DI
	SUBQ  $0x01, R9

index_loop_encodeSnappyBetterBlockAsm:
	CMPQ  R8, R9
	JAE   search_loop_encodeSnappyBetterBlockAsm
	MOVQ  (BX)(DI*1), R10
	MOVQ  (BX)(R8*1), R11
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x2f, R10
	SHLQ  $0x08, R11
	IMULQ SI, R11
	SHRQ  $0x2f, R11
	MOVL  DI, (AX)(R10*4)
	MOVL  R8, (AX)(R11*4)
	ADDQ  $0x02, DI
	ADDQ  $0x02, R8
	JMP   index_loop_encodeSnappyBetterBlockAsm

emit_remainder_encodeSnappyBetterBlockAsm:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 5(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
	CMPL DX, $0x00010000
	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
	CMPL DX, $0x01000000
	JB   four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
	MOVB $0xfc, (CX)
	MOVL DX, 1(CX)
	ADDQ $0x05, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVL DX, BX
	SHRL $0x10, BX
	MOVB $0xf8, (CX)
	MOVW DX, 1(CX)
	MOVB BL, 3(CX)
	ADDQ $0x04, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm

one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeSnappyBetterBlockAsm:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte, tmp *[294912]byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm64K(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000900, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm64K:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm64K
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeSnappyBetterBlockAsm64K:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x07, SI
	LEAL  1(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeSnappyBetterBlockAsm64K
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x00cf1bbcdcbfa563, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x30, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x33, R11
	MOVL  (AX)(R10*4), SI
	MOVL  262144(AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	MOVL  DX, 262144(AX)(R11*4)
	MOVQ  (BX)(SI*1), R10
	MOVQ  (BX)(R8*1), R11
	CMPQ  R10, DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
	CMPQ  R11, DI
	JNE   no_short_found_encodeSnappyBetterBlockAsm64K
	MOVL  R8, SI
	JMP   candidate_match_encodeSnappyBetterBlockAsm64K

no_short_found_encodeSnappyBetterBlockAsm64K:
	CMPL R10, DI
	JEQ  candidate_match_encodeSnappyBetterBlockAsm64K
	CMPL R11, DI
	JEQ  candidateS_match_encodeSnappyBetterBlockAsm64K
	MOVL 20(SP), DX
	JMP  search_loop_encodeSnappyBetterBlockAsm64K

candidateS_match_encodeSnappyBetterBlockAsm64K:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x08, R10
	IMULQ R9, R10
	SHRQ  $0x30, R10
	MOVL  (AX)(R10*4), SI
	INCL  DX
	MOVL  DX, (AX)(R10*4)
	CMPL  (BX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
	DECL  DX
	MOVL  R8, SI

candidate_match_encodeSnappyBetterBlockAsm64K:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm64K

match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm64K
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm64K

match_extend_back_end_encodeSnappyBetterBlockAsm64K:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeSnappyBetterBlockAsm64K
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm64K:
	MOVL DX, DI
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), R10

	// matchLen
	XORL R12, R12

matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL R8, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K
	MOVQ (R9)(R12*1), R11
	MOVQ 8(R9)(R12*1), R13
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
	XORQ 8(R10)(R12*1), R13
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K
	LEAL -16(R8), R8
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K

matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm64K

matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL R8, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
	MOVQ (R9)(R12*1), R11
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K

matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm64K

matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL R8, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
	MOVL (R9)(R12*1), R11
	CMPL (R10)(R12*1), R11
	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
	LEAL -4(R8), R8
	LEAL 4(R12), R12

matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL R8, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
	JB   match_nolit_end_encodeSnappyBetterBlockAsm64K
	MOVW (R9)(R12*1), R11
	CMPW (R10)(R12*1), R11
	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
	LEAL 2(R12), R12
	SUBL $0x02, R8
	JZ   match_nolit_end_encodeSnappyBetterBlockAsm64K

matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm64K
	LEAL 1(R12), R12

match_nolit_end_encodeSnappyBetterBlockAsm64K:
	MOVL DX, R8
	SUBL SI, R8

	// Check if repeat
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm64K
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm64K

three_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K

two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_encodeSnappyBetterBlockAsm64K
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K

one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeSnappyBetterBlockAsm64K:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (CX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K

memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
	CMPL R12, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
	MOVB $0xee, (CX)
	MOVW R8, 1(CX)
	LEAL -60(R12), R12
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
	MOVL R12, SI
	SHLL $0x02, SI
	CMPL R12, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
	CMPL R8, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
	LEAL -15(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
	LEAL -2(SI), SI
	MOVB SI, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeSnappyBetterBlockAsm64K
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
	MOVQ  $0x00cf1bbcdcbfa563, SI
	MOVQ  $0x9e3779b1, R8
	LEAQ  1(DI), DI
	LEAQ  -2(DX), R9
	MOVQ  (BX)(DI*1), R10
	MOVQ  1(BX)(DI*1), R11
	MOVQ  (BX)(R9*1), R12
	MOVQ  1(BX)(R9*1), R13
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x30, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x33, R11
	SHLQ  $0x08, R12
	IMULQ SI, R12
	SHRQ  $0x30, R12
	SHLQ  $0x20, R13
	IMULQ R8, R13
	SHRQ  $0x33, R13
	LEAQ  1(DI), R8
	LEAQ  1(R9), R14
	MOVL  DI, (AX)(R10*4)
	MOVL  R9, (AX)(R12*4)
	MOVL  R8, 262144(AX)(R11*4)
	MOVL  R14, 262144(AX)(R13*4)
	LEAQ  1(R9)(DI*1), R8
	SHRQ  $0x01, R8
	ADDQ  $0x01, DI
	SUBQ  $0x01, R9

index_loop_encodeSnappyBetterBlockAsm64K:
	CMPQ  R8, R9
	JAE   search_loop_encodeSnappyBetterBlockAsm64K
	MOVQ  (BX)(DI*1), R10
	MOVQ  (BX)(R8*1), R11
	SHLQ  $0x08, R10
	IMULQ SI, R10
	SHRQ  $0x30, R10
	SHLQ  $0x08, R11
	IMULQ SI, R11
	SHRQ  $0x30, R11
	MOVL  DI, (AX)(R10*4)
	MOVL  R8, (AX)(R11*4)
	ADDQ  $0x02, DI
	ADDQ  $0x02, R8
	JMP   index_loop_encodeSnappyBetterBlockAsm64K

emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm64K
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K

three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K

one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm12B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000280, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm12B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm12B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeSnappyBetterBlockAsm12B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x06, SI
	LEAL  1(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeSnappyBetterBlockAsm12B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x34, R11
	MOVL  (AX)(R10*4), SI
	MOVL  65536(AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	MOVL  DX, 65536(AX)(R11*4)
	MOVQ  (BX)(SI*1), R10
	MOVQ  (BX)(R8*1), R11
	CMPQ  R10, DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
	CMPQ  R11, DI
	JNE   no_short_found_encodeSnappyBetterBlockAsm12B
	MOVL  R8, SI
	JMP   candidate_match_encodeSnappyBetterBlockAsm12B

no_short_found_encodeSnappyBetterBlockAsm12B:
	CMPL R10, DI
	JEQ  candidate_match_encodeSnappyBetterBlockAsm12B
	CMPL R11, DI
	JEQ  candidateS_match_encodeSnappyBetterBlockAsm12B
	MOVL 20(SP), DX
	JMP  search_loop_encodeSnappyBetterBlockAsm12B

candidateS_match_encodeSnappyBetterBlockAsm12B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x32, R10
	MOVL  (AX)(R10*4), SI
	INCL  DX
	MOVL  DX, (AX)(R10*4)
	CMPL  (BX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
	DECL  DX
	MOVL  R8, SI

candidate_match_encodeSnappyBetterBlockAsm12B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm12B

match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm12B
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm12B

match_extend_back_end_encodeSnappyBetterBlockAsm12B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeSnappyBetterBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm12B:
	MOVL DX, DI
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), R10

	// matchLen
	XORL R12, R12

matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL R8, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B
	MOVQ (R9)(R12*1), R11
	MOVQ 8(R9)(R12*1), R13
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
	XORQ 8(R10)(R12*1), R13
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B
	LEAL -16(R8), R8
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B

matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm12B

matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL R8, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
	MOVQ (R9)(R12*1), R11
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B

matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm12B

matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL R8, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
	MOVL (R9)(R12*1), R11
	CMPL (R10)(R12*1), R11
	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
	LEAL -4(R8), R8
	LEAL 4(R12), R12

matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL R8, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
	JB   match_nolit_end_encodeSnappyBetterBlockAsm12B
	MOVW (R9)(R12*1), R11
	CMPW (R10)(R12*1), R11
	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
	LEAL 2(R12), R12
	SUBL $0x02, R8
	JZ   match_nolit_end_encodeSnappyBetterBlockAsm12B

matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm12B
	LEAL 1(R12), R12

match_nolit_end_encodeSnappyBetterBlockAsm12B:
	MOVL DX, R8
	SUBL SI, R8

	// Check if repeat
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm12B
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm12B

three_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B

two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_encodeSnappyBetterBlockAsm12B
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B

one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeSnappyBetterBlockAsm12B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (CX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B

memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
	CMPL R12, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
	MOVB $0xee, (CX)
	MOVW R8, 1(CX)
	LEAL -60(R12), R12
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
	MOVL R12, SI
	SHLL $0x02, SI
	CMPL R12, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
	CMPL R8, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
	LEAL -15(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
	LEAL -2(SI), SI
	MOVB SI, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeSnappyBetterBlockAsm12B
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	LEAQ  1(DI), DI
	LEAQ  -2(DX), R9
	MOVQ  (BX)(DI*1), R10
	MOVQ  1(BX)(DI*1), R11
	MOVQ  (BX)(R9*1), R12
	MOVQ  1(BX)(R9*1), R13
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x34, R11
	SHLQ  $0x10, R12
	IMULQ SI, R12
	SHRQ  $0x32, R12
	SHLQ  $0x20, R13
	IMULQ R8, R13
	SHRQ  $0x34, R13
	LEAQ  1(DI), R8
	LEAQ  1(R9), R14
	MOVL  DI, (AX)(R10*4)
	MOVL  R9, (AX)(R12*4)
	MOVL  R8, 65536(AX)(R11*4)
	MOVL  R14, 65536(AX)(R13*4)
	LEAQ  1(R9)(DI*1), R8
	SHRQ  $0x01, R8
	ADDQ  $0x01, DI
	SUBQ  $0x01, R9

index_loop_encodeSnappyBetterBlockAsm12B:
	CMPQ  R8, R9
	JAE   search_loop_encodeSnappyBetterBlockAsm12B
	MOVQ  (BX)(DI*1), R10
	MOVQ  (BX)(R8*1), R11
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x32, R10
	SHLQ  $0x10, R11
	IMULQ SI, R11
	SHRQ  $0x32, R11
	MOVL  DI, (AX)(R10*4)
	MOVL  R8, (AX)(R11*4)
	ADDQ  $0x02, DI
	ADDQ  $0x02, R8
	JMP   index_loop_encodeSnappyBetterBlockAsm12B

emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm12B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B

three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B

one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm10B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x000000a0, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm10B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm10B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeSnappyBetterBlockAsm10B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  1(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeSnappyBetterBlockAsm10B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x36, R11
	MOVL  (AX)(R10*4), SI
	MOVL  16384(AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	MOVL  DX, 16384(AX)(R11*4)
	MOVQ  (BX)(SI*1), R10
	MOVQ  (BX)(R8*1), R11
	CMPQ  R10, DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
	CMPQ  R11, DI
	JNE   no_short_found_encodeSnappyBetterBlockAsm10B
	MOVL  R8, SI
	JMP   candidate_match_encodeSnappyBetterBlockAsm10B

no_short_found_encodeSnappyBetterBlockAsm10B:
	CMPL R10, DI
	JEQ  candidate_match_encodeSnappyBetterBlockAsm10B
	CMPL R11, DI
	JEQ  candidateS_match_encodeSnappyBetterBlockAsm10B
	MOVL 20(SP), DX
	JMP  search_loop_encodeSnappyBetterBlockAsm10B

candidateS_match_encodeSnappyBetterBlockAsm10B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x34, R10
	MOVL  (AX)(R10*4), SI
	INCL  DX
	MOVL  DX, (AX)(R10*4)
	CMPL  (BX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
	DECL  DX
	MOVL  R8, SI

candidate_match_encodeSnappyBetterBlockAsm10B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm10B

match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm10B
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm10B

match_extend_back_end_encodeSnappyBetterBlockAsm10B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeSnappyBetterBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm10B:
	MOVL DX, DI
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), R10

	// matchLen
	XORL R12, R12

matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL R8, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B
	MOVQ (R9)(R12*1), R11
	MOVQ 8(R9)(R12*1), R13
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
	XORQ 8(R10)(R12*1), R13
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B
	LEAL -16(R8), R8
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B

matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm10B

matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL R8, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
	MOVQ (R9)(R12*1), R11
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B

matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm10B

matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL R8, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
	MOVL (R9)(R12*1), R11
	CMPL (R10)(R12*1), R11
	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
	LEAL -4(R8), R8
	LEAL 4(R12), R12

matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL R8, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
	JB   match_nolit_end_encodeSnappyBetterBlockAsm10B
	MOVW (R9)(R12*1), R11
	CMPW (R10)(R12*1), R11
	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
	LEAL 2(R12), R12
	SUBL $0x02, R8
	JZ   match_nolit_end_encodeSnappyBetterBlockAsm10B

matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm10B
	LEAL 1(R12), R12

match_nolit_end_encodeSnappyBetterBlockAsm10B:
	MOVL DX, R8
	SUBL SI, R8

	// Check if repeat
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm10B
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm10B

three_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B

two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_encodeSnappyBetterBlockAsm10B
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B

one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeSnappyBetterBlockAsm10B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (CX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B

memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
	CMPL R12, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
	MOVB $0xee, (CX)
	MOVW R8, 1(CX)
	LEAL -60(R12), R12
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
	MOVL R12, SI
	SHLL $0x02, SI
	CMPL R12, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
	CMPL R8, $0x00000800
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
	LEAL -15(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
	LEAL -2(SI), SI
	MOVB SI, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeSnappyBetterBlockAsm10B
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	LEAQ  1(DI), DI
	LEAQ  -2(DX), R9
	MOVQ  (BX)(DI*1), R10
	MOVQ  1(BX)(DI*1), R11
	MOVQ  (BX)(R9*1), R12
	MOVQ  1(BX)(R9*1), R13
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x34, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x36, R11
	SHLQ  $0x10, R12
	IMULQ SI, R12
	SHRQ  $0x34, R12
	SHLQ  $0x20, R13
	IMULQ R8, R13
	SHRQ  $0x36, R13
	LEAQ  1(DI), R8
	LEAQ  1(R9), R14
	MOVL  DI, (AX)(R10*4)
	MOVL  R9, (AX)(R12*4)
	MOVL  R8, 16384(AX)(R11*4)
	MOVL  R14, 16384(AX)(R13*4)
	LEAQ  1(R9)(DI*1), R8
	SHRQ  $0x01, R8
	ADDQ  $0x01, DI
	SUBQ  $0x01, R9

index_loop_encodeSnappyBetterBlockAsm10B:
	CMPQ  R8, R9
	JAE   search_loop_encodeSnappyBetterBlockAsm10B
	MOVQ  (BX)(DI*1), R10
	MOVQ  (BX)(R8*1), R11
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x34, R10
	SHLQ  $0x10, R11
	IMULQ SI, R11
	SHRQ  $0x34, R11
	MOVL  DI, (AX)(R10*4)
	MOVL  R8, (AX)(R11*4)
	ADDQ  $0x02, DI
	ADDQ  $0x02, R8
	JMP   index_loop_encodeSnappyBetterBlockAsm10B

emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm10B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B

three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B

one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int
// Requires: BMI, SSE2
TEXT ·encodeSnappyBetterBlockAsm8B(SB), $24-64
	MOVQ tmp+48(FP), AX
	MOVQ dst_base+0(FP), CX
	MOVQ $0x00000028, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_encodeSnappyBetterBlockAsm8B:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_encodeSnappyBetterBlockAsm8B
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+32(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  $0x00000000, 16(SP)
	MOVQ  src_base+24(FP), BX

search_loop_encodeSnappyBetterBlockAsm8B:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x04, SI
	LEAL  1(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_encodeSnappyBetterBlockAsm8B
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  $0x9e3779b1, SI
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ SI, R11
	SHRQ  $0x38, R11
	MOVL  (AX)(R10*4), SI
	MOVL  4096(AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	MOVL  DX, 4096(AX)(R11*4)
	MOVQ  (BX)(SI*1), R10
	MOVQ  (BX)(R8*1), R11
	CMPQ  R10, DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
	CMPQ  R11, DI
	JNE   no_short_found_encodeSnappyBetterBlockAsm8B
	MOVL  R8, SI
	JMP   candidate_match_encodeSnappyBetterBlockAsm8B

no_short_found_encodeSnappyBetterBlockAsm8B:
	CMPL R10, DI
	JEQ  candidate_match_encodeSnappyBetterBlockAsm8B
	CMPL R11, DI
	JEQ  candidateS_match_encodeSnappyBetterBlockAsm8B
	MOVL 20(SP), DX
	JMP  search_loop_encodeSnappyBetterBlockAsm8B

candidateS_match_encodeSnappyBetterBlockAsm8B:
	SHRQ  $0x08, DI
	MOVQ  DI, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x36, R10
	MOVL  (AX)(R10*4), SI
	INCL  DX
	MOVL  DX, (AX)(R10*4)
	CMPL  (BX)(SI*1), DI
	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
	DECL  DX
	MOVL  R8, SI

candidate_match_encodeSnappyBetterBlockAsm8B:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm8B

match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
	CMPL DX, DI
	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm8B
	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm8B

match_extend_back_end_encodeSnappyBetterBlockAsm8B:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_encodeSnappyBetterBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_dst_size_check_encodeSnappyBetterBlockAsm8B:
	MOVL DX, DI
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+32(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), R10

	// matchLen
	XORL R12, R12

matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL R8, $0x10
	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B
	MOVQ (R9)(R12*1), R11
	MOVQ 8(R9)(R12*1), R13
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
	XORQ 8(R10)(R12*1), R13
	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B
	LEAL -16(R8), R8
	LEAL 16(R12), R12
	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B

matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R13, R13

#else
	BSFQ R13, R13

#endif
	SARQ $0x03, R13
	LEAL 8(R12)(R13*1), R12
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm8B

matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL R8, $0x08
	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
	MOVQ (R9)(R12*1), R11
	XORQ (R10)(R12*1), R11
	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
	LEAL -8(R8), R8
	LEAL 8(R12), R12
	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B

matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL (R12)(R11*1), R12
	JMP  match_nolit_end_encodeSnappyBetterBlockAsm8B

matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL R8, $0x04
	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
	MOVL (R9)(R12*1), R11
	CMPL (R10)(R12*1), R11
	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
	LEAL -4(R8), R8
	LEAL 4(R12), R12

matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL R8, $0x01
	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
	JB   match_nolit_end_encodeSnappyBetterBlockAsm8B
	MOVW (R9)(R12*1), R11
	CMPW (R10)(R12*1), R11
	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
	LEAL 2(R12), R12
	SUBL $0x02, R8
	JZ   match_nolit_end_encodeSnappyBetterBlockAsm8B

matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
	MOVB (R9)(R12*1), R11
	CMPB (R10)(R12*1), R11
	JNE  match_nolit_end_encodeSnappyBetterBlockAsm8B
	LEAL 1(R12), R12

match_nolit_end_encodeSnappyBetterBlockAsm8B:
	MOVL DX, R8
	SUBL SI, R8

	// Check if repeat
	MOVL R8, 16(SP)
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R10
	SUBL SI, R9
	LEAL -1(R9), SI
	CMPL SI, $0x3c
	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm8B
	CMPL SI, $0x00000100
	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm8B

three_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
	MOVB $0xf4, (CX)
	MOVW SI, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B

two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
	MOVB $0xf0, (CX)
	MOVB SI, 1(CX)
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_match_emit_encodeSnappyBetterBlockAsm8B
	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B

one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
	SHLB $0x02, SI
	MOVB SI, (CX)
	ADDQ $0x01, CX

memmove_match_emit_encodeSnappyBetterBlockAsm8B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
	MOVQ (R10), R11
	MOVQ R11, (CX)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (R10), R11
	MOVQ -8(R10)(R9*1), R10
	MOVQ R11, (CX)
	MOVQ R10, -8(CX)(R9*1)
	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (R10), X0
	MOVOU -16(R10)(R9*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(R9*1)
	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)

memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
	MOVQ SI, CX
	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B

memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
	LEAQ (CX)(R9*1), SI

	// genMemMoveLong
	MOVOU (R10), X0
	MOVOU 16(R10), X1
	MOVOU -32(R10)(R9*1), X2
	MOVOU -16(R10)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  CX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R14
	SUBQ  R11, R14
	DECQ  R13
	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(R10)(R14*1), R11
	LEAQ  -32(CX)(R14*1), R15

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R11
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(R10)(R14*1), X4
	MOVOU -16(R10)(R14*1), X5
	MOVOA X4, -32(CX)(R14*1)
	MOVOA X5, -16(CX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(R9*1)
	MOVOU X3, -16(CX)(R9*1)
	MOVQ  SI, CX

emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
	ADDL R12, DX
	ADDL $0x04, R12
	MOVL DX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
	CMPL R12, $0x40
	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
	MOVB $0xee, (CX)
	MOVW R8, 1(CX)
	LEAL -60(R12), R12
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B

two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
	MOVL R12, SI
	SHLL $0x02, SI
	CMPL R12, $0x0c
	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
	LEAL -15(SI), SI
	MOVB R8, 1(CX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, SI
	MOVB SI, (CX)
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B

emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
	LEAL -2(SI), SI
	MOVB SI, (CX)
	MOVW R8, 1(CX)
	ADDQ $0x03, CX

match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
	CMPL DX, 8(SP)
	JAE  emit_remainder_encodeSnappyBetterBlockAsm8B
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
	MOVQ  $0x0000cf1bbcdcbf9b, SI
	MOVQ  $0x9e3779b1, R8
	LEAQ  1(DI), DI
	LEAQ  -2(DX), R9
	MOVQ  (BX)(DI*1), R10
	MOVQ  1(BX)(DI*1), R11
	MOVQ  (BX)(R9*1), R12
	MOVQ  1(BX)(R9*1), R13
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x36, R10
	SHLQ  $0x20, R11
	IMULQ R8, R11
	SHRQ  $0x38, R11
	SHLQ  $0x10, R12
	IMULQ SI, R12
	SHRQ  $0x36, R12
	SHLQ  $0x20, R13
	IMULQ R8, R13
	SHRQ  $0x38, R13
	LEAQ  1(DI), R8
	LEAQ  1(R9), R14
	MOVL  DI, (AX)(R10*4)
	MOVL  R9, (AX)(R12*4)
	MOVL  R8, 4096(AX)(R11*4)
	MOVL  R14, 4096(AX)(R13*4)
	LEAQ  1(R9)(DI*1), R8
	SHRQ  $0x01, R8
	ADDQ  $0x01, DI
	SUBQ  $0x01, R9

index_loop_encodeSnappyBetterBlockAsm8B:
	CMPQ  R8, R9
	JAE   search_loop_encodeSnappyBetterBlockAsm8B
	MOVQ  (BX)(DI*1), R10
	MOVQ  (BX)(R8*1), R11
	SHLQ  $0x10, R10
	IMULQ SI, R10
	SHRQ  $0x36, R10
	SHLQ  $0x10, R11
	IMULQ SI, R11
	SHRQ  $0x36, R11
	MOVL  DI, (AX)(R10*4)
	MOVL  R8, (AX)(R11*4)
	ADDQ  $0x02, DI
	ADDQ  $0x02, R8
	JMP   index_loop_encodeSnappyBetterBlockAsm8B

emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVQ src_len+32(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm8B
	MOVQ $0x00000000, ret+56(FP)
	RET

emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
	MOVQ src_len+32(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), DX
	CMPL DX, $0x3c
	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
	CMPL DX, $0x00000100
	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B

three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVB $0xf4, (CX)
	MOVW DX, 1(CX)
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B

two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVB $0xf0, (CX)
	MOVB DL, 1(CX)
	ADDQ $0x02, CX
	CMPL DX, $0x40
	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B

one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
	SHLB $0x02, DL
	MOVB DL, (CX)
	ADDQ $0x01, CX

memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveShort
	CMPQ BX, $0x03
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2
	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3
	CMPQ BX, $0x08
	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7
	CMPQ BX, $0x10
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
	CMPQ BX, $0x20
	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2:
	MOVB (AX), SI
	MOVB -1(AX)(BX*1), AL
	MOVB SI, (CX)
	MOVB AL, -1(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3:
	MOVW (AX), SI
	MOVB 2(AX), AL
	MOVW SI, (CX)
	MOVB AL, 2(CX)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7:
	MOVL (AX), SI
	MOVL -4(AX)(BX*1), AX
	MOVL SI, (CX)
	MOVL AX, -4(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
	MOVQ (AX), SI
	MOVQ -8(AX)(BX*1), AX
	MOVQ SI, (CX)
	MOVQ AX, -8(CX)(BX*1)
	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
	MOVOU (AX), X0
	MOVOU -16(AX)(BX*1), X1
	MOVOU X0, (CX)
	MOVOU X1, -16(CX)(BX*1)
	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B

emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)

memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVQ DX, CX
	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B

memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
	LEAQ (CX)(SI*1), DX
	MOVL SI, BX

	// genMemMoveLong
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU -32(AX)(BX*1), X2
	MOVOU -16(AX)(BX*1), X3
	MOVQ  BX, DI
	SHRQ  $0x05, DI
	MOVQ  CX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	LEAQ  -32(AX)(R8*1), SI
	LEAQ  -32(CX)(R8*1), R9

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back

emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
	MOVOU -32(AX)(R8*1), X4
	MOVOU -16(AX)(R8*1), X5
	MOVOA X4, -32(CX)(R8*1)
	MOVOA X5, -16(CX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  BX, R8
	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
	MOVOU X0, (CX)
	MOVOU X1, 16(CX)
	MOVOU X2, -32(CX)(BX*1)
	MOVOU X3, -16(CX)(BX*1)
	MOVQ  DX, CX

emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
	MOVQ dst_base+0(FP), AX
	SUBQ AX, CX
	MOVQ CX, ret+56(FP)
	RET

// func calcBlockSize(src []byte, tmp *[32768]byte) int
// Requires: BMI, SSE2
TEXT ·calcBlockSize(SB), $24-40
	MOVQ tmp+24(FP), AX
	XORQ CX, CX
	MOVQ $0x00000100, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_calcBlockSize:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_calcBlockSize
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+8(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+0(FP), BX

search_loop_calcBlockSize:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x05, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_calcBlockSize
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x33, R10
	SHLQ  $0x10, R11
	IMULQ R9, R11
	SHRQ  $0x33, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x10, R10
	IMULQ R9, R10
	SHRQ  $0x33, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_calcBlockSize
	LEAL  1(DX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_calcBlockSize

repeat_extend_back_loop_calcBlockSize:
	CMPL DI, SI
	JBE  repeat_extend_back_end_calcBlockSize
	MOVB -1(BX)(R8*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_calcBlockSize
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_calcBlockSize

repeat_extend_back_end_calcBlockSize:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 5(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_calcBlockSize
	MOVQ $0x00000000, ret+32(FP)
	RET

repeat_dst_size_check_calcBlockSize:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_calcBlockSize
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_calcBlockSize
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_calcBlockSize
	CMPL SI, $0x00010000
	JB   three_bytes_repeat_emit_calcBlockSize
	CMPL SI, $0x01000000
	JB   four_bytes_repeat_emit_calcBlockSize
	ADDQ $0x05, CX
	JMP  memmove_long_repeat_emit_calcBlockSize

four_bytes_repeat_emit_calcBlockSize:
	ADDQ $0x04, CX
	JMP  memmove_long_repeat_emit_calcBlockSize

three_bytes_repeat_emit_calcBlockSize:
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_calcBlockSize

two_bytes_repeat_emit_calcBlockSize:
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_calcBlockSize
	JMP  memmove_long_repeat_emit_calcBlockSize

one_byte_repeat_emit_calcBlockSize:
	ADDQ $0x01, CX

memmove_repeat_emit_calcBlockSize:
	LEAQ (CX)(R8*1), CX
	JMP  emit_literal_done_repeat_emit_calcBlockSize

memmove_long_repeat_emit_calcBlockSize:
	LEAQ (CX)(R8*1), CX

emit_literal_done_repeat_emit_calcBlockSize:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+8(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_calcBlockSize:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_calcBlockSize
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSize
	XORQ 8(SI)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_calcBlockSize
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_calcBlockSize

matchlen_bsf_16repeat_extend_calcBlockSize:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_calcBlockSize

matchlen_match8_repeat_extend_calcBlockSize:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_calcBlockSize
	MOVQ (R9)(R11*1), R10
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSize
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_calcBlockSize

matchlen_bsf_8_repeat_extend_calcBlockSize:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_calcBlockSize

matchlen_match4_repeat_extend_calcBlockSize:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_calcBlockSize
	MOVL (R9)(R11*1), R10
	CMPL (SI)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_calcBlockSize
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_calcBlockSize:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_calcBlockSize
	JB   repeat_extend_forward_end_calcBlockSize
	MOVW (R9)(R11*1), R10
	CMPW (SI)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_calcBlockSize
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_calcBlockSize

matchlen_match1_repeat_extend_calcBlockSize:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_calcBlockSize
	LEAL 1(R11), R11

repeat_extend_forward_end_calcBlockSize:
	ADDL R11, DX
	MOVL DX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
	CMPL DI, $0x00010000
	JB   two_byte_offset_repeat_as_copy_calcBlockSize

four_bytes_loop_back_repeat_as_copy_calcBlockSize:
	CMPL SI, $0x40
	JBE  four_bytes_remain_repeat_as_copy_calcBlockSize
	LEAL -64(SI), SI
	ADDQ $0x05, CX
	CMPL SI, $0x04
	JB   four_bytes_remain_repeat_as_copy_calcBlockSize
	JMP  four_bytes_loop_back_repeat_as_copy_calcBlockSize

four_bytes_remain_repeat_as_copy_calcBlockSize:
	TESTL SI, SI
	JZ    repeat_end_emit_calcBlockSize
	XORL  SI, SI
	ADDQ  $0x05, CX
	JMP   repeat_end_emit_calcBlockSize

two_byte_offset_repeat_as_copy_calcBlockSize:
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_calcBlockSize
	LEAL -60(SI), SI
	ADDQ $0x03, CX
	JMP  two_byte_offset_repeat_as_copy_calcBlockSize

two_byte_offset_short_repeat_as_copy_calcBlockSize:
	MOVL SI, R8
	SHLL $0x02, R8
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_calcBlockSize
	CMPL DI, $0x00000800
	JAE  emit_copy_three_repeat_as_copy_calcBlockSize
	ADDQ $0x02, CX
	JMP  repeat_end_emit_calcBlockSize

emit_copy_three_repeat_as_copy_calcBlockSize:
	ADDQ $0x03, CX

repeat_end_emit_calcBlockSize:
	MOVL DX, 12(SP)
	JMP  search_loop_calcBlockSize

no_repeat_found_calcBlockSize:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_calcBlockSize
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_calcBlockSize
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_calcBlockSize
	MOVL 20(SP), DX
	JMP  search_loop_calcBlockSize

candidate3_match_calcBlockSize:
	ADDL $0x02, DX
	JMP  candidate_match_calcBlockSize

candidate2_match_calcBlockSize:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_calcBlockSize:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_calcBlockSize

match_extend_back_loop_calcBlockSize:
	CMPL DX, DI
	JBE  match_extend_back_end_calcBlockSize
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_calcBlockSize
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_calcBlockSize
	JMP  match_extend_back_loop_calcBlockSize

match_extend_back_end_calcBlockSize:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 5(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_calcBlockSize
	MOVQ $0x00000000, ret+32(FP)
	RET

match_dst_size_check_calcBlockSize:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_calcBlockSize
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_calcBlockSize
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_calcBlockSize
	CMPL DI, $0x00010000
	JB   three_bytes_match_emit_calcBlockSize
	CMPL DI, $0x01000000
	JB   four_bytes_match_emit_calcBlockSize
	ADDQ $0x05, CX
	JMP  memmove_long_match_emit_calcBlockSize

four_bytes_match_emit_calcBlockSize:
	ADDQ $0x04, CX
	JMP  memmove_long_match_emit_calcBlockSize

three_bytes_match_emit_calcBlockSize:
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_calcBlockSize

two_bytes_match_emit_calcBlockSize:
	ADDQ $0x02, CX
	CMPL DI, $0x40
	JB   memmove_match_emit_calcBlockSize
	JMP  memmove_long_match_emit_calcBlockSize

one_byte_match_emit_calcBlockSize:
	ADDQ $0x01, CX

memmove_match_emit_calcBlockSize:
	LEAQ (CX)(R9*1), CX
	JMP  emit_literal_done_match_emit_calcBlockSize

memmove_long_match_emit_calcBlockSize:
	LEAQ (CX)(R9*1), CX

emit_literal_done_match_emit_calcBlockSize:
match_nolit_loop_calcBlockSize:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+8(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_calcBlockSize:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_calcBlockSize
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_calcBlockSize
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_calcBlockSize
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_calcBlockSize

matchlen_bsf_16match_nolit_calcBlockSize:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_calcBlockSize

matchlen_match8_match_nolit_calcBlockSize:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_calcBlockSize
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_calcBlockSize
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_calcBlockSize

matchlen_bsf_8_match_nolit_calcBlockSize:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_calcBlockSize

matchlen_match4_match_nolit_calcBlockSize:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_calcBlockSize
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_calcBlockSize
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_calcBlockSize:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_calcBlockSize
	JB   match_nolit_end_calcBlockSize
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_calcBlockSize
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_calcBlockSize

matchlen_match1_match_nolit_calcBlockSize:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_calcBlockSize
	LEAL 1(R10), R10

match_nolit_end_calcBlockSize:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
	CMPL SI, $0x00010000
	JB   two_byte_offset_match_nolit_calcBlockSize

four_bytes_loop_back_match_nolit_calcBlockSize:
	CMPL R10, $0x40
	JBE  four_bytes_remain_match_nolit_calcBlockSize
	LEAL -64(R10), R10
	ADDQ $0x05, CX
	CMPL R10, $0x04
	JB   four_bytes_remain_match_nolit_calcBlockSize
	JMP  four_bytes_loop_back_match_nolit_calcBlockSize

four_bytes_remain_match_nolit_calcBlockSize:
	TESTL R10, R10
	JZ    match_nolit_emitcopy_end_calcBlockSize
	XORL  SI, SI
	ADDQ  $0x05, CX
	JMP   match_nolit_emitcopy_end_calcBlockSize

two_byte_offset_match_nolit_calcBlockSize:
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_calcBlockSize
	LEAL -60(R10), R10
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_calcBlockSize

two_byte_offset_short_match_nolit_calcBlockSize:
	MOVL R10, DI
	SHLL $0x02, DI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_calcBlockSize
	CMPL SI, $0x00000800
	JAE  emit_copy_three_match_nolit_calcBlockSize
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_calcBlockSize

emit_copy_three_match_nolit_calcBlockSize:
	ADDQ $0x03, CX

match_nolit_emitcopy_end_calcBlockSize:
	CMPL DX, 8(SP)
	JAE  emit_remainder_calcBlockSize
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_calcBlockSize
	MOVQ $0x00000000, ret+32(FP)
	RET

match_nolit_dst_ok_calcBlockSize:
	MOVQ  $0x0000cf1bbcdcbf9b, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x10, R8
	IMULQ R9, R8
	SHRQ  $0x33, R8
	SHLQ  $0x10, SI
	IMULQ R9, SI
	SHRQ  $0x33, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_calcBlockSize
	INCL  DX
	JMP   search_loop_calcBlockSize

emit_remainder_calcBlockSize:
	MOVQ src_len+8(FP), AX
	SUBL 12(SP), AX
	LEAQ 5(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_calcBlockSize
	MOVQ $0x00000000, ret+32(FP)
	RET

emit_remainder_ok_calcBlockSize:
	MOVQ src_len+8(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_calcBlockSize
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), AX
	CMPL AX, $0x3c
	JB   one_byte_emit_remainder_calcBlockSize
	CMPL AX, $0x00000100
	JB   two_bytes_emit_remainder_calcBlockSize
	CMPL AX, $0x00010000
	JB   three_bytes_emit_remainder_calcBlockSize
	CMPL AX, $0x01000000
	JB   four_bytes_emit_remainder_calcBlockSize
	ADDQ $0x05, CX
	JMP  memmove_long_emit_remainder_calcBlockSize

four_bytes_emit_remainder_calcBlockSize:
	ADDQ $0x04, CX
	JMP  memmove_long_emit_remainder_calcBlockSize

three_bytes_emit_remainder_calcBlockSize:
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_calcBlockSize

two_bytes_emit_remainder_calcBlockSize:
	ADDQ $0x02, CX
	CMPL AX, $0x40
	JB   memmove_emit_remainder_calcBlockSize
	JMP  memmove_long_emit_remainder_calcBlockSize

one_byte_emit_remainder_calcBlockSize:
	ADDQ $0x01, CX

memmove_emit_remainder_calcBlockSize:
	LEAQ (CX)(SI*1), AX
	MOVQ AX, CX
	JMP  emit_literal_done_emit_remainder_calcBlockSize

memmove_long_emit_remainder_calcBlockSize:
	LEAQ (CX)(SI*1), AX
	MOVQ AX, CX

emit_literal_done_emit_remainder_calcBlockSize:
	MOVQ CX, ret+32(FP)
	RET

// func calcBlockSizeSmall(src []byte, tmp *[2048]byte) int
// Requires: BMI, SSE2
TEXT ·calcBlockSizeSmall(SB), $24-40
	MOVQ tmp+24(FP), AX
	XORQ CX, CX
	MOVQ $0x00000010, DX
	MOVQ AX, BX
	PXOR X0, X0

zero_loop_calcBlockSizeSmall:
	MOVOU X0, (BX)
	MOVOU X0, 16(BX)
	MOVOU X0, 32(BX)
	MOVOU X0, 48(BX)
	MOVOU X0, 64(BX)
	MOVOU X0, 80(BX)
	MOVOU X0, 96(BX)
	MOVOU X0, 112(BX)
	ADDQ  $0x80, BX
	DECQ  DX
	JNZ   zero_loop_calcBlockSizeSmall
	MOVL  $0x00000000, 12(SP)
	MOVQ  src_len+8(FP), DX
	LEAQ  -9(DX), BX
	LEAQ  -8(DX), SI
	MOVL  SI, 8(SP)
	SHRQ  $0x05, DX
	SUBL  DX, BX
	LEAQ  (CX)(BX*1), BX
	MOVQ  BX, (SP)
	MOVL  $0x00000001, DX
	MOVL  DX, 16(SP)
	MOVQ  src_base+0(FP), BX

search_loop_calcBlockSizeSmall:
	MOVL  DX, SI
	SUBL  12(SP), SI
	SHRL  $0x04, SI
	LEAL  4(DX)(SI*1), SI
	CMPL  SI, 8(SP)
	JAE   emit_remainder_calcBlockSizeSmall
	MOVQ  (BX)(DX*1), DI
	MOVL  SI, 20(SP)
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R10
	MOVQ  DI, R11
	SHRQ  $0x08, R11
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x37, R10
	SHLQ  $0x20, R11
	IMULQ R9, R11
	SHRQ  $0x37, R11
	MOVL  (AX)(R10*4), SI
	MOVL  (AX)(R11*4), R8
	MOVL  DX, (AX)(R10*4)
	LEAL  1(DX), R10
	MOVL  R10, (AX)(R11*4)
	MOVQ  DI, R10
	SHRQ  $0x10, R10
	SHLQ  $0x20, R10
	IMULQ R9, R10
	SHRQ  $0x37, R10
	MOVL  DX, R9
	SUBL  16(SP), R9
	MOVL  1(BX)(R9*1), R11
	MOVQ  DI, R9
	SHRQ  $0x08, R9
	CMPL  R9, R11
	JNE   no_repeat_found_calcBlockSizeSmall
	LEAL  1(DX), DI
	MOVL  12(SP), SI
	MOVL  DI, R8
	SUBL  16(SP), R8
	JZ    repeat_extend_back_end_calcBlockSizeSmall

repeat_extend_back_loop_calcBlockSizeSmall:
	CMPL DI, SI
	JBE  repeat_extend_back_end_calcBlockSizeSmall
	MOVB -1(BX)(R8*1), R9
	MOVB -1(BX)(DI*1), R10
	CMPB R9, R10
	JNE  repeat_extend_back_end_calcBlockSizeSmall
	LEAL -1(DI), DI
	DECL R8
	JNZ  repeat_extend_back_loop_calcBlockSizeSmall

repeat_extend_back_end_calcBlockSizeSmall:
	MOVL DI, SI
	SUBL 12(SP), SI
	LEAQ 3(CX)(SI*1), SI
	CMPQ SI, (SP)
	JB   repeat_dst_size_check_calcBlockSizeSmall
	MOVQ $0x00000000, ret+32(FP)
	RET

repeat_dst_size_check_calcBlockSizeSmall:
	MOVL 12(SP), SI
	CMPL SI, DI
	JEQ  emit_literal_done_repeat_emit_calcBlockSizeSmall
	MOVL DI, R8
	MOVL DI, 12(SP)
	LEAQ (BX)(SI*1), R9
	SUBL SI, R8
	LEAL -1(R8), SI
	CMPL SI, $0x3c
	JB   one_byte_repeat_emit_calcBlockSizeSmall
	CMPL SI, $0x00000100
	JB   two_bytes_repeat_emit_calcBlockSizeSmall
	JB   three_bytes_repeat_emit_calcBlockSizeSmall

three_bytes_repeat_emit_calcBlockSizeSmall:
	ADDQ $0x03, CX
	JMP  memmove_long_repeat_emit_calcBlockSizeSmall

two_bytes_repeat_emit_calcBlockSizeSmall:
	ADDQ $0x02, CX
	CMPL SI, $0x40
	JB   memmove_repeat_emit_calcBlockSizeSmall
	JMP  memmove_long_repeat_emit_calcBlockSizeSmall

one_byte_repeat_emit_calcBlockSizeSmall:
	ADDQ $0x01, CX

memmove_repeat_emit_calcBlockSizeSmall:
	LEAQ (CX)(R8*1), CX
	JMP  emit_literal_done_repeat_emit_calcBlockSizeSmall

memmove_long_repeat_emit_calcBlockSizeSmall:
	LEAQ (CX)(R8*1), CX

emit_literal_done_repeat_emit_calcBlockSizeSmall:
	ADDL $0x05, DX
	MOVL DX, SI
	SUBL 16(SP), SI
	MOVQ src_len+8(FP), R8
	SUBL DX, R8
	LEAQ (BX)(DX*1), R9
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R11, R11

matchlen_loopback_16_repeat_extend_calcBlockSizeSmall:
	CMPL R8, $0x10
	JB   matchlen_match8_repeat_extend_calcBlockSizeSmall
	MOVQ (R9)(R11*1), R10
	MOVQ 8(R9)(R11*1), R12
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
	XORQ 8(SI)(R11*1), R12
	JNZ  matchlen_bsf_16repeat_extend_calcBlockSizeSmall
	LEAL -16(R8), R8
	LEAL 16(R11), R11
	JMP  matchlen_loopback_16_repeat_extend_calcBlockSizeSmall

matchlen_bsf_16repeat_extend_calcBlockSizeSmall:
#ifdef GOAMD64_v3
	TZCNTQ R12, R12

#else
	BSFQ R12, R12

#endif
	SARQ $0x03, R12
	LEAL 8(R11)(R12*1), R11
	JMP  repeat_extend_forward_end_calcBlockSizeSmall

matchlen_match8_repeat_extend_calcBlockSizeSmall:
	CMPL R8, $0x08
	JB   matchlen_match4_repeat_extend_calcBlockSizeSmall
	MOVQ (R9)(R11*1), R10
	XORQ (SI)(R11*1), R10
	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
	LEAL -8(R8), R8
	LEAL 8(R11), R11
	JMP  matchlen_match4_repeat_extend_calcBlockSizeSmall

matchlen_bsf_8_repeat_extend_calcBlockSizeSmall:
#ifdef GOAMD64_v3
	TZCNTQ R10, R10

#else
	BSFQ R10, R10

#endif
	SARQ $0x03, R10
	LEAL (R11)(R10*1), R11
	JMP  repeat_extend_forward_end_calcBlockSizeSmall

matchlen_match4_repeat_extend_calcBlockSizeSmall:
	CMPL R8, $0x04
	JB   matchlen_match2_repeat_extend_calcBlockSizeSmall
	MOVL (R9)(R11*1), R10
	CMPL (SI)(R11*1), R10
	JNE  matchlen_match2_repeat_extend_calcBlockSizeSmall
	LEAL -4(R8), R8
	LEAL 4(R11), R11

matchlen_match2_repeat_extend_calcBlockSizeSmall:
	CMPL R8, $0x01
	JE   matchlen_match1_repeat_extend_calcBlockSizeSmall
	JB   repeat_extend_forward_end_calcBlockSizeSmall
	MOVW (R9)(R11*1), R10
	CMPW (SI)(R11*1), R10
	JNE  matchlen_match1_repeat_extend_calcBlockSizeSmall
	LEAL 2(R11), R11
	SUBL $0x02, R8
	JZ   repeat_extend_forward_end_calcBlockSizeSmall

matchlen_match1_repeat_extend_calcBlockSizeSmall:
	MOVB (R9)(R11*1), R10
	CMPB (SI)(R11*1), R10
	JNE  repeat_extend_forward_end_calcBlockSizeSmall
	LEAL 1(R11), R11

repeat_extend_forward_end_calcBlockSizeSmall:
	ADDL R11, DX
	MOVL DX, SI
	SUBL DI, SI
	MOVL 16(SP), DI

	// emitCopy
two_byte_offset_repeat_as_copy_calcBlockSizeSmall:
	CMPL SI, $0x40
	JBE  two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall
	LEAL -60(SI), SI
	ADDQ $0x03, CX
	JMP  two_byte_offset_repeat_as_copy_calcBlockSizeSmall

two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall:
	MOVL SI, DI
	SHLL $0x02, DI
	CMPL SI, $0x0c
	JAE  emit_copy_three_repeat_as_copy_calcBlockSizeSmall
	ADDQ $0x02, CX
	JMP  repeat_end_emit_calcBlockSizeSmall

emit_copy_three_repeat_as_copy_calcBlockSizeSmall:
	ADDQ $0x03, CX

repeat_end_emit_calcBlockSizeSmall:
	MOVL DX, 12(SP)
	JMP  search_loop_calcBlockSizeSmall

no_repeat_found_calcBlockSizeSmall:
	CMPL (BX)(SI*1), DI
	JEQ  candidate_match_calcBlockSizeSmall
	SHRQ $0x08, DI
	MOVL (AX)(R10*4), SI
	LEAL 2(DX), R9
	CMPL (BX)(R8*1), DI
	JEQ  candidate2_match_calcBlockSizeSmall
	MOVL R9, (AX)(R10*4)
	SHRQ $0x08, DI
	CMPL (BX)(SI*1), DI
	JEQ  candidate3_match_calcBlockSizeSmall
	MOVL 20(SP), DX
	JMP  search_loop_calcBlockSizeSmall

candidate3_match_calcBlockSizeSmall:
	ADDL $0x02, DX
	JMP  candidate_match_calcBlockSizeSmall

candidate2_match_calcBlockSizeSmall:
	MOVL R9, (AX)(R10*4)
	INCL DX
	MOVL R8, SI

candidate_match_calcBlockSizeSmall:
	MOVL  12(SP), DI
	TESTL SI, SI
	JZ    match_extend_back_end_calcBlockSizeSmall

match_extend_back_loop_calcBlockSizeSmall:
	CMPL DX, DI
	JBE  match_extend_back_end_calcBlockSizeSmall
	MOVB -1(BX)(SI*1), R8
	MOVB -1(BX)(DX*1), R9
	CMPB R8, R9
	JNE  match_extend_back_end_calcBlockSizeSmall
	LEAL -1(DX), DX
	DECL SI
	JZ   match_extend_back_end_calcBlockSizeSmall
	JMP  match_extend_back_loop_calcBlockSizeSmall

match_extend_back_end_calcBlockSizeSmall:
	MOVL DX, DI
	SUBL 12(SP), DI
	LEAQ 3(CX)(DI*1), DI
	CMPQ DI, (SP)
	JB   match_dst_size_check_calcBlockSizeSmall
	MOVQ $0x00000000, ret+32(FP)
	RET

match_dst_size_check_calcBlockSizeSmall:
	MOVL DX, DI
	MOVL 12(SP), R8
	CMPL R8, DI
	JEQ  emit_literal_done_match_emit_calcBlockSizeSmall
	MOVL DI, R9
	MOVL DI, 12(SP)
	LEAQ (BX)(R8*1), DI
	SUBL R8, R9
	LEAL -1(R9), DI
	CMPL DI, $0x3c
	JB   one_byte_match_emit_calcBlockSizeSmall
	CMPL DI, $0x00000100
	JB   two_bytes_match_emit_calcBlockSizeSmall
	JB   three_bytes_match_emit_calcBlockSizeSmall

three_bytes_match_emit_calcBlockSizeSmall:
	ADDQ $0x03, CX
	JMP  memmove_long_match_emit_calcBlockSizeSmall

two_bytes_match_emit_calcBlockSizeSmall:
	ADDQ $0x02, CX
	CMPL DI, $0x40
	JB   memmove_match_emit_calcBlockSizeSmall
	JMP  memmove_long_match_emit_calcBlockSizeSmall

one_byte_match_emit_calcBlockSizeSmall:
	ADDQ $0x01, CX

memmove_match_emit_calcBlockSizeSmall:
	LEAQ (CX)(R9*1), CX
	JMP  emit_literal_done_match_emit_calcBlockSizeSmall

memmove_long_match_emit_calcBlockSizeSmall:
	LEAQ (CX)(R9*1), CX

emit_literal_done_match_emit_calcBlockSizeSmall:
match_nolit_loop_calcBlockSizeSmall:
	MOVL DX, DI
	SUBL SI, DI
	MOVL DI, 16(SP)
	ADDL $0x04, DX
	ADDL $0x04, SI
	MOVQ src_len+8(FP), DI
	SUBL DX, DI
	LEAQ (BX)(DX*1), R8
	LEAQ (BX)(SI*1), SI

	// matchLen
	XORL R10, R10

matchlen_loopback_16_match_nolit_calcBlockSizeSmall:
	CMPL DI, $0x10
	JB   matchlen_match8_match_nolit_calcBlockSizeSmall
	MOVQ (R8)(R10*1), R9
	MOVQ 8(R8)(R10*1), R11
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_calcBlockSizeSmall
	XORQ 8(SI)(R10*1), R11
	JNZ  matchlen_bsf_16match_nolit_calcBlockSizeSmall
	LEAL -16(DI), DI
	LEAL 16(R10), R10
	JMP  matchlen_loopback_16_match_nolit_calcBlockSizeSmall

matchlen_bsf_16match_nolit_calcBlockSizeSmall:
#ifdef GOAMD64_v3
	TZCNTQ R11, R11

#else
	BSFQ R11, R11

#endif
	SARQ $0x03, R11
	LEAL 8(R10)(R11*1), R10
	JMP  match_nolit_end_calcBlockSizeSmall

matchlen_match8_match_nolit_calcBlockSizeSmall:
	CMPL DI, $0x08
	JB   matchlen_match4_match_nolit_calcBlockSizeSmall
	MOVQ (R8)(R10*1), R9
	XORQ (SI)(R10*1), R9
	JNZ  matchlen_bsf_8_match_nolit_calcBlockSizeSmall
	LEAL -8(DI), DI
	LEAL 8(R10), R10
	JMP  matchlen_match4_match_nolit_calcBlockSizeSmall

matchlen_bsf_8_match_nolit_calcBlockSizeSmall:
#ifdef GOAMD64_v3
	TZCNTQ R9, R9

#else
	BSFQ R9, R9

#endif
	SARQ $0x03, R9
	LEAL (R10)(R9*1), R10
	JMP  match_nolit_end_calcBlockSizeSmall

matchlen_match4_match_nolit_calcBlockSizeSmall:
	CMPL DI, $0x04
	JB   matchlen_match2_match_nolit_calcBlockSizeSmall
	MOVL (R8)(R10*1), R9
	CMPL (SI)(R10*1), R9
	JNE  matchlen_match2_match_nolit_calcBlockSizeSmall
	LEAL -4(DI), DI
	LEAL 4(R10), R10

matchlen_match2_match_nolit_calcBlockSizeSmall:
	CMPL DI, $0x01
	JE   matchlen_match1_match_nolit_calcBlockSizeSmall
	JB   match_nolit_end_calcBlockSizeSmall
	MOVW (R8)(R10*1), R9
	CMPW (SI)(R10*1), R9
	JNE  matchlen_match1_match_nolit_calcBlockSizeSmall
	LEAL 2(R10), R10
	SUBL $0x02, DI
	JZ   match_nolit_end_calcBlockSizeSmall

matchlen_match1_match_nolit_calcBlockSizeSmall:
	MOVB (R8)(R10*1), R9
	CMPB (SI)(R10*1), R9
	JNE  match_nolit_end_calcBlockSizeSmall
	LEAL 1(R10), R10

match_nolit_end_calcBlockSizeSmall:
	ADDL R10, DX
	MOVL 16(SP), SI
	ADDL $0x04, R10
	MOVL DX, 12(SP)

	// emitCopy
two_byte_offset_match_nolit_calcBlockSizeSmall:
	CMPL R10, $0x40
	JBE  two_byte_offset_short_match_nolit_calcBlockSizeSmall
	LEAL -60(R10), R10
	ADDQ $0x03, CX
	JMP  two_byte_offset_match_nolit_calcBlockSizeSmall

two_byte_offset_short_match_nolit_calcBlockSizeSmall:
	MOVL R10, SI
	SHLL $0x02, SI
	CMPL R10, $0x0c
	JAE  emit_copy_three_match_nolit_calcBlockSizeSmall
	ADDQ $0x02, CX
	JMP  match_nolit_emitcopy_end_calcBlockSizeSmall

emit_copy_three_match_nolit_calcBlockSizeSmall:
	ADDQ $0x03, CX

match_nolit_emitcopy_end_calcBlockSizeSmall:
	CMPL DX, 8(SP)
	JAE  emit_remainder_calcBlockSizeSmall
	MOVQ -2(BX)(DX*1), DI
	CMPQ CX, (SP)
	JB   match_nolit_dst_ok_calcBlockSizeSmall
	MOVQ $0x00000000, ret+32(FP)
	RET

match_nolit_dst_ok_calcBlockSizeSmall:
	MOVQ  $0x9e3779b1, R9
	MOVQ  DI, R8
	SHRQ  $0x10, DI
	MOVQ  DI, SI
	SHLQ  $0x20, R8
	IMULQ R9, R8
	SHRQ  $0x37, R8
	SHLQ  $0x20, SI
	IMULQ R9, SI
	SHRQ  $0x37, SI
	LEAL  -2(DX), R9
	LEAQ  (AX)(SI*4), R10
	MOVL  (R10), SI
	MOVL  R9, (AX)(R8*4)
	MOVL  DX, (R10)
	CMPL  (BX)(SI*1), DI
	JEQ   match_nolit_loop_calcBlockSizeSmall
	INCL  DX
	JMP   search_loop_calcBlockSizeSmall

emit_remainder_calcBlockSizeSmall:
	MOVQ src_len+8(FP), AX
	SUBL 12(SP), AX
	LEAQ 3(CX)(AX*1), AX
	CMPQ AX, (SP)
	JB   emit_remainder_ok_calcBlockSizeSmall
	MOVQ $0x00000000, ret+32(FP)
	RET

emit_remainder_ok_calcBlockSizeSmall:
	MOVQ src_len+8(FP), AX
	MOVL 12(SP), DX
	CMPL DX, AX
	JEQ  emit_literal_done_emit_remainder_calcBlockSizeSmall
	MOVL AX, SI
	MOVL AX, 12(SP)
	LEAQ (BX)(DX*1), AX
	SUBL DX, SI
	LEAL -1(SI), AX
	CMPL AX, $0x3c
	JB   one_byte_emit_remainder_calcBlockSizeSmall
	CMPL AX, $0x00000100
	JB   two_bytes_emit_remainder_calcBlockSizeSmall
	JB   three_bytes_emit_remainder_calcBlockSizeSmall

three_bytes_emit_remainder_calcBlockSizeSmall:
	ADDQ $0x03, CX
	JMP  memmove_long_emit_remainder_calcBlockSizeSmall

two_bytes_emit_remainder_calcBlockSizeSmall:
	ADDQ $0x02, CX
	CMPL AX, $0x40
	JB   memmove_emit_remainder_calcBlockSizeSmall
	JMP  memmove_long_emit_remainder_calcBlockSizeSmall

one_byte_emit_remainder_calcBlockSizeSmall:
	ADDQ $0x01, CX

memmove_emit_remainder_calcBlockSizeSmall:
	LEAQ (CX)(SI*1), AX
	MOVQ AX, CX
	JMP  emit_literal_done_emit_remainder_calcBlockSizeSmall

memmove_long_emit_remainder_calcBlockSizeSmall:
	LEAQ (CX)(SI*1), AX
	MOVQ AX, CX

emit_literal_done_emit_remainder_calcBlockSizeSmall:
	MOVQ CX, ret+32(FP)
	RET

// func emitLiteral(dst []byte, lit []byte) int
// Requires: SSE2
TEXT ·emitLiteral(SB), NOSPLIT, $0-56
	MOVQ  lit_len+32(FP), DX
	MOVQ  dst_base+0(FP), AX
	MOVQ  lit_base+24(FP), CX
	TESTQ DX, DX
	JZ    emit_literal_end_standalone_skip
	MOVL  DX, BX
	LEAL  -1(DX), SI
	CMPL  SI, $0x3c
	JB    one_byte_standalone
	CMPL  SI, $0x00000100
	JB    two_bytes_standalone
	CMPL  SI, $0x00010000
	JB    three_bytes_standalone
	CMPL  SI, $0x01000000
	JB    four_bytes_standalone
	MOVB  $0xfc, (AX)
	MOVL  SI, 1(AX)
	ADDQ  $0x05, BX
	ADDQ  $0x05, AX
	JMP   memmove_long_standalone

four_bytes_standalone:
	MOVL SI, DI
	SHRL $0x10, DI
	MOVB $0xf8, (AX)
	MOVW SI, 1(AX)
	MOVB DI, 3(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  memmove_long_standalone

three_bytes_standalone:
	MOVB $0xf4, (AX)
	MOVW SI, 1(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  memmove_long_standalone

two_bytes_standalone:
	MOVB $0xf0, (AX)
	MOVB SI, 1(AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	CMPL SI, $0x40
	JB   memmove_standalone
	JMP  memmove_long_standalone

one_byte_standalone:
	SHLB $0x02, SI
	MOVB SI, (AX)
	ADDQ $0x01, BX
	ADDQ $0x01, AX

memmove_standalone:
	// genMemMoveShort
	CMPQ DX, $0x03
	JB   emit_lit_memmove_standalone_memmove_move_1or2
	JE   emit_lit_memmove_standalone_memmove_move_3
	CMPQ DX, $0x08
	JB   emit_lit_memmove_standalone_memmove_move_4through7
	CMPQ DX, $0x10
	JBE  emit_lit_memmove_standalone_memmove_move_8through16
	CMPQ DX, $0x20
	JBE  emit_lit_memmove_standalone_memmove_move_17through32
	JMP  emit_lit_memmove_standalone_memmove_move_33through64

emit_lit_memmove_standalone_memmove_move_1or2:
	MOVB (CX), SI
	MOVB -1(CX)(DX*1), CL
	MOVB SI, (AX)
	MOVB CL, -1(AX)(DX*1)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_3:
	MOVW (CX), SI
	MOVB 2(CX), CL
	MOVW SI, (AX)
	MOVB CL, 2(AX)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_4through7:
	MOVL (CX), SI
	MOVL -4(CX)(DX*1), CX
	MOVL SI, (AX)
	MOVL CX, -4(AX)(DX*1)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_8through16:
	MOVQ (CX), SI
	MOVQ -8(CX)(DX*1), CX
	MOVQ SI, (AX)
	MOVQ CX, -8(AX)(DX*1)
	JMP  emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_17through32:
	MOVOU (CX), X0
	MOVOU -16(CX)(DX*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(DX*1)
	JMP   emit_literal_end_standalone

emit_lit_memmove_standalone_memmove_move_33through64:
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(DX*1), X2
	MOVOU -16(CX)(DX*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DX*1)
	MOVOU X3, -16(AX)(DX*1)
	JMP   emit_literal_end_standalone
	JMP emit_literal_end_standalone

memmove_long_standalone:
	// genMemMoveLong
	MOVOU (CX), X0
	MOVOU 16(CX), X1
	MOVOU -32(CX)(DX*1), X2
	MOVOU -16(CX)(DX*1), X3
	MOVQ  DX, DI
	SHRQ  $0x05, DI
	MOVQ  AX, SI
	ANDL  $0x0000001f, SI
	MOVQ  $0x00000040, R8
	SUBQ  SI, R8
	DECQ  DI
	JA    emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
	LEAQ  -32(CX)(R8*1), SI
	LEAQ  -32(AX)(R8*1), R9

emit_lit_memmove_long_standalonelarge_big_loop_back:
	MOVOU (SI), X4
	MOVOU 16(SI), X5
	MOVOA X4, (R9)
	MOVOA X5, 16(R9)
	ADDQ  $0x20, R9
	ADDQ  $0x20, SI
	ADDQ  $0x20, R8
	DECQ  DI
	JNA   emit_lit_memmove_long_standalonelarge_big_loop_back

emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
	MOVOU -32(CX)(R8*1), X4
	MOVOU -16(CX)(R8*1), X5
	MOVOA X4, -32(AX)(R8*1)
	MOVOA X5, -16(AX)(R8*1)
	ADDQ  $0x20, R8
	CMPQ  DX, R8
	JAE   emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(DX*1)
	MOVOU X3, -16(AX)(DX*1)
	JMP   emit_literal_end_standalone
	JMP emit_literal_end_standalone

emit_literal_end_standalone_skip:
	XORQ BX, BX

emit_literal_end_standalone:
	MOVQ BX, ret+48(FP)
	RET

// func emitRepeat(dst []byte, offset int, length int) int
TEXT ·emitRepeat(SB), NOSPLIT, $0-48
	XORQ BX, BX
	MOVQ dst_base+0(FP), AX
	MOVQ offset+24(FP), CX
	MOVQ length+32(FP), DX

	// emitRepeat
emit_repeat_again_standalone:
	MOVL DX, SI
	LEAL -4(DX), DX
	CMPL SI, $0x08
	JBE  repeat_two_standalone
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_standalone
	CMPL CX, $0x00000800
	JB   repeat_two_offset_standalone

cant_repeat_two_offset_standalone:
	CMPL DX, $0x00000104
	JB   repeat_three_standalone
	CMPL DX, $0x00010100
	JB   repeat_four_standalone
	CMPL DX, $0x0100ffff
	JB   repeat_five_standalone
	LEAL -16842747(DX), DX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	ADDQ $0x05, BX
	JMP  emit_repeat_again_standalone

repeat_five_standalone:
	LEAL -65536(DX), DX
	MOVL DX, CX
	MOVW $0x001d, (AX)
	MOVW DX, 2(AX)
	SARL $0x10, CX
	MOVB CL, 4(AX)
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	JMP  gen_emit_repeat_end

repeat_four_standalone:
	LEAL -256(DX), DX
	MOVW $0x0019, (AX)
	MOVW DX, 2(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  gen_emit_repeat_end

repeat_three_standalone:
	LEAL -4(DX), DX
	MOVW $0x0015, (AX)
	MOVB DL, 2(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  gen_emit_repeat_end

repeat_two_standalone:
	SHLL $0x02, DX
	ORL  $0x01, DX
	MOVW DX, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_repeat_end

repeat_two_offset_standalone:
	XORQ SI, SI
	LEAL 1(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SARL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX

gen_emit_repeat_end:
	MOVQ BX, ret+40(FP)
	RET

// func emitCopy(dst []byte, offset int, length int) int
TEXT ·emitCopy(SB), NOSPLIT, $0-48
	XORQ BX, BX
	MOVQ dst_base+0(FP), AX
	MOVQ offset+24(FP), CX
	MOVQ length+32(FP), DX

	// emitCopy
	CMPL CX, $0x00010000
	JB   two_byte_offset_standalone
	CMPL DX, $0x40
	JBE  four_bytes_remain_standalone
	MOVB $0xff, (AX)
	MOVL CX, 1(AX)
	LEAL -64(DX), DX
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	CMPL DX, $0x04
	JB   four_bytes_remain_standalone

	// emitRepeat
emit_repeat_again_standalone_emit_copy:
	MOVL DX, SI
	LEAL -4(DX), DX
	CMPL SI, $0x08
	JBE  repeat_two_standalone_emit_copy
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_standalone_emit_copy
	CMPL CX, $0x00000800
	JB   repeat_two_offset_standalone_emit_copy

cant_repeat_two_offset_standalone_emit_copy:
	CMPL DX, $0x00000104
	JB   repeat_three_standalone_emit_copy
	CMPL DX, $0x00010100
	JB   repeat_four_standalone_emit_copy
	CMPL DX, $0x0100ffff
	JB   repeat_five_standalone_emit_copy
	LEAL -16842747(DX), DX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	ADDQ $0x05, BX
	JMP  emit_repeat_again_standalone_emit_copy

repeat_five_standalone_emit_copy:
	LEAL -65536(DX), DX
	MOVL DX, CX
	MOVW $0x001d, (AX)
	MOVW DX, 2(AX)
	SARL $0x10, CX
	MOVB CL, 4(AX)
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	JMP  gen_emit_copy_end

repeat_four_standalone_emit_copy:
	LEAL -256(DX), DX
	MOVW $0x0019, (AX)
	MOVW DX, 2(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  gen_emit_copy_end

repeat_three_standalone_emit_copy:
	LEAL -4(DX), DX
	MOVW $0x0015, (AX)
	MOVB DL, 2(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  gen_emit_copy_end

repeat_two_standalone_emit_copy:
	SHLL $0x02, DX
	ORL  $0x01, DX
	MOVW DX, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

repeat_two_offset_standalone_emit_copy:
	XORQ SI, SI
	LEAL 1(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SARL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

four_bytes_remain_standalone:
	TESTL DX, DX
	JZ    gen_emit_copy_end
	XORL  SI, SI
	LEAL  -1(SI)(DX*4), DX
	MOVB  DL, (AX)
	MOVL  CX, 1(AX)
	ADDQ  $0x05, BX
	ADDQ  $0x05, AX
	JMP   gen_emit_copy_end

two_byte_offset_standalone:
	CMPL DX, $0x40
	JBE  two_byte_offset_short_standalone
	CMPL CX, $0x00000800
	JAE  long_offset_short_standalone
	MOVL $0x00000001, SI
	LEAL 16(SI), SI
	MOVB CL, 1(AX)
	MOVL CX, DI
	SHRL $0x08, DI
	SHLL $0x05, DI
	ORL  DI, SI
	MOVB SI, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	SUBL $0x08, DX

	// emitRepeat
	LEAL -4(DX), DX
	JMP  cant_repeat_two_offset_standalone_emit_copy_short_2b

emit_repeat_again_standalone_emit_copy_short_2b:
	MOVL DX, SI
	LEAL -4(DX), DX
	CMPL SI, $0x08
	JBE  repeat_two_standalone_emit_copy_short_2b
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_standalone_emit_copy_short_2b
	CMPL CX, $0x00000800
	JB   repeat_two_offset_standalone_emit_copy_short_2b

cant_repeat_two_offset_standalone_emit_copy_short_2b:
	CMPL DX, $0x00000104
	JB   repeat_three_standalone_emit_copy_short_2b
	CMPL DX, $0x00010100
	JB   repeat_four_standalone_emit_copy_short_2b
	CMPL DX, $0x0100ffff
	JB   repeat_five_standalone_emit_copy_short_2b
	LEAL -16842747(DX), DX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	ADDQ $0x05, BX
	JMP  emit_repeat_again_standalone_emit_copy_short_2b

repeat_five_standalone_emit_copy_short_2b:
	LEAL -65536(DX), DX
	MOVL DX, CX
	MOVW $0x001d, (AX)
	MOVW DX, 2(AX)
	SARL $0x10, CX
	MOVB CL, 4(AX)
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	JMP  gen_emit_copy_end

repeat_four_standalone_emit_copy_short_2b:
	LEAL -256(DX), DX
	MOVW $0x0019, (AX)
	MOVW DX, 2(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  gen_emit_copy_end

repeat_three_standalone_emit_copy_short_2b:
	LEAL -4(DX), DX
	MOVW $0x0015, (AX)
	MOVB DL, 2(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  gen_emit_copy_end

repeat_two_standalone_emit_copy_short_2b:
	SHLL $0x02, DX
	ORL  $0x01, DX
	MOVW DX, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

repeat_two_offset_standalone_emit_copy_short_2b:
	XORQ SI, SI
	LEAL 1(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SARL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

long_offset_short_standalone:
	MOVB $0xee, (AX)
	MOVW CX, 1(AX)
	LEAL -60(DX), DX
	ADDQ $0x03, AX
	ADDQ $0x03, BX

	// emitRepeat
emit_repeat_again_standalone_emit_copy_short:
	MOVL DX, SI
	LEAL -4(DX), DX
	CMPL SI, $0x08
	JBE  repeat_two_standalone_emit_copy_short
	CMPL SI, $0x0c
	JAE  cant_repeat_two_offset_standalone_emit_copy_short
	CMPL CX, $0x00000800
	JB   repeat_two_offset_standalone_emit_copy_short

cant_repeat_two_offset_standalone_emit_copy_short:
	CMPL DX, $0x00000104
	JB   repeat_three_standalone_emit_copy_short
	CMPL DX, $0x00010100
	JB   repeat_four_standalone_emit_copy_short
	CMPL DX, $0x0100ffff
	JB   repeat_five_standalone_emit_copy_short
	LEAL -16842747(DX), DX
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	ADDQ $0x05, BX
	JMP  emit_repeat_again_standalone_emit_copy_short

repeat_five_standalone_emit_copy_short:
	LEAL -65536(DX), DX
	MOVL DX, CX
	MOVW $0x001d, (AX)
	MOVW DX, 2(AX)
	SARL $0x10, CX
	MOVB CL, 4(AX)
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	JMP  gen_emit_copy_end

repeat_four_standalone_emit_copy_short:
	LEAL -256(DX), DX
	MOVW $0x0019, (AX)
	MOVW DX, 2(AX)
	ADDQ $0x04, BX
	ADDQ $0x04, AX
	JMP  gen_emit_copy_end

repeat_three_standalone_emit_copy_short:
	LEAL -4(DX), DX
	MOVW $0x0015, (AX)
	MOVB DL, 2(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX
	JMP  gen_emit_copy_end

repeat_two_standalone_emit_copy_short:
	SHLL $0x02, DX
	ORL  $0x01, DX
	MOVW DX, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

repeat_two_offset_standalone_emit_copy_short:
	XORQ SI, SI
	LEAL 1(SI)(DX*4), DX
	MOVB CL, 1(AX)
	SARL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, DX
	MOVB DL, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

two_byte_offset_short_standalone:
	MOVL DX, SI
	SHLL $0x02, SI
	CMPL DX, $0x0c
	JAE  emit_copy_three_standalone
	CMPL CX, $0x00000800
	JAE  emit_copy_three_standalone
	LEAL -15(SI), SI
	MOVB CL, 1(AX)
	SHRL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, SI
	MOVB SI, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end

emit_copy_three_standalone:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW CX, 1(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX

gen_emit_copy_end:
	MOVQ BX, ret+40(FP)
	RET

// func emitCopyNoRepeat(dst []byte, offset int, length int) int
TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
	XORQ BX, BX
	MOVQ dst_base+0(FP), AX
	MOVQ offset+24(FP), CX
	MOVQ length+32(FP), DX

	// emitCopy
	CMPL CX, $0x00010000
	JB   two_byte_offset_standalone_snappy

four_bytes_loop_back_standalone_snappy:
	CMPL DX, $0x40
	JBE  four_bytes_remain_standalone_snappy
	MOVB $0xff, (AX)
	MOVL CX, 1(AX)
	LEAL -64(DX), DX
	ADDQ $0x05, BX
	ADDQ $0x05, AX
	CMPL DX, $0x04
	JB   four_bytes_remain_standalone_snappy
	JMP  four_bytes_loop_back_standalone_snappy

four_bytes_remain_standalone_snappy:
	TESTL DX, DX
	JZ    gen_emit_copy_end_snappy
	XORL  SI, SI
	LEAL  -1(SI)(DX*4), DX
	MOVB  DL, (AX)
	MOVL  CX, 1(AX)
	ADDQ  $0x05, BX
	ADDQ  $0x05, AX
	JMP   gen_emit_copy_end_snappy

two_byte_offset_standalone_snappy:
	CMPL DX, $0x40
	JBE  two_byte_offset_short_standalone_snappy
	MOVB $0xee, (AX)
	MOVW CX, 1(AX)
	LEAL -60(DX), DX
	ADDQ $0x03, AX
	ADDQ $0x03, BX
	JMP  two_byte_offset_standalone_snappy

two_byte_offset_short_standalone_snappy:
	MOVL DX, SI
	SHLL $0x02, SI
	CMPL DX, $0x0c
	JAE  emit_copy_three_standalone_snappy
	CMPL CX, $0x00000800
	JAE  emit_copy_three_standalone_snappy
	LEAL -15(SI), SI
	MOVB CL, 1(AX)
	SHRL $0x08, CX
	SHLL $0x05, CX
	ORL  CX, SI
	MOVB SI, (AX)
	ADDQ $0x02, BX
	ADDQ $0x02, AX
	JMP  gen_emit_copy_end_snappy

emit_copy_three_standalone_snappy:
	LEAL -2(SI), SI
	MOVB SI, (AX)
	MOVW CX, 1(AX)
	ADDQ $0x03, BX
	ADDQ $0x03, AX

gen_emit_copy_end_snappy:
	MOVQ BX, ret+40(FP)
	RET

// func matchLen(a []byte, b []byte) int
// Requires: BMI
TEXT ·matchLen(SB), NOSPLIT, $0-56
	MOVQ a_base+0(FP), AX
	MOVQ b_base+24(FP), CX
	MOVQ a_len+8(FP), DX

	// matchLen
	XORL SI, SI

matchlen_loopback_16_standalone:
	CMPL DX, $0x10
	JB   matchlen_match8_standalone
	MOVQ (AX)(SI*1), BX
	MOVQ 8(AX)(SI*1), DI
	XORQ (CX)(SI*1), BX
	JNZ  matchlen_bsf_8_standalone
	XORQ 8(CX)(SI*1), DI
	JNZ  matchlen_bsf_16standalone
	LEAL -16(DX), DX
	LEAL 16(SI), SI
	JMP  matchlen_loopback_16_standalone

matchlen_bsf_16standalone:
#ifdef GOAMD64_v3
	TZCNTQ DI, DI

#else
	BSFQ DI, DI

#endif
	SARQ $0x03, DI
	LEAL 8(SI)(DI*1), SI
	JMP  gen_match_len_end

matchlen_match8_standalone:
	CMPL DX, $0x08
	JB   matchlen_match4_standalone
	MOVQ (AX)(SI*1), BX
	XORQ (CX)(SI*1), BX
	JNZ  matchlen_bsf_8_standalone
	LEAL -8(DX), DX
	LEAL 8(SI), SI
	JMP  matchlen_match4_standalone

matchlen_bsf_8_standalone:
#ifdef GOAMD64_v3
	TZCNTQ BX, BX

#else
	BSFQ BX, BX

#endif
	SARQ $0x03, BX
	LEAL (SI)(BX*1), SI
	JMP  gen_match_len_end

matchlen_match4_standalone:
	CMPL DX, $0x04
	JB   matchlen_match2_standalone
	MOVL (AX)(SI*1), BX
	CMPL (CX)(SI*1), BX
	JNE  matchlen_match2_standalone
	LEAL -4(DX), DX
	LEAL 4(SI), SI

matchlen_match2_standalone:
	CMPL DX, $0x01
	JE   matchlen_match1_standalone
	JB   gen_match_len_end
	MOVW (AX)(SI*1), BX
	CMPW (CX)(SI*1), BX
	JNE  matchlen_match1_standalone
	LEAL 2(SI), SI
	SUBL $0x02, DX
	JZ   gen_match_len_end

matchlen_match1_standalone:
	MOVB (AX)(SI*1), BL
	CMPB (CX)(SI*1), BL
	JNE  gen_match_len_end
	LEAL 1(SI), SI

gen_match_len_end:
	MOVQ SI, ret+48(FP)
	RET

// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
// Requires: SSE2
TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64
	XORQ SI, SI
	MOVQ dst_base+0(FP), AX
	MOVQ dst_len+8(FP), CX
	MOVQ src_base+24(FP), DX
	MOVQ src_len+32(FP), BX
	LEAQ (DX)(BX*1), BX
	LEAQ -8(AX)(CX*1), CX
	XORQ DI, DI

lz4_s2_loop:
	CMPQ    DX, BX
	JAE     lz4_s2_corrupt
	CMPQ    AX, CX
	JAE     lz4_s2_dstfull
	MOVBQZX (DX), R8
	MOVQ    R8, R9
	MOVQ    R8, R10
	SHRQ    $0x04, R9
	ANDQ    $0x0f, R10
	CMPQ    R8, $0xf0
	JB      lz4_s2_ll_end

lz4_s2_ll_loop:
	INCQ    DX
	CMPQ    DX, BX
	JAE     lz4_s2_corrupt
	MOVBQZX (DX), R8
	ADDQ    R8, R9
	CMPQ    R8, $0xff
	JEQ     lz4_s2_ll_loop

lz4_s2_ll_end:
	LEAQ  (DX)(R9*1), R8
	ADDQ  $0x04, R10
	CMPQ  R8, BX
	JAE   lz4_s2_corrupt
	INCQ  DX
	INCQ  R8
	TESTQ R9, R9
	JZ    lz4_s2_lits_done
	LEAQ  (AX)(R9*1), R11
	CMPQ  R11, CX
	JAE   lz4_s2_dstfull
	ADDQ  R9, SI
	LEAL  -1(R9), R11
	CMPL  R11, $0x3c
	JB    one_byte_lz4_s2
	CMPL  R11, $0x00000100
	JB    two_bytes_lz4_s2
	CMPL  R11, $0x00010000
	JB    three_bytes_lz4_s2
	CMPL  R11, $0x01000000
	JB    four_bytes_lz4_s2
	MOVB  $0xfc, (AX)
	MOVL  R11, 1(AX)
	ADDQ  $0x05, AX
	JMP   memmove_long_lz4_s2

four_bytes_lz4_s2:
	MOVL R11, R12
	SHRL $0x10, R12
	MOVB $0xf8, (AX)
	MOVW R11, 1(AX)
	MOVB R12, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_lz4_s2

three_bytes_lz4_s2:
	MOVB $0xf4, (AX)
	MOVW R11, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_lz4_s2

two_bytes_lz4_s2:
	MOVB $0xf0, (AX)
	MOVB R11, 1(AX)
	ADDQ $0x02, AX
	CMPL R11, $0x40
	JB   memmove_lz4_s2
	JMP  memmove_long_lz4_s2

one_byte_lz4_s2:
	SHLB $0x02, R11
	MOVB R11, (AX)
	ADDQ $0x01, AX

memmove_lz4_s2:
	LEAQ (AX)(R9*1), R11

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_lz4_s2_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_lz4_s2_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_lz4_s2_memmove_move_17through32
	JMP  emit_lit_memmove_lz4_s2_memmove_move_33through64

emit_lit_memmove_lz4_s2_memmove_move_8:
	MOVQ (DX), R12
	MOVQ R12, (AX)
	JMP  memmove_end_copy_lz4_s2

emit_lit_memmove_lz4_s2_memmove_move_8through16:
	MOVQ (DX), R12
	MOVQ -8(DX)(R9*1), DX
	MOVQ R12, (AX)
	MOVQ DX, -8(AX)(R9*1)
	JMP  memmove_end_copy_lz4_s2

emit_lit_memmove_lz4_s2_memmove_move_17through32:
	MOVOU (DX), X0
	MOVOU -16(DX)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_lz4_s2

emit_lit_memmove_lz4_s2_memmove_move_33through64:
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R9*1), X2
	MOVOU -16(DX)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_lz4_s2:
	MOVQ R11, AX
	JMP  lz4_s2_lits_emit_done

memmove_long_lz4_s2:
	LEAQ (AX)(R9*1), R11

	// genMemMoveLong
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R9*1), X2
	MOVOU -16(DX)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R12
	ANDL  $0x0000001f, R12
	MOVQ  $0x00000040, R14
	SUBQ  R12, R14
	DECQ  R13
	JA    emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
	LEAQ  -32(DX)(R14*1), R12
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_lz4_s2large_big_loop_back:
	MOVOU (R12), X4
	MOVOU 16(R12), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R12
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_lz4_s2large_big_loop_back

emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32:
	MOVOU -32(DX)(R14*1), X4
	MOVOU -16(DX)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R11, AX

lz4_s2_lits_emit_done:
	MOVQ R8, DX

lz4_s2_lits_done:
	CMPQ DX, BX
	JNE  lz4_s2_match
	CMPQ R10, $0x04
	JEQ  lz4_s2_done
	JMP  lz4_s2_corrupt

lz4_s2_match:
	LEAQ    2(DX), R8
	CMPQ    R8, BX
	JAE     lz4_s2_corrupt
	MOVWQZX (DX), R9
	MOVQ    R8, DX
	TESTQ   R9, R9
	JZ      lz4_s2_corrupt
	CMPQ    R9, SI
	JA      lz4_s2_corrupt
	CMPQ    R10, $0x13
	JNE     lz4_s2_ml_done

lz4_s2_ml_loop:
	MOVBQZX (DX), R8
	INCQ    DX
	ADDQ    R8, R10
	CMPQ    DX, BX
	JAE     lz4_s2_corrupt
	CMPQ    R8, $0xff
	JEQ     lz4_s2_ml_loop

lz4_s2_ml_done:
	ADDQ R10, SI
	CMPQ R9, DI
	JNE  lz4_s2_docopy

	// emitRepeat
emit_repeat_again_lz4_s2:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2

cant_repeat_two_offset_lz4_s2:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2

repeat_five_lz4_s2:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4_s2_loop

repeat_four_lz4_s2:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4_s2_loop

repeat_three_lz4_s2:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4_s2_loop

repeat_two_lz4_s2:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

repeat_two_offset_lz4_s2:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

lz4_s2_docopy:
	MOVQ R9, DI

	// emitCopy
	CMPL R10, $0x40
	JBE  two_byte_offset_short_lz4_s2
	CMPL R9, $0x00000800
	JAE  long_offset_short_lz4_s2
	MOVL $0x00000001, R8
	LEAL 16(R8), R8
	MOVB R9, 1(AX)
	MOVL R9, R11
	SHRL $0x08, R11
	SHLL $0x05, R11
	ORL  R11, R8
	MOVB R8, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R10

	// emitRepeat
	LEAL -4(R10), R10
	JMP  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b

emit_repeat_again_lz4_s2_emit_copy_short_2b:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2_emit_copy_short_2b
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2_emit_copy_short_2b

cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2_emit_copy_short_2b
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2_emit_copy_short_2b
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2_emit_copy_short_2b
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2_emit_copy_short_2b

repeat_five_lz4_s2_emit_copy_short_2b:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4_s2_loop

repeat_four_lz4_s2_emit_copy_short_2b:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4_s2_loop

repeat_three_lz4_s2_emit_copy_short_2b:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4_s2_loop

repeat_two_lz4_s2_emit_copy_short_2b:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

repeat_two_offset_lz4_s2_emit_copy_short_2b:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

long_offset_short_lz4_s2:
	MOVB $0xee, (AX)
	MOVW R9, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX

	// emitRepeat
emit_repeat_again_lz4_s2_emit_copy_short:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2_emit_copy_short
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2_emit_copy_short

cant_repeat_two_offset_lz4_s2_emit_copy_short:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2_emit_copy_short
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2_emit_copy_short
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2_emit_copy_short
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2_emit_copy_short

repeat_five_lz4_s2_emit_copy_short:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4_s2_loop

repeat_four_lz4_s2_emit_copy_short:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4_s2_loop

repeat_three_lz4_s2_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4_s2_loop

repeat_two_lz4_s2_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

repeat_two_offset_lz4_s2_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

two_byte_offset_short_lz4_s2:
	MOVL R10, R8
	SHLL $0x02, R8
	CMPL R10, $0x0c
	JAE  emit_copy_three_lz4_s2
	CMPL R9, $0x00000800
	JAE  emit_copy_three_lz4_s2
	LEAL -15(R8), R8
	MOVB R9, 1(AX)
	SHRL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R8
	MOVB R8, (AX)
	ADDQ $0x02, AX
	JMP  lz4_s2_loop

emit_copy_three_lz4_s2:
	LEAL -2(R8), R8
	MOVB R8, (AX)
	MOVW R9, 1(AX)
	ADDQ $0x03, AX
	JMP  lz4_s2_loop

lz4_s2_done:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ SI, uncompressed+48(FP)
	MOVQ AX, dstUsed+56(FP)
	RET

lz4_s2_corrupt:
	XORQ AX, AX
	LEAQ -1(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

lz4_s2_dstfull:
	XORQ AX, AX
	LEAQ -2(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

// func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
// Requires: SSE2
TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64
	XORQ SI, SI
	MOVQ dst_base+0(FP), AX
	MOVQ dst_len+8(FP), CX
	MOVQ src_base+24(FP), DX
	MOVQ src_len+32(FP), BX
	LEAQ (DX)(BX*1), BX
	LEAQ -8(AX)(CX*1), CX
	XORQ DI, DI

lz4s_s2_loop:
	CMPQ    DX, BX
	JAE     lz4s_s2_corrupt
	CMPQ    AX, CX
	JAE     lz4s_s2_dstfull
	MOVBQZX (DX), R8
	MOVQ    R8, R9
	MOVQ    R8, R10
	SHRQ    $0x04, R9
	ANDQ    $0x0f, R10
	CMPQ    R8, $0xf0
	JB      lz4s_s2_ll_end

lz4s_s2_ll_loop:
	INCQ    DX
	CMPQ    DX, BX
	JAE     lz4s_s2_corrupt
	MOVBQZX (DX), R8
	ADDQ    R8, R9
	CMPQ    R8, $0xff
	JEQ     lz4s_s2_ll_loop

lz4s_s2_ll_end:
	LEAQ  (DX)(R9*1), R8
	ADDQ  $0x03, R10
	CMPQ  R8, BX
	JAE   lz4s_s2_corrupt
	INCQ  DX
	INCQ  R8
	TESTQ R9, R9
	JZ    lz4s_s2_lits_done
	LEAQ  (AX)(R9*1), R11
	CMPQ  R11, CX
	JAE   lz4s_s2_dstfull
	ADDQ  R9, SI
	LEAL  -1(R9), R11
	CMPL  R11, $0x3c
	JB    one_byte_lz4s_s2
	CMPL  R11, $0x00000100
	JB    two_bytes_lz4s_s2
	CMPL  R11, $0x00010000
	JB    three_bytes_lz4s_s2
	CMPL  R11, $0x01000000
	JB    four_bytes_lz4s_s2
	MOVB  $0xfc, (AX)
	MOVL  R11, 1(AX)
	ADDQ  $0x05, AX
	JMP   memmove_long_lz4s_s2

four_bytes_lz4s_s2:
	MOVL R11, R12
	SHRL $0x10, R12
	MOVB $0xf8, (AX)
	MOVW R11, 1(AX)
	MOVB R12, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_lz4s_s2

three_bytes_lz4s_s2:
	MOVB $0xf4, (AX)
	MOVW R11, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_lz4s_s2

two_bytes_lz4s_s2:
	MOVB $0xf0, (AX)
	MOVB R11, 1(AX)
	ADDQ $0x02, AX
	CMPL R11, $0x40
	JB   memmove_lz4s_s2
	JMP  memmove_long_lz4s_s2

one_byte_lz4s_s2:
	SHLB $0x02, R11
	MOVB R11, (AX)
	ADDQ $0x01, AX

memmove_lz4s_s2:
	LEAQ (AX)(R9*1), R11

	// genMemMoveShort
	CMPQ R9, $0x08
	JBE  emit_lit_memmove_lz4s_s2_memmove_move_8
	CMPQ R9, $0x10
	JBE  emit_lit_memmove_lz4s_s2_memmove_move_8through16
	CMPQ R9, $0x20
	JBE  emit_lit_memmove_lz4s_s2_memmove_move_17through32
	JMP  emit_lit_memmove_lz4s_s2_memmove_move_33through64

emit_lit_memmove_lz4s_s2_memmove_move_8:
	MOVQ (DX), R12
	MOVQ R12, (AX)
	JMP  memmove_end_copy_lz4s_s2

emit_lit_memmove_lz4s_s2_memmove_move_8through16:
	MOVQ (DX), R12
	MOVQ -8(DX)(R9*1), DX
	MOVQ R12, (AX)
	MOVQ DX, -8(AX)(R9*1)
	JMP  memmove_end_copy_lz4s_s2

emit_lit_memmove_lz4s_s2_memmove_move_17through32:
	MOVOU (DX), X0
	MOVOU -16(DX)(R9*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R9*1)
	JMP   memmove_end_copy_lz4s_s2

emit_lit_memmove_lz4s_s2_memmove_move_33through64:
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R9*1), X2
	MOVOU -16(DX)(R9*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)

memmove_end_copy_lz4s_s2:
	MOVQ R11, AX
	JMP  lz4s_s2_lits_emit_done

memmove_long_lz4s_s2:
	LEAQ (AX)(R9*1), R11

	// genMemMoveLong
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R9*1), X2
	MOVOU -16(DX)(R9*1), X3
	MOVQ  R9, R13
	SHRQ  $0x05, R13
	MOVQ  AX, R12
	ANDL  $0x0000001f, R12
	MOVQ  $0x00000040, R14
	SUBQ  R12, R14
	DECQ  R13
	JA    emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
	LEAQ  -32(DX)(R14*1), R12
	LEAQ  -32(AX)(R14*1), R15

emit_lit_memmove_long_lz4s_s2large_big_loop_back:
	MOVOU (R12), X4
	MOVOU 16(R12), X5
	MOVOA X4, (R15)
	MOVOA X5, 16(R15)
	ADDQ  $0x20, R15
	ADDQ  $0x20, R12
	ADDQ  $0x20, R14
	DECQ  R13
	JNA   emit_lit_memmove_long_lz4s_s2large_big_loop_back

emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32:
	MOVOU -32(DX)(R14*1), X4
	MOVOU -16(DX)(R14*1), X5
	MOVOA X4, -32(AX)(R14*1)
	MOVOA X5, -16(AX)(R14*1)
	ADDQ  $0x20, R14
	CMPQ  R9, R14
	JAE   emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R9*1)
	MOVOU X3, -16(AX)(R9*1)
	MOVQ  R11, AX

lz4s_s2_lits_emit_done:
	MOVQ R8, DX

lz4s_s2_lits_done:
	CMPQ DX, BX
	JNE  lz4s_s2_match
	CMPQ R10, $0x03
	JEQ  lz4s_s2_done
	JMP  lz4s_s2_corrupt

lz4s_s2_match:
	CMPQ    R10, $0x03
	JEQ     lz4s_s2_loop
	LEAQ    2(DX), R8
	CMPQ    R8, BX
	JAE     lz4s_s2_corrupt
	MOVWQZX (DX), R9
	MOVQ    R8, DX
	TESTQ   R9, R9
	JZ      lz4s_s2_corrupt
	CMPQ    R9, SI
	JA      lz4s_s2_corrupt
	CMPQ    R10, $0x12
	JNE     lz4s_s2_ml_done

lz4s_s2_ml_loop:
	MOVBQZX (DX), R8
	INCQ    DX
	ADDQ    R8, R10
	CMPQ    DX, BX
	JAE     lz4s_s2_corrupt
	CMPQ    R8, $0xff
	JEQ     lz4s_s2_ml_loop

lz4s_s2_ml_done:
	ADDQ R10, SI
	CMPQ R9, DI
	JNE  lz4s_s2_docopy

	// emitRepeat
emit_repeat_again_lz4_s2:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2

cant_repeat_two_offset_lz4_s2:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2

repeat_five_lz4_s2:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4s_s2_loop

repeat_four_lz4_s2:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4s_s2_loop

repeat_three_lz4_s2:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4s_s2_loop

repeat_two_lz4_s2:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

repeat_two_offset_lz4_s2:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

lz4s_s2_docopy:
	MOVQ R9, DI

	// emitCopy
	CMPL R10, $0x40
	JBE  two_byte_offset_short_lz4_s2
	CMPL R9, $0x00000800
	JAE  long_offset_short_lz4_s2
	MOVL $0x00000001, R8
	LEAL 16(R8), R8
	MOVB R9, 1(AX)
	MOVL R9, R11
	SHRL $0x08, R11
	SHLL $0x05, R11
	ORL  R11, R8
	MOVB R8, (AX)
	ADDQ $0x02, AX
	SUBL $0x08, R10

	// emitRepeat
	LEAL -4(R10), R10
	JMP  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b

emit_repeat_again_lz4_s2_emit_copy_short_2b:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2_emit_copy_short_2b
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2_emit_copy_short_2b

cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2_emit_copy_short_2b
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2_emit_copy_short_2b
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2_emit_copy_short_2b
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2_emit_copy_short_2b

repeat_five_lz4_s2_emit_copy_short_2b:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4s_s2_loop

repeat_four_lz4_s2_emit_copy_short_2b:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4s_s2_loop

repeat_three_lz4_s2_emit_copy_short_2b:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4s_s2_loop

repeat_two_lz4_s2_emit_copy_short_2b:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

repeat_two_offset_lz4_s2_emit_copy_short_2b:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

long_offset_short_lz4_s2:
	MOVB $0xee, (AX)
	MOVW R9, 1(AX)
	LEAL -60(R10), R10
	ADDQ $0x03, AX

	// emitRepeat
emit_repeat_again_lz4_s2_emit_copy_short:
	MOVL R10, R8
	LEAL -4(R10), R10
	CMPL R8, $0x08
	JBE  repeat_two_lz4_s2_emit_copy_short
	CMPL R8, $0x0c
	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short
	CMPL R9, $0x00000800
	JB   repeat_two_offset_lz4_s2_emit_copy_short

cant_repeat_two_offset_lz4_s2_emit_copy_short:
	CMPL R10, $0x00000104
	JB   repeat_three_lz4_s2_emit_copy_short
	CMPL R10, $0x00010100
	JB   repeat_four_lz4_s2_emit_copy_short
	CMPL R10, $0x0100ffff
	JB   repeat_five_lz4_s2_emit_copy_short
	LEAL -16842747(R10), R10
	MOVL $0xfffb001d, (AX)
	MOVB $0xff, 4(AX)
	ADDQ $0x05, AX
	JMP  emit_repeat_again_lz4_s2_emit_copy_short

repeat_five_lz4_s2_emit_copy_short:
	LEAL -65536(R10), R10
	MOVL R10, R9
	MOVW $0x001d, (AX)
	MOVW R10, 2(AX)
	SARL $0x10, R9
	MOVB R9, 4(AX)
	ADDQ $0x05, AX
	JMP  lz4s_s2_loop

repeat_four_lz4_s2_emit_copy_short:
	LEAL -256(R10), R10
	MOVW $0x0019, (AX)
	MOVW R10, 2(AX)
	ADDQ $0x04, AX
	JMP  lz4s_s2_loop

repeat_three_lz4_s2_emit_copy_short:
	LEAL -4(R10), R10
	MOVW $0x0015, (AX)
	MOVB R10, 2(AX)
	ADDQ $0x03, AX
	JMP  lz4s_s2_loop

repeat_two_lz4_s2_emit_copy_short:
	SHLL $0x02, R10
	ORL  $0x01, R10
	MOVW R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

repeat_two_offset_lz4_s2_emit_copy_short:
	XORQ R8, R8
	LEAL 1(R8)(R10*4), R10
	MOVB R9, 1(AX)
	SARL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R10
	MOVB R10, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

two_byte_offset_short_lz4_s2:
	MOVL R10, R8
	SHLL $0x02, R8
	CMPL R10, $0x0c
	JAE  emit_copy_three_lz4_s2
	CMPL R9, $0x00000800
	JAE  emit_copy_three_lz4_s2
	LEAL -15(R8), R8
	MOVB R9, 1(AX)
	SHRL $0x08, R9
	SHLL $0x05, R9
	ORL  R9, R8
	MOVB R8, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_s2_loop

emit_copy_three_lz4_s2:
	LEAL -2(R8), R8
	MOVB R8, (AX)
	MOVW R9, 1(AX)
	ADDQ $0x03, AX
	JMP  lz4s_s2_loop

lz4s_s2_done:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ SI, uncompressed+48(FP)
	MOVQ AX, dstUsed+56(FP)
	RET

lz4s_s2_corrupt:
	XORQ AX, AX
	LEAQ -1(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

lz4s_s2_dstfull:
	XORQ AX, AX
	LEAQ -2(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

// func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
// Requires: SSE2
TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64
	XORQ SI, SI
	MOVQ dst_base+0(FP), AX
	MOVQ dst_len+8(FP), CX
	MOVQ src_base+24(FP), DX
	MOVQ src_len+32(FP), BX
	LEAQ (DX)(BX*1), BX
	LEAQ -8(AX)(CX*1), CX

lz4_snappy_loop:
	CMPQ    DX, BX
	JAE     lz4_snappy_corrupt
	CMPQ    AX, CX
	JAE     lz4_snappy_dstfull
	MOVBQZX (DX), DI
	MOVQ    DI, R8
	MOVQ    DI, R9
	SHRQ    $0x04, R8
	ANDQ    $0x0f, R9
	CMPQ    DI, $0xf0
	JB      lz4_snappy_ll_end

lz4_snappy_ll_loop:
	INCQ    DX
	CMPQ    DX, BX
	JAE     lz4_snappy_corrupt
	MOVBQZX (DX), DI
	ADDQ    DI, R8
	CMPQ    DI, $0xff
	JEQ     lz4_snappy_ll_loop

lz4_snappy_ll_end:
	LEAQ  (DX)(R8*1), DI
	ADDQ  $0x04, R9
	CMPQ  DI, BX
	JAE   lz4_snappy_corrupt
	INCQ  DX
	INCQ  DI
	TESTQ R8, R8
	JZ    lz4_snappy_lits_done
	LEAQ  (AX)(R8*1), R10
	CMPQ  R10, CX
	JAE   lz4_snappy_dstfull
	ADDQ  R8, SI
	LEAL  -1(R8), R10
	CMPL  R10, $0x3c
	JB    one_byte_lz4_snappy
	CMPL  R10, $0x00000100
	JB    two_bytes_lz4_snappy
	CMPL  R10, $0x00010000
	JB    three_bytes_lz4_snappy
	CMPL  R10, $0x01000000
	JB    four_bytes_lz4_snappy
	MOVB  $0xfc, (AX)
	MOVL  R10, 1(AX)
	ADDQ  $0x05, AX
	JMP   memmove_long_lz4_snappy

four_bytes_lz4_snappy:
	MOVL R10, R11
	SHRL $0x10, R11
	MOVB $0xf8, (AX)
	MOVW R10, 1(AX)
	MOVB R11, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_lz4_snappy

three_bytes_lz4_snappy:
	MOVB $0xf4, (AX)
	MOVW R10, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_lz4_snappy

two_bytes_lz4_snappy:
	MOVB $0xf0, (AX)
	MOVB R10, 1(AX)
	ADDQ $0x02, AX
	CMPL R10, $0x40
	JB   memmove_lz4_snappy
	JMP  memmove_long_lz4_snappy

one_byte_lz4_snappy:
	SHLB $0x02, R10
	MOVB R10, (AX)
	ADDQ $0x01, AX

memmove_lz4_snappy:
	LEAQ (AX)(R8*1), R10

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_lz4_snappy_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_lz4_snappy_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_lz4_snappy_memmove_move_17through32
	JMP  emit_lit_memmove_lz4_snappy_memmove_move_33through64

emit_lit_memmove_lz4_snappy_memmove_move_8:
	MOVQ (DX), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_lz4_snappy

emit_lit_memmove_lz4_snappy_memmove_move_8through16:
	MOVQ (DX), R11
	MOVQ -8(DX)(R8*1), DX
	MOVQ R11, (AX)
	MOVQ DX, -8(AX)(R8*1)
	JMP  memmove_end_copy_lz4_snappy

emit_lit_memmove_lz4_snappy_memmove_move_17through32:
	MOVOU (DX), X0
	MOVOU -16(DX)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_lz4_snappy

emit_lit_memmove_lz4_snappy_memmove_move_33through64:
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R8*1), X2
	MOVOU -16(DX)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_lz4_snappy:
	MOVQ R10, AX
	JMP  lz4_snappy_lits_emit_done

memmove_long_lz4_snappy:
	LEAQ (AX)(R8*1), R10

	// genMemMoveLong
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R8*1), X2
	MOVOU -16(DX)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
	LEAQ  -32(DX)(R13*1), R11
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_lz4_snappylarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_lz4_snappylarge_big_loop_back

emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32:
	MOVOU -32(DX)(R13*1), X4
	MOVOU -16(DX)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  R10, AX

lz4_snappy_lits_emit_done:
	MOVQ DI, DX

lz4_snappy_lits_done:
	CMPQ DX, BX
	JNE  lz4_snappy_match
	CMPQ R9, $0x04
	JEQ  lz4_snappy_done
	JMP  lz4_snappy_corrupt

lz4_snappy_match:
	LEAQ    2(DX), DI
	CMPQ    DI, BX
	JAE     lz4_snappy_corrupt
	MOVWQZX (DX), R8
	MOVQ    DI, DX
	TESTQ   R8, R8
	JZ      lz4_snappy_corrupt
	CMPQ    R8, SI
	JA      lz4_snappy_corrupt
	CMPQ    R9, $0x13
	JNE     lz4_snappy_ml_done

lz4_snappy_ml_loop:
	MOVBQZX (DX), DI
	INCQ    DX
	ADDQ    DI, R9
	CMPQ    DX, BX
	JAE     lz4_snappy_corrupt
	CMPQ    DI, $0xff
	JEQ     lz4_snappy_ml_loop

lz4_snappy_ml_done:
	ADDQ R9, SI

	// emitCopy
two_byte_offset_lz4_s2:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_lz4_s2
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX
	CMPQ AX, CX
	JAE  lz4_snappy_loop
	JMP  two_byte_offset_lz4_s2

two_byte_offset_short_lz4_s2:
	MOVL R9, DI
	SHLL $0x02, DI
	CMPL R9, $0x0c
	JAE  emit_copy_three_lz4_s2
	CMPL R8, $0x00000800
	JAE  emit_copy_three_lz4_s2
	LEAL -15(DI), DI
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  lz4_snappy_loop

emit_copy_three_lz4_s2:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  lz4_snappy_loop

lz4_snappy_done:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ SI, uncompressed+48(FP)
	MOVQ AX, dstUsed+56(FP)
	RET

lz4_snappy_corrupt:
	XORQ AX, AX
	LEAQ -1(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

lz4_snappy_dstfull:
	XORQ AX, AX
	LEAQ -2(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

// func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
// Requires: SSE2
TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64
	XORQ SI, SI
	MOVQ dst_base+0(FP), AX
	MOVQ dst_len+8(FP), CX
	MOVQ src_base+24(FP), DX
	MOVQ src_len+32(FP), BX
	LEAQ (DX)(BX*1), BX
	LEAQ -8(AX)(CX*1), CX

lz4s_snappy_loop:
	CMPQ    DX, BX
	JAE     lz4s_snappy_corrupt
	CMPQ    AX, CX
	JAE     lz4s_snappy_dstfull
	MOVBQZX (DX), DI
	MOVQ    DI, R8
	MOVQ    DI, R9
	SHRQ    $0x04, R8
	ANDQ    $0x0f, R9
	CMPQ    DI, $0xf0
	JB      lz4s_snappy_ll_end

lz4s_snappy_ll_loop:
	INCQ    DX
	CMPQ    DX, BX
	JAE     lz4s_snappy_corrupt
	MOVBQZX (DX), DI
	ADDQ    DI, R8
	CMPQ    DI, $0xff
	JEQ     lz4s_snappy_ll_loop

lz4s_snappy_ll_end:
	LEAQ  (DX)(R8*1), DI
	ADDQ  $0x03, R9
	CMPQ  DI, BX
	JAE   lz4s_snappy_corrupt
	INCQ  DX
	INCQ  DI
	TESTQ R8, R8
	JZ    lz4s_snappy_lits_done
	LEAQ  (AX)(R8*1), R10
	CMPQ  R10, CX
	JAE   lz4s_snappy_dstfull
	ADDQ  R8, SI
	LEAL  -1(R8), R10
	CMPL  R10, $0x3c
	JB    one_byte_lz4s_snappy
	CMPL  R10, $0x00000100
	JB    two_bytes_lz4s_snappy
	CMPL  R10, $0x00010000
	JB    three_bytes_lz4s_snappy
	CMPL  R10, $0x01000000
	JB    four_bytes_lz4s_snappy
	MOVB  $0xfc, (AX)
	MOVL  R10, 1(AX)
	ADDQ  $0x05, AX
	JMP   memmove_long_lz4s_snappy

four_bytes_lz4s_snappy:
	MOVL R10, R11
	SHRL $0x10, R11
	MOVB $0xf8, (AX)
	MOVW R10, 1(AX)
	MOVB R11, 3(AX)
	ADDQ $0x04, AX
	JMP  memmove_long_lz4s_snappy

three_bytes_lz4s_snappy:
	MOVB $0xf4, (AX)
	MOVW R10, 1(AX)
	ADDQ $0x03, AX
	JMP  memmove_long_lz4s_snappy

two_bytes_lz4s_snappy:
	MOVB $0xf0, (AX)
	MOVB R10, 1(AX)
	ADDQ $0x02, AX
	CMPL R10, $0x40
	JB   memmove_lz4s_snappy
	JMP  memmove_long_lz4s_snappy

one_byte_lz4s_snappy:
	SHLB $0x02, R10
	MOVB R10, (AX)
	ADDQ $0x01, AX

memmove_lz4s_snappy:
	LEAQ (AX)(R8*1), R10

	// genMemMoveShort
	CMPQ R8, $0x08
	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_8
	CMPQ R8, $0x10
	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_8through16
	CMPQ R8, $0x20
	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_17through32
	JMP  emit_lit_memmove_lz4s_snappy_memmove_move_33through64

emit_lit_memmove_lz4s_snappy_memmove_move_8:
	MOVQ (DX), R11
	MOVQ R11, (AX)
	JMP  memmove_end_copy_lz4s_snappy

emit_lit_memmove_lz4s_snappy_memmove_move_8through16:
	MOVQ (DX), R11
	MOVQ -8(DX)(R8*1), DX
	MOVQ R11, (AX)
	MOVQ DX, -8(AX)(R8*1)
	JMP  memmove_end_copy_lz4s_snappy

emit_lit_memmove_lz4s_snappy_memmove_move_17through32:
	MOVOU (DX), X0
	MOVOU -16(DX)(R8*1), X1
	MOVOU X0, (AX)
	MOVOU X1, -16(AX)(R8*1)
	JMP   memmove_end_copy_lz4s_snappy

emit_lit_memmove_lz4s_snappy_memmove_move_33through64:
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R8*1), X2
	MOVOU -16(DX)(R8*1), X3
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)

memmove_end_copy_lz4s_snappy:
	MOVQ R10, AX
	JMP  lz4s_snappy_lits_emit_done

memmove_long_lz4s_snappy:
	LEAQ (AX)(R8*1), R10

	// genMemMoveLong
	MOVOU (DX), X0
	MOVOU 16(DX), X1
	MOVOU -32(DX)(R8*1), X2
	MOVOU -16(DX)(R8*1), X3
	MOVQ  R8, R12
	SHRQ  $0x05, R12
	MOVQ  AX, R11
	ANDL  $0x0000001f, R11
	MOVQ  $0x00000040, R13
	SUBQ  R11, R13
	DECQ  R12
	JA    emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
	LEAQ  -32(DX)(R13*1), R11
	LEAQ  -32(AX)(R13*1), R14

emit_lit_memmove_long_lz4s_snappylarge_big_loop_back:
	MOVOU (R11), X4
	MOVOU 16(R11), X5
	MOVOA X4, (R14)
	MOVOA X5, 16(R14)
	ADDQ  $0x20, R14
	ADDQ  $0x20, R11
	ADDQ  $0x20, R13
	DECQ  R12
	JNA   emit_lit_memmove_long_lz4s_snappylarge_big_loop_back

emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32:
	MOVOU -32(DX)(R13*1), X4
	MOVOU -16(DX)(R13*1), X5
	MOVOA X4, -32(AX)(R13*1)
	MOVOA X5, -16(AX)(R13*1)
	ADDQ  $0x20, R13
	CMPQ  R8, R13
	JAE   emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
	MOVOU X0, (AX)
	MOVOU X1, 16(AX)
	MOVOU X2, -32(AX)(R8*1)
	MOVOU X3, -16(AX)(R8*1)
	MOVQ  R10, AX

lz4s_snappy_lits_emit_done:
	MOVQ DI, DX

lz4s_snappy_lits_done:
	CMPQ DX, BX
	JNE  lz4s_snappy_match
	CMPQ R9, $0x03
	JEQ  lz4s_snappy_done
	JMP  lz4s_snappy_corrupt

lz4s_snappy_match:
	CMPQ    R9, $0x03
	JEQ     lz4s_snappy_loop
	LEAQ    2(DX), DI
	CMPQ    DI, BX
	JAE     lz4s_snappy_corrupt
	MOVWQZX (DX), R8
	MOVQ    DI, DX
	TESTQ   R8, R8
	JZ      lz4s_snappy_corrupt
	CMPQ    R8, SI
	JA      lz4s_snappy_corrupt
	CMPQ    R9, $0x12
	JNE     lz4s_snappy_ml_done

lz4s_snappy_ml_loop:
	MOVBQZX (DX), DI
	INCQ    DX
	ADDQ    DI, R9
	CMPQ    DX, BX
	JAE     lz4s_snappy_corrupt
	CMPQ    DI, $0xff
	JEQ     lz4s_snappy_ml_loop

lz4s_snappy_ml_done:
	ADDQ R9, SI

	// emitCopy
two_byte_offset_lz4_s2:
	CMPL R9, $0x40
	JBE  two_byte_offset_short_lz4_s2
	MOVB $0xee, (AX)
	MOVW R8, 1(AX)
	LEAL -60(R9), R9
	ADDQ $0x03, AX
	CMPQ AX, CX
	JAE  lz4s_snappy_loop
	JMP  two_byte_offset_lz4_s2

two_byte_offset_short_lz4_s2:
	MOVL R9, DI
	SHLL $0x02, DI
	CMPL R9, $0x0c
	JAE  emit_copy_three_lz4_s2
	CMPL R8, $0x00000800
	JAE  emit_copy_three_lz4_s2
	LEAL -15(DI), DI
	MOVB R8, 1(AX)
	SHRL $0x08, R8
	SHLL $0x05, R8
	ORL  R8, DI
	MOVB DI, (AX)
	ADDQ $0x02, AX
	JMP  lz4s_snappy_loop

emit_copy_three_lz4_s2:
	LEAL -2(DI), DI
	MOVB DI, (AX)
	MOVW R8, 1(AX)
	ADDQ $0x03, AX
	JMP  lz4s_snappy_loop

lz4s_snappy_done:
	MOVQ dst_base+0(FP), CX
	SUBQ CX, AX
	MOVQ SI, uncompressed+48(FP)
	MOVQ AX, dstUsed+56(FP)
	RET

lz4s_snappy_corrupt:
	XORQ AX, AX
	LEAQ -1(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET

lz4s_snappy_dstfull:
	XORQ AX, AX
	LEAQ -2(AX), SI
	MOVQ SI, uncompressed+48(FP)
	RET