// Code generated by command: go run blake2s_amd64_asm.go -out ../blake2s_amd64.s -pkg blake2s. DO NOT EDIT.

//go:build amd64 && gc && !purego

#include "textflag.h"

// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
// Requires: SSE2
TEXT ·hashBlocksSSE2(SB), $672-48
	MOVQ  h+0(FP), AX
	MOVQ  c+8(FP), BX
	MOVL  flag+16(FP), CX
	MOVQ  blocks_base+24(FP), SI
	MOVQ  blocks_len+32(FP), DX
	MOVQ  SP, BP
	ADDQ  $0x0f, BP
	ANDQ  $-16, BP
	MOVQ  (BX), R9
	MOVQ  R9, (BP)
	MOVQ  CX, 8(BP)
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU iv0<>+0(SB), X2
	MOVOU iv1<>+0(SB), X3
	MOVOU counter<>+0(SB), X12
	MOVOU rol16<>+0(SB), X13
	MOVOU rol8<>+0(SB), X14
	MOVO  (BP), X15

loop:
	MOVO   X0, X4
	MOVO   X1, X5
	MOVO   X2, X6
	MOVO   X3, X7
	PADDQ  X12, X15
	PXOR   X15, X7
	MOVQ   (SI), R8
	MOVQ   8(SI), R9
	MOVQ   16(SI), R10
	MOVQ   24(SI), R11
	MOVQ   32(SI), R12
	MOVQ   40(SI), R13
	MOVQ   48(SI), R14
	MOVQ   56(SI), R15
	MOVL   R8, 16(BP)
	MOVL   R8, 116(BP)
	MOVL   R8, 164(BP)
	MOVL   R8, 264(BP)
	MOVL   R8, 288(BP)
	MOVL   R8, 344(BP)
	MOVL   R8, 432(BP)
	MOVL   R8, 512(BP)
	MOVL   R8, 540(BP)
	MOVL   R8, 652(BP)
	SHRQ   $0x20, R8
	MOVL   R8, 32(BP)
	MOVL   R8, 112(BP)
	MOVL   R8, 200(BP)
	MOVL   R8, 228(BP)
	MOVL   R8, 320(BP)
	MOVL   R8, 380(BP)
	MOVL   R8, 404(BP)
	MOVL   R8, 488(BP)
	MOVL   R8, 568(BP)
	MOVL   R8, 604(BP)
	MOVL   R9, 20(BP)
	MOVL   R9, 132(BP)
	MOVL   R9, 168(BP)
	MOVL   R9, 240(BP)
	MOVL   R9, 280(BP)
	MOVL   R9, 336(BP)
	MOVL   R9, 456(BP)
	MOVL   R9, 508(BP)
	MOVL   R9, 576(BP)
	MOVL   R9, 608(BP)
	SHRQ   $0x20, R9
	MOVL   R9, 36(BP)
	MOVL   R9, 140(BP)
	MOVL   R9, 180(BP)
	MOVL   R9, 212(BP)
	MOVL   R9, 316(BP)
	MOVL   R9, 364(BP)
	MOVL   R9, 452(BP)
	MOVL   R9, 476(BP)
	MOVL   R9, 552(BP)
	MOVL   R9, 632(BP)
	MOVL   R10, 24(BP)
	MOVL   R10, 84(BP)
	MOVL   R10, 204(BP)
	MOVL   R10, 248(BP)
	MOVL   R10, 296(BP)
	MOVL   R10, 368(BP)
	MOVL   R10, 412(BP)
	MOVL   R10, 516(BP)
	MOVL   R10, 584(BP)
	MOVL   R10, 612(BP)
	SHRQ   $0x20, R10
	MOVL   R10, 40(BP)
	MOVL   R10, 124(BP)
	MOVL   R10, 152(BP)
	MOVL   R10, 244(BP)
	MOVL   R10, 276(BP)
	MOVL   R10, 388(BP)
	MOVL   R10, 416(BP)
	MOVL   R10, 496(BP)
	MOVL   R10, 588(BP)
	MOVL   R10, 620(BP)
	MOVL   R11, 28(BP)
	MOVL   R11, 108(BP)
	MOVL   R11, 196(BP)
	MOVL   R11, 256(BP)
	MOVL   R11, 312(BP)
	MOVL   R11, 340(BP)
	MOVL   R11, 436(BP)
	MOVL   R11, 520(BP)
	MOVL   R11, 528(BP)
	MOVL   R11, 616(BP)
	SHRQ   $0x20, R11
	MOVL   R11, 44(BP)
	MOVL   R11, 136(BP)
	MOVL   R11, 184(BP)
	MOVL   R11, 208(BP)
	MOVL   R11, 292(BP)
	MOVL   R11, 372(BP)
	MOVL   R11, 448(BP)
	MOVL   R11, 468(BP)
	MOVL   R11, 580(BP)
	MOVL   R11, 600(BP)
	MOVL   R12, 48(BP)
	MOVL   R12, 100(BP)
	MOVL   R12, 160(BP)
	MOVL   R12, 268(BP)
	MOVL   R12, 328(BP)
	MOVL   R12, 348(BP)
	MOVL   R12, 444(BP)
	MOVL   R12, 504(BP)
	MOVL   R12, 556(BP)
	MOVL   R12, 596(BP)
	SHRQ   $0x20, R12
	MOVL   R12, 64(BP)
	MOVL   R12, 88(BP)
	MOVL   R12, 188(BP)
	MOVL   R12, 224(BP)
	MOVL   R12, 272(BP)
	MOVL   R12, 396(BP)
	MOVL   R12, 440(BP)
	MOVL   R12, 492(BP)
	MOVL   R12, 548(BP)
	MOVL   R12, 628(BP)
	MOVL   R13, 52(BP)
	MOVL   R13, 96(BP)
	MOVL   R13, 176(BP)
	MOVL   R13, 260(BP)
	MOVL   R13, 284(BP)
	MOVL   R13, 356(BP)
	MOVL   R13, 428(BP)
	MOVL   R13, 524(BP)
	MOVL   R13, 572(BP)
	MOVL   R13, 592(BP)
	SHRQ   $0x20, R13
	MOVL   R13, 68(BP)
	MOVL   R13, 120(BP)
	MOVL   R13, 144(BP)
	MOVL   R13, 220(BP)
	MOVL   R13, 308(BP)
	MOVL   R13, 360(BP)
	MOVL   R13, 460(BP)
	MOVL   R13, 480(BP)
	MOVL   R13, 536(BP)
	MOVL   R13, 640(BP)
	MOVL   R14, 56(BP)
	MOVL   R14, 128(BP)
	MOVL   R14, 148(BP)
	MOVL   R14, 232(BP)
	MOVL   R14, 324(BP)
	MOVL   R14, 352(BP)
	MOVL   R14, 400(BP)
	MOVL   R14, 472(BP)
	MOVL   R14, 560(BP)
	MOVL   R14, 648(BP)
	SHRQ   $0x20, R14
	MOVL   R14, 72(BP)
	MOVL   R14, 92(BP)
	MOVL   R14, 172(BP)
	MOVL   R14, 216(BP)
	MOVL   R14, 332(BP)
	MOVL   R14, 384(BP)
	MOVL   R14, 424(BP)
	MOVL   R14, 464(BP)
	MOVL   R14, 564(BP)
	MOVL   R14, 636(BP)
	MOVL   R15, 60(BP)
	MOVL   R15, 80(BP)
	MOVL   R15, 192(BP)
	MOVL   R15, 236(BP)
	MOVL   R15, 304(BP)
	MOVL   R15, 392(BP)
	MOVL   R15, 408(BP)
	MOVL   R15, 484(BP)
	MOVL   R15, 532(BP)
	MOVL   R15, 644(BP)
	SHRQ   $0x20, R15
	MOVL   R15, 76(BP)
	MOVL   R15, 104(BP)
	MOVL   R15, 156(BP)
	MOVL   R15, 252(BP)
	MOVL   R15, 300(BP)
	MOVL   R15, 376(BP)
	MOVL   R15, 420(BP)
	MOVL   R15, 500(BP)
	MOVL   R15, 544(BP)
	MOVL   R15, 624(BP)
	PADDL  16(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  32(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  48(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  64(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  80(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  96(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  112(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  128(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  144(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  160(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  176(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  192(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  208(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  224(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  240(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  256(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  272(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  288(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  304(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  320(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  336(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  352(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  368(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  384(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  400(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  416(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  432(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  448(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  464(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  480(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  496(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  512(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  528(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  544(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  560(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  576(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  592(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  608(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  624(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x10, X8
	PSRLL  $0x10, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  640(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	MOVO   X7, X8
	PSLLL  $0x18, X8
	PSRLL  $0x08, X7
	PXOR   X8, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PXOR   X4, X0
	PXOR   X5, X1
	PXOR   X6, X0
	PXOR   X7, X1
	LEAQ   64(SI), SI
	SUBQ   $0x40, DX
	JNE    loop
	MOVO   X15, (BP)
	MOVQ   (BP), R9
	MOVQ   R9, (BX)
	MOVOU  X0, (AX)
	MOVOU  X1, 16(AX)
	RET

DATA iv0<>+0(SB)/4, $0x6a09e667
DATA iv0<>+4(SB)/4, $0xbb67ae85
DATA iv0<>+8(SB)/4, $0x3c6ef372
DATA iv0<>+12(SB)/4, $0xa54ff53a
GLOBL iv0<>(SB), RODATA|NOPTR, $16

DATA iv1<>+0(SB)/4, $0x510e527f
DATA iv1<>+4(SB)/4, $0x9b05688c
DATA iv1<>+8(SB)/4, $0x1f83d9ab
DATA iv1<>+12(SB)/4, $0x5be0cd19
GLOBL iv1<>(SB), RODATA|NOPTR, $16

DATA counter<>+0(SB)/8, $0x0000000000000040
DATA counter<>+8(SB)/8, $0x0000000000000000
GLOBL counter<>(SB), RODATA|NOPTR, $16

DATA rol16<>+0(SB)/8, $0x0504070601000302
DATA rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
GLOBL rol16<>(SB), RODATA|NOPTR, $16

DATA rol8<>+0(SB)/8, $0x0407060500030201
DATA rol8<>+8(SB)/8, $0x0c0f0e0d080b0a09
GLOBL rol8<>(SB), RODATA|NOPTR, $16

// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
// Requires: SSE2, SSSE3
TEXT ·hashBlocksSSSE3(SB), $672-48
	MOVQ  h+0(FP), AX
	MOVQ  c+8(FP), BX
	MOVL  flag+16(FP), CX
	MOVQ  blocks_base+24(FP), SI
	MOVQ  blocks_len+32(FP), DX
	MOVQ  SP, BP
	ADDQ  $0x0f, BP
	ANDQ  $-16, BP
	MOVQ  (BX), R9
	MOVQ  R9, (BP)
	MOVQ  CX, 8(BP)
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU iv0<>+0(SB), X2
	MOVOU iv1<>+0(SB), X3
	MOVOU counter<>+0(SB), X12
	MOVOU rol16<>+0(SB), X13
	MOVOU rol8<>+0(SB), X14
	MOVO  (BP), X15

loop:
	MOVO   X0, X4
	MOVO   X1, X5
	MOVO   X2, X6
	MOVO   X3, X7
	PADDQ  X12, X15
	PXOR   X15, X7
	MOVQ   (SI), R8
	MOVQ   8(SI), R9
	MOVQ   16(SI), R10
	MOVQ   24(SI), R11
	MOVQ   32(SI), R12
	MOVQ   40(SI), R13
	MOVQ   48(SI), R14
	MOVQ   56(SI), R15
	MOVL   R8, 16(BP)
	MOVL   R8, 116(BP)
	MOVL   R8, 164(BP)
	MOVL   R8, 264(BP)
	MOVL   R8, 288(BP)
	MOVL   R8, 344(BP)
	MOVL   R8, 432(BP)
	MOVL   R8, 512(BP)
	MOVL   R8, 540(BP)
	MOVL   R8, 652(BP)
	SHRQ   $0x20, R8
	MOVL   R8, 32(BP)
	MOVL   R8, 112(BP)
	MOVL   R8, 200(BP)
	MOVL   R8, 228(BP)
	MOVL   R8, 320(BP)
	MOVL   R8, 380(BP)
	MOVL   R8, 404(BP)
	MOVL   R8, 488(BP)
	MOVL   R8, 568(BP)
	MOVL   R8, 604(BP)
	MOVL   R9, 20(BP)
	MOVL   R9, 132(BP)
	MOVL   R9, 168(BP)
	MOVL   R9, 240(BP)
	MOVL   R9, 280(BP)
	MOVL   R9, 336(BP)
	MOVL   R9, 456(BP)
	MOVL   R9, 508(BP)
	MOVL   R9, 576(BP)
	MOVL   R9, 608(BP)
	SHRQ   $0x20, R9
	MOVL   R9, 36(BP)
	MOVL   R9, 140(BP)
	MOVL   R9, 180(BP)
	MOVL   R9, 212(BP)
	MOVL   R9, 316(BP)
	MOVL   R9, 364(BP)
	MOVL   R9, 452(BP)
	MOVL   R9, 476(BP)
	MOVL   R9, 552(BP)
	MOVL   R9, 632(BP)
	MOVL   R10, 24(BP)
	MOVL   R10, 84(BP)
	MOVL   R10, 204(BP)
	MOVL   R10, 248(BP)
	MOVL   R10, 296(BP)
	MOVL   R10, 368(BP)
	MOVL   R10, 412(BP)
	MOVL   R10, 516(BP)
	MOVL   R10, 584(BP)
	MOVL   R10, 612(BP)
	SHRQ   $0x20, R10
	MOVL   R10, 40(BP)
	MOVL   R10, 124(BP)
	MOVL   R10, 152(BP)
	MOVL   R10, 244(BP)
	MOVL   R10, 276(BP)
	MOVL   R10, 388(BP)
	MOVL   R10, 416(BP)
	MOVL   R10, 496(BP)
	MOVL   R10, 588(BP)
	MOVL   R10, 620(BP)
	MOVL   R11, 28(BP)
	MOVL   R11, 108(BP)
	MOVL   R11, 196(BP)
	MOVL   R11, 256(BP)
	MOVL   R11, 312(BP)
	MOVL   R11, 340(BP)
	MOVL   R11, 436(BP)
	MOVL   R11, 520(BP)
	MOVL   R11, 528(BP)
	MOVL   R11, 616(BP)
	SHRQ   $0x20, R11
	MOVL   R11, 44(BP)
	MOVL   R11, 136(BP)
	MOVL   R11, 184(BP)
	MOVL   R11, 208(BP)
	MOVL   R11, 292(BP)
	MOVL   R11, 372(BP)
	MOVL   R11, 448(BP)
	MOVL   R11, 468(BP)
	MOVL   R11, 580(BP)
	MOVL   R11, 600(BP)
	MOVL   R12, 48(BP)
	MOVL   R12, 100(BP)
	MOVL   R12, 160(BP)
	MOVL   R12, 268(BP)
	MOVL   R12, 328(BP)
	MOVL   R12, 348(BP)
	MOVL   R12, 444(BP)
	MOVL   R12, 504(BP)
	MOVL   R12, 556(BP)
	MOVL   R12, 596(BP)
	SHRQ   $0x20, R12
	MOVL   R12, 64(BP)
	MOVL   R12, 88(BP)
	MOVL   R12, 188(BP)
	MOVL   R12, 224(BP)
	MOVL   R12, 272(BP)
	MOVL   R12, 396(BP)
	MOVL   R12, 440(BP)
	MOVL   R12, 492(BP)
	MOVL   R12, 548(BP)
	MOVL   R12, 628(BP)
	MOVL   R13, 52(BP)
	MOVL   R13, 96(BP)
	MOVL   R13, 176(BP)
	MOVL   R13, 260(BP)
	MOVL   R13, 284(BP)
	MOVL   R13, 356(BP)
	MOVL   R13, 428(BP)
	MOVL   R13, 524(BP)
	MOVL   R13, 572(BP)
	MOVL   R13, 592(BP)
	SHRQ   $0x20, R13
	MOVL   R13, 68(BP)
	MOVL   R13, 120(BP)
	MOVL   R13, 144(BP)
	MOVL   R13, 220(BP)
	MOVL   R13, 308(BP)
	MOVL   R13, 360(BP)
	MOVL   R13, 460(BP)
	MOVL   R13, 480(BP)
	MOVL   R13, 536(BP)
	MOVL   R13, 640(BP)
	MOVL   R14, 56(BP)
	MOVL   R14, 128(BP)
	MOVL   R14, 148(BP)
	MOVL   R14, 232(BP)
	MOVL   R14, 324(BP)
	MOVL   R14, 352(BP)
	MOVL   R14, 400(BP)
	MOVL   R14, 472(BP)
	MOVL   R14, 560(BP)
	MOVL   R14, 648(BP)
	SHRQ   $0x20, R14
	MOVL   R14, 72(BP)
	MOVL   R14, 92(BP)
	MOVL   R14, 172(BP)
	MOVL   R14, 216(BP)
	MOVL   R14, 332(BP)
	MOVL   R14, 384(BP)
	MOVL   R14, 424(BP)
	MOVL   R14, 464(BP)
	MOVL   R14, 564(BP)
	MOVL   R14, 636(BP)
	MOVL   R15, 60(BP)
	MOVL   R15, 80(BP)
	MOVL   R15, 192(BP)
	MOVL   R15, 236(BP)
	MOVL   R15, 304(BP)
	MOVL   R15, 392(BP)
	MOVL   R15, 408(BP)
	MOVL   R15, 484(BP)
	MOVL   R15, 532(BP)
	MOVL   R15, 644(BP)
	SHRQ   $0x20, R15
	MOVL   R15, 76(BP)
	MOVL   R15, 104(BP)
	MOVL   R15, 156(BP)
	MOVL   R15, 252(BP)
	MOVL   R15, 300(BP)
	MOVL   R15, 376(BP)
	MOVL   R15, 420(BP)
	MOVL   R15, 500(BP)
	MOVL   R15, 544(BP)
	MOVL   R15, 624(BP)
	PADDL  16(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  32(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  48(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  64(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  80(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  96(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  112(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  128(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  144(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  160(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  176(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  192(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  208(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  224(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  240(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  256(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  272(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  288(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  304(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  320(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  336(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  352(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  368(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  384(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  400(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  416(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  432(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  448(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  464(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  480(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  496(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  512(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  528(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  544(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  560(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  576(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PADDL  592(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  608(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  624(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  640(BP), X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PXOR   X4, X0
	PXOR   X5, X1
	PXOR   X6, X0
	PXOR   X7, X1
	LEAQ   64(SI), SI
	SUBQ   $0x40, DX
	JNE    loop
	MOVO   X15, (BP)
	MOVQ   (BP), R9
	MOVQ   R9, (BX)
	MOVOU  X0, (AX)
	MOVOU  X1, 16(AX)
	RET

// func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
// Requires: SSE2, SSE4.1, SSSE3
TEXT ·hashBlocksSSE4(SB), $32-48
	MOVQ  h+0(FP), AX
	MOVQ  c+8(FP), BX
	MOVL  flag+16(FP), CX
	MOVQ  blocks_base+24(FP), SI
	MOVQ  blocks_len+32(FP), DX
	MOVQ  SP, BP
	ADDQ  $0x0f, BP
	ANDQ  $-16, BP
	MOVQ  (BX), R9
	MOVQ  R9, (BP)
	MOVQ  CX, 8(BP)
	MOVOU (AX), X0
	MOVOU 16(AX), X1
	MOVOU iv0<>+0(SB), X2
	MOVOU iv1<>+0(SB), X3
	MOVOU counter<>+0(SB), X12
	MOVOU rol16<>+0(SB), X13
	MOVOU rol8<>+0(SB), X14
	MOVO  (BP), X15

loop:
	MOVO   X0, X4
	MOVO   X1, X5
	MOVO   X2, X6
	MOVO   X3, X7
	PADDQ  X12, X15
	PXOR   X15, X7
	MOVL   (SI), X8
	PINSRD $0x01, 8(SI), X8
	PINSRD $0x02, 16(SI), X8
	PINSRD $0x03, 24(SI), X8
	MOVL   4(SI), X9
	PINSRD $0x01, 12(SI), X9
	PINSRD $0x02, 20(SI), X9
	PINSRD $0x03, 28(SI), X9
	MOVL   32(SI), X10
	PINSRD $0x01, 40(SI), X10
	PINSRD $0x02, 48(SI), X10
	PINSRD $0x03, 56(SI), X10
	MOVL   36(SI), X11
	PINSRD $0x01, 44(SI), X11
	PINSRD $0x02, 52(SI), X11
	PINSRD $0x03, 60(SI), X11
	PADDL  X8, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X9, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  X10, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X11, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	MOVL   56(SI), X8
	PINSRD $0x01, 16(SI), X8
	PINSRD $0x02, 36(SI), X8
	PINSRD $0x03, 52(SI), X8
	MOVL   40(SI), X9
	PINSRD $0x01, 32(SI), X9
	PINSRD $0x02, 60(SI), X9
	PINSRD $0x03, 24(SI), X9
	MOVL   4(SI), X10
	PINSRD $0x01, (SI), X10
	PINSRD $0x02, 44(SI), X10
	PINSRD $0x03, 20(SI), X10
	MOVL   48(SI), X11
	PINSRD $0x01, 8(SI), X11
	PINSRD $0x02, 28(SI), X11
	PINSRD $0x03, 12(SI), X11
	PADDL  X8, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X9, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  X10, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X11, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	MOVL   44(SI), X8
	PINSRD $0x01, 48(SI), X8
	PINSRD $0x02, 20(SI), X8
	PINSRD $0x03, 60(SI), X8
	MOVL   32(SI), X9
	PINSRD $0x01, (SI), X9
	PINSRD $0x02, 8(SI), X9
	PINSRD $0x03, 52(SI), X9
	MOVL   40(SI), X10
	PINSRD $0x01, 12(SI), X10
	PINSRD $0x02, 28(SI), X10
	PINSRD $0x03, 36(SI), X10
	MOVL   56(SI), X11
	PINSRD $0x01, 24(SI), X11
	PINSRD $0x02, 4(SI), X11
	PINSRD $0x03, 16(SI), X11
	PADDL  X8, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X9, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  X10, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X11, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	MOVL   28(SI), X8
	PINSRD $0x01, 12(SI), X8
	PINSRD $0x02, 52(SI), X8
	PINSRD $0x03, 44(SI), X8
	MOVL   36(SI), X9
	PINSRD $0x01, 4(SI), X9
	PINSRD $0x02, 48(SI), X9
	PINSRD $0x03, 56(SI), X9
	MOVL   8(SI), X10
	PINSRD $0x01, 20(SI), X10
	PINSRD $0x02, 16(SI), X10
	PINSRD $0x03, 60(SI), X10
	MOVL   24(SI), X11
	PINSRD $0x01, 40(SI), X11
	PINSRD $0x02, (SI), X11
	PINSRD $0x03, 32(SI), X11
	PADDL  X8, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X9, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  X10, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X11, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	MOVL   36(SI), X8
	PINSRD $0x01, 20(SI), X8
	PINSRD $0x02, 8(SI), X8
	PINSRD $0x03, 40(SI), X8
	MOVL   (SI), X9
	PINSRD $0x01, 28(SI), X9
	PINSRD $0x02, 16(SI), X9
	PINSRD $0x03, 60(SI), X9
	MOVL   56(SI), X10
	PINSRD $0x01, 44(SI), X10
	PINSRD $0x02, 24(SI), X10
	PINSRD $0x03, 12(SI), X10
	MOVL   4(SI), X11
	PINSRD $0x01, 48(SI), X11
	PINSRD $0x02, 32(SI), X11
	PINSRD $0x03, 52(SI), X11
	PADDL  X8, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X9, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  X10, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X11, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	MOVL   8(SI), X8
	PINSRD $0x01, 24(SI), X8
	PINSRD $0x02, (SI), X8
	PINSRD $0x03, 32(SI), X8
	MOVL   48(SI), X9
	PINSRD $0x01, 40(SI), X9
	PINSRD $0x02, 44(SI), X9
	PINSRD $0x03, 12(SI), X9
	MOVL   16(SI), X10
	PINSRD $0x01, 28(SI), X10
	PINSRD $0x02, 60(SI), X10
	PINSRD $0x03, 4(SI), X10
	MOVL   52(SI), X11
	PINSRD $0x01, 20(SI), X11
	PINSRD $0x02, 56(SI), X11
	PINSRD $0x03, 36(SI), X11
	PADDL  X8, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X9, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  X10, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X11, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	MOVL   48(SI), X8
	PINSRD $0x01, 4(SI), X8
	PINSRD $0x02, 56(SI), X8
	PINSRD $0x03, 16(SI), X8
	MOVL   20(SI), X9
	PINSRD $0x01, 60(SI), X9
	PINSRD $0x02, 52(SI), X9
	PINSRD $0x03, 40(SI), X9
	MOVL   (SI), X10
	PINSRD $0x01, 24(SI), X10
	PINSRD $0x02, 36(SI), X10
	PINSRD $0x03, 32(SI), X10
	MOVL   28(SI), X11
	PINSRD $0x01, 12(SI), X11
	PINSRD $0x02, 8(SI), X11
	PINSRD $0x03, 44(SI), X11
	PADDL  X8, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X9, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  X10, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X11, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	MOVL   52(SI), X8
	PINSRD $0x01, 28(SI), X8
	PINSRD $0x02, 48(SI), X8
	PINSRD $0x03, 12(SI), X8
	MOVL   44(SI), X9
	PINSRD $0x01, 56(SI), X9
	PINSRD $0x02, 4(SI), X9
	PINSRD $0x03, 36(SI), X9
	MOVL   20(SI), X10
	PINSRD $0x01, 60(SI), X10
	PINSRD $0x02, 32(SI), X10
	PINSRD $0x03, 8(SI), X10
	MOVL   (SI), X11
	PINSRD $0x01, 16(SI), X11
	PINSRD $0x02, 24(SI), X11
	PINSRD $0x03, 40(SI), X11
	PADDL  X8, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X9, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  X10, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X11, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	MOVL   24(SI), X8
	PINSRD $0x01, 56(SI), X8
	PINSRD $0x02, 44(SI), X8
	PINSRD $0x03, (SI), X8
	MOVL   60(SI), X9
	PINSRD $0x01, 36(SI), X9
	PINSRD $0x02, 12(SI), X9
	PINSRD $0x03, 32(SI), X9
	MOVL   48(SI), X10
	PINSRD $0x01, 52(SI), X10
	PINSRD $0x02, 4(SI), X10
	PINSRD $0x03, 40(SI), X10
	MOVL   8(SI), X11
	PINSRD $0x01, 28(SI), X11
	PINSRD $0x02, 16(SI), X11
	PINSRD $0x03, 20(SI), X11
	PADDL  X8, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X9, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  X10, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X11, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	MOVL   40(SI), X8
	PINSRD $0x01, 32(SI), X8
	PINSRD $0x02, 28(SI), X8
	PINSRD $0x03, 4(SI), X8
	MOVL   8(SI), X9
	PINSRD $0x01, 16(SI), X9
	PINSRD $0x02, 24(SI), X9
	PINSRD $0x03, 20(SI), X9
	MOVL   60(SI), X10
	PINSRD $0x01, 36(SI), X10
	PINSRD $0x02, 12(SI), X10
	PINSRD $0x03, 52(SI), X10
	MOVL   44(SI), X11
	PINSRD $0x01, 56(SI), X11
	PINSRD $0x02, 48(SI), X11
	PINSRD $0x03, (SI), X11
	PADDL  X8, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X9, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X5, X5
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X7, X7
	PADDL  X10, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X13, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x14, X8
	PSRLL  $0x0c, X5
	PXOR   X8, X5
	PADDL  X11, X4
	PADDL  X5, X4
	PXOR   X4, X7
	PSHUFB X14, X7
	PADDL  X7, X6
	PXOR   X6, X5
	MOVO   X5, X8
	PSLLL  $0x19, X8
	PSRLL  $0x07, X5
	PXOR   X8, X5
	PSHUFL $0x39, X7, X7
	PSHUFL $0x4e, X6, X6
	PSHUFL $0x93, X5, X5
	PXOR   X4, X0
	PXOR   X5, X1
	PXOR   X6, X0
	PXOR   X7, X1
	LEAQ   64(SI), SI
	SUBQ   $0x40, DX
	JNE    loop
	MOVO   X15, (BP)
	MOVQ   (BP), R9
	MOVQ   R9, (BX)
	MOVOU  X0, (AX)
	MOVOU  X1, 16(AX)
	RET