// Code generated by command: go run encode_asm.go -pkg base64 -out ../base64/encode_amd64.s -stubs ../base64/encode_amd64.go. DO NOT EDIT. //go:build !purego // +build !purego #include "textflag.h" // func encodeAVX2(dst []byte, src []byte, lut *int8) (int, int) // Requires: AVX, AVX2, SSE4.1 TEXT ·encodeAVX2(SB), NOSPLIT, $0-72 MOVQ dst_base+0(FP), AX MOVQ src_base+24(FP), DX MOVQ lut+48(FP), SI MOVQ src_len+32(FP), DI MOVB $0x33, CL PINSRB $0x00, CX, X4 VPBROADCASTB X4, Y4 MOVB $0x19, CL PINSRB $0x00, CX, X5 VPBROADCASTB X5, Y5 XORQ CX, CX XORQ BX, BX // Load the 16-byte LUT into both lanes of the register VPERMQ $0x44, (SI), Y3 // Load the first block using a mask to avoid potential fault VMOVDQU b64_enc_load<>+0(SB), Y0 VPMASKMOVD -4(DX)(BX*1), Y0, Y0 loop: VPSHUFB b64_enc_shuf<>+0(SB), Y0, Y0 VPAND b64_enc_mask1<>+0(SB), Y0, Y1 VPSLLW $0x08, Y1, Y2 VPSLLW $0x04, Y1, Y1 VPBLENDW $0xaa, Y2, Y1, Y2 VPAND b64_enc_mask2<>+0(SB), Y0, Y1 VPMULHUW b64_enc_mult<>+0(SB), Y1, Y0 VPOR Y0, Y2, Y0 VPSUBUSB Y4, Y0, Y1 VPCMPGTB Y5, Y0, Y2 VPSUBB Y2, Y1, Y1 VPSHUFB Y1, Y3, Y1 VPADDB Y0, Y1, Y0 VMOVDQU Y0, (AX)(CX*1) ADDQ $0x20, CX ADDQ $0x18, BX SUBQ $0x18, DI CMPQ DI, $0x20 JB done VMOVDQU -4(DX)(BX*1), Y0 JMP loop done: MOVQ CX, ret+56(FP) MOVQ BX, ret1+64(FP) VZEROUPPER RET DATA b64_enc_load<>+0(SB)/8, $0x8000000000000000 DATA b64_enc_load<>+8(SB)/8, $0x8000000080000000 DATA b64_enc_load<>+16(SB)/8, $0x8000000080000000 DATA b64_enc_load<>+24(SB)/8, $0x8000000080000000 GLOBL b64_enc_load<>(SB), RODATA|NOPTR, $32 DATA b64_enc_shuf<>+0(SB)/8, $0x0809070805060405 DATA b64_enc_shuf<>+8(SB)/8, $0x0e0f0d0e0b0c0a0b DATA b64_enc_shuf<>+16(SB)/8, $0x0405030401020001 DATA b64_enc_shuf<>+24(SB)/8, $0x0a0b090a07080607 GLOBL b64_enc_shuf<>(SB), RODATA|NOPTR, $32 DATA b64_enc_mask1<>+0(SB)/8, $0x003f03f0003f03f0 DATA b64_enc_mask1<>+8(SB)/8, $0x003f03f0003f03f0 DATA b64_enc_mask1<>+16(SB)/8, $0x003f03f0003f03f0 DATA b64_enc_mask1<>+24(SB)/8, $0x003f03f0003f03f0 GLOBL b64_enc_mask1<>(SB), RODATA|NOPTR, $32 DATA b64_enc_mask2<>+0(SB)/8, $0x0fc0fc000fc0fc00 DATA b64_enc_mask2<>+8(SB)/8, $0x0fc0fc000fc0fc00 DATA b64_enc_mask2<>+16(SB)/8, $0x0fc0fc000fc0fc00 DATA b64_enc_mask2<>+24(SB)/8, $0x0fc0fc000fc0fc00 GLOBL b64_enc_mask2<>(SB), RODATA|NOPTR, $32 DATA b64_enc_mult<>+0(SB)/8, $0x0400004004000040 DATA b64_enc_mult<>+8(SB)/8, $0x0400004004000040 DATA b64_enc_mult<>+16(SB)/8, $0x0400004004000040 DATA b64_enc_mult<>+24(SB)/8, $0x0400004004000040 GLOBL b64_enc_mult<>(SB), RODATA|NOPTR, $32