You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
730 lines
16 KiB
730 lines
16 KiB
// Copyright 2016 The Go Authors. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file. |
|
|
|
// +build !appengine |
|
// +build gc |
|
// +build !noasm |
|
|
|
#include "textflag.h" |
|
|
|
// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a |
|
// Go toolchain regression. See https://github.com/golang/go/issues/15426 and |
|
// https://github.com/golang/snappy/issues/29 |
|
// |
|
// As a workaround, the package was built with a known good assembler, and |
|
// those instructions were disassembled by "objdump -d" to yield the |
|
// 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 |
|
// style comments, in AT&T asm syntax. Note that rsp here is a physical |
|
// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm). |
|
// The instructions were then encoded as "BYTE $0x.." sequences, which assemble |
|
// fine on Go 1.6. |
|
|
|
// The asm code generally follows the pure Go code in encode_other.go, except |
|
// where marked with a "!!!". |
|
|
|
// ---------------------------------------------------------------------------- |
|
|
|
// func emitLiteral(dst, lit []byte) int |
|
// |
|
// All local variables fit into registers. The register allocation: |
|
// - AX len(lit) |
|
// - BX n |
|
// - DX return value |
|
// - DI &dst[i] |
|
// - R10 &lit[0] |
|
// |
|
// The 24 bytes of stack space is to call runtime·memmove. |
|
// |
|
// The unusual register allocation of local variables, such as R10 for the |
|
// source pointer, matches the allocation used at the call site in encodeBlock, |
|
// which makes it easier to manually inline this function. |
|
TEXT ·emitLiteral(SB), NOSPLIT, $24-56 |
|
MOVQ dst_base+0(FP), DI |
|
MOVQ lit_base+24(FP), R10 |
|
MOVQ lit_len+32(FP), AX |
|
MOVQ AX, DX |
|
MOVL AX, BX |
|
SUBL $1, BX |
|
|
|
CMPL BX, $60 |
|
JLT oneByte |
|
CMPL BX, $256 |
|
JLT twoBytes |
|
|
|
threeBytes: |
|
MOVB $0xf4, 0(DI) |
|
MOVW BX, 1(DI) |
|
ADDQ $3, DI |
|
ADDQ $3, DX |
|
JMP memmove |
|
|
|
twoBytes: |
|
MOVB $0xf0, 0(DI) |
|
MOVB BX, 1(DI) |
|
ADDQ $2, DI |
|
ADDQ $2, DX |
|
JMP memmove |
|
|
|
oneByte: |
|
SHLB $2, BX |
|
MOVB BX, 0(DI) |
|
ADDQ $1, DI |
|
ADDQ $1, DX |
|
|
|
memmove: |
|
MOVQ DX, ret+48(FP) |
|
|
|
// copy(dst[i:], lit) |
|
// |
|
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push |
|
// DI, R10 and AX as arguments. |
|
MOVQ DI, 0(SP) |
|
MOVQ R10, 8(SP) |
|
MOVQ AX, 16(SP) |
|
CALL runtime·memmove(SB) |
|
RET |
|
|
|
// ---------------------------------------------------------------------------- |
|
|
|
// func emitCopy(dst []byte, offset, length int) int |
|
// |
|
// All local variables fit into registers. The register allocation: |
|
// - AX length |
|
// - SI &dst[0] |
|
// - DI &dst[i] |
|
// - R11 offset |
|
// |
|
// The unusual register allocation of local variables, such as R11 for the |
|
// offset, matches the allocation used at the call site in encodeBlock, which |
|
// makes it easier to manually inline this function. |
|
TEXT ·emitCopy(SB), NOSPLIT, $0-48 |
|
MOVQ dst_base+0(FP), DI |
|
MOVQ DI, SI |
|
MOVQ offset+24(FP), R11 |
|
MOVQ length+32(FP), AX |
|
|
|
loop0: |
|
// for length >= 68 { etc } |
|
CMPL AX, $68 |
|
JLT step1 |
|
|
|
// Emit a length 64 copy, encoded as 3 bytes. |
|
MOVB $0xfe, 0(DI) |
|
MOVW R11, 1(DI) |
|
ADDQ $3, DI |
|
SUBL $64, AX |
|
JMP loop0 |
|
|
|
step1: |
|
// if length > 64 { etc } |
|
CMPL AX, $64 |
|
JLE step2 |
|
|
|
// Emit a length 60 copy, encoded as 3 bytes. |
|
MOVB $0xee, 0(DI) |
|
MOVW R11, 1(DI) |
|
ADDQ $3, DI |
|
SUBL $60, AX |
|
|
|
step2: |
|
// if length >= 12 || offset >= 2048 { goto step3 } |
|
CMPL AX, $12 |
|
JGE step3 |
|
CMPL R11, $2048 |
|
JGE step3 |
|
|
|
// Emit the remaining copy, encoded as 2 bytes. |
|
MOVB R11, 1(DI) |
|
SHRL $8, R11 |
|
SHLB $5, R11 |
|
SUBB $4, AX |
|
SHLB $2, AX |
|
ORB AX, R11 |
|
ORB $1, R11 |
|
MOVB R11, 0(DI) |
|
ADDQ $2, DI |
|
|
|
// Return the number of bytes written. |
|
SUBQ SI, DI |
|
MOVQ DI, ret+40(FP) |
|
RET |
|
|
|
step3: |
|
// Emit the remaining copy, encoded as 3 bytes. |
|
SUBL $1, AX |
|
SHLB $2, AX |
|
ORB $2, AX |
|
MOVB AX, 0(DI) |
|
MOVW R11, 1(DI) |
|
ADDQ $3, DI |
|
|
|
// Return the number of bytes written. |
|
SUBQ SI, DI |
|
MOVQ DI, ret+40(FP) |
|
RET |
|
|
|
// ---------------------------------------------------------------------------- |
|
|
|
// func extendMatch(src []byte, i, j int) int |
|
// |
|
// All local variables fit into registers. The register allocation: |
|
// - DX &src[0] |
|
// - SI &src[j] |
|
// - R13 &src[len(src) - 8] |
|
// - R14 &src[len(src)] |
|
// - R15 &src[i] |
|
// |
|
// The unusual register allocation of local variables, such as R15 for a source |
|
// pointer, matches the allocation used at the call site in encodeBlock, which |
|
// makes it easier to manually inline this function. |
|
TEXT ·extendMatch(SB), NOSPLIT, $0-48 |
|
MOVQ src_base+0(FP), DX |
|
MOVQ src_len+8(FP), R14 |
|
MOVQ i+24(FP), R15 |
|
MOVQ j+32(FP), SI |
|
ADDQ DX, R14 |
|
ADDQ DX, R15 |
|
ADDQ DX, SI |
|
MOVQ R14, R13 |
|
SUBQ $8, R13 |
|
|
|
cmp8: |
|
// As long as we are 8 or more bytes before the end of src, we can load and |
|
// compare 8 bytes at a time. If those 8 bytes are equal, repeat. |
|
CMPQ SI, R13 |
|
JA cmp1 |
|
MOVQ (R15), AX |
|
MOVQ (SI), BX |
|
CMPQ AX, BX |
|
JNE bsf |
|
ADDQ $8, R15 |
|
ADDQ $8, SI |
|
JMP cmp8 |
|
|
|
bsf: |
|
// If those 8 bytes were not equal, XOR the two 8 byte values, and return |
|
// the index of the first byte that differs. The BSF instruction finds the |
|
// least significant 1 bit, the amd64 architecture is little-endian, and |
|
// the shift by 3 converts a bit index to a byte index. |
|
XORQ AX, BX |
|
BSFQ BX, BX |
|
SHRQ $3, BX |
|
ADDQ BX, SI |
|
|
|
// Convert from &src[ret] to ret. |
|
SUBQ DX, SI |
|
MOVQ SI, ret+40(FP) |
|
RET |
|
|
|
cmp1: |
|
// In src's tail, compare 1 byte at a time. |
|
CMPQ SI, R14 |
|
JAE extendMatchEnd |
|
MOVB (R15), AX |
|
MOVB (SI), BX |
|
CMPB AX, BX |
|
JNE extendMatchEnd |
|
ADDQ $1, R15 |
|
ADDQ $1, SI |
|
JMP cmp1 |
|
|
|
extendMatchEnd: |
|
// Convert from &src[ret] to ret. |
|
SUBQ DX, SI |
|
MOVQ SI, ret+40(FP) |
|
RET |
|
|
|
// ---------------------------------------------------------------------------- |
|
|
|
// func encodeBlock(dst, src []byte) (d int) |
|
// |
|
// All local variables fit into registers, other than "var table". The register |
|
// allocation: |
|
// - AX . . |
|
// - BX . . |
|
// - CX 56 shift (note that amd64 shifts by non-immediates must use CX). |
|
// - DX 64 &src[0], tableSize |
|
// - SI 72 &src[s] |
|
// - DI 80 &dst[d] |
|
// - R9 88 sLimit |
|
// - R10 . &src[nextEmit] |
|
// - R11 96 prevHash, currHash, nextHash, offset |
|
// - R12 104 &src[base], skip |
|
// - R13 . &src[nextS], &src[len(src) - 8] |
|
// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x |
|
// - R15 112 candidate |
|
// |
|
// The second column (56, 64, etc) is the stack offset to spill the registers |
|
// when calling other functions. We could pack this slightly tighter, but it's |
|
// simpler to have a dedicated spill map independent of the function called. |
|
// |
|
// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An |
|
// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill |
|
// local variables (registers) during calls gives 32768 + 56 + 64 = 32888. |
|
TEXT ·encodeBlock(SB), 0, $32888-56 |
|
MOVQ dst_base+0(FP), DI |
|
MOVQ src_base+24(FP), SI |
|
MOVQ src_len+32(FP), R14 |
|
|
|
// shift, tableSize := uint32(32-8), 1<<8 |
|
MOVQ $24, CX |
|
MOVQ $256, DX |
|
|
|
calcShift: |
|
// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { |
|
// shift-- |
|
// } |
|
CMPQ DX, $16384 |
|
JGE varTable |
|
CMPQ DX, R14 |
|
JGE varTable |
|
SUBQ $1, CX |
|
SHLQ $1, DX |
|
JMP calcShift |
|
|
|
varTable: |
|
// var table [maxTableSize]uint16 |
|
// |
|
// In the asm code, unlike the Go code, we can zero-initialize only the |
|
// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU |
|
// writes 16 bytes, so we can do only tableSize/8 writes instead of the |
|
// 2048 writes that would zero-initialize all of table's 32768 bytes. |
|
SHRQ $3, DX |
|
LEAQ table-32768(SP), BX |
|
PXOR X0, X0 |
|
|
|
memclr: |
|
MOVOU X0, 0(BX) |
|
ADDQ $16, BX |
|
SUBQ $1, DX |
|
JNZ memclr |
|
|
|
// !!! DX = &src[0] |
|
MOVQ SI, DX |
|
|
|
// sLimit := len(src) - inputMargin |
|
MOVQ R14, R9 |
|
SUBQ $15, R9 |
|
|
|
// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't |
|
// change for the rest of the function. |
|
MOVQ CX, 56(SP) |
|
MOVQ DX, 64(SP) |
|
MOVQ R9, 88(SP) |
|
|
|
// nextEmit := 0 |
|
MOVQ DX, R10 |
|
|
|
// s := 1 |
|
ADDQ $1, SI |
|
|
|
// nextHash := hash(load32(src, s), shift) |
|
MOVL 0(SI), R11 |
|
IMULL $0x1e35a7bd, R11 |
|
SHRL CX, R11 |
|
|
|
outer: |
|
// for { etc } |
|
|
|
// skip := 32 |
|
MOVQ $32, R12 |
|
|
|
// nextS := s |
|
MOVQ SI, R13 |
|
|
|
// candidate := 0 |
|
MOVQ $0, R15 |
|
|
|
inner0: |
|
// for { etc } |
|
|
|
// s := nextS |
|
MOVQ R13, SI |
|
|
|
// bytesBetweenHashLookups := skip >> 5 |
|
MOVQ R12, R14 |
|
SHRQ $5, R14 |
|
|
|
// nextS = s + bytesBetweenHashLookups |
|
ADDQ R14, R13 |
|
|
|
// skip += bytesBetweenHashLookups |
|
ADDQ R14, R12 |
|
|
|
// if nextS > sLimit { goto emitRemainder } |
|
MOVQ R13, AX |
|
SUBQ DX, AX |
|
CMPQ AX, R9 |
|
JA emitRemainder |
|
|
|
// candidate = int(table[nextHash]) |
|
// XXX: MOVWQZX table-32768(SP)(R11*2), R15 |
|
// XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 |
|
BYTE $0x4e |
|
BYTE $0x0f |
|
BYTE $0xb7 |
|
BYTE $0x7c |
|
BYTE $0x5c |
|
BYTE $0x78 |
|
|
|
// table[nextHash] = uint16(s) |
|
MOVQ SI, AX |
|
SUBQ DX, AX |
|
|
|
// XXX: MOVW AX, table-32768(SP)(R11*2) |
|
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) |
|
BYTE $0x66 |
|
BYTE $0x42 |
|
BYTE $0x89 |
|
BYTE $0x44 |
|
BYTE $0x5c |
|
BYTE $0x78 |
|
|
|
// nextHash = hash(load32(src, nextS), shift) |
|
MOVL 0(R13), R11 |
|
IMULL $0x1e35a7bd, R11 |
|
SHRL CX, R11 |
|
|
|
// if load32(src, s) != load32(src, candidate) { continue } break |
|
MOVL 0(SI), AX |
|
MOVL (DX)(R15*1), BX |
|
CMPL AX, BX |
|
JNE inner0 |
|
|
|
fourByteMatch: |
|
// As per the encode_other.go code: |
|
// |
|
// A 4-byte match has been found. We'll later see etc. |
|
|
|
// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment |
|
// on inputMargin in encode.go. |
|
MOVQ SI, AX |
|
SUBQ R10, AX |
|
CMPQ AX, $16 |
|
JLE emitLiteralFastPath |
|
|
|
// ---------------------------------------- |
|
// Begin inline of the emitLiteral call. |
|
// |
|
// d += emitLiteral(dst[d:], src[nextEmit:s]) |
|
|
|
MOVL AX, BX |
|
SUBL $1, BX |
|
|
|
CMPL BX, $60 |
|
JLT inlineEmitLiteralOneByte |
|
CMPL BX, $256 |
|
JLT inlineEmitLiteralTwoBytes |
|
|
|
inlineEmitLiteralThreeBytes: |
|
MOVB $0xf4, 0(DI) |
|
MOVW BX, 1(DI) |
|
ADDQ $3, DI |
|
JMP inlineEmitLiteralMemmove |
|
|
|
inlineEmitLiteralTwoBytes: |
|
MOVB $0xf0, 0(DI) |
|
MOVB BX, 1(DI) |
|
ADDQ $2, DI |
|
JMP inlineEmitLiteralMemmove |
|
|
|
inlineEmitLiteralOneByte: |
|
SHLB $2, BX |
|
MOVB BX, 0(DI) |
|
ADDQ $1, DI |
|
|
|
inlineEmitLiteralMemmove: |
|
// Spill local variables (registers) onto the stack; call; unspill. |
|
// |
|
// copy(dst[i:], lit) |
|
// |
|
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push |
|
// DI, R10 and AX as arguments. |
|
MOVQ DI, 0(SP) |
|
MOVQ R10, 8(SP) |
|
MOVQ AX, 16(SP) |
|
ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)". |
|
MOVQ SI, 72(SP) |
|
MOVQ DI, 80(SP) |
|
MOVQ R15, 112(SP) |
|
CALL runtime·memmove(SB) |
|
MOVQ 56(SP), CX |
|
MOVQ 64(SP), DX |
|
MOVQ 72(SP), SI |
|
MOVQ 80(SP), DI |
|
MOVQ 88(SP), R9 |
|
MOVQ 112(SP), R15 |
|
JMP inner1 |
|
|
|
inlineEmitLiteralEnd: |
|
// End inline of the emitLiteral call. |
|
// ---------------------------------------- |
|
|
|
emitLiteralFastPath: |
|
// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2". |
|
MOVB AX, BX |
|
SUBB $1, BX |
|
SHLB $2, BX |
|
MOVB BX, (DI) |
|
ADDQ $1, DI |
|
|
|
// !!! Implement the copy from lit to dst as a 16-byte load and store. |
|
// (Encode's documentation says that dst and src must not overlap.) |
|
// |
|
// This always copies 16 bytes, instead of only len(lit) bytes, but that's |
|
// OK. Subsequent iterations will fix up the overrun. |
|
// |
|
// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or |
|
// 16-byte loads and stores. This technique probably wouldn't be as |
|
// effective on architectures that are fussier about alignment. |
|
MOVOU 0(R10), X0 |
|
MOVOU X0, 0(DI) |
|
ADDQ AX, DI |
|
|
|
inner1: |
|
// for { etc } |
|
|
|
// base := s |
|
MOVQ SI, R12 |
|
|
|
// !!! offset := base - candidate |
|
MOVQ R12, R11 |
|
SUBQ R15, R11 |
|
SUBQ DX, R11 |
|
|
|
// ---------------------------------------- |
|
// Begin inline of the extendMatch call. |
|
// |
|
// s = extendMatch(src, candidate+4, s+4) |
|
|
|
// !!! R14 = &src[len(src)] |
|
MOVQ src_len+32(FP), R14 |
|
ADDQ DX, R14 |
|
|
|
// !!! R13 = &src[len(src) - 8] |
|
MOVQ R14, R13 |
|
SUBQ $8, R13 |
|
|
|
// !!! R15 = &src[candidate + 4] |
|
ADDQ $4, R15 |
|
ADDQ DX, R15 |
|
|
|
// !!! s += 4 |
|
ADDQ $4, SI |
|
|
|
inlineExtendMatchCmp8: |
|
// As long as we are 8 or more bytes before the end of src, we can load and |
|
// compare 8 bytes at a time. If those 8 bytes are equal, repeat. |
|
CMPQ SI, R13 |
|
JA inlineExtendMatchCmp1 |
|
MOVQ (R15), AX |
|
MOVQ (SI), BX |
|
CMPQ AX, BX |
|
JNE inlineExtendMatchBSF |
|
ADDQ $8, R15 |
|
ADDQ $8, SI |
|
JMP inlineExtendMatchCmp8 |
|
|
|
inlineExtendMatchBSF: |
|
// If those 8 bytes were not equal, XOR the two 8 byte values, and return |
|
// the index of the first byte that differs. The BSF instruction finds the |
|
// least significant 1 bit, the amd64 architecture is little-endian, and |
|
// the shift by 3 converts a bit index to a byte index. |
|
XORQ AX, BX |
|
BSFQ BX, BX |
|
SHRQ $3, BX |
|
ADDQ BX, SI |
|
JMP inlineExtendMatchEnd |
|
|
|
inlineExtendMatchCmp1: |
|
// In src's tail, compare 1 byte at a time. |
|
CMPQ SI, R14 |
|
JAE inlineExtendMatchEnd |
|
MOVB (R15), AX |
|
MOVB (SI), BX |
|
CMPB AX, BX |
|
JNE inlineExtendMatchEnd |
|
ADDQ $1, R15 |
|
ADDQ $1, SI |
|
JMP inlineExtendMatchCmp1 |
|
|
|
inlineExtendMatchEnd: |
|
// End inline of the extendMatch call. |
|
// ---------------------------------------- |
|
|
|
// ---------------------------------------- |
|
// Begin inline of the emitCopy call. |
|
// |
|
// d += emitCopy(dst[d:], base-candidate, s-base) |
|
|
|
// !!! length := s - base |
|
MOVQ SI, AX |
|
SUBQ R12, AX |
|
|
|
inlineEmitCopyLoop0: |
|
// for length >= 68 { etc } |
|
CMPL AX, $68 |
|
JLT inlineEmitCopyStep1 |
|
|
|
// Emit a length 64 copy, encoded as 3 bytes. |
|
MOVB $0xfe, 0(DI) |
|
MOVW R11, 1(DI) |
|
ADDQ $3, DI |
|
SUBL $64, AX |
|
JMP inlineEmitCopyLoop0 |
|
|
|
inlineEmitCopyStep1: |
|
// if length > 64 { etc } |
|
CMPL AX, $64 |
|
JLE inlineEmitCopyStep2 |
|
|
|
// Emit a length 60 copy, encoded as 3 bytes. |
|
MOVB $0xee, 0(DI) |
|
MOVW R11, 1(DI) |
|
ADDQ $3, DI |
|
SUBL $60, AX |
|
|
|
inlineEmitCopyStep2: |
|
// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 } |
|
CMPL AX, $12 |
|
JGE inlineEmitCopyStep3 |
|
CMPL R11, $2048 |
|
JGE inlineEmitCopyStep3 |
|
|
|
// Emit the remaining copy, encoded as 2 bytes. |
|
MOVB R11, 1(DI) |
|
SHRL $8, R11 |
|
SHLB $5, R11 |
|
SUBB $4, AX |
|
SHLB $2, AX |
|
ORB AX, R11 |
|
ORB $1, R11 |
|
MOVB R11, 0(DI) |
|
ADDQ $2, DI |
|
JMP inlineEmitCopyEnd |
|
|
|
inlineEmitCopyStep3: |
|
// Emit the remaining copy, encoded as 3 bytes. |
|
SUBL $1, AX |
|
SHLB $2, AX |
|
ORB $2, AX |
|
MOVB AX, 0(DI) |
|
MOVW R11, 1(DI) |
|
ADDQ $3, DI |
|
|
|
inlineEmitCopyEnd: |
|
// End inline of the emitCopy call. |
|
// ---------------------------------------- |
|
|
|
// nextEmit = s |
|
MOVQ SI, R10 |
|
|
|
// if s >= sLimit { goto emitRemainder } |
|
MOVQ SI, AX |
|
SUBQ DX, AX |
|
CMPQ AX, R9 |
|
JAE emitRemainder |
|
|
|
// As per the encode_other.go code: |
|
// |
|
// We could immediately etc. |
|
|
|
// x := load64(src, s-1) |
|
MOVQ -1(SI), R14 |
|
|
|
// prevHash := hash(uint32(x>>0), shift) |
|
MOVL R14, R11 |
|
IMULL $0x1e35a7bd, R11 |
|
SHRL CX, R11 |
|
|
|
// table[prevHash] = uint16(s-1) |
|
MOVQ SI, AX |
|
SUBQ DX, AX |
|
SUBQ $1, AX |
|
|
|
// XXX: MOVW AX, table-32768(SP)(R11*2) |
|
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) |
|
BYTE $0x66 |
|
BYTE $0x42 |
|
BYTE $0x89 |
|
BYTE $0x44 |
|
BYTE $0x5c |
|
BYTE $0x78 |
|
|
|
// currHash := hash(uint32(x>>8), shift) |
|
SHRQ $8, R14 |
|
MOVL R14, R11 |
|
IMULL $0x1e35a7bd, R11 |
|
SHRL CX, R11 |
|
|
|
// candidate = int(table[currHash]) |
|
// XXX: MOVWQZX table-32768(SP)(R11*2), R15 |
|
// XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 |
|
BYTE $0x4e |
|
BYTE $0x0f |
|
BYTE $0xb7 |
|
BYTE $0x7c |
|
BYTE $0x5c |
|
BYTE $0x78 |
|
|
|
// table[currHash] = uint16(s) |
|
ADDQ $1, AX |
|
|
|
// XXX: MOVW AX, table-32768(SP)(R11*2) |
|
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) |
|
BYTE $0x66 |
|
BYTE $0x42 |
|
BYTE $0x89 |
|
BYTE $0x44 |
|
BYTE $0x5c |
|
BYTE $0x78 |
|
|
|
// if uint32(x>>8) == load32(src, candidate) { continue } |
|
MOVL (DX)(R15*1), BX |
|
CMPL R14, BX |
|
JEQ inner1 |
|
|
|
// nextHash = hash(uint32(x>>16), shift) |
|
SHRQ $8, R14 |
|
MOVL R14, R11 |
|
IMULL $0x1e35a7bd, R11 |
|
SHRL CX, R11 |
|
|
|
// s++ |
|
ADDQ $1, SI |
|
|
|
// break out of the inner1 for loop, i.e. continue the outer loop. |
|
JMP outer |
|
|
|
emitRemainder: |
|
// if nextEmit < len(src) { etc } |
|
MOVQ src_len+32(FP), AX |
|
ADDQ DX, AX |
|
CMPQ R10, AX |
|
JEQ encodeBlockEnd |
|
|
|
// d += emitLiteral(dst[d:], src[nextEmit:]) |
|
// |
|
// Push args. |
|
MOVQ DI, 0(SP) |
|
MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative. |
|
MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative. |
|
MOVQ R10, 24(SP) |
|
SUBQ R10, AX |
|
MOVQ AX, 32(SP) |
|
MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative. |
|
|
|
// Spill local variables (registers) onto the stack; call; unspill. |
|
MOVQ DI, 80(SP) |
|
CALL ·emitLiteral(SB) |
|
MOVQ 80(SP), DI |
|
|
|
// Finish the "d +=" part of "d += emitLiteral(etc)". |
|
ADDQ 48(SP), DI |
|
|
|
encodeBlockEnd: |
|
MOVQ dst_base+0(FP), AX |
|
SUBQ AX, DI |
|
MOVQ DI, d+48(FP) |
|
RET
|
|
|