You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
215 lines
3.5 KiB
215 lines
3.5 KiB
// +build !appengine |
|
// +build gc |
|
// +build !purego |
|
|
|
#include "textflag.h" |
|
|
|
// Register allocation: |
|
// AX h |
|
// SI pointer to advance through b |
|
// DX n |
|
// BX loop end |
|
// R8 v1, k1 |
|
// R9 v2 |
|
// R10 v3 |
|
// R11 v4 |
|
// R12 tmp |
|
// R13 prime1v |
|
// R14 prime2v |
|
// DI prime4v |
|
|
|
// round reads from and advances the buffer pointer in SI. |
|
// It assumes that R13 has prime1v and R14 has prime2v. |
|
#define round(r) \ |
|
MOVQ (SI), R12 \ |
|
ADDQ $8, SI \ |
|
IMULQ R14, R12 \ |
|
ADDQ R12, r \ |
|
ROLQ $31, r \ |
|
IMULQ R13, r |
|
|
|
// mergeRound applies a merge round on the two registers acc and val. |
|
// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v. |
|
#define mergeRound(acc, val) \ |
|
IMULQ R14, val \ |
|
ROLQ $31, val \ |
|
IMULQ R13, val \ |
|
XORQ val, acc \ |
|
IMULQ R13, acc \ |
|
ADDQ DI, acc |
|
|
|
// func Sum64(b []byte) uint64 |
|
TEXT ·Sum64(SB), NOSPLIT, $0-32 |
|
// Load fixed primes. |
|
MOVQ ·prime1v(SB), R13 |
|
MOVQ ·prime2v(SB), R14 |
|
MOVQ ·prime4v(SB), DI |
|
|
|
// Load slice. |
|
MOVQ b_base+0(FP), SI |
|
MOVQ b_len+8(FP), DX |
|
LEAQ (SI)(DX*1), BX |
|
|
|
// The first loop limit will be len(b)-32. |
|
SUBQ $32, BX |
|
|
|
// Check whether we have at least one block. |
|
CMPQ DX, $32 |
|
JLT noBlocks |
|
|
|
// Set up initial state (v1, v2, v3, v4). |
|
MOVQ R13, R8 |
|
ADDQ R14, R8 |
|
MOVQ R14, R9 |
|
XORQ R10, R10 |
|
XORQ R11, R11 |
|
SUBQ R13, R11 |
|
|
|
// Loop until SI > BX. |
|
blockLoop: |
|
round(R8) |
|
round(R9) |
|
round(R10) |
|
round(R11) |
|
|
|
CMPQ SI, BX |
|
JLE blockLoop |
|
|
|
MOVQ R8, AX |
|
ROLQ $1, AX |
|
MOVQ R9, R12 |
|
ROLQ $7, R12 |
|
ADDQ R12, AX |
|
MOVQ R10, R12 |
|
ROLQ $12, R12 |
|
ADDQ R12, AX |
|
MOVQ R11, R12 |
|
ROLQ $18, R12 |
|
ADDQ R12, AX |
|
|
|
mergeRound(AX, R8) |
|
mergeRound(AX, R9) |
|
mergeRound(AX, R10) |
|
mergeRound(AX, R11) |
|
|
|
JMP afterBlocks |
|
|
|
noBlocks: |
|
MOVQ ·prime5v(SB), AX |
|
|
|
afterBlocks: |
|
ADDQ DX, AX |
|
|
|
// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8. |
|
ADDQ $24, BX |
|
|
|
CMPQ SI, BX |
|
JG fourByte |
|
|
|
wordLoop: |
|
// Calculate k1. |
|
MOVQ (SI), R8 |
|
ADDQ $8, SI |
|
IMULQ R14, R8 |
|
ROLQ $31, R8 |
|
IMULQ R13, R8 |
|
|
|
XORQ R8, AX |
|
ROLQ $27, AX |
|
IMULQ R13, AX |
|
ADDQ DI, AX |
|
|
|
CMPQ SI, BX |
|
JLE wordLoop |
|
|
|
fourByte: |
|
ADDQ $4, BX |
|
CMPQ SI, BX |
|
JG singles |
|
|
|
MOVL (SI), R8 |
|
ADDQ $4, SI |
|
IMULQ R13, R8 |
|
XORQ R8, AX |
|
|
|
ROLQ $23, AX |
|
IMULQ R14, AX |
|
ADDQ ·prime3v(SB), AX |
|
|
|
singles: |
|
ADDQ $4, BX |
|
CMPQ SI, BX |
|
JGE finalize |
|
|
|
singlesLoop: |
|
MOVBQZX (SI), R12 |
|
ADDQ $1, SI |
|
IMULQ ·prime5v(SB), R12 |
|
XORQ R12, AX |
|
|
|
ROLQ $11, AX |
|
IMULQ R13, AX |
|
|
|
CMPQ SI, BX |
|
JL singlesLoop |
|
|
|
finalize: |
|
MOVQ AX, R12 |
|
SHRQ $33, R12 |
|
XORQ R12, AX |
|
IMULQ R14, AX |
|
MOVQ AX, R12 |
|
SHRQ $29, R12 |
|
XORQ R12, AX |
|
IMULQ ·prime3v(SB), AX |
|
MOVQ AX, R12 |
|
SHRQ $32, R12 |
|
XORQ R12, AX |
|
|
|
MOVQ AX, ret+24(FP) |
|
RET |
|
|
|
// writeBlocks uses the same registers as above except that it uses AX to store |
|
// the d pointer. |
|
|
|
// func writeBlocks(d *Digest, b []byte) int |
|
TEXT ·writeBlocks(SB), NOSPLIT, $0-40 |
|
// Load fixed primes needed for round. |
|
MOVQ ·prime1v(SB), R13 |
|
MOVQ ·prime2v(SB), R14 |
|
|
|
// Load slice. |
|
MOVQ b_base+8(FP), SI |
|
MOVQ b_len+16(FP), DX |
|
LEAQ (SI)(DX*1), BX |
|
SUBQ $32, BX |
|
|
|
// Load vN from d. |
|
MOVQ d+0(FP), AX |
|
MOVQ 0(AX), R8 // v1 |
|
MOVQ 8(AX), R9 // v2 |
|
MOVQ 16(AX), R10 // v3 |
|
MOVQ 24(AX), R11 // v4 |
|
|
|
// We don't need to check the loop condition here; this function is |
|
// always called with at least one block of data to process. |
|
blockLoop: |
|
round(R8) |
|
round(R9) |
|
round(R10) |
|
round(R11) |
|
|
|
CMPQ SI, BX |
|
JLE blockLoop |
|
|
|
// Copy vN back to d. |
|
MOVQ R8, 0(AX) |
|
MOVQ R9, 8(AX) |
|
MOVQ R10, 16(AX) |
|
MOVQ R11, 24(AX) |
|
|
|
// The number of bytes written is SI minus the old base pointer. |
|
SUBQ b_base+8(FP), SI |
|
MOVQ SI, ret+32(FP) |
|
|
|
RET
|
|
|