1
0
mirror of https://github.com/gilbertchen/duplicacy synced 2025-12-06 00:03:38 +00:00
Files
duplicacy/duplicacy_chunkmaker.go
2017-06-02 16:33:36 -04:00

296 lines
9.3 KiB
Go

// Copyright (c) Acrosync LLC. All rights reserved.
// Licensed under the Fair Source License 0.9 (https://fair.io/)
// User Limitation: 5 users
package duplicacy
import (
"io"
"crypto/sha256"
"encoding/hex"
"encoding/binary"
)
// ChunkMaker breaks data into chunks using buzhash. To save memory, the chunk maker only use a circular buffer
// whose size is double the minimum chunk size.
type ChunkMaker struct {
maximumChunkSize int
minimumChunkSize int
bufferCapacity int
hashMask uint64
randomTable [256]uint64
buffer []byte
bufferSize int
bufferStart int
config *Config
hashOnly bool
hashOnlyChunk *Chunk
}
// CreateChunkMaker creates a chunk maker. 'randomSeed' is used to generate the character-to-integer table needed by
// buzhash.
func CreateChunkMaker(config *Config, hashOnly bool) *ChunkMaker {
size := 1
for size * 2 <= config.AverageChunkSize {
size *= 2
}
if size != config.AverageChunkSize {
LOG_FATAL("CHUNK_SIZE", "Invalid average chunk size: %d is not a power of 2", config.AverageChunkSize)
return nil
}
maker := &ChunkMaker {
hashMask: uint64(config.AverageChunkSize - 1),
maximumChunkSize: config.MaximumChunkSize,
minimumChunkSize: config.MinimumChunkSize,
bufferCapacity: 2 * config.MinimumChunkSize,
config: config,
hashOnly: hashOnly,
}
if hashOnly {
maker.hashOnlyChunk = CreateChunk(config, false)
}
randomData := sha256.Sum256(config.ChunkSeed)
for i := 0; i < 64; i++ {
for j := 0; j < 4; j++ {
maker.randomTable[4 * i + j] = binary.LittleEndian.Uint64(randomData[8 * j : 8 * j + 8])
}
randomData = sha256.Sum256(randomData[:])
}
maker.buffer = make([]byte, 2 * config.MinimumChunkSize)
return maker
}
func rotateLeft(value uint64, bits uint) uint64 {
return (value << (bits & 0x3f)) | (value >> (64 - (bits & 0x3f)))
}
func rotateLeftByOne(value uint64) uint64 {
return (value << 1) | (value >> 63)
}
func (maker *ChunkMaker) buzhashSum(sum uint64, data [] byte) uint64 {
for i := 0; i < len(data); i++ {
sum = rotateLeftByOne(sum) ^ maker.randomTable[data[i]]
}
return sum
}
func (maker *ChunkMaker) buzhashUpdate(sum uint64, out byte, in byte, length int) uint64 {
return rotateLeftByOne(sum) ^ rotateLeft(maker.randomTable[out], uint(length)) ^ maker.randomTable[in]
}
// ForEachChunk reads data from 'reader'. If EOF is encountered, it will call 'nextReader' to ask for next file. If
// 'nextReader' returns false, it will process remaining data in the buffer and then quit. When a chunk is identified,
// it will call 'endOfChunk' to return the chunk size and a boolean flag indicating if it is the last chunk.
func (maker *ChunkMaker) ForEachChunk(reader io.Reader, endOfChunk func(chunk *Chunk, final bool),
nextReader func(size int64, hash string)(io.Reader, bool)) {
maker.bufferStart = 0
maker.bufferSize = 0
var minimumReached bool
var hashSum uint64
var chunk *Chunk
fileSize := int64(0)
fileHasher := maker.config.NewFileHasher()
// Start a new chunk.
startNewChunk := func() {
hashSum = 0
minimumReached = false
if maker.hashOnly {
chunk = maker.hashOnlyChunk
chunk.Reset(true)
} else {
chunk = maker.config.GetChunk()
chunk.Reset(true)
}
}
// Move data from the buffer to the chunk.
fill := func(count int) {
if maker.bufferStart + count < maker.bufferCapacity {
chunk.Write(maker.buffer[maker.bufferStart : maker.bufferStart + count])
maker.bufferStart += count
maker.bufferSize -= count
} else {
chunk.Write(maker.buffer[maker.bufferStart :])
chunk.Write(maker.buffer[: count - (maker.bufferCapacity - maker.bufferStart)])
maker.bufferStart = count - (maker.bufferCapacity - maker.bufferStart)
maker.bufferSize -= count
}
}
startNewChunk()
var err error
isEOF := false
if maker.minimumChunkSize == maker.maximumChunkSize {
if maker.bufferCapacity < maker.minimumChunkSize {
maker.buffer = make([]byte, maker.minimumChunkSize)
}
for {
startNewChunk()
maker.bufferStart = 0
for maker.bufferStart < maker.minimumChunkSize && !isEOF {
count, err := reader.Read(maker.buffer[maker.bufferStart : maker.minimumChunkSize])
if err != nil {
if err != io.EOF {
LOG_ERROR("CHUNK_MAKER", "Failed to read %d bytes: %s", count, err.Error())
return
} else {
isEOF = true
}
}
maker.bufferStart += count
}
fileHasher.Write(maker.buffer[:maker.bufferStart])
fileSize += int64(maker.bufferStart)
chunk.Write(maker.buffer[:maker.bufferStart])
if isEOF {
var ok bool
reader, ok = nextReader(fileSize, hex.EncodeToString(fileHasher.Sum(nil)))
if !ok {
endOfChunk(chunk, true)
return
} else {
endOfChunk(chunk, false)
fileSize = 0
fileHasher = maker.config.NewFileHasher()
isEOF = false
}
}
}
}
for {
// If the buffer still has some space left and EOF is not seen, read more data.
for maker.bufferSize < maker.bufferCapacity && !isEOF {
start := maker.bufferStart + maker.bufferSize
count := maker.bufferCapacity - start
if start >= maker.bufferCapacity {
start -= maker.bufferCapacity
count = maker.bufferStart - start
}
count, err = reader.Read(maker.buffer[start : start + count])
if err != nil && err != io.EOF {
LOG_ERROR("CHUNK_MAKER", "Failed to read %d bytes: %s", count, err.Error())
return
}
maker.bufferSize += count
fileHasher.Write(maker.buffer[start : start + count])
fileSize += int64(count)
// if EOF is seen, try to switch to next file and continue
if err == io.EOF {
var ok bool
reader, ok = nextReader(fileSize, hex.EncodeToString(fileHasher.Sum(nil)))
if !ok {
isEOF = true
} else {
fileSize = 0
fileHasher = maker.config.NewFileHasher()
isEOF = false
}
}
}
// No eough data to meet the minimum chunk size requirement, so just return as a chunk.
if maker.bufferSize < maker.minimumChunkSize {
fill(maker.bufferSize)
endOfChunk(chunk, true)
return
}
// Minimum chunk size has been reached. Calculate the buzhash for the minimum size chunk.
if (!minimumReached) {
bytes := maker.minimumChunkSize
if maker.bufferStart + bytes < maker.bufferCapacity {
hashSum = maker.buzhashSum(0, maker.buffer[maker.bufferStart : maker.bufferStart + bytes])
} else {
hashSum = maker.buzhashSum(0, maker.buffer[maker.bufferStart :])
hashSum = maker.buzhashSum(hashSum,
maker.buffer[: bytes - (maker.bufferCapacity - maker.bufferStart)])
}
if (hashSum & maker.hashMask) == 0 {
// This is a minimum size chunk
fill(bytes)
endOfChunk(chunk, false)
startNewChunk()
continue
}
minimumReached = true
}
// Now check the buzhash of the data in the buffer, shifting one byte at a time.
bytes := maker.bufferSize - maker.minimumChunkSize
isEOC := false
maxSize := maker.maximumChunkSize - chunk.GetLength()
for i := 0; i < maker.bufferSize - maker.minimumChunkSize; i++ {
out := maker.bufferStart + i
if out >= maker.bufferCapacity {
out -= maker.bufferCapacity
}
in := maker.bufferStart + i + maker.minimumChunkSize
if in >= maker.bufferCapacity {
in -= maker.bufferCapacity
}
hashSum = maker.buzhashUpdate(hashSum, maker.buffer[out], maker.buffer[in], maker.minimumChunkSize)
if (hashSum & maker.hashMask) == 0 || i == maxSize - maker.minimumChunkSize - 1 {
// A chunk is completed.
bytes = i + 1 + maker.minimumChunkSize
isEOC = true
break
}
}
fill(bytes)
if isEOC {
if isEOF && maker.bufferSize == 0 {
endOfChunk(chunk, true)
return
}
endOfChunk(chunk, false)
startNewChunk()
continue
}
if isEOF {
fill(maker.bufferSize)
endOfChunk(chunk, true)
return
}
}
}