Commit 421f3935 authored by Viktor Trón's avatar Viktor Trón Committed by GitHub

bmt: simplification and cleanup (#1490)

parent a4ca719e
// Copyright 2021 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bmt_test
import (
"fmt"
"testing"
"github.com/ethersphere/bee/pkg/bmt"
"github.com/ethersphere/bee/pkg/bmt/reference"
"github.com/ethersphere/bee/pkg/swarm"
"golang.org/x/sync/errgroup"
)
//
func BenchmarkBMT(t *testing.B) {
for size := 4096; size >= 128; size /= 2 {
t.Run(fmt.Sprintf("%v_size_%v", "SHA3", size), func(t *testing.B) {
benchmarkSHA3(t, size)
})
t.Run(fmt.Sprintf("%v_size_%v", "Baseline", size), func(t *testing.B) {
benchmarkBMTBaseline(t, size)
})
t.Run(fmt.Sprintf("%v_size_%v", "REF", size), func(t *testing.B) {
benchmarkRefHasher(t, size)
})
t.Run(fmt.Sprintf("%v_size_%v", "BMT", size), func(t *testing.B) {
benchmarkBMT(t, size)
})
}
}
func BenchmarkPool(t *testing.B) {
for _, c := range []int{1, 8, 16, 32, 64} {
t.Run(fmt.Sprintf("poolsize_%v", c), func(t *testing.B) {
benchmarkPool(t, c)
})
}
}
// benchmarks simple sha3 hash on chunks
func benchmarkSHA3(t *testing.B, n int) {
testData := randomBytes(t, seed)
t.ReportAllocs()
t.ResetTimer()
for i := 0; i < t.N; i++ {
if _, err := bmt.Sha3hash(testData[:n]); err != nil {
t.Fatalf("seed %d: %v", seed, err)
}
}
}
// benchmarks the minimum hashing time for a balanced (for simplicity) BMT
// by doing count/segmentsize parallel hashings of 2*segmentsize bytes
// doing it on n testPoolSize each reusing the base hasher
// the premise is that this is the minimum computation needed for a BMT
// therefore this serves as a theoretical optimum for concurrent implementations
func benchmarkBMTBaseline(t *testing.B, n int) {
testData := randomBytes(t, seed)
t.ReportAllocs()
t.ResetTimer()
for i := 0; i < t.N; i++ {
eg := new(errgroup.Group)
for j := 0; j < testSegmentCount; j++ {
eg.Go(func() error {
_, err := bmt.Sha3hash(testData[:hashSize])
return err
})
}
if err := eg.Wait(); err != nil {
t.Fatalf("seed %d: %v", seed, err)
}
}
}
// benchmarks BMT Hasher
func benchmarkBMT(t *testing.B, n int) {
testData := randomBytes(t, seed)
pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, testSegmentCount, testPoolSize))
h := pool.Get()
defer pool.Put(h)
t.ReportAllocs()
t.ResetTimer()
for i := 0; i < t.N; i++ {
if _, err := syncHash(h, testData[:n]); err != nil {
t.Fatalf("seed %d: %v", seed, err)
}
}
}
// benchmarks 100 concurrent bmt hashes with pool capacity
func benchmarkPool(t *testing.B, poolsize int) {
testData := randomBytes(t, seed)
pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, testSegmentCount, poolsize))
cycles := 100
t.ReportAllocs()
t.ResetTimer()
for i := 0; i < t.N; i++ {
eg := new(errgroup.Group)
for j := 0; j < cycles; j++ {
eg.Go(func() error {
h := pool.Get()
defer pool.Put(h)
_, err := syncHash(h, testData[:h.Capacity()])
return err
})
}
if err := eg.Wait(); err != nil {
t.Fatalf("seed %d: %v", seed, err)
}
}
}
// benchmarks the reference hasher
func benchmarkRefHasher(t *testing.B, n int) {
testData := randomBytes(t, seed)
rbmt := reference.NewRefHasher(swarm.NewHasher(), 128)
t.ReportAllocs()
t.ResetTimer()
for i := 0; i < t.N; i++ {
_, err := rbmt.Hash(testData[:n])
if err != nil {
t.Fatal(err)
}
}
}
// Copyright 2020 The Swarm Authors. All rights reserved.
// Copyright 2021 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bmt
import (
"encoding/binary"
"hash"
"github.com/ethersphere/bee/pkg/swarm"
)
const (
SpanSize = 8
var _ Hash = (*Hasher)(nil)
var (
zerospan = make([]byte, 8)
zerosection = make([]byte, 64)
)
// Hash provides the necessary extension of the hash interface to add the length-prefix of the BMT hash.
// Hasher is a reusable hasher for fixed maximum size chunks representing a BMT
// It reuses a pool of trees for amortised memory allocation and resource control,
// and supports order-agnostic concurrent segment writes and section (double segment) writes
// as well as sequential read and write.
//
// The same hasher instance must not be called concurrently on more than one chunk.
//
// The same hasher instance is synchronously reuseable.
//
// Any implementation should make it possible to generate a BMT hash using the hash.Hash interface only.
// However, the limitation will be that the Span of the BMT hash always must be limited to the amount of bytes actually written.
type Hash interface {
hash.Hash
// Sum gives back the tree to the pool and guaranteed to leave
// the tree and itself in a state reusable for hashing a new chunk.
type Hasher struct {
*Conf // configuration
bmt *tree // prebuilt BMT resource for flowcontrol and proofs
size int // bytes written to Hasher since last Reset()
pos int // index of rightmost currently open segment
offset int // offset (cursor position) within currently open segment
result chan []byte // result channel
errc chan error // error channel
span []byte // The span of the data subsumed under the chunk
}
// Capacity returns the maximum amount of bytes that will be processed by this hasher implementation.
// since BMT assumes a balanced binary tree, capacity it is always a power of 2
func (h *Hasher) Capacity() int {
return h.maxSize
}
// LengthToSpan creates a binary data span size representation.
// It is required for calculating the BMT hash.
func LengthToSpan(length int64) []byte {
span := make([]byte, SpanSize)
binary.LittleEndian.PutUint64(span, uint64(length))
return span
}
// SetHeaderInt64 sets the metadata preamble to the little endian binary representation of int64 argument for the current hash operation.
func (h *Hasher) SetHeaderInt64(length int64) {
binary.LittleEndian.PutUint64(h.span, uint64(length))
}
// SetHeader sets the metadata preamble to the span bytes given argument for the current hash operation.
func (h *Hasher) SetHeader(span []byte) {
copy(h.span, span)
}
// Size returns the digest size of the hash
func (h *Hasher) Size() int {
return h.segmentSize
}
// SetSpan sets the length prefix of BMT hash.
SetSpan(int64) error
// BlockSize returns the optimal write size to the Hasher
func (h *Hasher) BlockSize() int {
return 2 * h.segmentSize
}
// Hash returns the BMT root hash of the buffer and an error
// using Hash presupposes sequential synchronous writes (io.Writer interface).
func (h *Hasher) Hash(b []byte) ([]byte, error) {
if h.size == 0 {
return sha3hash(h.span, h.zerohashes[h.depth])
}
copy(h.bmt.buffer[h.size:], zerosection)
// write the last section with final flag set to true
go h.processSection(h.pos, true)
select {
case result := <-h.result:
return sha3hash(h.span, result)
case err := <-h.errc:
return nil, err
}
}
// Sum returns the BMT root hash of the buffer, unsafe version of Hash
func (h *Hasher) Sum(b []byte) []byte {
s, _ := h.Hash(b)
return s
}
// Write calls sequentially add to the buffer to be hashed,
// with every full segment calls processSection in a go routine.
func (h *Hasher) Write(b []byte) (int, error) {
l := len(b)
max := h.maxSize - h.size
if l > max {
l = max
}
copy(h.bmt.buffer[h.size:], b)
secsize := 2 * h.segmentSize
from := h.size / secsize
h.offset = h.size % secsize
h.size += l
to := h.size / secsize
if l == max {
to--
}
h.pos = to
for i := from; i < to; i++ {
go h.processSection(i, false)
}
return l, nil
}
// SetSpanBytes sets the length prefix of BMT hash in byte form.
SetSpanBytes([]byte) error
// Reset prepares the Hasher for reuse
func (h *Hasher) Reset() {
h.pos = 0
h.size = 0
h.offset = 0
copy(h.span, zerospan)
}
// Capacity returns the maximum amount of bytes that will be processed by the implementation.
Capacity() int
// processSection writes the hash of i-th section into level 1 node of the BMT tree.
func (h *Hasher) processSection(i int, final bool) {
secsize := 2 * h.segmentSize
offset := i * secsize
level := 1
// select the leaf node for the section
n := h.bmt.leaves[i]
isLeft := n.isLeft
hasher := n.hasher
n = n.parent
// hash the section
section, err := doHash(hasher, h.bmt.buffer[offset:offset+secsize])
if err != nil {
select {
case h.errc <- err:
default:
}
return
}
// write hash into parent node
if final {
// for the last segment use writeFinalNode
h.writeFinalNode(level, n, isLeft, section)
} else {
h.writeNode(n, isLeft, section)
}
}
// writeNode pushes the data to the node.
// if it is the first of 2 sisters written, the routine terminates.
// if it is the second, it calculates the hash and writes it
// to the parent node recursively.
// since hashing the parent is synchronous the same hasher can be used.
func (h *Hasher) writeNode(n *node, isLeft bool, s []byte) {
var err error
level := 1
for {
// at the root of the bmt just write the result to the result channel
if n == nil {
h.result <- s
return
}
// otherwise assign child hash to left or right segment
if isLeft {
n.left = s
} else {
n.right = s
}
// the child-thread first arriving will terminate
if n.toggle() {
return
}
// the thread coming second now can be sure both left and right children are written
// so it calculates the hash of left|right and pushes it to the parent
s, err = doHash(n.hasher, n.left, n.right)
if err != nil {
select {
case h.errc <- err:
default:
}
return
}
isLeft = n.isLeft
n = n.parent
level++
}
}
// writeFinalNode is following the path starting from the final datasegment to the
// BMT root via parents.
// For unbalanced trees it fills in the missing right sister nodes using
// the pool's lookup table for BMT subtree root hashes for all-zero sections.
// Otherwise behaves like `writeNode`.
func (h *Hasher) writeFinalNode(level int, n *node, isLeft bool, s []byte) {
var err error
for {
// at the root of the bmt just write the result to the result channel
if n == nil {
if s != nil {
h.result <- s
}
return
}
var noHash bool
if isLeft {
// coming from left sister branch
// when the final section's path is going via left child node
// we include an all-zero subtree hash for the right level and toggle the node.
n.right = h.zerohashes[level]
if s != nil {
n.left = s
// if a left final node carries a hash, it must be the first (and only thread)
// so the toggle is already in passive state no need no call
// yet thread needs to carry on pushing hash to parent
noHash = false
} else {
// if again first thread then propagate nil and calculate no hash
noHash = n.toggle()
}
} else {
// right sister branch
if s != nil {
// if hash was pushed from right child node, write right segment change state
n.right = s
// if toggle is true, we arrived first so no hashing just push nil to parent
noHash = n.toggle()
} else {
// if s is nil, then thread arrived first at previous node and here there will be two,
// so no need to do anything and keep s = nil for parent
noHash = true
}
}
// the child-thread first arriving will just continue resetting s to nil
// the second thread now can be sure both left and right children are written
// it calculates the hash of left|right and pushes it to the parent
if noHash {
s = nil
} else {
s, err = doHash(n.hasher, n.left, n.right)
if err != nil {
select {
case h.errc <- err:
default:
}
return
}
}
// iterate to parent
isLeft = n.isLeft
n = n.parent
level++
}
}
// calculates the Keccak256 SHA3 hash of the data
func sha3hash(data ...[]byte) ([]byte, error) {
return doHash(swarm.NewHasher(), data...)
}
// WriteSection writes to a specific section of the data to be hashed.
WriteSection(idx int, data []byte) error
// calculates Hash of the data
func doHash(h hash.Hash, data ...[]byte) ([]byte, error) {
h.Reset()
for _, v := range data {
if _, err := h.Write(v); err != nil {
return nil, err
}
}
return h.Sum(nil), nil
}
// Copyright 2021 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bmt_test
import (
"bytes"
"context"
"fmt"
"io"
"math/rand"
"sort"
"testing"
"time"
"github.com/ethersphere/bee/pkg/bmt"
"github.com/ethersphere/bee/pkg/bmt/reference"
"github.com/ethersphere/bee/pkg/swarm"
"golang.org/x/sync/errgroup"
)
const (
// testPoolSize is the number of bmt trees the pool keeps when
testPoolSize = 16
// segmentCount is the maximum number of segments of the underlying chunk
// Should be equal to max-chunk-data-size / hash-size
// Currently set to 128 == 4096 (default chunk size) / 32 (sha3.keccak256 size)
testSegmentCount = 128
)
var (
testSegmentCounts = []int{1, 2, 3, 4, 5, 8, 9, 15, 16, 17, 32, 37, 42, 53, 63, 64, 65, 111, 127, 128}
hashSize = swarm.NewHasher().Size()
seed = time.Now().Unix()
)
func refHash(count int, data []byte) ([]byte, error) {
rbmt := reference.NewRefHasher(swarm.NewHasher(), count)
refNoMetaHash, err := rbmt.Hash(data)
if err != nil {
return nil, err
}
return bmt.Sha3hash(bmt.LengthToSpan(int64(len(data))), refNoMetaHash)
}
// syncHash hashes the data and the span using the bmt hasher
func syncHash(h *bmt.Hasher, data []byte) ([]byte, error) {
h.Reset()
h.SetHeaderInt64(int64(len(data)))
_, err := h.Write(data)
if err != nil {
return nil, err
}
return h.Hash(nil)
}
// tests if hasher responds with correct hash comparing the reference implementation return value
func TestHasherEmptyData(t *testing.T) {
for _, count := range testSegmentCounts {
t.Run(fmt.Sprintf("%d_segments", count), func(t *testing.T) {
expHash, err := refHash(count, nil)
if err != nil {
t.Fatal(err)
}
pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, count, 1))
h := pool.Get()
resHash, err := syncHash(h, nil)
if err != nil {
t.Fatal(err)
}
pool.Put(h)
if !bytes.Equal(expHash, resHash) {
t.Fatalf("hash mismatch with reference. expected %x, got %x", expHash, resHash)
}
})
}
}
// tests sequential write with entire max size written in one go
func TestSyncHasherCorrectness(t *testing.T) {
testData := randomBytes(t, seed)
for _, count := range testSegmentCounts {
t.Run(fmt.Sprintf("segments_%v", count), func(t *testing.T) {
max := count * hashSize
var incr int
capacity := 1
pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, count, capacity))
for n := 0; n <= max; n += incr {
h := pool.Get()
incr = 1 + rand.Intn(5)
err := testHasherCorrectness(h, testData, n, count)
if err != nil {
t.Fatalf("seed %d: %v", seed, err)
}
pool.Put(h)
}
})
}
}
// tests that the BMT hasher can be synchronously reused with poolsizes 1 and testPoolSize
func TestHasherReuse(t *testing.T) {
t.Run(fmt.Sprintf("poolsize_%d", 1), func(t *testing.T) {
testHasherReuse(t, 1)
})
t.Run(fmt.Sprintf("poolsize_%d", testPoolSize), func(t *testing.T) {
testHasherReuse(t, testPoolSize)
})
}
// tests if bmt reuse is not corrupting result
func testHasherReuse(t *testing.T, poolsize int) {
pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, testSegmentCount, poolsize))
h := pool.Get()
defer pool.Put(h)
for i := 0; i < 100; i++ {
seed := int64(i)
testData := randomBytes(t, seed)
n := rand.Intn(h.Capacity())
err := testHasherCorrectness(h, testData, n, testSegmentCount)
if err != nil {
t.Fatalf("seed %d: %v", seed, err)
}
}
}
// tests if pool can be cleanly reused even in concurrent use by several hashers
func TestBMTConcurrentUse(t *testing.T) {
testData := randomBytes(t, seed)
pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, testSegmentCount, testPoolSize))
cycles := 100
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
eg, ectx := errgroup.WithContext(ctx)
for i := 0; i < cycles; i++ {
eg.Go(func() error {
select {
case <-ectx.Done():
return ectx.Err()
default:
}
h := pool.Get()
defer pool.Put(h)
n := rand.Intn(h.Capacity())
return testHasherCorrectness(h, testData, n, testSegmentCount)
})
}
if err := eg.Wait(); err != nil {
t.Fatalf("seed %d: %v", seed, err)
}
}
// tests BMT Hasher io.Writer interface is working correctly even with random short writes
func TestBMTWriterBuffers(t *testing.T) {
for i, count := range testSegmentCounts {
t.Run(fmt.Sprintf("%d_segments", count), func(t *testing.T) {
pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, count, testPoolSize))
h := pool.Get()
defer pool.Put(h)
size := h.Capacity()
seed := int64(i)
testData := randomBytes(t, seed)
resHash, err := syncHash(h, testData[:size])
if err != nil {
t.Fatal(err)
}
expHash, err := refHash(count, testData[:size])
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(resHash, expHash) {
t.Fatalf("single write :hash mismatch with reference. expected %x, got %x", expHash, resHash)
}
attempts := 10
f := func() error {
h := pool.Get()
defer pool.Put(h)
reads := rand.Intn(count*2-1) + 1
offsets := make([]int, reads+1)
for i := 0; i < reads; i++ {
offsets[i] = rand.Intn(size) + 1
}
offsets[reads] = size
from := 0
sort.Ints(offsets)
for _, to := range offsets {
if from < to {
read, err := h.Write(testData[from:to])
if err != nil {
return err
}
if read != to-from {
return fmt.Errorf("incorrect read. expected %v bytes, got %v", to-from, read)
}
from = to
}
}
h.SetHeaderInt64(int64(size))
resHash, err := h.Hash(nil)
if err != nil {
return err
}
if !bytes.Equal(resHash, expHash) {
return fmt.Errorf("hash mismatch on %v. expected %x, got %x", offsets, expHash, resHash)
}
return nil
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
eg, ectx := errgroup.WithContext(ctx)
for i := 0; i < attempts; i++ {
eg.Go(func() error {
select {
case <-ectx.Done():
return ectx.Err()
default:
}
return f()
})
}
if err := eg.Wait(); err != nil {
t.Fatalf("seed %d: %v", seed, err)
}
})
}
}
// helper function that compares reference and optimised implementations for correctness
func testHasherCorrectness(h *bmt.Hasher, data []byte, n, count int) (err error) {
if len(data) < n {
n = len(data)
}
exp, err := refHash(count, data[:n])
if err != nil {
return err
}
got, err := syncHash(h, data[:n])
if err != nil {
return err
}
if !bytes.Equal(got, exp) {
return fmt.Errorf("wrong hash: expected %x, got %x", exp, got)
}
return nil
}
// verifies that the bmt.Hasher can be used with the hash.Hash interface
func TestUseSyncAsOrdinaryHasher(t *testing.T) {
pool := bmt.NewPool(bmt.NewConf(swarm.NewHasher, testSegmentCount, testPoolSize))
h := pool.Get()
defer pool.Put(h)
data := []byte("moodbytesmoodbytesmoodbytesmoodbytes")
expHash, err := refHash(128, data)
if err != nil {
t.Fatal(err)
}
h.SetHeaderInt64(int64(len(data)))
_, err = h.Write(data)
if err != nil {
t.Fatal(err)
}
resHash := h.Sum(nil)
if !bytes.Equal(expHash, resHash) {
t.Fatalf("normalhash; expected %x, got %x", expHash, resHash)
}
}
func randomBytes(t testing.TB, seed int64) []byte {
t.Helper()
data := make([]byte, 4096)
s := rand.NewSource(seed)
r := rand.New(s)
_, err := io.ReadFull(r, data)
if err != nil {
t.Fatal(err)
}
return data
}
// Copyright 2020 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Command generate_legacy generates bmt hashes of sequential byte inputs
// for every possible length of legacy bmt hasher
package main
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"github.com/ethersphere/bee/pkg/bmt/legacy"
"gitlab.com/nolash/go-mockbytes"
"golang.org/x/crypto/sha3"
)
func main() {
// create output directory, fail if it already exists or error creating
if len(os.Args) != 2 {
fmt.Fprintf(os.Stderr, "Usage: generate-hashes <output_directory>\n")
os.Exit(1)
}
outputDir, err := filepath.Abs(os.Args[1])
if err != nil {
fmt.Fprintf(os.Stderr, "Invalid input: %s", err)
os.Exit(1)
}
err = os.Mkdir(outputDir, 0750)
if err == os.ErrExist {
fmt.Fprintf(os.Stderr, "Directory %s already exists\n", outputDir)
os.Exit(1)
} else if err != nil {
fmt.Fprintf(os.Stderr, "Error creating output directory: %v\n", err)
os.Exit(1)
}
// set up hasher
hashPool := legacy.NewTreePool(sha3.NewLegacyKeccak256, 128, legacy.PoolSize)
bmtHash := legacy.New(hashPool)
// create sequence generator and outputs
var i int
g := mockbytes.New(0, mockbytes.MockTypeStandard).WithModulus(255)
for i = 0; i < 4096; i++ {
s := fmt.Sprintf("processing %d...", i)
fmt.Fprintf(os.Stderr, "%-64s\r", s)
filename := fmt.Sprintf("%s/%d.bin", outputDir, i)
b, err := g.SequentialBytes(i)
if err != nil {
fmt.Fprint(os.Stderr, err.Error())
os.Exit(1)
}
bmtHash.Reset()
_, err = bmtHash.Write(b)
sum := bmtHash.Sum(nil)
if err != nil {
fmt.Fprint(os.Stderr, err.Error())
os.Exit(1)
}
err = ioutil.WriteFile(filename, sum, 0666)
if err != nil {
fmt.Fprint(os.Stderr, err.Error())
os.Exit(1)
}
err = ioutil.WriteFile(filename, b, 0666)
if err != nil {
fmt.Fprint(os.Stderr, err.Error())
}
}
// Be kind and give feedback to user
dirString := fmt.Sprintf("Done. Data is in %s. Enjoy!", outputDir)
fmt.Printf("%-64s\n", dirString)
}
// Copyright 2020 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Command main_legacy executes the BMT hash algorithm on the given data and writes the binary result to standard output
//
// Up to 4096 bytes will be read
//
// If a filename is given as argument, it reads data from the file. Otherwise it reads data from standard input.
package main
import (
"fmt"
"io"
"os"
"github.com/ethersphere/bee/pkg/bmt/legacy"
"golang.org/x/crypto/sha3"
)
func main() {
var data [4096]byte
var err error
var infile *os.File
if len(os.Args) > 1 {
infile, err = os.Open(os.Args[1])
if err != nil {
fmt.Fprint(os.Stderr, err.Error())
os.Exit(1)
}
} else {
infile = os.Stdin
}
var c int
c, err = infile.Read(data[:])
// EOF means zero-length input. This is still valid input for BMT
if err != nil && err != io.EOF {
fmt.Fprint(os.Stderr, err.Error())
infile.Close()
os.Exit(1)
}
infile.Close()
hashPool := legacy.NewTreePool(sha3.NewLegacyKeccak256, 128, legacy.PoolSize)
bmtHash := legacy.New(hashPool)
_, err = bmtHash.Write(data[:c])
if err != nil {
fmt.Fprint(os.Stderr, err.Error())
os.Exit(1)
}
binSum := bmtHash.Sum(nil)
_, err = os.Stdout.Write(binSum)
if err != nil {
fmt.Fprint(os.Stderr, err.Error())
os.Exit(1)
}
}
// Copyright 2020 The Swarm Authors. All rights reserved.
// Copyright 2021 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package bmt defines the interface for the Binary Merkle Tree hash.
// Package bmt implements Binary Merkle Tree hash.
// Binary Merkle Tree Hash is a hash function over arbitrary byte slices of limited size.
// The BMT hash is defined as H(header|bmt-root) where header is an 8-byte metadata prefix and
// bmt-root is the root hash of the binary merkle tree built over fixed size segments
// of the underlying chunk using any base hash function H (e.g., keccak 256 SHA3).
// The segment size is the same as the hash size of H.
// The number of segments on the base must be a power of 2 so that the resulting tree is balanced.
// Chunks with data shorter than the fixed size are hashed as if they had zero padding.
//
// BMT hash is used as the chunk hash function in swarm which in turn is the basis for the
// 128 branching swarm hash used to represent files.
//
// The BMT is optimal for providing compact inclusion proofs, i.e. prove that a
// segment is a substring of a chunk starting at a particular offset.
// The size of the underlying segments is fixed to the size of the base hash (called the resolution
// of the BMT hash), Using Keccak256 SHA3 hash is 32 bytes, the EVM word size to optimize for on-chain BMT verification
// as well as the hash size optimal for inclusion proofs in the merkle tree of the swarm hash.
//
// Two implementations are provided:
//
// RefHasher is optimized for code simplicity and meant as a reference implementation
// that is simple to understand
//
// Hasher is optimized for speed taking advantage of concurrency with minimalistic concurrency control.
//
// BMT Hasher implements the following interfaces:
//
// - standard golang hash.Hash - synchronous, reusable
//
// - io.Writer - synchronous left-to-right datawriter.
package bmt
// Copyright 2020 The Swarm Authors. All rights reserved.
// Copyright 2021 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bmt
import (
"errors"
)
var ErrOverflow = errors.New("BMT hash capacity exceeded")
var Sha3hash = sha3hash
// Copyright 2021 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bmt
import (
"hash"
)
const (
SpanSize = 8
)
// Hash provides the necessary extension of the hash interface to add the length-prefix of the BMT hash.
//
// Any implementation should make it possible to generate a BMT hash using the hash.Hash interface only.
// However, the limitation will be that the Span of the BMT hash always must be limited to the amount of bytes actually written.
type Hash interface {
hash.Hash
// SetHeaderInt64 sets the header bytes of BMT hash to the little endian binary representation of the int64 argument.
SetHeaderInt64(int64)
// SetHeader sets the header bytes of BMT hash by copying the first 8 bytes of the argument.
SetHeader([]byte)
// Hash calculates the BMT hash of the buffer written so far and appends it to the argument
Hash([]byte) ([]byte, error)
// Capacity returns the maximum amount of bytes that will be processed by the implementation.
Capacity() int
}
This diff is collapsed.
This diff is collapsed.
// Copyright 2018 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
// Binary Merkle Tree Hash is a hash function over arbitrary datachunks of limited size.
// It is defined as the root hash of the binary merkle tree built over fixed size segments
// of the underlying chunk using any base hash function (e.g., keccak 256 SHA3).
// Chunks with data shorter than the fixed size are hashed as if they had zero padding.
//
// BMT hash is used as the chunk hash function in swarm which in turn is the basis for the
// 128 branching swarm hash http://swarm-guide.readthedocs.io/en/latest/architecture.html#swarm-hash
//
// The BMT is optimal for providing compact inclusion proofs, i.e. prove that a
// segment is a substring of a chunk starting at a particular offset.
// The size of the underlying segments is fixed to the size of the base hash (called the resolution
// of the BMT hash), Using Keccak256 SHA3 hash is 32 bytes, the EVM word size to optimize for on-chain BMT verification
// as well as the hash size optimal for inclusion proofs in the merkle tree of the swarm hash.
//
// Two implementations are provided:
//
// RefHasher is optimized for code simplicity and meant as a reference implementation
// that is simple to understand
//
// Hasher is optimized for speed taking advantage of concurrency with minimalistic
// control structure to coordinate the concurrent routines
//
// BMT Hasher implements the following interfaces:
//
// standard golang hash.Hash - synchronous, reusable
//
// io.Writer - synchronous left-to-right datawriter
package legacy
// Copyright 2021 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bmt
import (
"hash"
"sync/atomic"
)
// BaseHasherFunc is a hash.Hash constructor function used for the base hash of the BMT.
// implemented by Keccak256 SHA3 sha3.NewLegacyKeccak256
type BaseHasherFunc func() hash.Hash
// configuration
type Conf struct {
segmentSize int // size of leaf segments, stipulated to be = hash size
segmentCount int // the number of segments on the base level of the BMT
capacity int // pool capacity, controls concurrency
depth int // depth of the bmt trees = int(log2(segmentCount))+1
maxSize int // the total length of the data (count * size)
zerohashes [][]byte // lookup table for predictable padding subtrees for all levels
hasher BaseHasherFunc // base hasher to use for the BMT levels
}
// Pool provides a pool of trees used as resources by the BMT Hasher.
// A tree popped from the pool is guaranteed to have a clean state ready
// for hashing a new chunk.
type Pool struct {
c chan *tree // the channel to obtain a resource from the pool
*Conf // configuration
}
func NewConf(hasher BaseHasherFunc, segmentCount, capacity int) *Conf {
count, depth := sizeToParams(segmentCount)
segmentSize := hasher().Size()
zerohashes := make([][]byte, depth+1)
zeros := make([]byte, segmentSize)
zerohashes[0] = zeros
var err error
// initialises the zerohashes lookup table
for i := 1; i < depth+1; i++ {
if zeros, err = doHash(hasher(), zeros, zeros); err != nil {
panic(err.Error())
}
zerohashes[i] = zeros
}
return &Conf{
hasher: hasher,
segmentSize: segmentSize,
segmentCount: segmentCount,
capacity: capacity,
maxSize: count * segmentSize,
depth: depth,
zerohashes: zerohashes,
}
}
// NewPool creates a tree pool with hasher, segment size, segment count and capacity
// it reuses free trees or creates a new one if capacity is not reached.
func NewPool(c *Conf) *Pool {
p := &Pool{
Conf: c,
c: make(chan *tree, c.capacity),
}
for i := 0; i < c.capacity; i++ {
p.c <- newTree(p.segmentSize, p.maxSize, p.depth, p.hasher)
}
return p
}
// Get returns a BMT hasher possibly reusing a tree from the pool
func (p *Pool) Get() *Hasher {
t := <-p.c
return &Hasher{
Conf: p.Conf,
result: make(chan []byte),
errc: make(chan error, 1),
span: make([]byte, SpanSize),
bmt: t,
}
}
// Put is called after using a bmt hasher to return the tree to a pool for reuse
func (p *Pool) Put(h *Hasher) {
p.c <- h.bmt
}
// tree is a reusable control structure representing a BMT
// organised in a binary tree
//
// Hasher uses a Pool to obtain a tree for each chunk hash
// the tree is 'locked' while not in the pool.
type tree struct {
leaves []*node // leaf nodes of the tree, other nodes accessible via parent links
buffer []byte
}
// node is a reuseable segment hasher representing a node in a BMT.
type node struct {
isLeft bool // whether it is left side of the parent double segment
parent *node // pointer to parent node in the BMT
state int32 // atomic increment impl concurrent boolean toggle
left, right []byte // this is where the two children sections are written
hasher hash.Hash // preconstructed hasher on nodes
}
// newNode constructs a segment hasher node in the BMT (used by newTree).
func newNode(index int, parent *node, hasher hash.Hash) *node {
return &node{
parent: parent,
isLeft: index%2 == 0,
hasher: hasher,
}
}
// newTree initialises a tree by building up the nodes of a BMT
//
// segmentSize is stipulated to be the size of the hash.
func newTree(segmentSize, maxsize, depth int, hashfunc func() hash.Hash) *tree {
n := newNode(0, nil, hashfunc())
prevlevel := []*node{n}
// iterate over levels and creates 2^(depth-level) nodes
// the 0 level is on double segment sections so we start at depth - 2
count := 2
for level := depth - 2; level >= 0; level-- {
nodes := make([]*node, count)
for i := 0; i < count; i++ {
parent := prevlevel[i/2]
nodes[i] = newNode(i, parent, hashfunc())
}
prevlevel = nodes
count *= 2
}
// the datanode level is the nodes on the last level
return &tree{
leaves: prevlevel,
buffer: make([]byte, maxsize),
}
}
// atomic bool toggle implementing a concurrent reusable 2-state object.
// Atomic addint with %2 implements atomic bool toggle.
// It returns true if the toggler just put it in the active/waiting state.
func (n *node) toggle() bool {
return atomic.AddInt32(&n.state, 1)%2 == 1
}
// sizeToParams calculates the depth (number of levels) and segment count in the BMT tree.
func sizeToParams(n int) (c, d int) {
c = 2
for ; c < n; c *= 2 {
d++
}
return c, d + 1
}
// Copyright 2020 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package pool
import (
"hash"
"sync"
bmtlegacy "github.com/ethersphere/bee/pkg/bmt/legacy"
"golang.org/x/crypto/sha3"
)
// Pooler pools bmt Hashers.
// It provides the ability for the number of hashers to grow
// according to demand, but will shrink once the minimum defined
// hashers are put back into the pool.
type Pooler interface {
// Get a bmt Hasher instance.
// Instances are reset before being returned to the caller.
Get() *bmtlegacy.Hasher
// Put a bmt Hasher back into the pool
Put(*bmtlegacy.Hasher)
// Size of the pool.
Size() int
}
type pool struct {
p sync.Pool
mtx sync.Mutex
minimum int // minimum number of instances the pool should have
size int // size of the pool (only accounted for when items are put back)
rented int // number of video tapes on rent
}
// New returns a new HasherPool.
func New(minPool, branches int) Pooler {
return &pool{
p: sync.Pool{
New: func() interface{} {
return bmtlegacy.New(bmtlegacy.NewTreePool(hashFunc, branches, 1)) // one tree per hasher
},
},
minimum: minPool,
}
}
// Get gets a bmt Hasher from the pool.
func (h *pool) Get() *bmtlegacy.Hasher {
h.mtx.Lock()
defer h.mtx.Unlock()
v := h.p.Get().(*bmtlegacy.Hasher)
h.rented++
if h.size > 0 {
h.size--
}
return v
}
// Put puts a Hasher back into the pool.
// It discards the instance if the minimum number of instances
// has been reached.
// The hasher is reset before being put back into the pool.
func (h *pool) Put(v *bmtlegacy.Hasher) {
h.mtx.Lock()
defer h.mtx.Unlock()
h.rented--
// only put back if we're not exceeding the minimum capacity
if h.size+1 > h.minimum {
return
}
v.Reset()
h.p.Put(v)
h.size++
}
// Size of the pool.
func (h *pool) Size() int {
h.mtx.Lock()
defer h.mtx.Unlock()
return h.size
}
func hashFunc() hash.Hash {
return sha3.NewLegacyKeccak256()
}
// Copyright 2020 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package pool_test
import (
"sync"
"testing"
"github.com/ethersphere/bee/pkg/bmt/pool"
)
const str = "hello world"
func TestHasherPool(t *testing.T) {
h := pool.New(3, 128)
v := h.Get()
_, err := v.Write([]byte(str))
if err != nil {
t.Fatal(err)
}
h.Put(v)
if s := h.Size(); s != 1 {
t.Fatalf("expected size 1 but got %d", s)
}
}
func TestHasherPool_concurrent(t *testing.T) {
h := pool.New(3, 128)
c := make(chan struct{})
var wg sync.WaitGroup
// request 10 copies
for i := 0; i < 10; i++ {
v := h.Get()
_, err := v.Write([]byte(str))
if err != nil {
t.Fatal(err)
}
wg.Add(1)
go func() {
defer wg.Done()
<-c
h.Put(v)
}()
}
// when we get instances from the pool, we dont know
// which ones are new and which aren't, so size is
// only incremented when items are put back
if s := h.Size(); s != 0 {
t.Fatalf("expected size 0 but got %d", s)
}
close(c)
wg.Wait()
if s := h.Size(); s != 3 {
t.Fatalf("expected size 3 but got %d", s)
}
}
// Copyright 2020 The Swarm Authors. All rights reserved.
// Copyright 2021 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package reference is a simple nonconcurrent reference implementation of the BMT hash
//
// This implementation does not take advantage of any paralellisms and uses
// far more memory than necessary, but it is easy to see that it is correct.
// It can be used for generating test cases for optimized implementations.
// There is extra check on reference hasher correctness in reference_test.go
package reference
import (
......
// Copyright 2020 The Swarm Authors. All rights reserved.
// Copyright 2021 The Swarm Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
......@@ -6,12 +6,13 @@ package reference_test
import (
"bytes"
crand "crypto/rand"
"fmt"
"hash"
"io"
"testing"
"github.com/ethersphere/bee/pkg/bmt/reference"
"gitlab.com/nolash/go-mockbytes"
"golang.org/x/crypto/sha3"
)
......@@ -44,16 +45,16 @@ func sha3hash(t *testing.T, data ...[]byte) []byte {
func TestRefHasher(t *testing.T) {
// the test struct is used to specify the expected BMT hash for
// segment counts between from and to and lengths from 1 to datalength
for i, x := range []struct {
for _, x := range []struct {
from int
to int
expected func([]byte) []byte
}{
// all lengths in [0,64] should be:
//
// sha3hash(data)
//
{
// all lengths in [0,64] should be:
//
// sha3hash(data)
//
from: 1,
to: 2,
expected: func(d []byte) []byte {
......@@ -61,14 +62,15 @@ func TestRefHasher(t *testing.T) {
copy(data, d)
return sha3hash(t, data)
},
}, {
// all lengths in [3,4] should be:
//
// sha3hash(
// sha3hash(data[:64])
// sha3hash(data[64:])
// )
//
},
// all lengths in [3,4] should be:
//
// sha3hash(
// sha3hash(data[:64])
// sha3hash(data[64:])
// )
//
{
from: 3,
to: 4,
expected: func(d []byte) []byte {
......@@ -76,20 +78,21 @@ func TestRefHasher(t *testing.T) {
copy(data, d)
return sha3hash(t, sha3hash(t, data[:64]), sha3hash(t, data[64:]))
},
}, {
// all bmttestutil.SegmentCounts in [5,8] should be:
//
// sha3hash(
// sha3hash(
// sha3hash(data[:64])
// sha3hash(data[64:128])
// )
// sha3hash(
// sha3hash(data[128:192])
// sha3hash(data[192:])
// )
// )
//
},
// all bmttestutil.SegmentCounts in [5,8] should be:
//
// sha3hash(
// sha3hash(
// sha3hash(data[:64])
// sha3hash(data[64:128])
// )
// sha3hash(
// sha3hash(data[128:192])
// sha3hash(data[192:])
// )
// )
//
{
from: 5,
to: 8,
expected: func(d []byte) []byte {
......@@ -102,8 +105,8 @@ func TestRefHasher(t *testing.T) {
for segCount := x.from; segCount <= x.to; segCount++ {
for length := 1; length <= segCount*32; length++ {
t.Run(fmt.Sprintf("%d_segments_%d_bytes", segCount, length), func(t *testing.T) {
g := mockbytes.New(i, mockbytes.MockTypeStandard)
data, err := g.RandomBytes(length)
data := make([]byte, length)
_, err := io.ReadFull(crand.Reader, data)
if err != nil {
t.Fatal(err)
}
......
......@@ -7,24 +7,25 @@
package bmtpool
import (
bmtlegacy "github.com/ethersphere/bee/pkg/bmt/legacy"
"github.com/ethersphere/bee/pkg/bmt/pool"
"github.com/ethersphere/bee/pkg/bmt"
"github.com/ethersphere/bee/pkg/swarm"
)
var instance pool.Pooler
const Capacity = 32
var instance *bmt.Pool
func init() {
instance = pool.New(8, swarm.BmtBranches)
instance = bmt.NewPool(bmt.NewConf(swarm.NewHasher, swarm.BmtBranches, Capacity))
}
// Get a bmt Hasher instance.
// Instances are reset before being returned to the caller.
func Get() *bmtlegacy.Hasher {
func Get() *bmt.Hasher {
return instance.Get()
}
// Put a bmt Hasher back into the pool
func Put(h *bmtlegacy.Hasher) {
func Put(h *bmt.Hasher) {
instance.Put(h)
}
......@@ -63,13 +63,11 @@ func hasher(data []byte) func([]byte) ([]byte, error) {
hasher := bmtpool.Get()
defer bmtpool.Put(hasher)
if err := hasher.SetSpanBytes(span); err != nil {
return nil, err
}
hasher.SetHeader(span)
if _, err := hasher.Write(data); err != nil {
return nil, err
}
return hasher.Sum(nil), nil
return hasher.Hash(nil)
}
}
......
......@@ -35,18 +35,17 @@ func (w *bmtWriter) ChainWrite(p *pipeline.PipeWriteArgs) error {
return errInvalidData
}
hasher := bmtpool.Get()
err := hasher.SetSpanBytes(p.Data[:swarm.SpanSize])
hasher.SetHeader(p.Data[:swarm.SpanSize])
_, err := hasher.Write(p.Data[swarm.SpanSize:])
if err != nil {
bmtpool.Put(hasher)
return err
}
_, err = hasher.Write(p.Data[swarm.SpanSize:])
p.Ref, err = hasher.Hash(nil)
bmtpool.Put(hasher)
if err != nil {
bmtpool.Put(hasher)
return err
}
p.Ref = hasher.Sum(nil)
bmtpool.Put(hasher)
return w.next.ChainWrite(p)
}
......
......@@ -14,7 +14,6 @@ import (
"github.com/ethersphere/bee/pkg/file/pipeline"
"github.com/ethersphere/bee/pkg/file/pipeline/bmt"
mock "github.com/ethersphere/bee/pkg/file/pipeline/mock"
"github.com/ethersphere/bee/pkg/swarm"
)
// TestStoreWriter tests that store writer stores the provided data and calls the next chain writer.
......@@ -29,9 +28,9 @@ func TestBmtWriter(t *testing.T) {
{
// this is a special case, since semantically it can be considered the hash
// of an empty file (since data is all zeros).
name: "all zeros",
data: make([]byte, swarm.ChunkSize),
expHash: mustDecodeString(t, "09ae927d0f3aaa37324df178928d3826820f3dd3388ce4aaebfc3af410bde23a"),
name: "empty file",
data: make([]byte, 0),
expHash: mustDecodeString(t, "b34ca8c22b9e982354f9c7f50b470d66db428d880c8a904d5fe4ec9713171526"),
},
{
name: "hello world",
......@@ -68,7 +67,7 @@ func TestBmtWriter(t *testing.T) {
t.Fatal(err)
}
if !bytes.Equal(tc.expHash, args.Ref) {
t.Fatalf("ref mismatch. got %v want %v", args.Ref, tc.expHash)
t.Fatalf("ref mismatch. got %x want %x", args.Ref, tc.expHash)
}
if calls := mockChainWriter.ChainWriteCalls(); calls != 1 {
......
......@@ -71,7 +71,7 @@ func TestEmpty(t *testing.T) {
if err != nil {
t.Fatal(err)
}
exp := swarm.MustParseHexAddress("ffd70157e48063fc33c97a050f7f640233bf646cc98d9524c6b92bcf3ab56f83")
exp := swarm.MustParseHexAddress("b34ca8c22b9e982354f9c7f50b470d66db428d880c8a904d5fe4ec9713171526")
if !bytes.Equal(exp.Bytes(), sum) {
t.Fatalf("expected %s got %s", exp.String(), hex.EncodeToString(sum))
}
......
......@@ -180,13 +180,11 @@ func hasher(span, b []byte) func([]byte) ([]byte, error) {
s := append(nonce, b...)
hasher := bmtpool.Get()
defer bmtpool.Put(hasher)
if err := hasher.SetSpanBytes(span); err != nil {
return nil, err
}
hasher.SetHeader(span)
if _, err := hasher.Write(s); err != nil {
return nil, err
}
return hasher.Sum(nil), nil
return hasher.Hash(nil)
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment