bmt: import bmt repo (#1520)

4254a3e3 · acud · GitHub · 1b83c2e5 · 4254a3e3 · 4254a3e3
Commit 4254a3e3 authored Apr 01, 2021 by acud Committed by GitHub Apr 01, 2021
14 changed files
--- a/pkg/bmt/bmt.go
+++ b/pkg/bmt/bmt.go
+// Copyright 2020 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bmt
+
+import (
+	"hash"
+)
+
+const (
+	SpanSize = 8
+)
+
+// Hash provides the necessary extension of the hash interface to add the length-prefix of the BMT hash.
+//
+// Any implementation should make it possible to generate a BMT hash using the hash.Hash interface only.
+// However, the limitation will be that the Span of the BMT hash always must be limited to the amount of bytes actually written.
+type Hash interface {
+	hash.Hash
+
+	// SetSpan sets the length prefix of BMT hash.
+	SetSpan(int64) error
+
+	// SetSpanBytes sets the length prefix of BMT hash in byte form.
+	SetSpanBytes([]byte) error
+
+	// Capacity returns the maximum amount of bytes that will be processed by the implementation.
+	Capacity() int
+
+	// WriteSection writes to a specific section of the data to be hashed.
+	WriteSection(idx int, data []byte) error
+}
--- a/pkg/bmt/cmd/generate-hashes/generate-hashes.go
+++ b/pkg/bmt/cmd/generate-hashes/generate-hashes.go
+// Copyright 2020 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Command generate_legacy generates bmt hashes of sequential byte inputs
+// for every possible length of legacy bmt hasher
+package main
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+
+	"github.com/ethersphere/bee/pkg/bmt/legacy"
+	"gitlab.com/nolash/go-mockbytes"
+	"golang.org/x/crypto/sha3"
+)
+
+func main() {
+
+	// create output directory, fail if it already exists or error creating
+	if len(os.Args) != 2 {
+		fmt.Fprintf(os.Stderr, "Usage: generate-hashes <output_directory>\n")
+		os.Exit(1)
+	}
+	outputDir, err := filepath.Abs(os.Args[1])
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Invalid input: %s", err)
+		os.Exit(1)
+	}
+	err = os.Mkdir(outputDir, 0750)
+	if err == os.ErrExist {
+		fmt.Fprintf(os.Stderr, "Directory %s already exists\n", outputDir)
+		os.Exit(1)
+	} else if err != nil {
+		fmt.Fprintf(os.Stderr, "Error creating output directory: %v\n", err)
+		os.Exit(1)
+	}
+
+	// set up hasher
+	hashPool := legacy.NewTreePool(sha3.NewLegacyKeccak256, 128, legacy.PoolSize)
+	bmtHash := legacy.New(hashPool)
+
+	// create sequence generator and outputs
+	var i int
+	g := mockbytes.New(0, mockbytes.MockTypeStandard).WithModulus(255)
+	for i = 0; i < 4096; i++ {
+		s := fmt.Sprintf("processing %d...", i)
+		fmt.Fprintf(os.Stderr, "%-64s\r", s)
+		filename := fmt.Sprintf("%s/%d.bin", outputDir, i)
+		b, err := g.SequentialBytes(i)
+		if err != nil {
+			fmt.Fprint(os.Stderr, err.Error())
+			os.Exit(1)
+		}
+		bmtHash.Reset()
+		_, err = bmtHash.Write(b)
+		sum := bmtHash.Sum(nil)
+		if err != nil {
+			fmt.Fprint(os.Stderr, err.Error())
+			os.Exit(1)
+		}
+		err = ioutil.WriteFile(filename, sum, 0666)
+		if err != nil {
+			fmt.Fprint(os.Stderr, err.Error())
+			os.Exit(1)
+		}
+		err = ioutil.WriteFile(filename, b, 0666)
+		if err != nil {
+			fmt.Fprint(os.Stderr, err.Error())
+		}
+	}
+
+	// Be kind and give feedback to user
+	dirString := fmt.Sprintf("Done. Data is in %s. Enjoy!", outputDir)
+	fmt.Printf("%-64s\n", dirString)
+}
--- a/pkg/bmt/cmd/main_legacy.go
+++ b/pkg/bmt/cmd/main_legacy.go
+// Copyright 2020 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Command main_legacy executes the BMT hash algorithm on the given data and writes the binary result to standard output
+//
+// Up to 4096 bytes will be read
+//
+// If a filename is given as argument, it reads data from the file. Otherwise it reads data from standard input.
+package main
+
+import (
+	"fmt"
+	"io"
+	"os"
+
+	"github.com/ethersphere/bee/pkg/bmt/legacy"
+	"golang.org/x/crypto/sha3"
+)
+
+func main() {
+	var data [4096]byte
+	var err error
+	var infile *os.File
+
+	if len(os.Args) > 1 {
+		infile, err = os.Open(os.Args[1])
+		if err != nil {
+			fmt.Fprint(os.Stderr, err.Error())
+			os.Exit(1)
+		}
+	} else {
+		infile = os.Stdin
+	}
+	var c int
+	c, err = infile.Read(data[:])
+
+	// EOF means zero-length input. This is still valid input for BMT
+	if err != nil && err != io.EOF {
+		fmt.Fprint(os.Stderr, err.Error())
+		infile.Close()
+		os.Exit(1)
+	}
+	infile.Close()
+
+	hashPool := legacy.NewTreePool(sha3.NewLegacyKeccak256, 128, legacy.PoolSize)
+	bmtHash := legacy.New(hashPool)
+	_, err = bmtHash.Write(data[:c])
+	if err != nil {
+		fmt.Fprint(os.Stderr, err.Error())
+		os.Exit(1)
+	}
+	binSum := bmtHash.Sum(nil)
+	_, err = os.Stdout.Write(binSum)
+	if err != nil {
+		fmt.Fprint(os.Stderr, err.Error())
+		os.Exit(1)
+	}
+}
--- a/pkg/bmt/doc.go
+++ b/pkg/bmt/doc.go
+// Copyright 2020 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package bmt defines the interface for the Binary Merkle Tree hash.
+package bmt
--- a/pkg/bmt/errors.go
+++ b/pkg/bmt/errors.go
+// Copyright 2020 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bmt
+
+import (
+	"errors"
+)
+
+var ErrOverflow = errors.New("BMT hash capacity exceeded")
--- a/pkg/bmt/legacy/bmt.go
+++ b/pkg/bmt/legacy/bmt.go
--- a/pkg/bmt/legacy/bmt_test.go
+++ b/pkg/bmt/legacy/bmt_test.go
--- a/pkg/bmt/legacy/doc.go
+++ b/pkg/bmt/legacy/doc.go
+// Copyright 2018 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+// Binary Merkle Tree Hash is a hash function over arbitrary datachunks of limited size.
+// It is defined as the root hash of the binary merkle tree built over fixed size segments
+// of the underlying chunk using any base hash function (e.g., keccak 256 SHA3).
+// Chunks with data shorter than the fixed size are hashed as if they had zero padding.
+//
+// BMT hash is used as the chunk hash function in swarm which in turn is the basis for the
+// 128 branching swarm hash http://swarm-guide.readthedocs.io/en/latest/architecture.html#swarm-hash
+//
+// The BMT is optimal for providing compact inclusion proofs, i.e. prove that a
+// segment is a substring of a chunk starting at a particular offset.
+// The size of the underlying segments is fixed to the size of the base hash (called the resolution
+// of the BMT hash), Using Keccak256 SHA3 hash is 32 bytes, the EVM word size to optimize for on-chain BMT verification
+// as well as the hash size optimal for inclusion proofs in the merkle tree of the swarm hash.
+//
+// Two implementations are provided:
+//
+// RefHasher is optimized for code simplicity and meant as a reference implementation
+// that is simple to understand
+//
+// Hasher is optimized for speed taking advantage of concurrency with minimalistic
+// control structure to coordinate the concurrent routines
+//
+// BMT Hasher implements the following interfaces:
+//
+// standard golang hash.Hash - synchronous, reusable
+//
+// io.Writer - synchronous left-to-right datawriter
+package legacy
--- a/pkg/bmt/pool/pool.go
+++ b/pkg/bmt/pool/pool.go
+// Copyright 2020 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pool
+
+import (
+	"hash"
+	"sync"
+
+	bmtlegacy "github.com/ethersphere/bee/pkg/bmt/legacy"
+	"golang.org/x/crypto/sha3"
+)
+
+// Pooler pools bmt Hashers.
+// It provides the ability for the number of hashers to grow
+// according to demand, but will shrink once the minimum defined
+// hashers are put back into the pool.
+type Pooler interface {
+	// Get a bmt Hasher instance.
+	// Instances are reset before being returned to the caller.
+	Get() *bmtlegacy.Hasher
+	// Put a bmt Hasher back into the pool
+	Put(*bmtlegacy.Hasher)
+	// Size of the pool.
+	Size() int
+}
+
+type pool struct {
+	p       sync.Pool
+	mtx     sync.Mutex
+	minimum int // minimum number of instances the pool should have
+	size    int // size of the pool (only accounted for when items are put back)
+	rented  int // number of video tapes on rent
+}
+
+// New returns a new HasherPool.
+func New(minPool, branches int) Pooler {
+	return &pool{
+		p: sync.Pool{
+			New: func() interface{} {
+				return bmtlegacy.New(bmtlegacy.NewTreePool(hashFunc, branches, 1)) // one tree per hasher
+			},
+		},
+		minimum: minPool,
+	}
+}
+
+// Get gets a bmt Hasher from the pool.
+func (h *pool) Get() *bmtlegacy.Hasher {
+	h.mtx.Lock()
+	defer h.mtx.Unlock()
+
+	v := h.p.Get().(*bmtlegacy.Hasher)
+	h.rented++
+
+	if h.size > 0 {
+		h.size--
+	}
+
+	return v
+}
+
+// Put puts a Hasher back into the pool.
+// It discards the instance if the minimum number of instances
+// has been reached.
+// The hasher is reset before being put back into the pool.
+func (h *pool) Put(v *bmtlegacy.Hasher) {
+	h.mtx.Lock()
+	defer h.mtx.Unlock()
+
+	h.rented--
+
+	// only put back if we're not exceeding the minimum capacity
+	if h.size+1 > h.minimum {
+		return
+	}
+
+	v.Reset()
+	h.p.Put(v)
+	h.size++
+}
+
+// Size of the pool.
+func (h *pool) Size() int {
+	h.mtx.Lock()
+	defer h.mtx.Unlock()
+	return h.size
+}
+
+func hashFunc() hash.Hash {
+	return sha3.NewLegacyKeccak256()
+}
--- a/pkg/bmt/pool/pool_test.go
+++ b/pkg/bmt/pool/pool_test.go
+// Copyright 2020 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pool_test
+
+import (
+	"sync"
+	"testing"
+
+	"github.com/ethersphere/bee/pkg/bmt/pool"
+)
+
+const str = "hello world"
+
+func TestHasherPool(t *testing.T) {
+	h := pool.New(3, 128)
+	v := h.Get()
+	_, err := v.Write([]byte(str))
+	if err != nil {
+		t.Fatal(err)
+	}
+	h.Put(v)
+	if s := h.Size(); s != 1 {
+		t.Fatalf("expected size 1 but got %d", s)
+	}
+}
+
+func TestHasherPool_concurrent(t *testing.T) {
+	h := pool.New(3, 128)
+
+	c := make(chan struct{})
+	var wg sync.WaitGroup
+
+	// request 10 copies
+	for i := 0; i < 10; i++ {
+		v := h.Get()
+		_, err := v.Write([]byte(str))
+		if err != nil {
+			t.Fatal(err)
+		}
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			<-c
+			h.Put(v)
+		}()
+	}
+
+	// when we get instances from the pool, we dont know
+	// which ones are new and which aren't, so size is
+	// only incremented when items are put back
+	if s := h.Size(); s != 0 {
+		t.Fatalf("expected size 0 but got %d", s)
+	}
+
+	close(c)
+	wg.Wait()
+
+	if s := h.Size(); s != 3 {
+		t.Fatalf("expected size 3 but got %d", s)
+	}
+}
--- a/pkg/bmt/reference/doc.go
+++ b/pkg/bmt/reference/doc.go
+// Copyright 2020 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package reference is a simple nonconcurrent reference implementation for hashsize segment based
+// Binary Merkle tree hash on arbitrary but fixed maximum chunksize n where 0 <= n <= 4096
+//
+// This implementation does not take advantage of any paralellisms and uses
+// far more memory than necessary, but it is easy to see that it is correct.
+// It can be used for generating test cases for optimized implementations.
+// There is extra check on reference hasher correctness in bmt_test.go
+// * TestRefHasher
+// * testBMTHasherCorrectness function
+package reference
--- a/pkg/bmt/reference/reference.go
+++ b/pkg/bmt/reference/reference.go
+// Copyright 2020 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package reference
+
+import (
+	"hash"
+)
+
+// RefHasher is the non-optimized easy-to-read reference implementation of BMT.
+type RefHasher struct {
+	maxDataLength int       // c * hashSize, where c = 2 ^ ceil(log2(count)), where count = ceil(length / hashSize)
+	sectionLength int       // 2 * hashSize
+	hasher        hash.Hash // base hash func (Keccak256 SHA3)
+}
+
+// NewRefHasher returns a new RefHasher.
+func NewRefHasher(h hash.Hash, count int) *RefHasher {
+	hashsize := h.Size()
+	c := 2
+	for ; c < count; c *= 2 {
+	}
+	return &RefHasher{
+		sectionLength: 2 * hashsize,
+		maxDataLength: c * hashsize,
+		hasher:        h,
+	}
+}
+
+// Hash returns the BMT hash of the byte slice.
+func (rh *RefHasher) Hash(data []byte) ([]byte, error) {
+	// if data is shorter than the base length (maxDataLength), we provide padding with zeros
+	d := make([]byte, rh.maxDataLength)
+	length := len(data)
+	if length > rh.maxDataLength {
+		length = rh.maxDataLength
+	}
+	copy(d, data[:length])
+	return rh.hash(d, rh.maxDataLength)
+}
+
+// hash calls itself recursively on both halves of the given slice
+// concatenates the results, and returns the hash of that
+// if the length of d is 2 * segmentSize then just returns the hash of that section
+// data has length maxDataLength = segmentSize * 2^k
+func (rh *RefHasher) hash(data []byte, length int) ([]byte, error) {
+	var section []byte
+	if length == rh.sectionLength {
+		// section contains two data segments (d)
+		section = data
+	} else {
+		// section contains hashes of left and right BMT subtree
+		// to be calculated by calling hash recursively on left and right half of d
+		length /= 2
+		left, err := rh.hash(data[:length], length)
+		if err != nil {
+			return nil, err
+		}
+		right, err := rh.hash(data[length:], length)
+		if err != nil {
+			return nil, err
+		}
+		section = append(left, right...)
+	}
+	rh.hasher.Reset()
+	_, err := rh.hasher.Write(section)
+	if err != nil {
+		return nil, err
+	}
+	return rh.hasher.Sum(nil), nil
+}
--- a/pkg/bmt/reference/reference_test.go
+++ b/pkg/bmt/reference/reference_test.go
+// Copyright 2020 The Swarm Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package reference_test
+
+import (
+	"bytes"
+	"fmt"
+	"hash"
+	"testing"
+
+	"github.com/ethersphere/bee/pkg/bmt/reference"
+	"gitlab.com/nolash/go-mockbytes"
+
+	"golang.org/x/crypto/sha3"
+)
+
+// calculates the hash of the data using hash.Hash
+func doSum(h hash.Hash, b []byte, data ...[]byte) ([]byte, error) {
+	h.Reset()
+	for _, v := range data {
+		var err error
+		_, err = h.Write(v)
+		if err != nil {
+			return nil, err
+		}
+	}
+	return h.Sum(b), nil
+}
+
+// calculates the Keccak256 SHA3 hash of the data
+func sha3hash(t *testing.T, data ...[]byte) []byte {
+	t.Helper()
+	h := sha3.NewLegacyKeccak256()
+	r, err := doSum(h, nil, data...)
+	if err != nil {
+		t.Fatal(err)
+	}
+	return r
+}
+
+// TestRefHasher tests that the RefHasher computes the expected BMT hash for some small data lengths.
+func TestRefHasher(t *testing.T) {
+	// the test struct is used to specify the expected BMT hash for
+	// segment counts between from and to and lengths from 1 to datalength
+	for i, x := range []struct {
+		from     int
+		to       int
+		expected func([]byte) []byte
+	}{
+		{
+			// all lengths in [0,64] should be:
+			//
+			//   sha3hash(data)
+			//
+			from: 1,
+			to:   2,
+			expected: func(d []byte) []byte {
+				data := make([]byte, 64)
+				copy(data, d)
+				return sha3hash(t, data)
+			},
+		}, {
+			// all lengths in [3,4] should be:
+			//
+			//   sha3hash(
+			//     sha3hash(data[:64])
+			//     sha3hash(data[64:])
+			//   )
+			//
+			from: 3,
+			to:   4,
+			expected: func(d []byte) []byte {
+				data := make([]byte, 128)
+				copy(data, d)
+				return sha3hash(t, sha3hash(t, data[:64]), sha3hash(t, data[64:]))
+			},
+		}, {
+			// all bmttestutil.SegmentCounts in [5,8] should be:
+			//
+			//   sha3hash(
+			//     sha3hash(
+			//       sha3hash(data[:64])
+			//       sha3hash(data[64:128])
+			//     )
+			//     sha3hash(
+			//       sha3hash(data[128:192])
+			//       sha3hash(data[192:])
+			//     )
+			//   )
+			//
+			from: 5,
+			to:   8,
+			expected: func(d []byte) []byte {
+				data := make([]byte, 256)
+				copy(data, d)
+				return sha3hash(t, sha3hash(t, sha3hash(t, data[:64]), sha3hash(t, data[64:128])), sha3hash(t, sha3hash(t, data[128:192]), sha3hash(t, data[192:])))
+			},
+		},
+	} {
+		for segCount := x.from; segCount <= x.to; segCount++ {
+			for length := 1; length <= segCount*32; length++ {
+				t.Run(fmt.Sprintf("%d_segments_%d_bytes", segCount, length), func(t *testing.T) {
+					g := mockbytes.New(i, mockbytes.MockTypeStandard)
+					data, err := g.RandomBytes(length)
+					if err != nil {
+						t.Fatal(err)
+					}
+					expected := x.expected(data)
+					actual, err := reference.NewRefHasher(sha3.NewLegacyKeccak256(), segCount).Hash(data)
+					if err != nil {
+						t.Fatal(err)
+					}
+					if !bytes.Equal(actual, expected) {
+						t.Fatalf("expected %x, got %x", expected, actual)
+					}
+				})
+			}
+		}
+	}
+}
--- a/pkg/bmtpool/bmtpool.go
+++ b/pkg/bmtpool/bmtpool.go
@@ -7,9 +7,9 @@
 package bmtpool

 import (
+	bmtlegacy "github.com/ethersphere/bee/pkg/bmt/legacy"
+	"github.com/ethersphere/bee/pkg/bmt/pool"
 	"github.com/ethersphere/bee/pkg/swarm"
-	bmtlegacy "github.com/ethersphere/bmt/legacy"
-	"github.com/ethersphere/bmt/pool"
 )

 var instance pool.Pooler