Commit 425891bb authored by Peter Mrekaj's avatar Peter Mrekaj Committed by GitHub

refactor(kademlia): build topology peer connections concurrently (#1647)

Decreases the time for building the topology by doing the connection attempts to peers concurrently.
parent f3cb3f2e
...@@ -6,6 +6,7 @@ package kademlia ...@@ -6,6 +6,7 @@ package kademlia
import ( import (
"context" "context"
random "crypto/rand"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
...@@ -15,8 +16,6 @@ import ( ...@@ -15,8 +16,6 @@ import (
"sync" "sync"
"time" "time"
random "crypto/rand"
"github.com/ethersphere/bee/pkg/addressbook" "github.com/ethersphere/bee/pkg/addressbook"
"github.com/ethersphere/bee/pkg/discovery" "github.com/ethersphere/bee/pkg/discovery"
"github.com/ethersphere/bee/pkg/logging" "github.com/ethersphere/bee/pkg/logging"
...@@ -35,15 +34,15 @@ const ( ...@@ -35,15 +34,15 @@ const (
) )
var ( var (
errMissingAddressBookEntry = errors.New("addressbook underlay entry not found")
errOverlayMismatch = errors.New("overlay mismatch")
timeToRetry = 60 * time.Second
shortRetry = 30 * time.Second
saturationPeers = 4 saturationPeers = 4
overSaturationPeers = 16 overSaturationPeers = 16
shortRetry = 30 * time.Second
timeToRetry = 2 * shortRetry
broadcastBinSize = 4 broadcastBinSize = 4
) )
var errOverlayMismatch = errors.New("overlay mismatch")
type binSaturationFunc func(bin uint8, peers, connected *pslice.PSlice) (saturated bool, oversaturated bool) type binSaturationFunc func(bin uint8, peers, connected *pslice.PSlice) (saturated bool, oversaturated bool)
type sanctionedPeerFunc func(peer swarm.Address) bool type sanctionedPeerFunc func(peer swarm.Address) bool
...@@ -75,7 +74,7 @@ type Kad struct { ...@@ -75,7 +74,7 @@ type Kad struct {
depthMu sync.RWMutex // protect depth changes depthMu sync.RWMutex // protect depth changes
manageC chan struct{} // trigger the manage forever loop to connect to new peers manageC chan struct{} // trigger the manage forever loop to connect to new peers
waitNext map[string]retryInfo // sanction connections to a peer, key is overlay string and value is a retry information waitNext map[string]retryInfo // sanction connections to a peer, key is overlay string and value is a retry information
waitNextMu sync.Mutex // synchronize map waitNextMu sync.Mutex // guards waitNext map
peerSig []chan struct{} peerSig []chan struct{}
peerSigMtx sync.Mutex peerSigMtx sync.Mutex
logger logging.Logger // logger logger logging.Logger // logger
...@@ -199,98 +198,57 @@ func (k *Kad) generateCommonBinPrefixes() { ...@@ -199,98 +198,57 @@ func (k *Kad) generateCommonBinPrefixes() {
// Clears the bit at pos in n. // Clears the bit at pos in n.
func clearBit(n, pos uint8) uint8 { func clearBit(n, pos uint8) uint8 {
mask := ^(uint8(1) << pos) mask := ^(uint8(1) << pos)
n &= mask return n & mask
return n
} }
// Sets the bit at pos in the integer n. // Sets the bit at pos in the integer n.
func setBit(n, pos uint8) uint8 { func setBit(n, pos uint8) uint8 {
n |= (1 << pos) return n | 1<<pos
return n
} }
func hasBit(n, pos uint8) bool { func hasBit(n, pos uint8) bool {
val := n & (1 << pos) return n&(1<<pos) > 0
return (val > 0)
} }
// manage is a forever loop that manages the connection to new peers // peerConnInfo groups necessary fields needed to create a connection.
// once they get added or once others leave. type peerConnInfo struct {
func (k *Kad) manage() { po uint8
var ( addr swarm.Address
peerToRemove swarm.Address }
start time.Time
spf = func(peer swarm.Address) bool { // connectBalanced attempts to connect to the balanced peers first.
func (k *Kad) connectBalanced(wg *sync.WaitGroup, peerConnChan chan<- *peerConnInfo) {
skipPeers := func(peer swarm.Address) bool {
k.waitNextMu.Lock() k.waitNextMu.Lock()
defer k.waitNextMu.Unlock() defer k.waitNextMu.Unlock()
if next, ok := k.waitNext[peer.String()]; ok && time.Now().Before(next.tryAfter) { next, ok := k.waitNext[peer.String()]
return true return ok && time.Now().Before(next.tryAfter)
}
return false
} }
)
defer k.wg.Done()
defer close(k.done)
ctx, cancel := context.WithCancel(context.Background())
go func() {
<-k.quit
cancel()
}()
for {
select {
case <-k.quit:
return
case <-time.After(30 * time.Second):
// periodically try to connect to new peers
select {
case k.manageC <- struct{}{}:
default:
}
case <-k.manageC:
start = time.Now()
select {
case <-k.quit:
return
default:
}
if k.standalone {
continue
}
// attempt balanced connection first
err := func() error {
// for each bin
for i := range k.commonBinPrefixes { for i := range k.commonBinPrefixes {
// and each pseudo address
for j := range k.commonBinPrefixes[i] { for j := range k.commonBinPrefixes[i] {
pseudoAddr := k.commonBinPrefixes[i][j] pseudoAddr := k.commonBinPrefixes[i][j]
closestConnectedPeer, err := closestPeer(k.connectedPeers, pseudoAddr, noopSanctionedPeerFn, swarm.ZeroAddress) closestConnectedPeer, err := closestPeer(k.connectedPeers, pseudoAddr, noopSanctionedPeerFn)
if err != nil { if err != nil {
if errors.Is(err, topology.ErrNotFound) { if errors.Is(err, topology.ErrNotFound) {
break break
} }
k.logger.Errorf("closest connected peer: %v", err) k.logger.Errorf("closest connected peer: %v", err)
continue continue
} }
// check proximity
closestConnectedPO := swarm.ExtendedProximity(closestConnectedPeer.Bytes(), pseudoAddr.Bytes()) closestConnectedPO := swarm.ExtendedProximity(closestConnectedPeer.Bytes(), pseudoAddr.Bytes())
if int(closestConnectedPO) >= i+k.bitSuffixLength+1 {
continue
}
if int(closestConnectedPO) < i+k.bitSuffixLength+1 { // Connect to closest known peer which we haven't tried connecting to recently.
// connect to closest known peer which we haven't tried connecting closestKnownPeer, err := closestPeer(k.knownPeers, pseudoAddr, skipPeers)
// to recently
closestKnownPeer, err := closestPeer(k.knownPeers, pseudoAddr, spf, swarm.ZeroAddress)
if err != nil { if err != nil {
if errors.Is(err, topology.ErrNotFound) { if errors.Is(err, topology.ErrNotFound) {
break break
} }
k.logger.Errorf("closest known peer: %v", err) k.logger.Errorf("closest known peer: %v", err)
continue continue
} }
...@@ -300,170 +258,211 @@ func (k *Kad) manage() { ...@@ -300,170 +258,211 @@ func (k *Kad) manage() {
} }
closestKnownPeerPO := swarm.ExtendedProximity(closestKnownPeer.Bytes(), pseudoAddr.Bytes()) closestKnownPeerPO := swarm.ExtendedProximity(closestKnownPeer.Bytes(), pseudoAddr.Bytes())
if int(closestKnownPeerPO) < i+k.bitSuffixLength+1 { if int(closestKnownPeerPO) < i+k.bitSuffixLength+1 {
continue continue
} }
peer := closestKnownPeer select {
case <-k.quit:
return
default:
wg.Add(1)
peerConnChan <- &peerConnInfo{
po: swarm.Proximity(k.base.Bytes(), closestKnownPeer.Bytes()),
addr: closestKnownPeer,
}
}
break
}
}
}
bzzAddr, err := k.addressBook.Get(peer) // connectNeighbours attempts to connect to the neighbours
if err != nil { // which were not considered by the connectBalanced method.
if err == addressbook.ErrNotFound { func (k *Kad) connectNeighbours(wg *sync.WaitGroup, peerConnChan chan<- *peerConnInfo) {
k.logger.Debugf("failed to get address book entry for peer: %s", peer.String()) // The topology.EachPeerFunc doesn't return an error
peerToRemove = peer // so we ignore the error returned from EachBinRev.
return errMissingAddressBookEntry _ = k.knownPeers.EachBinRev(func(addr swarm.Address, po uint8) (bool, bool, error) {
if k.connectedPeers.Exists(addr) {
return false, false, nil
} }
// either a peer is not known in the address book, in which case it
// should be removed, or that some severe I/O problem is at hand k.waitNextMu.Lock()
return err if next, ok := k.waitNext[addr.String()]; ok && time.Now().Before(next.tryAfter) {
k.waitNextMu.Unlock()
return false, false, nil
} }
k.waitNextMu.Unlock()
po := swarm.Proximity(k.base.Bytes(), peer.Bytes()) if saturated, _ := k.saturationFunc(po, k.knownPeers, k.connectedPeers); saturated {
return false, true, nil // Bin is saturated, skip to next bin.
}
err = k.connect(ctx, peer, bzzAddr.Underlay, po) select {
if err != nil { case <-k.quit:
if errors.Is(err, errOverlayMismatch) { return true, false, nil
k.knownPeers.Remove(peer, po) default:
if err := k.addressBook.Remove(peer); err != nil { wg.Add(1)
k.logger.Debugf("could not remove peer from addressbook: %s", peer.String()) peerConnChan <- &peerConnInfo{
po: po,
addr: addr,
} }
} }
k.logger.Debugf("peer not reachable from kademlia %s: %v", bzzAddr.String(), err)
k.logger.Warningf("peer not reachable when attempting to connect")
k.waitNextMu.Lock() // The bin could be saturated or not, so a decision cannot
if _, ok := k.waitNext[peer.String()]; !ok { // be made before checking the next peer, so we iterate to next.
// don't override existing data in the map return false, true, nil
k.waitNext[peer.String()] = retryInfo{tryAfter: time.Now().Add(timeToRetry)} })
}
// connectionAttemptsHandler handles the connection attempts
// to peers sent by the producers to the peerConnChan.
func (k *Kad) connectionAttemptsHandler(ctx context.Context, wg *sync.WaitGroup, peerConnChan <-chan *peerConnInfo) {
connect := func(peer *peerConnInfo) {
bzzAddr, err := k.addressBook.Get(peer.addr)
switch {
case errors.Is(err, addressbook.ErrNotFound):
k.logger.Debugf("empty address book entry for peer %q", peer.addr)
po := swarm.Proximity(k.base.Bytes(), peer.addr.Bytes())
k.knownPeers.Remove(peer.addr, po)
return
case err != nil:
k.logger.Debugf("failed to get address book entry for peer %q: %v", peer.addr, err)
return
} }
k.waitNextMu.Unlock()
// continue to next switch err = k.connect(ctx, peer.addr, bzzAddr.Underlay); {
continue case errors.Is(err, errOverlayMismatch):
k.logger.Debugf("overlay mismatch has occurred to an overlay %q with underlay %q", peer.addr, bzzAddr.Underlay)
k.waitNextMu.Lock()
delete(k.waitNext, peer.addr.String())
k.waitNextMu.Unlock()
k.knownPeers.Remove(peer.addr, peer.po)
if err := k.addressBook.Remove(peer.addr); err != nil {
k.logger.Debugf("could not remove peer %q from addressbook", peer.addr)
}
fallthrough
case err != nil:
k.logger.Debugf("peer not reachable from kademlia %q: %v", bzzAddr, err)
k.logger.Warningf("peer not reachable when attempting to connect")
return
} }
k.waitNextMu.Lock() k.waitNextMu.Lock()
k.waitNext[peer.String()] = retryInfo{tryAfter: time.Now().Add(shortRetry)} k.waitNext[peer.addr.String()] = retryInfo{tryAfter: time.Now().Add(shortRetry)}
k.waitNextMu.Unlock() k.waitNextMu.Unlock()
k.connectedPeers.Add(peer, po) k.connectedPeers.Add(peer.addr, peer.po)
k.depthMu.Lock() k.depthMu.Lock()
k.depth = recalcDepth(k.connectedPeers, k.radius) k.depth = recalcDepth(k.connectedPeers, k.radius)
k.depthMu.Unlock() k.depthMu.Unlock()
k.logger.Debugf("connected to peer: %s for bin: %d", peer, i) select {
case k.manageC <- struct{}{}:
k.notifyPeerSig() default:
}
}
} }
return nil
}()
k.logger.Tracef("kademlia balanced connector took %s to finish", time.Since(start))
if err != nil { k.logger.Debugf("connected to peer: %q for bin: %d", peer.addr, peer.po)
if errors.Is(err, errMissingAddressBookEntry) { k.notifyPeerSig()
po := swarm.Proximity(k.base.Bytes(), peerToRemove.Bytes())
k.knownPeers.Remove(peerToRemove, po)
} else {
k.logger.Errorf("kademlia manage loop iterator: %v", err)
}
} }
err = k.knownPeers.EachBinRev(func(peer swarm.Address, po uint8) (bool, bool, error) { var (
// The inProgress helps to avoid making a connection
if k.connectedPeers.Exists(peer) { // to a peer who has the connection already in progress.
return false, false, nil inProgress = make(map[string]bool)
} inProgressMu sync.Mutex
)
for i := 0; i < int(swarm.MaxBins); i++ {
go func() {
for {
select {
case <-k.quit:
return
case peer := <-peerConnChan:
addr := peer.addr.String()
// Check if the peer was penalized.
k.waitNextMu.Lock() k.waitNextMu.Lock()
if next, ok := k.waitNext[peer.String()]; ok && time.Now().Before(next.tryAfter) { next, ok := k.waitNext[addr]
if ok && time.Now().Before(next.tryAfter) {
k.waitNextMu.Unlock() k.waitNextMu.Unlock()
return false, false, nil wg.Done()
continue
} }
k.waitNextMu.Unlock() k.waitNextMu.Unlock()
currentDepth := k.NeighborhoodDepth() inProgressMu.Lock()
if saturated, _ := k.saturationFunc(po, k.knownPeers, k.connectedPeers); saturated { if !inProgress[addr] {
return false, true, nil // bin is saturated, skip to next bin inProgress[addr] = true
} inProgressMu.Unlock()
connect(peer)
bzzAddr, err := k.addressBook.Get(peer) inProgressMu.Lock()
if err != nil { delete(inProgress, addr)
if err == addressbook.ErrNotFound {
k.logger.Debugf("failed to get address book entry for peer: %s", peer.String())
peerToRemove = peer
return false, false, errMissingAddressBookEntry
}
// either a peer is not known in the address book, in which case it
// should be removed, or that some severe I/O problem is at hand
return false, false, err
} }
inProgressMu.Unlock()
err = k.connect(ctx, peer, bzzAddr.Underlay, po) wg.Done()
if err != nil {
if errors.Is(err, errOverlayMismatch) {
k.knownPeers.Remove(peer, po)
if err := k.addressBook.Remove(peer); err != nil {
k.logger.Debugf("could not remove peer from addressbook: %s", peer.String())
} }
} }
k.logger.Debugf("peer not reachable from kademlia %s: %v", bzzAddr.String(), err) }()
k.logger.Warningf("peer not reachable when attempting to connect")
k.waitNextMu.Lock()
if _, ok := k.waitNext[peer.String()]; !ok {
// don't override existing data in the map
k.waitNext[peer.String()] = retryInfo{tryAfter: time.Now().Add(timeToRetry)}
}
k.waitNextMu.Unlock()
// continue to next
return false, false, nil
} }
}
k.waitNextMu.Lock() // manage is a forever loop that manages the connection to new peers
k.waitNext[peer.String()] = retryInfo{tryAfter: time.Now().Add(shortRetry)} // once they get added or once others leave.
k.waitNextMu.Unlock() func (k *Kad) manage() {
defer k.wg.Done()
k.connectedPeers.Add(peer, po) defer close(k.done)
k.depthMu.Lock()
k.depth = recalcDepth(k.connectedPeers, k.radius)
k.depthMu.Unlock()
k.logger.Debugf("connected to peer: %s old depth: %d new depth: %d", peer, currentDepth, k.NeighborhoodDepth()) ctx, cancel := context.WithCancel(context.Background())
go func() {
<-k.quit
cancel()
}()
k.notifyPeerSig() // The wg makes sure that we wait for all the connection attempts,
// spun up by goroutines, to finish before we try the boot-nodes.
var wg sync.WaitGroup
var peerConnChan = make(chan *peerConnInfo)
go k.connectionAttemptsHandler(ctx, &wg, peerConnChan)
for {
select { select {
case <-k.quit: case <-k.quit:
return true, false, nil return
case <-time.After(30 * time.Second):
select {
case k.manageC <- struct{}{}:
default: default:
} }
case <-k.manageC:
start := time.Now()
// the bin could be saturated or not, so a decision cannot select {
// be made before checking the next peer, so we iterate to next case <-k.quit:
return false, false, nil return
}) default:
k.logger.Tracef("kademlia iterator took %s to finish", time.Since(start))
if err != nil {
if errors.Is(err, errMissingAddressBookEntry) {
po := swarm.Proximity(k.base.Bytes(), peerToRemove.Bytes())
k.knownPeers.Remove(peerToRemove, po)
} else {
k.logger.Errorf("kademlia manage loop iterator: %v", err)
} }
if k.standalone {
continue
} }
oldDepth := k.NeighborhoodDepth()
k.connectBalanced(&wg, peerConnChan)
k.connectNeighbours(&wg, peerConnChan)
wg.Wait()
k.logger.Tracef(
"kademlia: connector took %s to finish: old depth %d; new depth %d",
time.Since(start),
oldDepth,
k.NeighborhoodDepth(),
)
if k.connectedPeers.Length() == 0 { if k.connectedPeers.Length() == 0 {
k.logger.Debug("kademlia has no connected peers, trying bootnodes") k.logger.Debug("kademlia: no connected peers, trying bootnodes")
k.connectBootnodes(ctx) k.connectBootnodes(ctx)
} }
} }
} }
} }
...@@ -614,7 +613,7 @@ func recalcDepth(peers *pslice.PSlice, radius uint8) uint8 { ...@@ -614,7 +613,7 @@ func recalcDepth(peers *pslice.PSlice, radius uint8) uint8 {
// connect connects to a peer and gossips its address to our connected peers, // connect connects to a peer and gossips its address to our connected peers,
// as well as sends the peers we are connected to to the newly connected peer // as well as sends the peers we are connected to to the newly connected peer
func (k *Kad) connect(ctx context.Context, peer swarm.Address, ma ma.Multiaddr, po uint8) error { func (k *Kad) connect(ctx context.Context, peer swarm.Address, ma ma.Multiaddr) error {
k.logger.Infof("attempting to connect to peer %s", peer) k.logger.Infof("attempting to connect to peer %s", peer)
ctx, cancel := context.WithTimeout(ctx, 5*time.Second) ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel() defer cancel()
...@@ -821,14 +820,9 @@ func (k *Kad) notifyPeerSig() { ...@@ -821,14 +820,9 @@ func (k *Kad) notifyPeerSig() {
} }
} }
func closestPeer(peers *pslice.PSlice, addr swarm.Address, spf sanctionedPeerFunc, skipPeers ...swarm.Address) (swarm.Address, error) { func closestPeer(peers *pslice.PSlice, addr swarm.Address, spf sanctionedPeerFunc) (swarm.Address, error) {
closest := swarm.Address{} closest := swarm.ZeroAddress
err := peers.EachBinRev(func(peer swarm.Address, po uint8) (bool, bool, error) { err := peers.EachBinRev(func(peer swarm.Address, po uint8) (bool, bool, error) {
for _, a := range skipPeers {
if a.Equal(peer) {
return false, false, nil
}
}
// check whether peer is sanctioned // check whether peer is sanctioned
if spf(peer) { if spf(peer) {
return false, false, nil return false, false, nil
...@@ -854,12 +848,12 @@ func closestPeer(peers *pslice.PSlice, addr swarm.Address, spf sanctionedPeerFun ...@@ -854,12 +848,12 @@ func closestPeer(peers *pslice.PSlice, addr swarm.Address, spf sanctionedPeerFun
return false, false, nil return false, false, nil
}) })
if err != nil { if err != nil {
return swarm.Address{}, err return swarm.ZeroAddress, err
} }
// check if found // check if found
if closest.IsZero() { if closest.IsZero() {
return swarm.Address{}, topology.ErrNotFound return swarm.ZeroAddress, topology.ErrNotFound
} }
return closest, nil return closest, nil
...@@ -1014,10 +1008,6 @@ func (k *Kad) NeighborhoodDepth() uint8 { ...@@ -1014,10 +1008,6 @@ func (k *Kad) NeighborhoodDepth() uint8 {
k.depthMu.RLock() k.depthMu.RLock()
defer k.depthMu.RUnlock() defer k.depthMu.RUnlock()
return k.neighborhoodDepth()
}
func (k *Kad) neighborhoodDepth() uint8 {
return k.depth return k.depth
} }
...@@ -1033,7 +1023,7 @@ func (k *Kad) IsBalanced(bin uint8) bool { ...@@ -1033,7 +1023,7 @@ func (k *Kad) IsBalanced(bin uint8) bool {
// for each pseudo address // for each pseudo address
for i := range k.commonBinPrefixes[bin] { for i := range k.commonBinPrefixes[bin] {
pseudoAddr := k.commonBinPrefixes[bin][i] pseudoAddr := k.commonBinPrefixes[bin][i]
closestConnectedPeer, err := closestPeer(k.connectedPeers, pseudoAddr, noopSanctionedPeerFn, swarm.ZeroAddress) closestConnectedPeer, err := closestPeer(k.connectedPeers, pseudoAddr, noopSanctionedPeerFn)
if err != nil { if err != nil {
return false return false
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment