Commit 95b8441b authored by Peter Mrekaj's avatar Peter Mrekaj Committed by GitHub

refactor(kademlia,metrics): make metrics persistent writes asynchronous (#1873)

This optimization includes:
- all counters are stored in memory by default in order to minimize disc I/O operations
- counters that need to be persisted are flushed periodically to the persistent store
- switch to sync.Map which is more suitable for the case of disjoint sets of keys
- using lock-free data structures for the critical parts
parent 14ef5446
...@@ -59,6 +59,7 @@ require ( ...@@ -59,6 +59,7 @@ require (
github.com/wealdtech/go-ens/v3 v3.4.4 github.com/wealdtech/go-ens/v3 v3.4.4
gitlab.com/nolash/go-mockbytes v0.0.7 gitlab.com/nolash/go-mockbytes v0.0.7
go.opencensus.io v0.22.5 // indirect go.opencensus.io v0.22.5 // indirect
go.uber.org/atomic v1.7.0
go.uber.org/multierr v1.6.0 // indirect go.uber.org/multierr v1.6.0 // indirect
go.uber.org/zap v1.16.0 // indirect go.uber.org/zap v1.16.0 // indirect
golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad
......
...@@ -17,18 +17,15 @@ import ( ...@@ -17,18 +17,15 @@ import (
func snapshot(t *testing.T, mc *metrics.Collector, sst time.Time, addr swarm.Address) *metrics.Snapshot { func snapshot(t *testing.T, mc *metrics.Collector, sst time.Time, addr swarm.Address) *metrics.Snapshot {
t.Helper() t.Helper()
ss, err := mc.Snapshot(sst, addr) ss := mc.Snapshot(sst, addr)
if err != nil {
t.Fatalf("Snapshot(%q, ...): unexpected error: %v", addr, err)
}
if have, want := len(ss), 1; have != want { if have, want := len(ss), 1; have != want {
t.Fatalf("Snapshot(%q, ...): length mismatch: have: %d; want: %d", addr, have, want) t.Fatalf("Snapshot(%q, ...): length mismatch: have: %d; want: %d", addr, have, want)
} }
pm, ok := ss[addr.String()] cs, ok := ss[addr.ByteString()]
if !ok { if !ok {
t.Fatalf("Snapshot(%q, ...): missing peer metrics", addr) t.Fatalf("Snapshot(%q, ...): missing peer metrics", addr)
} }
return pm return cs
} }
func TestPeerMetricsCollector(t *testing.T) { func TestPeerMetricsCollector(t *testing.T) {
...@@ -52,20 +49,14 @@ func TestPeerMetricsCollector(t *testing.T) { ...@@ -52,20 +49,14 @@ func TestPeerMetricsCollector(t *testing.T) {
) )
// Inc session conn retry. // Inc session conn retry.
err = mc.Record(addr, metrics.IncSessionConnectionRetry()) mc.Record(addr, metrics.IncSessionConnectionRetry())
if err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
ss := snapshot(t, mc, t2, addr) ss := snapshot(t, mc, t2, addr)
if have, want := ss.SessionConnectionRetry, uint(1); have != want { if have, want := ss.SessionConnectionRetry, uint64(1); have != want {
t.Fatalf("Snapshot(%q, ...): session connection retry counter mismatch: have %d; want %d", addr, have, want) t.Fatalf("Snapshot(%q, ...): session connection retry counter mismatch: have %d; want %d", addr, have, want)
} }
// Login. // Login.
err = mc.Record(addr, metrics.PeerLogIn(t1, metrics.PeerConnectionDirectionInbound)) mc.Record(addr, metrics.PeerLogIn(t1, metrics.PeerConnectionDirectionInbound))
if err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
ss = snapshot(t, mc, t2, addr) ss = snapshot(t, mc, t2, addr)
if have, want := ss.LastSeenTimestamp, t1.UnixNano(); have != want { if have, want := ss.LastSeenTimestamp, t1.UnixNano(); have != want {
t.Fatalf("Snapshot(%q, ...): last seen counter mismatch: have %d; want %d", addr, have, want) t.Fatalf("Snapshot(%q, ...): last seen counter mismatch: have %d; want %d", addr, have, want)
...@@ -81,10 +72,7 @@ func TestPeerMetricsCollector(t *testing.T) { ...@@ -81,10 +72,7 @@ func TestPeerMetricsCollector(t *testing.T) {
} }
// Login when already logged in. // Login when already logged in.
err = mc.Record(addr, metrics.PeerLogIn(t1.Add(1*time.Second), metrics.PeerConnectionDirectionOutbound)) mc.Record(addr, metrics.PeerLogIn(t1.Add(1*time.Second), metrics.PeerConnectionDirectionOutbound))
if err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
ss = snapshot(t, mc, t2, addr) ss = snapshot(t, mc, t2, addr)
if have, want := ss.LastSeenTimestamp, t1.UnixNano(); have != want { if have, want := ss.LastSeenTimestamp, t1.UnixNano(); have != want {
t.Fatalf("Snapshot(%q, ...): last seen counter mismatch: have %d; want %d", addr, have, want) t.Fatalf("Snapshot(%q, ...): last seen counter mismatch: have %d; want %d", addr, have, want)
...@@ -100,20 +88,14 @@ func TestPeerMetricsCollector(t *testing.T) { ...@@ -100,20 +88,14 @@ func TestPeerMetricsCollector(t *testing.T) {
} }
// Inc session conn retry. // Inc session conn retry.
err = mc.Record(addr, metrics.IncSessionConnectionRetry()) mc.Record(addr, metrics.IncSessionConnectionRetry())
if err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
ss = snapshot(t, mc, t2, addr) ss = snapshot(t, mc, t2, addr)
if have, want := ss.SessionConnectionRetry, uint(2); have != want { if have, want := ss.SessionConnectionRetry, uint64(2); have != want {
t.Fatalf("Snapshot(%q, ...): session connection retry counter mismatch: have %d; want %d", addr, have, want) t.Fatalf("Snapshot(%q, ...): session connection retry counter mismatch: have %d; want %d", addr, have, want)
} }
// Logout. // Logout.
err = mc.Record(addr, metrics.PeerLogOut(t3)) mc.Record(addr, metrics.PeerLogOut(t3))
if err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
ss = snapshot(t, mc, t2, addr) ss = snapshot(t, mc, t2, addr)
if have, want := ss.LastSeenTimestamp, t3.UnixNano(); have != want { if have, want := ss.LastSeenTimestamp, t3.UnixNano(); have != want {
t.Fatalf("Snapshot(%q, ...): last seen counter mismatch: have %d; want %d", addr, have, want) t.Fatalf("Snapshot(%q, ...): last seen counter mismatch: have %d; want %d", addr, have, want)
...@@ -121,7 +103,7 @@ func TestPeerMetricsCollector(t *testing.T) { ...@@ -121,7 +103,7 @@ func TestPeerMetricsCollector(t *testing.T) {
if have, want := ss.ConnectionTotalDuration, t3.Sub(t1); have != want { if have, want := ss.ConnectionTotalDuration, t3.Sub(t1); have != want {
t.Fatalf("Snapshot(%q, ...): connection total duration counter mismatch: have %s; want %s", addr, have, want) t.Fatalf("Snapshot(%q, ...): connection total duration counter mismatch: have %s; want %s", addr, have, want)
} }
if have, want := ss.SessionConnectionRetry, uint(2); have != want { if have, want := ss.SessionConnectionRetry, uint64(2); have != want {
t.Fatalf("Snapshot(%q, ...): session connection retry counter mismatch: have %d; want %d", addr, have, want) t.Fatalf("Snapshot(%q, ...): session connection retry counter mismatch: have %d; want %d", addr, have, want)
} }
if have, want := ss.SessionConnectionDuration, t3.Sub(t1); have != want { if have, want := ss.SessionConnectionDuration, t3.Sub(t1); have != want {
...@@ -129,29 +111,24 @@ func TestPeerMetricsCollector(t *testing.T) { ...@@ -129,29 +111,24 @@ func TestPeerMetricsCollector(t *testing.T) {
} }
// Inspect. // Inspect.
if err := mc.Inspect(addr, func(have *metrics.Snapshot) { mc.Inspect(addr, func(have *metrics.Snapshot) {
want := ss want := ss
if diff := cmp.Diff(have, want); diff != "" { if diff := cmp.Diff(have, want); diff != "" {
t.Fatalf("unexpected snapshot diffrence:\n%s", diff) t.Fatalf("unexpected snapshot diffrence:\n%s", diff)
} }
}); err != nil { })
t.Fatalf("Inspect(%q, ...): unexpected error: %v", addr, err)
// Flush.
if err := mc.Flush(addr); err != nil {
t.Fatalf("Flush(): unexpected error: %v", err)
} }
// Finalize. // Finalize.
err = mc.Record(addr, metrics.PeerLogIn(t1, metrics.PeerConnectionDirectionInbound)) mc.Record(addr, metrics.PeerLogIn(t1, metrics.PeerConnectionDirectionInbound))
if err != nil { if err := mc.Finalize(t3); err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
err = mc.Finalize(t3)
if err != nil {
t.Fatalf("Finalize(%s): unexpected error: %v", t3, err) t.Fatalf("Finalize(%s): unexpected error: %v", t3, err)
} }
snapshots, err := mc.Snapshot(t2, addr) if have, want := len(mc.Snapshot(t2, addr)), 0; have != want {
if err != nil {
t.Fatalf("Snapshot(%q, ...): unexpected error: %v", addr, err)
}
if have, want := len(snapshots), 0; have != want {
t.Fatalf("Finalize(%s): counters length mismatch: have %d; want %d", t3, have, want) t.Fatalf("Finalize(%s): counters length mismatch: have %d; want %d", t3, have, want)
} }
} }
...@@ -364,12 +364,7 @@ func (k *Kad) connectionAttemptsHandler(ctx context.Context, wg *sync.WaitGroup, ...@@ -364,12 +364,7 @@ func (k *Kad) connectionAttemptsHandler(ctx context.Context, wg *sync.WaitGroup,
k.connectedPeers.Add(peer.addr, peer.po) k.connectedPeers.Add(peer.addr, peer.po)
if err := k.collector.Record( k.collector.Record(peer.addr, metrics.PeerLogIn(time.Now(), metrics.PeerConnectionDirectionOutbound))
peer.addr,
metrics.PeerLogIn(time.Now(), metrics.PeerConnectionDirectionOutbound),
); err != nil {
k.logger.Debugf("kademlia: unable to record login outbound metrics for %q: %v", peer.addr, err)
}
k.depthMu.Lock() k.depthMu.Lock()
k.depth = recalcDepth(k.connectedPeers, k.radius) k.depth = recalcDepth(k.connectedPeers, k.radius)
...@@ -447,6 +442,9 @@ func (k *Kad) manage() { ...@@ -447,6 +442,9 @@ func (k *Kad) manage() {
case <-k.quit: case <-k.quit:
return return
case <-time.After(30 * time.Second): case <-time.After(30 * time.Second):
if err := k.collector.Flush(); err != nil {
k.logger.Debugf("kademlia: unable to flush metrics counters to the persistent store: %v", err)
}
k.notifyManageLoop() k.notifyManageLoop()
case <-k.manageC: case <-k.manageC:
start := time.Now() start := time.Now()
...@@ -663,11 +661,9 @@ func (k *Kad) connect(ctx context.Context, peer swarm.Address, ma ma.Multiaddr) ...@@ -663,11 +661,9 @@ func (k *Kad) connect(ctx context.Context, peer swarm.Address, ma ma.Multiaddr)
failedAttempts++ failedAttempts++
} }
if err := k.collector.Record(peer, metrics.IncSessionConnectionRetry()); err != nil { k.collector.Record(peer, metrics.IncSessionConnectionRetry())
k.logger.Debugf("kademlia: unable to record session connection retry metrics for %q: %v", peer, err)
}
if err := k.collector.Inspect(peer, func(ss *metrics.Snapshot) { k.collector.Inspect(peer, func(ss *metrics.Snapshot) {
quickPrune := ss == nil || ss.HasAtMaxOneConnectionAttempt() quickPrune := ss == nil || ss.HasAtMaxOneConnectionAttempt()
if (k.connectedPeers.Length() > 0 && quickPrune) || failedAttempts > maxConnAttempts { if (k.connectedPeers.Length() > 0 && quickPrune) || failedAttempts > maxConnAttempts {
...@@ -680,9 +676,7 @@ func (k *Kad) connect(ctx context.Context, peer swarm.Address, ma ma.Multiaddr) ...@@ -680,9 +676,7 @@ func (k *Kad) connect(ctx context.Context, peer swarm.Address, ma ma.Multiaddr)
} else { } else {
k.waitNext.Set(peer, retryTime, failedAttempts) k.waitNext.Set(peer, retryTime, failedAttempts)
} }
}); err != nil { })
k.logger.Debugf("kademlia: connect: unable to inspect snapshot for %q: %v", peer, err)
}
return err return err
case !i.Overlay.Equal(peer): case !i.Overlay.Equal(peer):
...@@ -802,12 +796,7 @@ func (k *Kad) connected(ctx context.Context, addr swarm.Address) error { ...@@ -802,12 +796,7 @@ func (k *Kad) connected(ctx context.Context, addr swarm.Address) error {
k.knownPeers.Add(addr, po) k.knownPeers.Add(addr, po)
k.connectedPeers.Add(addr, po) k.connectedPeers.Add(addr, po)
if err := k.collector.Record( k.collector.Record(addr, metrics.PeerLogIn(time.Now(), metrics.PeerConnectionDirectionInbound))
addr,
metrics.PeerLogIn(time.Now(), metrics.PeerConnectionDirectionInbound),
); err != nil {
k.logger.Debugf("kademlia: unable to record login inbound metrics for %q: %v", addr, err)
}
k.waitNext.Remove(addr) k.waitNext.Remove(addr)
...@@ -830,12 +819,7 @@ func (k *Kad) Disconnected(peer p2p.Peer) { ...@@ -830,12 +819,7 @@ func (k *Kad) Disconnected(peer p2p.Peer) {
k.waitNext.SetTryAfter(peer.Address, time.Now().Add(timeToRetry)) k.waitNext.SetTryAfter(peer.Address, time.Now().Add(timeToRetry))
if err := k.collector.Record( k.collector.Record(peer.Address, metrics.PeerLogOut(time.Now()))
peer.Address,
metrics.PeerLogOut(time.Now()),
); err != nil {
k.logger.Debugf("kademlia: unable to record logout metrics for %q: %v", peer.Address, err)
}
k.depthMu.Lock() k.depthMu.Lock()
k.depth = recalcDepth(k.connectedPeers, k.radius) k.depth = recalcDepth(k.connectedPeers, k.radius)
...@@ -1097,10 +1081,7 @@ func (k *Kad) Snapshot() *topology.KadParams { ...@@ -1097,10 +1081,7 @@ func (k *Kad) Snapshot() *topology.KadParams {
infos = append(infos, topology.BinInfo{}) infos = append(infos, topology.BinInfo{})
} }
ss, err := k.collector.Snapshot(time.Now()) ss := k.collector.Snapshot(time.Now())
if err != nil {
k.logger.Debugf("kademlia: unable to take metrics snapshot: %v", err)
}
_ = k.connectedPeers.EachBin(func(addr swarm.Address, po uint8) (bool, bool, error) { _ = k.connectedPeers.EachBin(func(addr swarm.Address, po uint8) (bool, bool, error) {
infos[po].BinConnected++ infos[po].BinConnected++
...@@ -1108,7 +1089,7 @@ func (k *Kad) Snapshot() *topology.KadParams { ...@@ -1108,7 +1089,7 @@ func (k *Kad) Snapshot() *topology.KadParams {
infos[po].ConnectedPeers, infos[po].ConnectedPeers,
&topology.PeerInfo{ &topology.PeerInfo{
Address: addr, Address: addr,
Metrics: createMetricsSnapshotView(ss[addr.String()]), Metrics: createMetricsSnapshotView(ss[addr.ByteString()]),
}, },
) )
return false, false, nil return false, false, nil
...@@ -1129,7 +1110,7 @@ func (k *Kad) Snapshot() *topology.KadParams { ...@@ -1129,7 +1110,7 @@ func (k *Kad) Snapshot() *topology.KadParams {
infos[po].DisconnectedPeers, infos[po].DisconnectedPeers,
&topology.PeerInfo{ &topology.PeerInfo{
Address: addr, Address: addr,
Metrics: createMetricsSnapshotView(ss[addr.String()]), Metrics: createMetricsSnapshotView(ss[addr.ByteString()]),
}, },
) )
return false, false, nil return false, false, nil
......
...@@ -73,7 +73,7 @@ type PeerInfo struct { ...@@ -73,7 +73,7 @@ type PeerInfo struct {
// MetricSnapshotView represents snapshot of metrics counters in more human readable form. // MetricSnapshotView represents snapshot of metrics counters in more human readable form.
type MetricSnapshotView struct { type MetricSnapshotView struct {
LastSeenTimestamp int64 `json:"lastSeenTimestamp"` LastSeenTimestamp int64 `json:"lastSeenTimestamp"`
SessionConnectionRetry uint `json:"sessionConnectionRetry"` SessionConnectionRetry uint64 `json:"sessionConnectionRetry"`
ConnectionTotalDuration float64 `json:"connectionTotalDuration"` ConnectionTotalDuration float64 `json:"connectionTotalDuration"`
SessionConnectionDuration float64 `json:"sessionConnectionDuration"` SessionConnectionDuration float64 `json:"sessionConnectionDuration"`
SessionConnectionDirection string `json:"sessionConnectionDirection"` SessionConnectionDirection string `json:"sessionConnectionDirection"`
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment