Commit 95b8441b authored by Peter Mrekaj's avatar Peter Mrekaj Committed by GitHub

refactor(kademlia,metrics): make metrics persistent writes asynchronous (#1873)

This optimization includes:
- all counters are stored in memory by default in order to minimize disc I/O operations
- counters that need to be persisted are flushed periodically to the persistent store
- switch to sync.Map which is more suitable for the case of disjoint sets of keys
- using lock-free data structures for the critical parts
parent 14ef5446
......@@ -59,6 +59,7 @@ require (
github.com/wealdtech/go-ens/v3 v3.4.4
gitlab.com/nolash/go-mockbytes v0.0.7
go.opencensus.io v0.22.5 // indirect
go.uber.org/atomic v1.7.0
go.uber.org/multierr v1.6.0 // indirect
go.uber.org/zap v1.16.0 // indirect
golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad
......
......@@ -17,18 +17,15 @@ import (
func snapshot(t *testing.T, mc *metrics.Collector, sst time.Time, addr swarm.Address) *metrics.Snapshot {
t.Helper()
ss, err := mc.Snapshot(sst, addr)
if err != nil {
t.Fatalf("Snapshot(%q, ...): unexpected error: %v", addr, err)
}
ss := mc.Snapshot(sst, addr)
if have, want := len(ss), 1; have != want {
t.Fatalf("Snapshot(%q, ...): length mismatch: have: %d; want: %d", addr, have, want)
}
pm, ok := ss[addr.String()]
cs, ok := ss[addr.ByteString()]
if !ok {
t.Fatalf("Snapshot(%q, ...): missing peer metrics", addr)
}
return pm
return cs
}
func TestPeerMetricsCollector(t *testing.T) {
......@@ -52,20 +49,14 @@ func TestPeerMetricsCollector(t *testing.T) {
)
// Inc session conn retry.
err = mc.Record(addr, metrics.IncSessionConnectionRetry())
if err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
mc.Record(addr, metrics.IncSessionConnectionRetry())
ss := snapshot(t, mc, t2, addr)
if have, want := ss.SessionConnectionRetry, uint(1); have != want {
if have, want := ss.SessionConnectionRetry, uint64(1); have != want {
t.Fatalf("Snapshot(%q, ...): session connection retry counter mismatch: have %d; want %d", addr, have, want)
}
// Login.
err = mc.Record(addr, metrics.PeerLogIn(t1, metrics.PeerConnectionDirectionInbound))
if err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
mc.Record(addr, metrics.PeerLogIn(t1, metrics.PeerConnectionDirectionInbound))
ss = snapshot(t, mc, t2, addr)
if have, want := ss.LastSeenTimestamp, t1.UnixNano(); have != want {
t.Fatalf("Snapshot(%q, ...): last seen counter mismatch: have %d; want %d", addr, have, want)
......@@ -81,10 +72,7 @@ func TestPeerMetricsCollector(t *testing.T) {
}
// Login when already logged in.
err = mc.Record(addr, metrics.PeerLogIn(t1.Add(1*time.Second), metrics.PeerConnectionDirectionOutbound))
if err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
mc.Record(addr, metrics.PeerLogIn(t1.Add(1*time.Second), metrics.PeerConnectionDirectionOutbound))
ss = snapshot(t, mc, t2, addr)
if have, want := ss.LastSeenTimestamp, t1.UnixNano(); have != want {
t.Fatalf("Snapshot(%q, ...): last seen counter mismatch: have %d; want %d", addr, have, want)
......@@ -100,20 +88,14 @@ func TestPeerMetricsCollector(t *testing.T) {
}
// Inc session conn retry.
err = mc.Record(addr, metrics.IncSessionConnectionRetry())
if err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
mc.Record(addr, metrics.IncSessionConnectionRetry())
ss = snapshot(t, mc, t2, addr)
if have, want := ss.SessionConnectionRetry, uint(2); have != want {
if have, want := ss.SessionConnectionRetry, uint64(2); have != want {
t.Fatalf("Snapshot(%q, ...): session connection retry counter mismatch: have %d; want %d", addr, have, want)
}
// Logout.
err = mc.Record(addr, metrics.PeerLogOut(t3))
if err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
mc.Record(addr, metrics.PeerLogOut(t3))
ss = snapshot(t, mc, t2, addr)
if have, want := ss.LastSeenTimestamp, t3.UnixNano(); have != want {
t.Fatalf("Snapshot(%q, ...): last seen counter mismatch: have %d; want %d", addr, have, want)
......@@ -121,7 +103,7 @@ func TestPeerMetricsCollector(t *testing.T) {
if have, want := ss.ConnectionTotalDuration, t3.Sub(t1); have != want {
t.Fatalf("Snapshot(%q, ...): connection total duration counter mismatch: have %s; want %s", addr, have, want)
}
if have, want := ss.SessionConnectionRetry, uint(2); have != want {
if have, want := ss.SessionConnectionRetry, uint64(2); have != want {
t.Fatalf("Snapshot(%q, ...): session connection retry counter mismatch: have %d; want %d", addr, have, want)
}
if have, want := ss.SessionConnectionDuration, t3.Sub(t1); have != want {
......@@ -129,29 +111,24 @@ func TestPeerMetricsCollector(t *testing.T) {
}
// Inspect.
if err := mc.Inspect(addr, func(have *metrics.Snapshot) {
mc.Inspect(addr, func(have *metrics.Snapshot) {
want := ss
if diff := cmp.Diff(have, want); diff != "" {
t.Fatalf("unexpected snapshot diffrence:\n%s", diff)
}
}); err != nil {
t.Fatalf("Inspect(%q, ...): unexpected error: %v", addr, err)
})
// Flush.
if err := mc.Flush(addr); err != nil {
t.Fatalf("Flush(): unexpected error: %v", err)
}
// Finalize.
err = mc.Record(addr, metrics.PeerLogIn(t1, metrics.PeerConnectionDirectionInbound))
if err != nil {
t.Fatalf("Record(%q, ...): unexpected error: %v", addr, err)
}
err = mc.Finalize(t3)
if err != nil {
mc.Record(addr, metrics.PeerLogIn(t1, metrics.PeerConnectionDirectionInbound))
if err := mc.Finalize(t3); err != nil {
t.Fatalf("Finalize(%s): unexpected error: %v", t3, err)
}
snapshots, err := mc.Snapshot(t2, addr)
if err != nil {
t.Fatalf("Snapshot(%q, ...): unexpected error: %v", addr, err)
}
if have, want := len(snapshots), 0; have != want {
if have, want := len(mc.Snapshot(t2, addr)), 0; have != want {
t.Fatalf("Finalize(%s): counters length mismatch: have %d; want %d", t3, have, want)
}
}
......@@ -364,12 +364,7 @@ func (k *Kad) connectionAttemptsHandler(ctx context.Context, wg *sync.WaitGroup,
k.connectedPeers.Add(peer.addr, peer.po)
if err := k.collector.Record(
peer.addr,
metrics.PeerLogIn(time.Now(), metrics.PeerConnectionDirectionOutbound),
); err != nil {
k.logger.Debugf("kademlia: unable to record login outbound metrics for %q: %v", peer.addr, err)
}
k.collector.Record(peer.addr, metrics.PeerLogIn(time.Now(), metrics.PeerConnectionDirectionOutbound))
k.depthMu.Lock()
k.depth = recalcDepth(k.connectedPeers, k.radius)
......@@ -447,6 +442,9 @@ func (k *Kad) manage() {
case <-k.quit:
return
case <-time.After(30 * time.Second):
if err := k.collector.Flush(); err != nil {
k.logger.Debugf("kademlia: unable to flush metrics counters to the persistent store: %v", err)
}
k.notifyManageLoop()
case <-k.manageC:
start := time.Now()
......@@ -663,11 +661,9 @@ func (k *Kad) connect(ctx context.Context, peer swarm.Address, ma ma.Multiaddr)
failedAttempts++
}
if err := k.collector.Record(peer, metrics.IncSessionConnectionRetry()); err != nil {
k.logger.Debugf("kademlia: unable to record session connection retry metrics for %q: %v", peer, err)
}
k.collector.Record(peer, metrics.IncSessionConnectionRetry())
if err := k.collector.Inspect(peer, func(ss *metrics.Snapshot) {
k.collector.Inspect(peer, func(ss *metrics.Snapshot) {
quickPrune := ss == nil || ss.HasAtMaxOneConnectionAttempt()
if (k.connectedPeers.Length() > 0 && quickPrune) || failedAttempts > maxConnAttempts {
......@@ -680,9 +676,7 @@ func (k *Kad) connect(ctx context.Context, peer swarm.Address, ma ma.Multiaddr)
} else {
k.waitNext.Set(peer, retryTime, failedAttempts)
}
}); err != nil {
k.logger.Debugf("kademlia: connect: unable to inspect snapshot for %q: %v", peer, err)
}
})
return err
case !i.Overlay.Equal(peer):
......@@ -802,12 +796,7 @@ func (k *Kad) connected(ctx context.Context, addr swarm.Address) error {
k.knownPeers.Add(addr, po)
k.connectedPeers.Add(addr, po)
if err := k.collector.Record(
addr,
metrics.PeerLogIn(time.Now(), metrics.PeerConnectionDirectionInbound),
); err != nil {
k.logger.Debugf("kademlia: unable to record login inbound metrics for %q: %v", addr, err)
}
k.collector.Record(addr, metrics.PeerLogIn(time.Now(), metrics.PeerConnectionDirectionInbound))
k.waitNext.Remove(addr)
......@@ -830,12 +819,7 @@ func (k *Kad) Disconnected(peer p2p.Peer) {
k.waitNext.SetTryAfter(peer.Address, time.Now().Add(timeToRetry))
if err := k.collector.Record(
peer.Address,
metrics.PeerLogOut(time.Now()),
); err != nil {
k.logger.Debugf("kademlia: unable to record logout metrics for %q: %v", peer.Address, err)
}
k.collector.Record(peer.Address, metrics.PeerLogOut(time.Now()))
k.depthMu.Lock()
k.depth = recalcDepth(k.connectedPeers, k.radius)
......@@ -1097,10 +1081,7 @@ func (k *Kad) Snapshot() *topology.KadParams {
infos = append(infos, topology.BinInfo{})
}
ss, err := k.collector.Snapshot(time.Now())
if err != nil {
k.logger.Debugf("kademlia: unable to take metrics snapshot: %v", err)
}
ss := k.collector.Snapshot(time.Now())
_ = k.connectedPeers.EachBin(func(addr swarm.Address, po uint8) (bool, bool, error) {
infos[po].BinConnected++
......@@ -1108,7 +1089,7 @@ func (k *Kad) Snapshot() *topology.KadParams {
infos[po].ConnectedPeers,
&topology.PeerInfo{
Address: addr,
Metrics: createMetricsSnapshotView(ss[addr.String()]),
Metrics: createMetricsSnapshotView(ss[addr.ByteString()]),
},
)
return false, false, nil
......@@ -1129,7 +1110,7 @@ func (k *Kad) Snapshot() *topology.KadParams {
infos[po].DisconnectedPeers,
&topology.PeerInfo{
Address: addr,
Metrics: createMetricsSnapshotView(ss[addr.String()]),
Metrics: createMetricsSnapshotView(ss[addr.ByteString()]),
},
)
return false, false, nil
......
......@@ -73,7 +73,7 @@ type PeerInfo struct {
// MetricSnapshotView represents snapshot of metrics counters in more human readable form.
type MetricSnapshotView struct {
LastSeenTimestamp int64 `json:"lastSeenTimestamp"`
SessionConnectionRetry uint `json:"sessionConnectionRetry"`
SessionConnectionRetry uint64 `json:"sessionConnectionRetry"`
ConnectionTotalDuration float64 `json:"connectionTotalDuration"`
SessionConnectionDuration float64 `json:"sessionConnectionDuration"`
SessionConnectionDirection string `json:"sessionConnectionDirection"`
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment