Commit 93891de2 authored by angel-ding-cb's avatar angel-ding-cb Committed by GitHub

[Sequencer HA] Add clearer logs for debugging (#10562)

* add clearer logs for debugging

* fix small things based on the comment

* restart cli process

* small fix on the naming convention

* remove .String()
parent 7d92f417
......@@ -291,6 +291,10 @@ func (s *state) Equal(other *state) bool {
return s.leader == other.leader && s.healthy == other.healthy && s.active == other.active
}
func (s *state) String() string {
return fmt.Sprintf("leader: %t, healthy: %t, active: %t", s.leader, s.healthy, s.active)
}
var _ cliapp.Lifecycle = (*OpConductor)(nil)
// Start implements cliapp.Lifecycle.
......@@ -366,6 +370,7 @@ func (oc *OpConductor) Pause(ctx context.Context) error {
select {
case oc.pauseCh <- struct{}{}:
<-oc.pauseDoneCh
oc.log.Info("OpConductor has been paused")
return nil
case <-ctx.Done():
return ErrPauseTimeout
......@@ -382,6 +387,7 @@ func (oc *OpConductor) Resume(ctx context.Context) error {
select {
case oc.resumeCh <- struct{}{}:
<-oc.resumeDoneCh
oc.log.Info("OpConductor has been resumed")
return nil
case <-ctx.Done():
return ErrResumeTimeout
......@@ -604,7 +610,7 @@ func (oc *OpConductor) action() {
oc.log.Debug("exiting action with status and error", "status", status, "err", err)
if err != nil {
oc.log.Error("failed to execute step, queueing another one to retry", "err", err)
oc.log.Error("failed to execute step, queueing another one to retry", "err", err, "status", status)
// randomly sleep for 0-200ms to avoid excessive retry
time.Sleep(time.Duration(rand.Intn(200)) * time.Millisecond)
oc.queueAction()
......
......@@ -156,17 +156,22 @@ func (hm *SequencerHealthMonitor) healthCheck() error {
"last_seen_unsafe_num", hm.lastSeenUnsafeNum,
"last_seen_unsafe_time", hm.lastSeenUnsafeTime,
"unsafe_interval", hm.unsafeInterval,
"time_diff", timeDiff,
"block_diff", blockDiff,
"expected_blocks", expectedBlocks,
)
return ErrSequencerNotHealthy
}
if calculateTimeDiff(now, status.UnsafeL2.Time) > hm.unsafeInterval {
curUnsafeTimeDiff := calculateTimeDiff(now, status.UnsafeL2.Time)
if curUnsafeTimeDiff > hm.unsafeInterval {
hm.log.Error(
"unsafe head is not progressing as expected",
"unsafe head is falling behind the unsafe interval",
"now", now,
"unsafe_head_num", status.UnsafeL2.Number,
"unsafe_head_time", status.UnsafeL2.Time,
"unsafe_interval", hm.unsafeInterval,
"cur_unsafe_time_diff", curUnsafeTimeDiff,
)
return ErrSequencerNotHealthy
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment