Commit 93891de2 authored by angel-ding-cb's avatar angel-ding-cb Committed by GitHub

[Sequencer HA] Add clearer logs for debugging (#10562)

* add clearer logs for debugging

* fix small things based on the comment

* restart cli process

* small fix on the naming convention

* remove .String()
parent 7d92f417
...@@ -291,6 +291,10 @@ func (s *state) Equal(other *state) bool { ...@@ -291,6 +291,10 @@ func (s *state) Equal(other *state) bool {
return s.leader == other.leader && s.healthy == other.healthy && s.active == other.active return s.leader == other.leader && s.healthy == other.healthy && s.active == other.active
} }
func (s *state) String() string {
return fmt.Sprintf("leader: %t, healthy: %t, active: %t", s.leader, s.healthy, s.active)
}
var _ cliapp.Lifecycle = (*OpConductor)(nil) var _ cliapp.Lifecycle = (*OpConductor)(nil)
// Start implements cliapp.Lifecycle. // Start implements cliapp.Lifecycle.
...@@ -366,6 +370,7 @@ func (oc *OpConductor) Pause(ctx context.Context) error { ...@@ -366,6 +370,7 @@ func (oc *OpConductor) Pause(ctx context.Context) error {
select { select {
case oc.pauseCh <- struct{}{}: case oc.pauseCh <- struct{}{}:
<-oc.pauseDoneCh <-oc.pauseDoneCh
oc.log.Info("OpConductor has been paused")
return nil return nil
case <-ctx.Done(): case <-ctx.Done():
return ErrPauseTimeout return ErrPauseTimeout
...@@ -382,6 +387,7 @@ func (oc *OpConductor) Resume(ctx context.Context) error { ...@@ -382,6 +387,7 @@ func (oc *OpConductor) Resume(ctx context.Context) error {
select { select {
case oc.resumeCh <- struct{}{}: case oc.resumeCh <- struct{}{}:
<-oc.resumeDoneCh <-oc.resumeDoneCh
oc.log.Info("OpConductor has been resumed")
return nil return nil
case <-ctx.Done(): case <-ctx.Done():
return ErrResumeTimeout return ErrResumeTimeout
...@@ -604,7 +610,7 @@ func (oc *OpConductor) action() { ...@@ -604,7 +610,7 @@ func (oc *OpConductor) action() {
oc.log.Debug("exiting action with status and error", "status", status, "err", err) oc.log.Debug("exiting action with status and error", "status", status, "err", err)
if err != nil { if err != nil {
oc.log.Error("failed to execute step, queueing another one to retry", "err", err) oc.log.Error("failed to execute step, queueing another one to retry", "err", err, "status", status)
// randomly sleep for 0-200ms to avoid excessive retry // randomly sleep for 0-200ms to avoid excessive retry
time.Sleep(time.Duration(rand.Intn(200)) * time.Millisecond) time.Sleep(time.Duration(rand.Intn(200)) * time.Millisecond)
oc.queueAction() oc.queueAction()
......
...@@ -156,17 +156,22 @@ func (hm *SequencerHealthMonitor) healthCheck() error { ...@@ -156,17 +156,22 @@ func (hm *SequencerHealthMonitor) healthCheck() error {
"last_seen_unsafe_num", hm.lastSeenUnsafeNum, "last_seen_unsafe_num", hm.lastSeenUnsafeNum,
"last_seen_unsafe_time", hm.lastSeenUnsafeTime, "last_seen_unsafe_time", hm.lastSeenUnsafeTime,
"unsafe_interval", hm.unsafeInterval, "unsafe_interval", hm.unsafeInterval,
"time_diff", timeDiff,
"block_diff", blockDiff,
"expected_blocks", expectedBlocks,
) )
return ErrSequencerNotHealthy return ErrSequencerNotHealthy
} }
if calculateTimeDiff(now, status.UnsafeL2.Time) > hm.unsafeInterval { curUnsafeTimeDiff := calculateTimeDiff(now, status.UnsafeL2.Time)
if curUnsafeTimeDiff > hm.unsafeInterval {
hm.log.Error( hm.log.Error(
"unsafe head is not progressing as expected", "unsafe head is falling behind the unsafe interval",
"now", now, "now", now,
"unsafe_head_num", status.UnsafeL2.Number, "unsafe_head_num", status.UnsafeL2.Number,
"unsafe_head_time", status.UnsafeL2.Time, "unsafe_head_time", status.UnsafeL2.Time,
"unsafe_interval", hm.unsafeInterval, "unsafe_interval", hm.unsafeInterval,
"cur_unsafe_time_diff", curUnsafeTimeDiff,
) )
return ErrSequencerNotHealthy return ErrSequencerNotHealthy
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment