Commit 86a56249 authored by vicotor's avatar vicotor

add dispatch timeout

parent 3ade7f32
......@@ -2,6 +2,7 @@ endpoint="127.0.0.1:10002"
metrics_port = 28012
routines = 1
max_nm_update_ex = 40
dispatch_timeout = 30
[redis]
addr="127.0.0.1:6379"
......
......@@ -30,6 +30,7 @@ type Config struct {
MetricPort int `json:"metrics_port" toml:"metrics_port"`
Routines int `json:"routines" toml:"routines"`
MaxNmUpdateEx int `json:"max_nm_update_ex" toml:"max_nm_update_ex"`
DispatchTimeout int `json:"dispatch_timeout" toml:"dispatch_timeout"`
Redis RedisConfig `json:"redis" toml:"redis"`
Kafka KafkaConfig `json:"kafka" toml:"kafka"`
DbConfig MysqlConfig `json:"mysql" toml:"mysql"`
......
......@@ -169,15 +169,14 @@ func (n *Node) Loop(idx int) {
receipt.TaskProfitAccount = ""
receipt.TaskWorkerAccount = ""
switch err {
case ErrNoWorker:
case ErrNoWorker, ErrTimeout:
receipt.TaskResult = err.Error()
case ErrDispatchFailed:
receipt.TaskResult = err.Error()
default:
receipt.TaskResult = "internal error"
}
utils.FireTaskReceipt(n.kafkaProducer, receipt, config.GetConfig().Kafka.ReceiptTopic)
return nil
return utils.FireTaskReceipt(n.kafkaProducer, receipt, config.GetConfig().Kafka.ReceiptTopic)
}
for {
......@@ -188,13 +187,14 @@ func (n *Node) Loop(idx int) {
return
case t := <-taskCh:
go func(task *odysseus.TaskContent) {
fctx, _ := context.WithTimeout(context.Background(), time.Second*time.Duration(config.GetConfig().DispatchTimeout))
go func(ctx context.Context, task *odysseus.TaskContent) {
l := log.WithField("task-id", task.TaskId)
l.WithField("task", task).Info("get task")
// todo: add parameter for re-dispatch count.
for {
worker, err := PopWorker(n.rdb)
if err == ErrNoWorker {
worker, err := PopWorker(ctx, n.rdb)
if err == ErrNoWorker || err == ErrTimeout {
result := &odysseus.TaskResponse{
TaskId: task.TaskId,
TaskUid: task.TaskUid,
......@@ -203,10 +203,11 @@ func (n *Node) Loop(idx int) {
TaskError: err.Error(),
}
l.WithError(err).Error("pop worker failed")
postReceipt(task, result, err)
err = postResult(task, result)
if err != nil {
l.WithError(err).Error("post task result failed")
if e := postReceipt(task, result, err); e != nil {
l.WithError(e).Error("post task receipt failed")
}
if e := postResult(task, result); e != nil {
l.WithError(e).Error("post task result failed")
}
break
}
......@@ -215,7 +216,7 @@ func (n *Node) Loop(idx int) {
l.WithError(err).Error("pop worker failed")
continue
}
err = n.DispatchTask(n.rdb, worker, task)
err = n.DispatchTask(ctx, worker, task)
if err != nil {
l.WithError(err).Error("dispatch task failed")
continue
......@@ -224,7 +225,7 @@ func (n *Node) Loop(idx int) {
break
}
}
}(t)
}(fctx, t)
}
}
......
......@@ -22,6 +22,7 @@ var (
var (
ErrNoWorker = errors.New("no worker")
ErrTimeout = errors.New("timeout")
ErrDispatchFailed = errors.New("dispatch to nodemanager failed")
)
......@@ -33,9 +34,14 @@ type Worker struct {
managers []string
}
func PopWorker(rdb *redis.Client) (Worker, error) {
func PopWorker(ctx context.Context, rdb *redis.Client) (Worker, error) {
for i := 0; i < maxPriority; i++ {
for {
if ctx.Err() != nil {
return Worker{}, ErrTimeout
}
elem, err := rdb.LPop(context.Background(), config.WORKER_QUEUE_PREFIX+strconv.Itoa(i)).Result()
if err != nil {
log.WithError(err).Error("lPop worker failed")
......@@ -67,6 +73,9 @@ func PopWorker(rdb *redis.Client) (Worker, error) {
func workerStatusKey(wid string) string {
return fmt.Sprintf("%s_%s", config.WORKER_STATUS_PREFIX, wid)
}
func workerId(w Worker) string {
return fmt.Sprintf("%s_%d", w.addr, w.nonce)
}
func newManagerClient(endpoint string) (omanager.NodeManagerServiceClient, error) {
client, err := grpc.Dial(endpoint,
......@@ -105,12 +114,21 @@ func parseWorkerNmValue(nmValue string) (string, int64) {
return "", 0
}
func (n *Node) DispatchTask(rdb *redis.Client, worker Worker, task *odysseus.TaskContent) error {
func (n *Node) DispatchTask(ctx context.Context, worker Worker, task *odysseus.TaskContent) error {
l := log.WithField("task-id", task.TaskId)
l.WithFields(log.Fields{
"worker": worker.workerid,
"managerList": worker.managers,
}).Debug("dispatch task to worker")
var shouldAddBack = false
defer func(w Worker) {
if shouldAddBack {
// add worker back to redis queue.
n.rdb.LPush(context.Background(), config.WORKER_QUEUE_PREFIX+strconv.Itoa(w.priority), workerId(w))
l.WithField("worker", worker.workerid).Debug("add worker back to queue")
}
}(worker)
for _, manager := range worker.managers {
endpoint, updateTime := parseWorkerNmValue(manager)
if time.Now().Unix()-updateTime > int64(config.GetConfig().MaxNmUpdateEx) {
......@@ -126,7 +144,7 @@ func (n *Node) DispatchTask(rdb *redis.Client, worker Worker, task *odysseus.Tas
}).Error("connect to manager failed")
continue
}
_, err = client.DispatchTask(context.Background(), &omanager.DispatchTaskRequest{
_, err = client.DispatchTask(ctx, &omanager.DispatchTaskRequest{
Miner: worker.workerid,
TaskData: task,
})
......@@ -135,7 +153,10 @@ func (n *Node) DispatchTask(rdb *redis.Client, worker Worker, task *odysseus.Tas
"manager": endpoint,
"error": err,
}).Error("dispatch to manager failed")
if strings.HasSuffix(err.Error(), "deadline exceeded") {
shouldAddBack = true
return ErrTimeout
}
continue
}
return nil
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment