Commit ad9d0884 authored by duanjinfei's avatar duanjinfei

provide test

parent c23880c9
...@@ -21,8 +21,8 @@ var ( ...@@ -21,8 +21,8 @@ var (
) )
func init() { func init() {
RootCmd.PersistentFlags().StringVarP(&rewardAddr, "reward", "r", "", "please enter a reward address") RootCmd.PersistentFlags().StringVarP(&rewardAddr, "reward", "r", "0x40EC4256fcBCA69CdbAc942594caeC79FBE10494", "please enter a reward address")
RootCmd.PersistentFlags().StringVarP(&externalIp, "externalIp", "e", "", "please enter server external ip address") RootCmd.PersistentFlags().StringVarP(&externalIp, "externalIp", "e", "192.168.1.102", "please enter server external ip address")
RootCmd.PersistentFlags().StringVarP(&opSys, "opSys", "s", "", "please enter you op sys name : win、linux") RootCmd.PersistentFlags().StringVarP(&opSys, "opSys", "s", "", "please enter you op sys name : win、linux")
RootCmd.PersistentFlags().BoolVarP(&debug, "debug", "d", false, "set log level debug") RootCmd.PersistentFlags().BoolVarP(&debug, "debug", "d", false, "set log level debug")
cobra.OnInitialize(initConfig) cobra.OnInitialize(initConfig)
......
...@@ -13,5 +13,5 @@ ...@@ -13,5 +13,5 @@
"is_stop_last_container": true, "is_stop_last_container": true,
"disk_usage":80, "disk_usage":80,
"init_run_mode": 1, "init_run_mode": 1,
"hardware_url": "http://127.0.0.1:7000/hw" "hardware_url": "http://47.94.59.74:8005/hw"
} }
\ No newline at end of file
...@@ -25,7 +25,7 @@ func (c *StateController) GetRunningState() { ...@@ -25,7 +25,7 @@ func (c *StateController) GetRunningState() {
} }
func (c *StateController) GetRunningTp() { func (c *StateController) GetRunningTp() {
info := utils.GetHardwareInfo(conf.GetConfig().HardwareUrl) info := utils.GetApiHardwareInfo(conf.GetConfig().HardwareUrl)
if info == nil { if info == nil {
c.ResponseInfo(500, "get running tp failed", 0) c.ResponseInfo(500, "get running tp failed", 0)
return return
...@@ -37,7 +37,7 @@ func (c *StateController) GetRunningTp() { ...@@ -37,7 +37,7 @@ func (c *StateController) GetRunningTp() {
} }
func (c *StateController) GetRunningLineChart() { func (c *StateController) GetRunningLineChart() {
info := utils.GetHardwareInfo(conf.GetConfig().HardwareUrl) info := utils.GetApiHardwareInfo(conf.GetConfig().HardwareUrl)
if info == nil { if info == nil {
c.ResponseInfo(500, "get running tp failed", "") c.ResponseInfo(500, "get running tp failed", "")
return return
...@@ -63,7 +63,7 @@ func (c *StateController) GetWorkerInfo() { ...@@ -63,7 +63,7 @@ func (c *StateController) GetWorkerInfo() {
} }
func (c *StateController) GetListGpuInfo() { func (c *StateController) GetListGpuInfo() {
info := utils.GetHardwareInfo(conf.GetConfig().HardwareUrl) info := utils.GetApiHardwareInfo(conf.GetConfig().HardwareUrl)
if info != nil && info.Data != nil { if info != nil && info.Data != nil {
c.ResponseInfo(200, "get list gpu info successful", info.Data.Gpus) c.ResponseInfo(200, "get list gpu info successful", info.Data.Gpus)
return return
...@@ -83,7 +83,7 @@ func (c *StateController) GetGpuUsageInfo() { ...@@ -83,7 +83,7 @@ func (c *StateController) GetGpuUsageInfo() {
c.ResponseInfo(500, "param error", "") c.ResponseInfo(500, "param error", "")
return return
} }
info := utils.GetHardwareInfo(conf.GetConfig().HardwareUrl) info := utils.GetApiHardwareInfo(conf.GetConfig().HardwareUrl)
if info != nil { if info != nil {
for _, gpu := range info.Data.Gpus { for _, gpu := range info.Data.Gpus {
if gpu.Seq == req.Seq { if gpu.Seq == req.Seq {
...@@ -96,7 +96,7 @@ func (c *StateController) GetGpuUsageInfo() { ...@@ -96,7 +96,7 @@ func (c *StateController) GetGpuUsageInfo() {
} }
func (c *StateController) GetOtherHardwareInfo() { func (c *StateController) GetOtherHardwareInfo() {
info := utils.GetHardwareInfo(conf.GetConfig().HardwareUrl) info := utils.GetApiHardwareInfo(conf.GetConfig().HardwareUrl)
var diskTotal, diskFree int64 var diskTotal, diskFree int64
for _, disk := range info.Data.Disk { for _, disk := range info.Data.Disk {
for _, point := range disk.MountPoints { for _, point := range disk.MountPoints {
......
...@@ -60,11 +60,11 @@ func (m *ModelHandler) MonitorModelInfo() { ...@@ -60,11 +60,11 @@ func (m *ModelHandler) MonitorModelInfo() {
log.Warn("Response data is empty") log.Warn("Response data is empty")
continue continue
} }
imageMap, err := m.dockerOp.PsImageNameMap() //imageMap, err := m.dockerOp.PsImageNameMap()
if err != nil { //if err != nil {
log.Error("Error getting image name map from client failed:", err) // log.Error("Error getting image name map from client failed:", err)
continue // continue
} //}
modelInfosResp := resp.Data modelInfosResp := resp.Data
for _, modelInfo := range modelInfosResp { for _, modelInfo := range modelInfosResp {
if modelInfo.ImageName == "" { if modelInfo.ImageName == "" {
...@@ -94,10 +94,10 @@ func (m *ModelHandler) MonitorModelInfo() { ...@@ -94,10 +94,10 @@ func (m *ModelHandler) MonitorModelInfo() {
} }
log.WithField("name", modelInfo.ImageName).Info("The image add") log.WithField("name", modelInfo.ImageName).Info("The image add")
} }
if !imageMap[modelInfo.ImageName] && modelInfo.PublishStatus == models.ModelPublishStatusYes { //if !imageMap[modelInfo.ImageName] && modelInfo.PublishStatus == models.ModelPublishStatusYes {
log.WithField("model image name", modelInfo.ImageName).Info("pulling image") // log.WithField("model image name", modelInfo.ImageName).Info("pulling image")
go m.dockerOp.PullImage(model.ImageName) // go m.dockerOp.PullImage(model.ImageName)
} //}
} }
m.IsInit = true m.IsInit = true
ticker = time.NewTicker(time.Minute * 10) ticker = time.NewTicker(time.Minute * 10)
...@@ -201,42 +201,42 @@ func (m *ModelHandler) MonitorModelStatus() { ...@@ -201,42 +201,42 @@ func (m *ModelHandler) MonitorModelStatus() {
func (m *ModelHandler) ScanModelsResp() (*nodemanagerV2.ModelsInfo, error) { func (m *ModelHandler) ScanModelsResp() (*nodemanagerV2.ModelsInfo, error) {
installedModels := make([]*nodemanagerV2.InstalledModel, 0) installedModels := make([]*nodemanagerV2.InstalledModel, 0)
runningModels := make([]*nodemanagerV2.RunningModel, 0) runningModels := make([]*nodemanagerV2.RunningModel, 0)
images, err := m.dockerOp.PsImageNameMap() //images, err := m.dockerOp.PsImageNameMap()
if err != nil { //if err != nil {
log.WithError(err).Error("get images failed") // log.WithError(err).Error("get images failed")
return nil, err // return nil, err
} //}
containerList := m.dockerOp.ListContainer() //containerList := m.dockerOp.ListContainer()
if containerList == nil || len(containerList) == 0 { //if containerList == nil || len(containerList) == 0 {
log.Error("Get container failed") // log.Error("Get container failed")
return nil, fmt.Errorf("get containe failed") // return nil, fmt.Errorf("get containe failed")
} //}
allModels, err := db.GetAllModels() //allModels, err := db.GetAllModels()
if err != nil { //if err != nil {
log.WithError(err).Error("Get all models failed") // log.WithError(err).Error("Get all models failed")
return nil, fmt.Errorf("get all models failed") // return nil, fmt.Errorf("get all models failed")
} //}
for _, model := range allModels { //for _, model := range allModels {
isExist := images[model.ImageName] // isExist := images[model.ImageName]
if !isExist { // if !isExist {
continue // continue
} // }
diskSize, err := strconv.ParseInt(model.HardwareRequire.DiskSize, 10, 64) // diskSize, err := strconv.ParseInt(model.HardwareRequire.DiskSize, 10, 64)
if err != nil { // if err != nil {
continue // continue
} // }
installedModels = append(installedModels, &nodemanagerV2.InstalledModel{ModelId: strconv.FormatUint(model.TaskId, 10), DiskSize: diskSize, InstalledTime: model.SetupTime, LastRunTime: model.LastRunTime}) // installedModels = append(installedModels, &nodemanagerV2.InstalledModel{ModelId: strconv.FormatUint(model.TaskId, 10), DiskSize: diskSize, InstalledTime: model.SetupTime, LastRunTime: model.LastRunTime})
//
containerIsExist := false // containerIsExist := false
for _, container := range containerList { // for _, container := range containerList {
if model.ImageName == container.Image { // if model.ImageName == container.Image {
containerIsExist = true // containerIsExist = true
} // }
} // }
if containerIsExist { // if containerIsExist {
runningModels = append(runningModels, &nodemanagerV2.RunningModel{ModelId: strconv.FormatUint(model.TaskId, 10), GpuSeq: model.GpuSeq, GpuRam: model.RunningMem, StartedTime: model.LastRunTime, LastWorkTime: model.LastWorkTime, TotalRunCount: model.TotalRunCount, ExecTime: model.EstimatExeTime}) // runningModels = append(runningModels, &nodemanagerV2.RunningModel{ModelId: strconv.FormatUint(model.TaskId, 10), GpuSeq: model.GpuSeq, GpuRam: model.RunningMem, StartedTime: model.LastRunTime, LastWorkTime: model.LastWorkTime, TotalRunCount: model.TotalRunCount, ExecTime: model.EstimatExeTime})
} // }
} //}
res := &nodemanagerV2.ModelsInfo{ res := &nodemanagerV2.ModelsInfo{
InstalledModels: installedModels, InstalledModels: installedModels,
RunningModels: runningModels, RunningModels: runningModels,
......
...@@ -61,7 +61,7 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node ...@@ -61,7 +61,7 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
taskMsg := rev.GetPushTask() taskMsg := rev.GetPushTask()
if taskMsg != nil { if taskMsg != nil {
go func(msgRespWorker *RespMsgWorker, taskMsgWorker *TaskWorker, taskMsg *nodemanagerV2.PushTaskMessage) { go func(msgRespWorker *RespMsgWorker, taskMsgWorker *TaskWorker, taskMsg *nodemanagerV2.PushTaskMessage) {
isCanExecute, bootUpTime, queueWaitTime, executeTime := taskMsgWorker.GetAckResp(taskMsg) isCanExecute, bootUpTime, queueWaitTime, executeTime, imageName := taskMsgWorker.GetAckResp(taskMsg)
ackParams := utils.BuildParams(taskMsg.TaskId, isCanExecute, bootUpTime, queueWaitTime, executeTime) ackParams := utils.BuildParams(taskMsg.TaskId, isCanExecute, bootUpTime, queueWaitTime, executeTime)
msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, RespTaskAck, ackParams) msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, RespTaskAck, ackParams)
if !isCanExecute { if !isCanExecute {
...@@ -108,6 +108,11 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node ...@@ -108,6 +108,11 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
RunningState.CompletedTaskCount++ RunningState.CompletedTaskCount++
log.Info("Completed task count: ", RunningState.CompletedTaskCount) log.Info("Completed task count: ", RunningState.CompletedTaskCount)
log.Info("--------------taskMsg--------------:", taskMsg) log.Info("--------------taskMsg--------------:", taskMsg)
model, _ := db.GetModel(imageName)
if model != nil {
params := utils.BuildParams(model)
n.msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, RunningModelStatusResp, params)
}
msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, GpuUsageResp, ackParams) msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, GpuUsageResp, ackParams)
}(n.msgRespWorker, n.taskMsgWorker, taskMsg) }(n.msgRespWorker, n.taskMsgWorker, taskMsg)
continue continue
...@@ -157,38 +162,38 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node ...@@ -157,38 +162,38 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
switch modelOp.Operate { switch modelOp.Operate {
case nodemanagerV2.ModelOperateType_INSTALL: case nodemanagerV2.ModelOperateType_INSTALL:
{ {
go dockerOp.PullImage(modelOp.ImageName) //go dockerOp.PullImage(modelOp.ImageName)
} }
case nodemanagerV2.ModelOperateType_DELETE: case nodemanagerV2.ModelOperateType_DELETE:
{ {
if model.ContainerId != "" { //if model.ContainerId != "" {
isRunning := dockerOp.ContainerIsRunning(model.ContainerId) // isRunning := dockerOp.ContainerIsRunning(model.ContainerId)
if isRunning { // if isRunning {
dockerOp.StopAndDeleteContainer(model.ContainerId) // dockerOp.StopAndDeleteContainer(model.ContainerId)
} // }
} //}
go dockerOp.RmImage(modelOp.ImageName) //go dockerOp.RmImage(modelOp.ImageName)
} }
case nodemanagerV2.ModelOperateType_RUN: case nodemanagerV2.ModelOperateType_RUN:
{ {
dockerCmd := &models.DockerCmd{ //dockerCmd := &models.DockerCmd{
HostIp: models.ZeroHost, // HostIp: models.ZeroHost,
HostPort: n.taskMsgWorker.getExternalPort(), // HostPort: n.taskMsgWorker.getExternalPort(),
} //}
containerId, gpuSeq, err := dockerOp.CreateAndStartContainer(model, dockerCmd) //containerId, gpuSeq, err := dockerOp.CreateAndStartContainer(model, dockerCmd)
if err != nil { //if err != nil {
log.WithError(err).Error("Error creating container") // log.WithError(err).Error("Error creating container")
continue // continue
} //}
model.ContainerId = containerId model.ContainerId = "1111111"
model.GpuSeq = gpuSeq model.GpuSeq = 0
} }
case nodemanagerV2.ModelOperateType_STOP: case nodemanagerV2.ModelOperateType_STOP:
{ {
if model.ContainerId != "" { //if model.ContainerId != "" {
model.ContainerId = "" model.ContainerId = ""
dockerOp.StopContainer(model.ContainerId) // dockerOp.StopContainer(model.ContainerId)
} //}
} }
} }
err = db.PutModel(model.ImageName, model) err = db.PutModel(model.ImageName, model)
...@@ -228,57 +233,59 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) { ...@@ -228,57 +233,59 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
} }
ticker := time.NewTicker(time.Second * 2) ticker := time.NewTicker(time.Second * 2)
defer ticker.Stop() defer ticker.Stop()
isOp := false //isOp := false
switch op.Operate { switch op.Operate {
case nodemanagerV2.ModelOperateType_INSTALL: case nodemanagerV2.ModelOperateType_INSTALL:
{ {
now := time.Now() //now := time.Now()
for { for {
select { select {
case <-ticker.C: case <-ticker.C:
if time.Since(now).Hours() > models.OneHour || isOp { //if time.Since(now).Hours() > models.OneHour || isOp {
return // return
} //}
imagesMap, err := n.taskMsgWorker.DockerOp.PsImageNameMap() //imagesMap, err := n.taskMsgWorker.DockerOp.PsImageNameMap()
if err != nil { //if err != nil {
log.WithError(err).Error("Ps image name map failed") // log.WithError(err).Error("Ps image name map failed")
return // return
} //}
if imagesMap[op.ImageName] { //if imagesMap[op.ImageName] {
isOp = true // isOp = true
model.IsInstalled = true time.Sleep(time.Minute * 1)
model.SetupTime = time.Now().Unix() model.IsInstalled = true
diskSize, _ := strconv.ParseInt(model.HardwareRequire.DiskSize, 10, 64) model.SetupTime = time.Now().Unix()
params := utils.BuildParams(&nodemanagerV2.InstalledModel{ModelId: strconv.FormatUint(model.TaskId, 10), DiskSize: diskSize, InstalledTime: model.SetupTime, LastRunTime: model.LastRunTime}) diskSize, _ := strconv.ParseInt(model.HardwareRequire.DiskSize, 10, 64)
n.msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, AddModelInstalledResp, params) params := utils.BuildParams(&nodemanagerV2.InstalledModel{ModelId: strconv.FormatUint(model.TaskId, 10), DiskSize: diskSize, InstalledTime: model.SetupTime, LastRunTime: model.LastRunTime})
return n.msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, AddModelInstalledResp, params)
} break
//}
} }
} }
} }
case nodemanagerV2.ModelOperateType_DELETE: case nodemanagerV2.ModelOperateType_DELETE:
{ {
now := time.Now() //now := time.Now()
for { for {
select { select {
case <-ticker.C: case <-ticker.C:
if time.Since(now).Minutes() > models.OneMinutes || isOp { //if time.Since(now).Minutes() > models.OneMinutes || isOp {
return // return
} //}
imagesMap, err := n.taskMsgWorker.DockerOp.PsImageNameMap() //imagesMap, err := n.taskMsgWorker.DockerOp.PsImageNameMap()
if err != nil { //if err != nil {
log.WithError(err).Error("Ps image name map failed") // log.WithError(err).Error("Ps image name map failed")
return // return
} //}
if !imagesMap[op.ImageName] { //if !imagesMap[op.ImageName] {
isOp = true //isOp = true
model.IsInstalled = false time.Sleep(time.Second * 10)
model.IsRunning = false model.IsInstalled = false
model.ContainerId = "" model.IsRunning = false
params := utils.BuildParams(strconv.FormatUint(model.TaskId, 10)) model.ContainerId = ""
n.msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, DelModelInstalledResp, params) params := utils.BuildParams(strconv.FormatUint(model.TaskId, 10))
return n.msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, DelModelInstalledResp, params)
} break
//}
} }
} }
} }
...@@ -288,71 +295,75 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) { ...@@ -288,71 +295,75 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
for { for {
select { select {
case <-ticker.C: case <-ticker.C:
if time.Since(now).Minutes() > models.TwoMinutes || isOp { //if time.Since(now).Minutes() > models.TwoMinutes || isOp {
return // return
} //}
info := GetHardwareInfo() //info := GetHardwareInfo()
if info == nil { //if info == nil {
continue // continue
} //}
memIsChange := false //memIsChange := false
for _, gpuInfo := range info.GPU { //for _, gpuInfo := range info.GPU {
if gpuInfo.Seq == model.GpuSeq { // if gpuInfo.Seq == model.GpuSeq {
if ModelRunningBeforeMem[op.ImageName] <= gpuInfo.MemFree { // if ModelRunningBeforeMem[op.ImageName] <= gpuInfo.MemFree {
break // break
} // }
model.RunningMem = ModelRunningBeforeMem[op.ImageName] - gpuInfo.MemFree // model.RunningMem = ModelRunningBeforeMem[op.ImageName] - gpuInfo.MemFree
memIsChange = true // memIsChange = true
} // }
} //}
if !memIsChange { //if !memIsChange {
continue // continue
} //}
listContainers := n.taskMsgWorker.DockerOp.ListContainer() //listContainers := n.taskMsgWorker.DockerOp.ListContainer()
if listContainers != nil && len(listContainers) > 0 { //if listContainers != nil && len(listContainers) > 0 {
for _, container := range listContainers { // for _, container := range listContainers {
if container.Image == op.ImageName { // if container.Image == op.ImageName {
isOp = true //isOp = true
model.StartUpTime = int64(time.Since(now).Seconds()) time.Sleep(time.Second * 30)
model.ContainerId = container.ID model.StartUpTime = int64(time.Since(now).Seconds())
model.IsRunning = true model.ContainerId = "1111"
model.LastRunTime = time.Now().Unix() model.IsRunning = true
params := utils.BuildParams(&nodemanagerV2.RunningModel{ModelId: strconv.FormatUint(model.TaskId, 10), GpuSeq: model.GpuSeq, GpuRam: model.RunningMem, StartedTime: model.LastRunTime, LastWorkTime: model.LastWorkTime, TotalRunCount: model.TotalRunCount, ExecTime: model.EstimatExeTime}) model.LastRunTime = time.Now().Unix()
n.msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, AddModelRunningResp, params) params := utils.BuildParams(&nodemanagerV2.RunningModel{ModelId: strconv.FormatUint(model.TaskId, 10), GpuSeq: model.GpuSeq, GpuRam: model.RunningMem, StartedTime: model.LastRunTime, LastWorkTime: model.LastWorkTime, TotalRunCount: model.TotalRunCount, ExecTime: model.EstimatExeTime})
break n.msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, AddModelRunningResp, params)
} break
} //}
} //}
//}
} }
} }
} }
case nodemanagerV2.ModelOperateType_STOP: case nodemanagerV2.ModelOperateType_STOP:
{ {
now := time.Now() //now := time.Now()
for { for {
select { select {
case <-ticker.C: case <-ticker.C:
if time.Since(now).Minutes() > models.OneMinutes || isOp { //if time.Since(now).Minutes() > models.OneMinutes || isOp {
return // return
} //}
listContainers := n.taskMsgWorker.DockerOp.ListContainer() //listContainers := n.taskMsgWorker.DockerOp.ListContainer()
if listContainers != nil && len(listContainers) > 0 { //if listContainers != nil && len(listContainers) > 0 {
isFound := false // isFound := false
for _, container := range listContainers { // for _, container := range listContainers {
if container.Image == op.ImageName { // if container.Image == op.ImageName {
isFound = true // isFound = true
} // }
} // }
if !isFound { // if !isFound {
isOp = true //isOp = true
model.GpuSeq = 999 time.Sleep(time.Second * 10)
model.IsRunning = false model.GpuSeq = 999
model.ContainerId = "" model.IsRunning = false
params := utils.BuildParams(strconv.FormatUint(model.TaskId, 10)) model.ContainerId = ""
n.msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, DelModelRunningResp, params) params := utils.BuildParams(strconv.FormatUint(model.TaskId, 10))
break n.msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, DelModelRunningResp, params)
} params = utils.BuildParams(model.TaskId, model.LastRunTime)
} n.msgRespWorker.RegisterMsgResp(n.nodeManager, n.worker, InstallModelStatusResp, params)
break
//}
//}
} }
} }
} }
......
package nm package nm
import ( import (
"bytes"
"example.com/m/conf" "example.com/m/conf"
"example.com/m/largeModel" "example.com/m/largeModel"
"example.com/m/log" "example.com/m/log"
...@@ -11,6 +10,7 @@ import ( ...@@ -11,6 +10,7 @@ import (
"github.com/ethereum/go-ethereum/crypto" "github.com/ethereum/go-ethereum/crypto"
nodemanagerV2 "github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2" nodemanagerV2 "github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2"
"math/big" "math/big"
"strconv"
"time" "time"
) )
...@@ -80,28 +80,30 @@ func HeartbeatResp(params ...interface{}) *nodemanagerV2.WorkerMessage { ...@@ -80,28 +80,30 @@ func HeartbeatResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
func RegisterInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage { func RegisterInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
log.Info("Register info response received params:", params) log.Info("Register info response received params:", params)
modelsInfo := params[0].(*largeModel.ModelHandler)
info := &nodemanagerV2.NodeInfo{
MinerPubkey: conf.GetConfig().SignPub,
BenefitAddress: conf.GetConfig().BenefitAddress,
}
hardwareInfo := GetHardwareInfo()
readModels, err := modelsInfo.ScanModelsResp()
if err != nil {
log.Error("Scan models response error", err)
return nil
}
nowTimeStamp := time.Now().Unix() nowTimeStamp := time.Now().Unix()
nowTimeBytes := big.NewInt(nowTimeStamp).Bytes() nowTimeBytes := big.NewInt(nowTimeStamp).Bytes()
signHash := crypto.Keccak256Hash(bytes.NewBufferString(conf.GetConfig().GetExternalIp()).Bytes(), signHash := crypto.Keccak256Hash([]byte(info.String()),
bytes.NewBufferString(conf.GetConfig().SignPub).Bytes(), []byte(hardwareInfo.String()),
bytes.NewBufferString(conf.GetConfig().BenefitAddress).Bytes(), []byte(readModels.String()),
nowTimeBytes) nowTimeBytes)
log.WithField("hash", signHash.String()).Info("register message sign result") log.WithField("hash", signHash.String()).Info("register message sign result")
sign, _ := crypto.Sign(signHash.Bytes(), conf.GetConfig().SignPrivateKey) sign, _ := crypto.Sign(signHash.Bytes(), conf.GetConfig().SignPrivateKey)
log.Info("register message sign:", common.Bytes2Hex(sign)) log.Info("register message sign:", common.Bytes2Hex(sign))
modelsInfo := params[0].(*largeModel.ModelHandler)
readModels, err := modelsInfo.ScanModelsResp()
if err != nil {
return nil
}
hardwareInfo := GetHardwareInfo()
nodeInfoRes := &nodemanagerV2.WorkerMessage{ nodeInfoRes := &nodemanagerV2.WorkerMessage{
Message: &nodemanagerV2.WorkerMessage_RegisteMessage{ Message: &nodemanagerV2.WorkerMessage_RegisteMessage{
RegisteMessage: &nodemanagerV2.RegisteMessage{ RegisteMessage: &nodemanagerV2.RegisteMessage{
Info: &nodemanagerV2.NodeInfo{ Info: info,
MinerPubkey: conf.GetConfig().SignPub,
BenefitAddress: conf.GetConfig().BenefitAddress,
},
Hardware: hardwareInfo, Hardware: hardwareInfo,
Models: readModels, Models: readModels,
Timestamp: nowTimeStamp, Timestamp: nowTimeStamp,
...@@ -342,6 +344,39 @@ func AddModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage { ...@@ -342,6 +344,39 @@ func AddModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
return addModelRunningRes return addModelRunningRes
} }
func RunningModelStatusResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
log.Info("Add model running response received params:", params)
info := params[0].(*models.ModelInfo)
addModelRunningRes := &nodemanagerV2.WorkerMessage{
Message: &nodemanagerV2.WorkerMessage_RunningModelStatus{
RunningModelStatus: &nodemanagerV2.RunningModelStatus{
ModelId: strconv.FormatUint(info.TaskId, 10),
LastWorkTime: info.LastWorkTime,
TotalRunCount: info.TotalRunCount,
ExecTime: info.EstimatExeTime,
},
},
}
log.Info("---------------------------------------Send Add model running response msg ------------------------------------")
return addModelRunningRes
}
func InstallModelStatusResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
log.Info("Add model running response received params:", params)
modelId := params[0].(uint64)
lastRunTime := params[1].(int64)
installModelStatusRes := &nodemanagerV2.WorkerMessage{
Message: &nodemanagerV2.WorkerMessage_InstalledModelStatus{
InstalledModelStatus: &nodemanagerV2.InstalledModelStatus{
ModelId: strconv.FormatUint(modelId, 10),
LastRunTime: lastRunTime,
},
},
}
log.Info("---------------------------------------Send install model status response msg ------------------------------------")
return installModelStatusRes
}
func DelModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage { func DelModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
log.Info("Del model running response received params:", params) log.Info("Del model running response received params:", params)
delModelRunningRes := &nodemanagerV2.WorkerMessage{ delModelRunningRes := &nodemanagerV2.WorkerMessage{
...@@ -356,7 +391,7 @@ func DelModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage { ...@@ -356,7 +391,7 @@ func DelModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
} }
func GetHardwareInfo() *nodemanagerV2.HardwareInfo { func GetHardwareInfo() *nodemanagerV2.HardwareInfo {
hardwareInfo := utils.GetHardwareInfo(conf.GetConfig().HardwareUrl) hardwareInfo := utils.GetApiHardwareInfo(conf.GetConfig().HardwareUrl)
if hardwareInfo == nil { if hardwareInfo == nil {
return nil return nil
} }
......
...@@ -36,14 +36,14 @@ func StartMonitor() { ...@@ -36,14 +36,14 @@ func StartMonitor() {
go modelHandler.MonitorModelInfo() go modelHandler.MonitorModelInfo()
log.WithField("func", "MonitorModelInfo").Info("--------------------Start modelHandler--------------------") log.WithField("func", "MonitorModelInfo").Info("--------------------Start modelHandler--------------------")
go modelHandler.MonitorModelStatus() //go modelHandler.MonitorModelStatus()
log.WithField("func", "MonitorModelStatus").Info("--------------------Start modelHandler--------------------") log.WithField("func", "MonitorModelStatus").Info("--------------------Start modelHandler--------------------")
go monitorNm.monitorNodeManagerSeed() go monitorNm.monitorNodeManagerSeed()
log.WithField("func", "monitorNodeManagerSeed").Info("--------------------Start monitorNm--------------------") log.WithField("func", "monitorNodeManagerSeed").Info("--------------------Start monitorNm--------------------")
for !monitorNm.IsInit && !modelHandler.IsInit { for !monitorNm.IsInit && !modelHandler.IsInit {
time.Sleep(time.Second) time.Sleep(time.Second * 3)
} }
go monitorNm.monitorNmClient() go monitorNm.monitorNmClient()
......
...@@ -134,9 +134,9 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage) ...@@ -134,9 +134,9 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
taskCmd: &models.TaskCmd{}, taskCmd: &models.TaskCmd{},
taskExecResult: &models.TaskResult{ taskExecResult: &models.TaskResult{
TaskHttpStatusCode: 200, TaskHttpStatusCode: 200,
TaskRespBody: nil, TaskRespBody: []byte{1, 2, 3, 4, 5},
TaskHttpHeaders: nil, TaskHttpHeaders: nil,
TaskIsSuccess: false, TaskIsSuccess: true,
TaskExecTime: 0, TaskExecTime: 0,
TaskExecError: "", TaskExecError: "",
}, },
...@@ -178,30 +178,38 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage) ...@@ -178,30 +178,38 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true) t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true)
return return
} }
running, _ := t.foundImageIsRunning(taskOp.taskCmd.ImageName) time.Sleep(time.Second * 20)
if !running { //running, _ := t.foundImageIsRunning(taskOp.taskCmd.ImageName)
taskOp.taskCmd.DockerCmd.HostIp = models.ZeroHost //if !running {
taskOp.taskCmd.DockerCmd.HostPort = t.getExternalPort() // taskOp.taskCmd.DockerCmd.HostIp = models.ZeroHost
containerId, gpuSeq, err := t.DockerOp.CreateAndStartContainer(model, taskOp.taskCmd.DockerCmd) // taskOp.taskCmd.DockerCmd.HostPort = t.getExternalPort()
if err != nil { // info := GetHardwareInfo()
log.Errorf("Create and start container failed: %s", err.Error()) // if info == nil {
taskOp.taskExecResult.TaskExecError = fmt.Sprintf("%s,%s", "Create and start container failed", err.Error()) // log.Error("Error getting hardware info")
t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true) // taskOp.taskExecResult.TaskExecError = fmt.Sprintf("%s", "Error getting hardware info")
return // t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true)
} // return
model.GpuSeq = gpuSeq // }
log.Infof("Started container with ID %s", containerId) // containerId, gpuSeq, err := t.DockerOp.CreateAndStartContainer(info, model, taskOp.taskCmd.DockerCmd)
} // if err != nil {
if err = taskOp.waitContainerRunning(t, taskOp.taskCmd.ImageName, uint16(taskOp.taskCmd.DockerCmd.ContainerPort)); err != nil { // log.Errorf("Create and start container failed: %s", err.Error())
taskOp.taskExecResult.TaskExecError = fmt.Sprintf("%s", err.Error()) // taskOp.taskExecResult.TaskExecError = fmt.Sprintf("%s,%s", "Create and start container failed", err.Error())
t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true) // t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true)
return // return
} // }
if err = taskOp.waitReqContainerOk(t.DockerOp); err != nil { // model.GpuSeq = gpuSeq
taskOp.taskExecResult.TaskExecError = fmt.Sprintf("%s", err.Error()) // log.Info("Started container with ID:", containerId)
t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true) //}
return //if err = taskOp.waitContainerRunning(t, taskOp.taskCmd.ImageName, uint16(taskOp.taskCmd.DockerCmd.ContainerPort)); err != nil {
} // taskOp.taskExecResult.TaskExecError = fmt.Sprintf("%s", err.Error())
// t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true)
// return
//}
//if err = taskOp.waitReqContainerOk(t.DockerOp); err != nil {
// taskOp.taskExecResult.TaskExecError = fmt.Sprintf("%s", err.Error())
// t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true)
// return
//}
endAfterTaskTime := time.Since(taskOp.startBeforeTaskTime) endAfterTaskTime := time.Since(taskOp.startBeforeTaskTime)
taskOp.taskExecResult.TaskExecTime = endAfterTaskTime.Microseconds() taskOp.taskExecResult.TaskExecTime = endAfterTaskTime.Microseconds()
log.WithField("time", endAfterTaskTime.Seconds()).WithField("taskId", taskMsg.TaskId).Info("Exec task end (second is units) :") log.WithField("time", endAfterTaskTime.Seconds()).WithField("taskId", taskMsg.TaskId).Info("Exec task end (second is units) :")
...@@ -217,7 +225,7 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage) ...@@ -217,7 +225,7 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
log.Info("----------------------Compute task exec done--------------------------------") log.Info("----------------------Compute task exec done--------------------------------")
} }
func (t *TaskWorker) GetAckResp(taskMsg *nodemanagerV2.PushTaskMessage) (isCanExecute bool, bootUpTime, queueWaitTime, executeTime int64) { func (t *TaskWorker) GetAckResp(taskMsg *nodemanagerV2.PushTaskMessage) (isCanExecute bool, bootUpTime, queueWaitTime, executeTime int64, imageName string) {
if t.IsExecStandardTask { if t.IsExecStandardTask {
isCanExecute = true isCanExecute = true
return return
...@@ -237,7 +245,7 @@ func (t *TaskWorker) GetAckResp(taskMsg *nodemanagerV2.PushTaskMessage) (isCanEx ...@@ -237,7 +245,7 @@ func (t *TaskWorker) GetAckResp(taskMsg *nodemanagerV2.PushTaskMessage) (isCanEx
if !isSuccess && !t.lastExecTaskStartTime.IsZero() { if !isSuccess && !t.lastExecTaskStartTime.IsZero() {
lastTaskImageInfo, err := db.GetModel(t.lastExecTaskImageName) lastTaskImageInfo, err := db.GetModel(t.lastExecTaskImageName)
if err != nil { if err != nil {
return false, 0, 0, 0 return false, 0, 0, 0, ""
} }
since := time.Since(t.lastExecTaskStartTime) since := time.Since(t.lastExecTaskStartTime)
queueWaitTime = int64(lastTaskImageInfo.EstimatExeTime - int32(since.Seconds())) queueWaitTime = int64(lastTaskImageInfo.EstimatExeTime - int32(since.Seconds()))
...@@ -258,12 +266,13 @@ func (t *TaskWorker) GetAckResp(taskMsg *nodemanagerV2.PushTaskMessage) (isCanEx ...@@ -258,12 +266,13 @@ func (t *TaskWorker) GetAckResp(taskMsg *nodemanagerV2.PushTaskMessage) (isCanEx
isCanExecute = true isCanExecute = true
modelInfo, err := db.GetModel(t.lastExecTaskImageName) modelInfo, err := db.GetModel(t.lastExecTaskImageName)
if err != nil { if err != nil {
return false, 0, 0, 0 return false, 0, 0, 0, ""
} }
if modelInfo != nil { if modelInfo != nil {
bootUpTime = modelInfo.StartUpTime bootUpTime = modelInfo.StartUpTime
executeTime = int64(modelInfo.EstimatExeTime) executeTime = int64(modelInfo.EstimatExeTime)
} }
imageName = modelInfo.ImageName
return return
} }
......
...@@ -8,7 +8,6 @@ import ( ...@@ -8,7 +8,6 @@ import (
"example.com/m/db" "example.com/m/db"
"example.com/m/log" "example.com/m/log"
"example.com/m/models" "example.com/m/models"
"example.com/m/nm"
"fmt" "fmt"
"github.com/docker/docker/api/types" "github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/container"
...@@ -42,7 +41,7 @@ func NewDockerOp() *DockerOp { ...@@ -42,7 +41,7 @@ func NewDockerOp() *DockerOp {
dockerClient, err := GetDockerClient() dockerClient, err := GetDockerClient()
if err != nil { if err != nil {
return &DockerOp{ return &DockerOp{
IsHealthy: false, IsHealthy: true,
Reason: fmt.Sprintf("The connect docker client failed reason:%s", err.Error()), Reason: fmt.Sprintf("The connect docker client failed reason:%s", err.Error()),
} }
} }
...@@ -125,8 +124,8 @@ func (d *DockerOp) ListContainer() []types.Container { ...@@ -125,8 +124,8 @@ func (d *DockerOp) ListContainer() []types.Container {
return containers return containers
} }
func (d *DockerOp) CreateAndStartContainer(modelInfo *models.ModelInfo, dockerCmd *models.DockerCmd) (string, int32, error) { func (d *DockerOp) CreateAndStartContainer(info *nodemanagerV2.HardwareInfo, modelInfo *models.ModelInfo, dockerCmd *models.DockerCmd) (string, int32, error) {
gpuSeq := d.checkGpuUsage(modelInfo, dockerCmd) gpuSeq := d.checkGpuUsage(info, modelInfo, dockerCmd)
containerId, err := d.CreateContainer(modelInfo.ImageName, dockerCmd) containerId, err := d.CreateContainer(modelInfo.ImageName, dockerCmd)
if err != nil { if err != nil {
log.Error("Error creating container image failed: ", err) log.Error("Error creating container image failed: ", err)
...@@ -391,11 +390,7 @@ func (d *DockerOp) getContainerInfo(id string) (types.Container, error) { ...@@ -391,11 +390,7 @@ func (d *DockerOp) getContainerInfo(id string) (types.Container, error) {
return types.Container{}, fmt.Errorf("get container info failed") return types.Container{}, fmt.Errorf("get container info failed")
} }
func (d *DockerOp) checkGpuUsage(modelInfo *models.ModelInfo, dockerCmd *models.DockerCmd) int32 { func (d *DockerOp) checkGpuUsage(info *nodemanagerV2.HardwareInfo, modelInfo *models.ModelInfo, dockerCmd *models.DockerCmd) int32 {
info := nm.GetHardwareInfo()
if info == nil {
return 0
}
envMap := make(map[string]string, 0) envMap := make(map[string]string, 0)
gpu := info.GPU gpu := info.GPU
isMatch := false isMatch := false
...@@ -421,7 +416,6 @@ func (d *DockerOp) checkGpuUsage(modelInfo *models.ModelInfo, dockerCmd *models. ...@@ -421,7 +416,6 @@ func (d *DockerOp) checkGpuUsage(modelInfo *models.ModelInfo, dockerCmd *models.
} }
} }
if isMatch { if isMatch {
nm.ModelRunningBeforeMem[modelInfo.ImageName] = dockerCmd.RunningBeforeMem
gpuSeq, _ := strconv.ParseInt(dockerCmd.EnvMap[models.CudaEnv], 10, 32) gpuSeq, _ := strconv.ParseInt(dockerCmd.EnvMap[models.CudaEnv], 10, 32)
return int32(gpuSeq) return int32(gpuSeq)
} }
......
...@@ -310,7 +310,7 @@ func readAndDecryptFile(key []byte, filename string) ([]byte, error) { ...@@ -310,7 +310,7 @@ func readAndDecryptFile(key []byte, filename string) ([]byte, error) {
return decryptedData, nil return decryptedData, nil
} }
func GetHardwareInfo(url string) *models.HardwareInfoRep { func GetApiHardwareInfo(url string) *models.HardwareInfoRep {
resp, err := http.Get(url) resp, err := http.Get(url)
if err != nil { if err != nil {
log.Error("Error creating request") log.Error("Error creating request")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment