Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
power-node
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Odysseus
power-node
Commits
ad9d0884
Commit
ad9d0884
authored
May 28, 2024
by
duanjinfei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
provide test
parent
c23880c9
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
277 additions
and
228 deletions
+277
-228
rootcmd.go
cmd/rootcmd.go
+2
-2
config.json
config.json
+1
-1
StateController.go
controllers/StateController.go
+5
-5
model_handler.go
largeModel/model_handler.go
+45
-45
msg_handler.go
nm/msg_handler.go
+129
-118
msg_resp.go
nm/msg_resp.go
+50
-15
start.go
nm/start.go
+2
-2
task_handler.go
nm/task_handler.go
+38
-29
docker.go
operate/docker.go
+4
-10
util.go
utils/util.go
+1
-1
No files found.
cmd/rootcmd.go
View file @
ad9d0884
...
@@ -21,8 +21,8 @@ var (
...
@@ -21,8 +21,8 @@ var (
)
)
func
init
()
{
func
init
()
{
RootCmd
.
PersistentFlags
()
.
StringVarP
(
&
rewardAddr
,
"reward"
,
"r"
,
""
,
"please enter a reward address"
)
RootCmd
.
PersistentFlags
()
.
StringVarP
(
&
rewardAddr
,
"reward"
,
"r"
,
"
0x40EC4256fcBCA69CdbAc942594caeC79FBE10494
"
,
"please enter a reward address"
)
RootCmd
.
PersistentFlags
()
.
StringVarP
(
&
externalIp
,
"externalIp"
,
"e"
,
""
,
"please enter server external ip address"
)
RootCmd
.
PersistentFlags
()
.
StringVarP
(
&
externalIp
,
"externalIp"
,
"e"
,
"
192.168.1.102
"
,
"please enter server external ip address"
)
RootCmd
.
PersistentFlags
()
.
StringVarP
(
&
opSys
,
"opSys"
,
"s"
,
""
,
"please enter you op sys name : win、linux"
)
RootCmd
.
PersistentFlags
()
.
StringVarP
(
&
opSys
,
"opSys"
,
"s"
,
""
,
"please enter you op sys name : win、linux"
)
RootCmd
.
PersistentFlags
()
.
BoolVarP
(
&
debug
,
"debug"
,
"d"
,
false
,
"set log level debug"
)
RootCmd
.
PersistentFlags
()
.
BoolVarP
(
&
debug
,
"debug"
,
"d"
,
false
,
"set log level debug"
)
cobra
.
OnInitialize
(
initConfig
)
cobra
.
OnInitialize
(
initConfig
)
...
...
config.json
View file @
ad9d0884
...
@@ -13,5 +13,5 @@
...
@@ -13,5 +13,5 @@
"is_stop_last_container"
:
true
,
"is_stop_last_container"
:
true
,
"disk_usage"
:
80
,
"disk_usage"
:
80
,
"init_run_mode"
:
1
,
"init_run_mode"
:
1
,
"hardware_url"
:
"http://
127.0.0.1:7000
/hw"
"hardware_url"
:
"http://
47.94.59.74:8005
/hw"
}
}
\ No newline at end of file
controllers/StateController.go
View file @
ad9d0884
...
@@ -25,7 +25,7 @@ func (c *StateController) GetRunningState() {
...
@@ -25,7 +25,7 @@ func (c *StateController) GetRunningState() {
}
}
func
(
c
*
StateController
)
GetRunningTp
()
{
func
(
c
*
StateController
)
GetRunningTp
()
{
info
:=
utils
.
GetHardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
info
:=
utils
.
Get
Api
HardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
if
info
==
nil
{
if
info
==
nil
{
c
.
ResponseInfo
(
500
,
"get running tp failed"
,
0
)
c
.
ResponseInfo
(
500
,
"get running tp failed"
,
0
)
return
return
...
@@ -37,7 +37,7 @@ func (c *StateController) GetRunningTp() {
...
@@ -37,7 +37,7 @@ func (c *StateController) GetRunningTp() {
}
}
func
(
c
*
StateController
)
GetRunningLineChart
()
{
func
(
c
*
StateController
)
GetRunningLineChart
()
{
info
:=
utils
.
GetHardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
info
:=
utils
.
Get
Api
HardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
if
info
==
nil
{
if
info
==
nil
{
c
.
ResponseInfo
(
500
,
"get running tp failed"
,
""
)
c
.
ResponseInfo
(
500
,
"get running tp failed"
,
""
)
return
return
...
@@ -63,7 +63,7 @@ func (c *StateController) GetWorkerInfo() {
...
@@ -63,7 +63,7 @@ func (c *StateController) GetWorkerInfo() {
}
}
func
(
c
*
StateController
)
GetListGpuInfo
()
{
func
(
c
*
StateController
)
GetListGpuInfo
()
{
info
:=
utils
.
GetHardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
info
:=
utils
.
Get
Api
HardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
if
info
!=
nil
&&
info
.
Data
!=
nil
{
if
info
!=
nil
&&
info
.
Data
!=
nil
{
c
.
ResponseInfo
(
200
,
"get list gpu info successful"
,
info
.
Data
.
Gpus
)
c
.
ResponseInfo
(
200
,
"get list gpu info successful"
,
info
.
Data
.
Gpus
)
return
return
...
@@ -83,7 +83,7 @@ func (c *StateController) GetGpuUsageInfo() {
...
@@ -83,7 +83,7 @@ func (c *StateController) GetGpuUsageInfo() {
c
.
ResponseInfo
(
500
,
"param error"
,
""
)
c
.
ResponseInfo
(
500
,
"param error"
,
""
)
return
return
}
}
info
:=
utils
.
GetHardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
info
:=
utils
.
Get
Api
HardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
if
info
!=
nil
{
if
info
!=
nil
{
for
_
,
gpu
:=
range
info
.
Data
.
Gpus
{
for
_
,
gpu
:=
range
info
.
Data
.
Gpus
{
if
gpu
.
Seq
==
req
.
Seq
{
if
gpu
.
Seq
==
req
.
Seq
{
...
@@ -96,7 +96,7 @@ func (c *StateController) GetGpuUsageInfo() {
...
@@ -96,7 +96,7 @@ func (c *StateController) GetGpuUsageInfo() {
}
}
func
(
c
*
StateController
)
GetOtherHardwareInfo
()
{
func
(
c
*
StateController
)
GetOtherHardwareInfo
()
{
info
:=
utils
.
GetHardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
info
:=
utils
.
Get
Api
HardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
var
diskTotal
,
diskFree
int64
var
diskTotal
,
diskFree
int64
for
_
,
disk
:=
range
info
.
Data
.
Disk
{
for
_
,
disk
:=
range
info
.
Data
.
Disk
{
for
_
,
point
:=
range
disk
.
MountPoints
{
for
_
,
point
:=
range
disk
.
MountPoints
{
...
...
largeModel/model_handler.go
View file @
ad9d0884
...
@@ -60,11 +60,11 @@ func (m *ModelHandler) MonitorModelInfo() {
...
@@ -60,11 +60,11 @@ func (m *ModelHandler) MonitorModelInfo() {
log
.
Warn
(
"Response data is empty"
)
log
.
Warn
(
"Response data is empty"
)
continue
continue
}
}
imageMap
,
err
:=
m
.
dockerOp
.
PsImageNameMap
()
//
imageMap, err := m.dockerOp.PsImageNameMap()
if
err
!=
nil
{
//
if err != nil {
log
.
Error
(
"Error getting image name map from client failed:"
,
err
)
//
log.Error("Error getting image name map from client failed:", err)
continue
//
continue
}
//
}
modelInfosResp
:=
resp
.
Data
modelInfosResp
:=
resp
.
Data
for
_
,
modelInfo
:=
range
modelInfosResp
{
for
_
,
modelInfo
:=
range
modelInfosResp
{
if
modelInfo
.
ImageName
==
""
{
if
modelInfo
.
ImageName
==
""
{
...
@@ -94,10 +94,10 @@ func (m *ModelHandler) MonitorModelInfo() {
...
@@ -94,10 +94,10 @@ func (m *ModelHandler) MonitorModelInfo() {
}
}
log
.
WithField
(
"name"
,
modelInfo
.
ImageName
)
.
Info
(
"The image add"
)
log
.
WithField
(
"name"
,
modelInfo
.
ImageName
)
.
Info
(
"The image add"
)
}
}
if
!
imageMap
[
modelInfo
.
ImageName
]
&&
modelInfo
.
PublishStatus
==
models
.
ModelPublishStatusYes
{
//
if !imageMap[modelInfo.ImageName] && modelInfo.PublishStatus == models.ModelPublishStatusYes {
log
.
WithField
(
"model image name"
,
modelInfo
.
ImageName
)
.
Info
(
"pulling image"
)
//
log.WithField("model image name", modelInfo.ImageName).Info("pulling image")
go
m
.
dockerOp
.
PullImage
(
model
.
ImageName
)
//
go m.dockerOp.PullImage(model.ImageName)
}
//
}
}
}
m
.
IsInit
=
true
m
.
IsInit
=
true
ticker
=
time
.
NewTicker
(
time
.
Minute
*
10
)
ticker
=
time
.
NewTicker
(
time
.
Minute
*
10
)
...
@@ -201,42 +201,42 @@ func (m *ModelHandler) MonitorModelStatus() {
...
@@ -201,42 +201,42 @@ func (m *ModelHandler) MonitorModelStatus() {
func
(
m
*
ModelHandler
)
ScanModelsResp
()
(
*
nodemanagerV2
.
ModelsInfo
,
error
)
{
func
(
m
*
ModelHandler
)
ScanModelsResp
()
(
*
nodemanagerV2
.
ModelsInfo
,
error
)
{
installedModels
:=
make
([]
*
nodemanagerV2
.
InstalledModel
,
0
)
installedModels
:=
make
([]
*
nodemanagerV2
.
InstalledModel
,
0
)
runningModels
:=
make
([]
*
nodemanagerV2
.
RunningModel
,
0
)
runningModels
:=
make
([]
*
nodemanagerV2
.
RunningModel
,
0
)
images
,
err
:=
m
.
dockerOp
.
PsImageNameMap
()
//
images, err := m.dockerOp.PsImageNameMap()
if
err
!=
nil
{
//
if err != nil {
log
.
WithError
(
err
)
.
Error
(
"get images failed"
)
//
log.WithError(err).Error("get images failed")
return
nil
,
err
//
return nil, err
}
//
}
containerList
:=
m
.
dockerOp
.
ListContainer
()
//
containerList := m.dockerOp.ListContainer()
if
containerList
==
nil
||
len
(
containerList
)
==
0
{
//
if containerList == nil || len(containerList) == 0 {
log
.
Error
(
"Get container failed"
)
//
log.Error("Get container failed")
return
nil
,
fmt
.
Errorf
(
"get containe failed"
)
//
return nil, fmt.Errorf("get containe failed")
}
//
}
allModels
,
err
:=
db
.
GetAllModels
()
//
allModels, err := db.GetAllModels()
if
err
!=
nil
{
//
if err != nil {
log
.
WithError
(
err
)
.
Error
(
"Get all models failed"
)
//
log.WithError(err).Error("Get all models failed")
return
nil
,
fmt
.
Errorf
(
"get all models failed"
)
//
return nil, fmt.Errorf("get all models failed")
}
//
}
for
_
,
model
:=
range
allModels
{
//
for _, model := range allModels {
isExist
:=
images
[
model
.
ImageName
]
//
isExist := images[model.ImageName]
if
!
isExist
{
//
if !isExist {
continue
//
continue
}
//
}
diskSize
,
err
:=
strconv
.
ParseInt
(
model
.
HardwareRequire
.
DiskSize
,
10
,
64
)
//
diskSize, err := strconv.ParseInt(model.HardwareRequire.DiskSize, 10, 64)
if
err
!=
nil
{
//
if err != nil {
continue
//
continue
}
//
}
installedModels
=
append
(
installedModels
,
&
nodemanagerV2
.
InstalledModel
{
ModelId
:
strconv
.
FormatUint
(
model
.
TaskId
,
10
),
DiskSize
:
diskSize
,
InstalledTime
:
model
.
SetupTime
,
LastRunTime
:
model
.
LastRunTime
})
//
installedModels = append(installedModels, &nodemanagerV2.InstalledModel{ModelId: strconv.FormatUint(model.TaskId, 10), DiskSize: diskSize, InstalledTime: model.SetupTime, LastRunTime: model.LastRunTime})
//
containerIsExist
:=
false
//
containerIsExist := false
for
_
,
container
:=
range
containerList
{
//
for _, container := range containerList {
if
model
.
ImageName
==
container
.
Image
{
//
if model.ImageName == container.Image {
containerIsExist
=
true
//
containerIsExist = true
}
//
}
}
//
}
if
containerIsExist
{
//
if containerIsExist {
runningModels
=
append
(
runningModels
,
&
nodemanagerV2
.
RunningModel
{
ModelId
:
strconv
.
FormatUint
(
model
.
TaskId
,
10
),
GpuSeq
:
model
.
GpuSeq
,
GpuRam
:
model
.
RunningMem
,
StartedTime
:
model
.
LastRunTime
,
LastWorkTime
:
model
.
LastWorkTime
,
TotalRunCount
:
model
.
TotalRunCount
,
ExecTime
:
model
.
EstimatExeTime
})
//
runningModels = append(runningModels, &nodemanagerV2.RunningModel{ModelId: strconv.FormatUint(model.TaskId, 10), GpuSeq: model.GpuSeq, GpuRam: model.RunningMem, StartedTime: model.LastRunTime, LastWorkTime: model.LastWorkTime, TotalRunCount: model.TotalRunCount, ExecTime: model.EstimatExeTime})
}
//
}
}
//
}
res
:=
&
nodemanagerV2
.
ModelsInfo
{
res
:=
&
nodemanagerV2
.
ModelsInfo
{
InstalledModels
:
installedModels
,
InstalledModels
:
installedModels
,
RunningModels
:
runningModels
,
RunningModels
:
runningModels
,
...
...
nm/msg_handler.go
View file @
ad9d0884
...
@@ -61,7 +61,7 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
...
@@ -61,7 +61,7 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
taskMsg
:=
rev
.
GetPushTask
()
taskMsg
:=
rev
.
GetPushTask
()
if
taskMsg
!=
nil
{
if
taskMsg
!=
nil
{
go
func
(
msgRespWorker
*
RespMsgWorker
,
taskMsgWorker
*
TaskWorker
,
taskMsg
*
nodemanagerV2
.
PushTaskMessage
)
{
go
func
(
msgRespWorker
*
RespMsgWorker
,
taskMsgWorker
*
TaskWorker
,
taskMsg
*
nodemanagerV2
.
PushTaskMessage
)
{
isCanExecute
,
bootUpTime
,
queueWaitTime
,
executeTime
:=
taskMsgWorker
.
GetAckResp
(
taskMsg
)
isCanExecute
,
bootUpTime
,
queueWaitTime
,
executeTime
,
imageName
:=
taskMsgWorker
.
GetAckResp
(
taskMsg
)
ackParams
:=
utils
.
BuildParams
(
taskMsg
.
TaskId
,
isCanExecute
,
bootUpTime
,
queueWaitTime
,
executeTime
)
ackParams
:=
utils
.
BuildParams
(
taskMsg
.
TaskId
,
isCanExecute
,
bootUpTime
,
queueWaitTime
,
executeTime
)
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
RespTaskAck
,
ackParams
)
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
RespTaskAck
,
ackParams
)
if
!
isCanExecute
{
if
!
isCanExecute
{
...
@@ -108,6 +108,11 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
...
@@ -108,6 +108,11 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
RunningState
.
CompletedTaskCount
++
RunningState
.
CompletedTaskCount
++
log
.
Info
(
"Completed task count: "
,
RunningState
.
CompletedTaskCount
)
log
.
Info
(
"Completed task count: "
,
RunningState
.
CompletedTaskCount
)
log
.
Info
(
"--------------taskMsg--------------:"
,
taskMsg
)
log
.
Info
(
"--------------taskMsg--------------:"
,
taskMsg
)
model
,
_
:=
db
.
GetModel
(
imageName
)
if
model
!=
nil
{
params
:=
utils
.
BuildParams
(
model
)
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
RunningModelStatusResp
,
params
)
}
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
GpuUsageResp
,
ackParams
)
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
GpuUsageResp
,
ackParams
)
}(
n
.
msgRespWorker
,
n
.
taskMsgWorker
,
taskMsg
)
}(
n
.
msgRespWorker
,
n
.
taskMsgWorker
,
taskMsg
)
continue
continue
...
@@ -157,38 +162,38 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
...
@@ -157,38 +162,38 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
switch
modelOp
.
Operate
{
switch
modelOp
.
Operate
{
case
nodemanagerV2
.
ModelOperateType_INSTALL
:
case
nodemanagerV2
.
ModelOperateType_INSTALL
:
{
{
go
dockerOp
.
PullImage
(
modelOp
.
ImageName
)
//
go dockerOp.PullImage(modelOp.ImageName)
}
}
case
nodemanagerV2
.
ModelOperateType_DELETE
:
case
nodemanagerV2
.
ModelOperateType_DELETE
:
{
{
if
model
.
ContainerId
!=
""
{
//
if model.ContainerId != "" {
isRunning
:=
dockerOp
.
ContainerIsRunning
(
model
.
ContainerId
)
//
isRunning := dockerOp.ContainerIsRunning(model.ContainerId)
if
isRunning
{
//
if isRunning {
dockerOp
.
StopAndDeleteContainer
(
model
.
ContainerId
)
//
dockerOp.StopAndDeleteContainer(model.ContainerId)
}
//
}
}
//
}
go
dockerOp
.
RmImage
(
modelOp
.
ImageName
)
//
go dockerOp.RmImage(modelOp.ImageName)
}
}
case
nodemanagerV2
.
ModelOperateType_RUN
:
case
nodemanagerV2
.
ModelOperateType_RUN
:
{
{
dockerCmd
:=
&
models
.
DockerCmd
{
//
dockerCmd := &models.DockerCmd{
HostIp
:
models
.
ZeroHost
,
//
HostIp: models.ZeroHost,
HostPort
:
n
.
taskMsgWorker
.
getExternalPort
(),
//
HostPort: n.taskMsgWorker.getExternalPort(),
}
//
}
containerId
,
gpuSeq
,
err
:=
dockerOp
.
CreateAndStartContainer
(
model
,
dockerCmd
)
//
containerId, gpuSeq, err := dockerOp.CreateAndStartContainer(model, dockerCmd)
if
err
!=
nil
{
//
if err != nil {
log
.
WithError
(
err
)
.
Error
(
"Error creating container"
)
//
log.WithError(err).Error("Error creating container")
continue
//
continue
}
//
}
model
.
ContainerId
=
containerId
model
.
ContainerId
=
"1111111"
model
.
GpuSeq
=
gpuSeq
model
.
GpuSeq
=
0
}
}
case
nodemanagerV2
.
ModelOperateType_STOP
:
case
nodemanagerV2
.
ModelOperateType_STOP
:
{
{
if
model
.
ContainerId
!=
""
{
//
if model.ContainerId != "" {
model
.
ContainerId
=
""
model
.
ContainerId
=
""
dockerOp
.
StopContainer
(
model
.
ContainerId
)
//
dockerOp.StopContainer(model.ContainerId)
}
//
}
}
}
}
}
err
=
db
.
PutModel
(
model
.
ImageName
,
model
)
err
=
db
.
PutModel
(
model
.
ImageName
,
model
)
...
@@ -228,57 +233,59 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
...
@@ -228,57 +233,59 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
}
}
ticker
:=
time
.
NewTicker
(
time
.
Second
*
2
)
ticker
:=
time
.
NewTicker
(
time
.
Second
*
2
)
defer
ticker
.
Stop
()
defer
ticker
.
Stop
()
isOp
:=
false
//
isOp := false
switch
op
.
Operate
{
switch
op
.
Operate
{
case
nodemanagerV2
.
ModelOperateType_INSTALL
:
case
nodemanagerV2
.
ModelOperateType_INSTALL
:
{
{
now
:=
time
.
Now
()
//
now := time.Now()
for
{
for
{
select
{
select
{
case
<-
ticker
.
C
:
case
<-
ticker
.
C
:
if
time
.
Since
(
now
)
.
Hours
()
>
models
.
OneHour
||
isOp
{
//if time.Since(now).Hours() > models.OneHour || isOp {
return
// return
}
//}
imagesMap
,
err
:=
n
.
taskMsgWorker
.
DockerOp
.
PsImageNameMap
()
//imagesMap, err := n.taskMsgWorker.DockerOp.PsImageNameMap()
if
err
!=
nil
{
//if err != nil {
log
.
WithError
(
err
)
.
Error
(
"Ps image name map failed"
)
// log.WithError(err).Error("Ps image name map failed")
return
// return
}
//}
if
imagesMap
[
op
.
ImageName
]
{
//if imagesMap[op.ImageName] {
isOp
=
true
// isOp = true
time
.
Sleep
(
time
.
Minute
*
1
)
model
.
IsInstalled
=
true
model
.
IsInstalled
=
true
model
.
SetupTime
=
time
.
Now
()
.
Unix
()
model
.
SetupTime
=
time
.
Now
()
.
Unix
()
diskSize
,
_
:=
strconv
.
ParseInt
(
model
.
HardwareRequire
.
DiskSize
,
10
,
64
)
diskSize
,
_
:=
strconv
.
ParseInt
(
model
.
HardwareRequire
.
DiskSize
,
10
,
64
)
params
:=
utils
.
BuildParams
(
&
nodemanagerV2
.
InstalledModel
{
ModelId
:
strconv
.
FormatUint
(
model
.
TaskId
,
10
),
DiskSize
:
diskSize
,
InstalledTime
:
model
.
SetupTime
,
LastRunTime
:
model
.
LastRunTime
})
params
:=
utils
.
BuildParams
(
&
nodemanagerV2
.
InstalledModel
{
ModelId
:
strconv
.
FormatUint
(
model
.
TaskId
,
10
),
DiskSize
:
diskSize
,
InstalledTime
:
model
.
SetupTime
,
LastRunTime
:
model
.
LastRunTime
})
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
AddModelInstalledResp
,
params
)
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
AddModelInstalledResp
,
params
)
return
break
}
//
}
}
}
}
}
}
}
case
nodemanagerV2
.
ModelOperateType_DELETE
:
case
nodemanagerV2
.
ModelOperateType_DELETE
:
{
{
now
:=
time
.
Now
()
//
now := time.Now()
for
{
for
{
select
{
select
{
case
<-
ticker
.
C
:
case
<-
ticker
.
C
:
if
time
.
Since
(
now
)
.
Minutes
()
>
models
.
OneMinutes
||
isOp
{
//if time.Since(now).Minutes() > models.OneMinutes || isOp {
return
// return
}
//}
imagesMap
,
err
:=
n
.
taskMsgWorker
.
DockerOp
.
PsImageNameMap
()
//imagesMap, err := n.taskMsgWorker.DockerOp.PsImageNameMap()
if
err
!=
nil
{
//if err != nil {
log
.
WithError
(
err
)
.
Error
(
"Ps image name map failed"
)
// log.WithError(err).Error("Ps image name map failed")
return
// return
}
//}
if
!
imagesMap
[
op
.
ImageName
]
{
//if !imagesMap[op.ImageName] {
isOp
=
true
//isOp = true
time
.
Sleep
(
time
.
Second
*
10
)
model
.
IsInstalled
=
false
model
.
IsInstalled
=
false
model
.
IsRunning
=
false
model
.
IsRunning
=
false
model
.
ContainerId
=
""
model
.
ContainerId
=
""
params
:=
utils
.
BuildParams
(
strconv
.
FormatUint
(
model
.
TaskId
,
10
))
params
:=
utils
.
BuildParams
(
strconv
.
FormatUint
(
model
.
TaskId
,
10
))
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
DelModelInstalledResp
,
params
)
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
DelModelInstalledResp
,
params
)
return
break
}
//
}
}
}
}
}
}
}
...
@@ -288,71 +295,75 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
...
@@ -288,71 +295,75 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
for
{
for
{
select
{
select
{
case
<-
ticker
.
C
:
case
<-
ticker
.
C
:
if
time
.
Since
(
now
)
.
Minutes
()
>
models
.
TwoMinutes
||
isOp
{
//if time.Since(now).Minutes() > models.TwoMinutes || isOp {
return
// return
}
//}
info
:=
GetHardwareInfo
()
//info := GetHardwareInfo()
if
info
==
nil
{
//if info == nil {
continue
// continue
}
//}
memIsChange
:=
false
//memIsChange := false
for
_
,
gpuInfo
:=
range
info
.
GPU
{
//for _, gpuInfo := range info.GPU {
if
gpuInfo
.
Seq
==
model
.
GpuSeq
{
// if gpuInfo.Seq == model.GpuSeq {
if
ModelRunningBeforeMem
[
op
.
ImageName
]
<=
gpuInfo
.
MemFree
{
// if ModelRunningBeforeMem[op.ImageName] <= gpuInfo.MemFree {
break
// break
}
// }
model
.
RunningMem
=
ModelRunningBeforeMem
[
op
.
ImageName
]
-
gpuInfo
.
MemFree
// model.RunningMem = ModelRunningBeforeMem[op.ImageName] - gpuInfo.MemFree
memIsChange
=
true
// memIsChange = true
}
// }
}
//}
if
!
memIsChange
{
//if !memIsChange {
continue
// continue
}
//}
listContainers
:=
n
.
taskMsgWorker
.
DockerOp
.
ListContainer
()
//listContainers := n.taskMsgWorker.DockerOp.ListContainer()
if
listContainers
!=
nil
&&
len
(
listContainers
)
>
0
{
//if listContainers != nil && len(listContainers) > 0 {
for
_
,
container
:=
range
listContainers
{
// for _, container := range listContainers {
if
container
.
Image
==
op
.
ImageName
{
// if container.Image == op.ImageName {
isOp
=
true
//isOp = true
time
.
Sleep
(
time
.
Second
*
30
)
model
.
StartUpTime
=
int64
(
time
.
Since
(
now
)
.
Seconds
())
model
.
StartUpTime
=
int64
(
time
.
Since
(
now
)
.
Seconds
())
model
.
ContainerId
=
container
.
ID
model
.
ContainerId
=
"1111"
model
.
IsRunning
=
true
model
.
IsRunning
=
true
model
.
LastRunTime
=
time
.
Now
()
.
Unix
()
model
.
LastRunTime
=
time
.
Now
()
.
Unix
()
params
:=
utils
.
BuildParams
(
&
nodemanagerV2
.
RunningModel
{
ModelId
:
strconv
.
FormatUint
(
model
.
TaskId
,
10
),
GpuSeq
:
model
.
GpuSeq
,
GpuRam
:
model
.
RunningMem
,
StartedTime
:
model
.
LastRunTime
,
LastWorkTime
:
model
.
LastWorkTime
,
TotalRunCount
:
model
.
TotalRunCount
,
ExecTime
:
model
.
EstimatExeTime
})
params
:=
utils
.
BuildParams
(
&
nodemanagerV2
.
RunningModel
{
ModelId
:
strconv
.
FormatUint
(
model
.
TaskId
,
10
),
GpuSeq
:
model
.
GpuSeq
,
GpuRam
:
model
.
RunningMem
,
StartedTime
:
model
.
LastRunTime
,
LastWorkTime
:
model
.
LastWorkTime
,
TotalRunCount
:
model
.
TotalRunCount
,
ExecTime
:
model
.
EstimatExeTime
})
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
AddModelRunningResp
,
params
)
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
AddModelRunningResp
,
params
)
break
break
}
//
}
}
//
}
}
//
}
}
}
}
}
}
}
case
nodemanagerV2
.
ModelOperateType_STOP
:
case
nodemanagerV2
.
ModelOperateType_STOP
:
{
{
now
:=
time
.
Now
()
//
now := time.Now()
for
{
for
{
select
{
select
{
case
<-
ticker
.
C
:
case
<-
ticker
.
C
:
if
time
.
Since
(
now
)
.
Minutes
()
>
models
.
OneMinutes
||
isOp
{
//if time.Since(now).Minutes() > models.OneMinutes || isOp {
return
// return
}
//}
listContainers
:=
n
.
taskMsgWorker
.
DockerOp
.
ListContainer
()
//listContainers := n.taskMsgWorker.DockerOp.ListContainer()
if
listContainers
!=
nil
&&
len
(
listContainers
)
>
0
{
//if listContainers != nil && len(listContainers) > 0 {
isFound
:=
false
// isFound := false
for
_
,
container
:=
range
listContainers
{
// for _, container := range listContainers {
if
container
.
Image
==
op
.
ImageName
{
// if container.Image == op.ImageName {
isFound
=
true
// isFound = true
}
// }
}
// }
if
!
isFound
{
// if !isFound {
isOp
=
true
//isOp = true
time
.
Sleep
(
time
.
Second
*
10
)
model
.
GpuSeq
=
999
model
.
GpuSeq
=
999
model
.
IsRunning
=
false
model
.
IsRunning
=
false
model
.
ContainerId
=
""
model
.
ContainerId
=
""
params
:=
utils
.
BuildParams
(
strconv
.
FormatUint
(
model
.
TaskId
,
10
))
params
:=
utils
.
BuildParams
(
strconv
.
FormatUint
(
model
.
TaskId
,
10
))
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
DelModelRunningResp
,
params
)
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
DelModelRunningResp
,
params
)
params
=
utils
.
BuildParams
(
model
.
TaskId
,
model
.
LastRunTime
)
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
InstallModelStatusResp
,
params
)
break
break
}
//
}
}
//
}
}
}
}
}
}
}
...
...
nm/msg_resp.go
View file @
ad9d0884
package
nm
package
nm
import
(
import
(
"bytes"
"example.com/m/conf"
"example.com/m/conf"
"example.com/m/largeModel"
"example.com/m/largeModel"
"example.com/m/log"
"example.com/m/log"
...
@@ -11,6 +10,7 @@ import (
...
@@ -11,6 +10,7 @@ import (
"github.com/ethereum/go-ethereum/crypto"
"github.com/ethereum/go-ethereum/crypto"
nodemanagerV2
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2"
nodemanagerV2
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2"
"math/big"
"math/big"
"strconv"
"time"
"time"
)
)
...
@@ -80,28 +80,30 @@ func HeartbeatResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
...
@@ -80,28 +80,30 @@ func HeartbeatResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
func
RegisterInfoResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
func
RegisterInfoResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"Register info response received params:"
,
params
)
log
.
Info
(
"Register info response received params:"
,
params
)
modelsInfo
:=
params
[
0
]
.
(
*
largeModel
.
ModelHandler
)
info
:=
&
nodemanagerV2
.
NodeInfo
{
MinerPubkey
:
conf
.
GetConfig
()
.
SignPub
,
BenefitAddress
:
conf
.
GetConfig
()
.
BenefitAddress
,
}
hardwareInfo
:=
GetHardwareInfo
()
readModels
,
err
:=
modelsInfo
.
ScanModelsResp
()
if
err
!=
nil
{
log
.
Error
(
"Scan models response error"
,
err
)
return
nil
}
nowTimeStamp
:=
time
.
Now
()
.
Unix
()
nowTimeStamp
:=
time
.
Now
()
.
Unix
()
nowTimeBytes
:=
big
.
NewInt
(
nowTimeStamp
)
.
Bytes
()
nowTimeBytes
:=
big
.
NewInt
(
nowTimeStamp
)
.
Bytes
()
signHash
:=
crypto
.
Keccak256Hash
(
bytes
.
NewBufferString
(
conf
.
GetConfig
()
.
GetExternalIp
())
.
Bytes
(
),
signHash
:=
crypto
.
Keccak256Hash
(
[]
byte
(
info
.
String
()
),
bytes
.
NewBufferString
(
conf
.
GetConfig
()
.
SignPub
)
.
Bytes
(
),
[]
byte
(
hardwareInfo
.
String
()
),
bytes
.
NewBufferString
(
conf
.
GetConfig
()
.
BenefitAddress
)
.
Bytes
(
),
[]
byte
(
readModels
.
String
()
),
nowTimeBytes
)
nowTimeBytes
)
log
.
WithField
(
"hash"
,
signHash
.
String
())
.
Info
(
"register message sign result"
)
log
.
WithField
(
"hash"
,
signHash
.
String
())
.
Info
(
"register message sign result"
)
sign
,
_
:=
crypto
.
Sign
(
signHash
.
Bytes
(),
conf
.
GetConfig
()
.
SignPrivateKey
)
sign
,
_
:=
crypto
.
Sign
(
signHash
.
Bytes
(),
conf
.
GetConfig
()
.
SignPrivateKey
)
log
.
Info
(
"register message sign:"
,
common
.
Bytes2Hex
(
sign
))
log
.
Info
(
"register message sign:"
,
common
.
Bytes2Hex
(
sign
))
modelsInfo
:=
params
[
0
]
.
(
*
largeModel
.
ModelHandler
)
readModels
,
err
:=
modelsInfo
.
ScanModelsResp
()
if
err
!=
nil
{
return
nil
}
hardwareInfo
:=
GetHardwareInfo
()
nodeInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
nodeInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
Message
:
&
nodemanagerV2
.
WorkerMessage_RegisteMessage
{
Message
:
&
nodemanagerV2
.
WorkerMessage_RegisteMessage
{
RegisteMessage
:
&
nodemanagerV2
.
RegisteMessage
{
RegisteMessage
:
&
nodemanagerV2
.
RegisteMessage
{
Info
:
&
nodemanagerV2
.
NodeInfo
{
Info
:
info
,
MinerPubkey
:
conf
.
GetConfig
()
.
SignPub
,
BenefitAddress
:
conf
.
GetConfig
()
.
BenefitAddress
,
},
Hardware
:
hardwareInfo
,
Hardware
:
hardwareInfo
,
Models
:
readModels
,
Models
:
readModels
,
Timestamp
:
nowTimeStamp
,
Timestamp
:
nowTimeStamp
,
...
@@ -342,6 +344,39 @@ func AddModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
...
@@ -342,6 +344,39 @@ func AddModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
return
addModelRunningRes
return
addModelRunningRes
}
}
func
RunningModelStatusResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"Add model running response received params:"
,
params
)
info
:=
params
[
0
]
.
(
*
models
.
ModelInfo
)
addModelRunningRes
:=
&
nodemanagerV2
.
WorkerMessage
{
Message
:
&
nodemanagerV2
.
WorkerMessage_RunningModelStatus
{
RunningModelStatus
:
&
nodemanagerV2
.
RunningModelStatus
{
ModelId
:
strconv
.
FormatUint
(
info
.
TaskId
,
10
),
LastWorkTime
:
info
.
LastWorkTime
,
TotalRunCount
:
info
.
TotalRunCount
,
ExecTime
:
info
.
EstimatExeTime
,
},
},
}
log
.
Info
(
"---------------------------------------Send Add model running response msg ------------------------------------"
)
return
addModelRunningRes
}
func
InstallModelStatusResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"Add model running response received params:"
,
params
)
modelId
:=
params
[
0
]
.
(
uint64
)
lastRunTime
:=
params
[
1
]
.
(
int64
)
installModelStatusRes
:=
&
nodemanagerV2
.
WorkerMessage
{
Message
:
&
nodemanagerV2
.
WorkerMessage_InstalledModelStatus
{
InstalledModelStatus
:
&
nodemanagerV2
.
InstalledModelStatus
{
ModelId
:
strconv
.
FormatUint
(
modelId
,
10
),
LastRunTime
:
lastRunTime
,
},
},
}
log
.
Info
(
"---------------------------------------Send install model status response msg ------------------------------------"
)
return
installModelStatusRes
}
func
DelModelRunningResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
func
DelModelRunningResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"Del model running response received params:"
,
params
)
log
.
Info
(
"Del model running response received params:"
,
params
)
delModelRunningRes
:=
&
nodemanagerV2
.
WorkerMessage
{
delModelRunningRes
:=
&
nodemanagerV2
.
WorkerMessage
{
...
@@ -356,7 +391,7 @@ func DelModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
...
@@ -356,7 +391,7 @@ func DelModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
}
}
func
GetHardwareInfo
()
*
nodemanagerV2
.
HardwareInfo
{
func
GetHardwareInfo
()
*
nodemanagerV2
.
HardwareInfo
{
hardwareInfo
:=
utils
.
GetHardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
hardwareInfo
:=
utils
.
Get
Api
HardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
if
hardwareInfo
==
nil
{
if
hardwareInfo
==
nil
{
return
nil
return
nil
}
}
...
...
nm/start.go
View file @
ad9d0884
...
@@ -36,14 +36,14 @@ func StartMonitor() {
...
@@ -36,14 +36,14 @@ func StartMonitor() {
go
modelHandler
.
MonitorModelInfo
()
go
modelHandler
.
MonitorModelInfo
()
log
.
WithField
(
"func"
,
"MonitorModelInfo"
)
.
Info
(
"--------------------Start modelHandler--------------------"
)
log
.
WithField
(
"func"
,
"MonitorModelInfo"
)
.
Info
(
"--------------------Start modelHandler--------------------"
)
go
modelHandler
.
MonitorModelStatus
()
//
go modelHandler.MonitorModelStatus()
log
.
WithField
(
"func"
,
"MonitorModelStatus"
)
.
Info
(
"--------------------Start modelHandler--------------------"
)
log
.
WithField
(
"func"
,
"MonitorModelStatus"
)
.
Info
(
"--------------------Start modelHandler--------------------"
)
go
monitorNm
.
monitorNodeManagerSeed
()
go
monitorNm
.
monitorNodeManagerSeed
()
log
.
WithField
(
"func"
,
"monitorNodeManagerSeed"
)
.
Info
(
"--------------------Start monitorNm--------------------"
)
log
.
WithField
(
"func"
,
"monitorNodeManagerSeed"
)
.
Info
(
"--------------------Start monitorNm--------------------"
)
for
!
monitorNm
.
IsInit
&&
!
modelHandler
.
IsInit
{
for
!
monitorNm
.
IsInit
&&
!
modelHandler
.
IsInit
{
time
.
Sleep
(
time
.
Second
)
time
.
Sleep
(
time
.
Second
*
3
)
}
}
go
monitorNm
.
monitorNmClient
()
go
monitorNm
.
monitorNmClient
()
...
...
nm/task_handler.go
View file @
ad9d0884
...
@@ -134,9 +134,9 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
...
@@ -134,9 +134,9 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
taskCmd
:
&
models
.
TaskCmd
{},
taskCmd
:
&
models
.
TaskCmd
{},
taskExecResult
:
&
models
.
TaskResult
{
taskExecResult
:
&
models
.
TaskResult
{
TaskHttpStatusCode
:
200
,
TaskHttpStatusCode
:
200
,
TaskRespBody
:
nil
,
TaskRespBody
:
[]
byte
{
1
,
2
,
3
,
4
,
5
}
,
TaskHttpHeaders
:
nil
,
TaskHttpHeaders
:
nil
,
TaskIsSuccess
:
fals
e
,
TaskIsSuccess
:
tru
e
,
TaskExecTime
:
0
,
TaskExecTime
:
0
,
TaskExecError
:
""
,
TaskExecError
:
""
,
},
},
...
@@ -178,30 +178,38 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
...
@@ -178,30 +178,38 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
t
.
ExecTaskIdIsFinished
.
Store
(
taskMsg
.
TaskId
,
true
)
t
.
ExecTaskIdIsFinished
.
Store
(
taskMsg
.
TaskId
,
true
)
return
return
}
}
running
,
_
:=
t
.
foundImageIsRunning
(
taskOp
.
taskCmd
.
ImageName
)
time
.
Sleep
(
time
.
Second
*
20
)
if
!
running
{
//running, _ := t.foundImageIsRunning(taskOp.taskCmd.ImageName)
taskOp
.
taskCmd
.
DockerCmd
.
HostIp
=
models
.
ZeroHost
//if !running {
taskOp
.
taskCmd
.
DockerCmd
.
HostPort
=
t
.
getExternalPort
()
// taskOp.taskCmd.DockerCmd.HostIp = models.ZeroHost
containerId
,
gpuSeq
,
err
:=
t
.
DockerOp
.
CreateAndStartContainer
(
model
,
taskOp
.
taskCmd
.
DockerCmd
)
// taskOp.taskCmd.DockerCmd.HostPort = t.getExternalPort()
if
err
!=
nil
{
// info := GetHardwareInfo()
log
.
Errorf
(
"Create and start container failed: %s"
,
err
.
Error
())
// if info == nil {
taskOp
.
taskExecResult
.
TaskExecError
=
fmt
.
Sprintf
(
"%s,%s"
,
"Create and start container failed"
,
err
.
Error
())
// log.Error("Error getting hardware info")
t
.
ExecTaskIdIsFinished
.
Store
(
taskMsg
.
TaskId
,
true
)
// taskOp.taskExecResult.TaskExecError = fmt.Sprintf("%s", "Error getting hardware info")
return
// t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true)
}
// return
model
.
GpuSeq
=
gpuSeq
// }
log
.
Infof
(
"Started container with ID %s"
,
containerId
)
// containerId, gpuSeq, err := t.DockerOp.CreateAndStartContainer(info, model, taskOp.taskCmd.DockerCmd)
}
// if err != nil {
if
err
=
taskOp
.
waitContainerRunning
(
t
,
taskOp
.
taskCmd
.
ImageName
,
uint16
(
taskOp
.
taskCmd
.
DockerCmd
.
ContainerPort
));
err
!=
nil
{
// log.Errorf("Create and start container failed: %s", err.Error())
taskOp
.
taskExecResult
.
TaskExecError
=
fmt
.
Sprintf
(
"%s"
,
err
.
Error
())
// taskOp.taskExecResult.TaskExecError = fmt.Sprintf("%s,%s", "Create and start container failed", err.Error())
t
.
ExecTaskIdIsFinished
.
Store
(
taskMsg
.
TaskId
,
true
)
// t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true)
return
// return
}
// }
if
err
=
taskOp
.
waitReqContainerOk
(
t
.
DockerOp
);
err
!=
nil
{
// model.GpuSeq = gpuSeq
taskOp
.
taskExecResult
.
TaskExecError
=
fmt
.
Sprintf
(
"%s"
,
err
.
Error
())
// log.Info("Started container with ID:", containerId)
t
.
ExecTaskIdIsFinished
.
Store
(
taskMsg
.
TaskId
,
true
)
//}
return
//if err = taskOp.waitContainerRunning(t, taskOp.taskCmd.ImageName, uint16(taskOp.taskCmd.DockerCmd.ContainerPort)); err != nil {
}
// taskOp.taskExecResult.TaskExecError = fmt.Sprintf("%s", err.Error())
// t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true)
// return
//}
//if err = taskOp.waitReqContainerOk(t.DockerOp); err != nil {
// taskOp.taskExecResult.TaskExecError = fmt.Sprintf("%s", err.Error())
// t.ExecTaskIdIsFinished.Store(taskMsg.TaskId, true)
// return
//}
endAfterTaskTime
:=
time
.
Since
(
taskOp
.
startBeforeTaskTime
)
endAfterTaskTime
:=
time
.
Since
(
taskOp
.
startBeforeTaskTime
)
taskOp
.
taskExecResult
.
TaskExecTime
=
endAfterTaskTime
.
Microseconds
()
taskOp
.
taskExecResult
.
TaskExecTime
=
endAfterTaskTime
.
Microseconds
()
log
.
WithField
(
"time"
,
endAfterTaskTime
.
Seconds
())
.
WithField
(
"taskId"
,
taskMsg
.
TaskId
)
.
Info
(
"Exec task end (second is units) :"
)
log
.
WithField
(
"time"
,
endAfterTaskTime
.
Seconds
())
.
WithField
(
"taskId"
,
taskMsg
.
TaskId
)
.
Info
(
"Exec task end (second is units) :"
)
...
@@ -217,7 +225,7 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
...
@@ -217,7 +225,7 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
log
.
Info
(
"----------------------Compute task exec done--------------------------------"
)
log
.
Info
(
"----------------------Compute task exec done--------------------------------"
)
}
}
func
(
t
*
TaskWorker
)
GetAckResp
(
taskMsg
*
nodemanagerV2
.
PushTaskMessage
)
(
isCanExecute
bool
,
bootUpTime
,
queueWaitTime
,
executeTime
int64
)
{
func
(
t
*
TaskWorker
)
GetAckResp
(
taskMsg
*
nodemanagerV2
.
PushTaskMessage
)
(
isCanExecute
bool
,
bootUpTime
,
queueWaitTime
,
executeTime
int64
,
imageName
string
)
{
if
t
.
IsExecStandardTask
{
if
t
.
IsExecStandardTask
{
isCanExecute
=
true
isCanExecute
=
true
return
return
...
@@ -237,7 +245,7 @@ func (t *TaskWorker) GetAckResp(taskMsg *nodemanagerV2.PushTaskMessage) (isCanEx
...
@@ -237,7 +245,7 @@ func (t *TaskWorker) GetAckResp(taskMsg *nodemanagerV2.PushTaskMessage) (isCanEx
if
!
isSuccess
&&
!
t
.
lastExecTaskStartTime
.
IsZero
()
{
if
!
isSuccess
&&
!
t
.
lastExecTaskStartTime
.
IsZero
()
{
lastTaskImageInfo
,
err
:=
db
.
GetModel
(
t
.
lastExecTaskImageName
)
lastTaskImageInfo
,
err
:=
db
.
GetModel
(
t
.
lastExecTaskImageName
)
if
err
!=
nil
{
if
err
!=
nil
{
return
false
,
0
,
0
,
0
return
false
,
0
,
0
,
0
,
""
}
}
since
:=
time
.
Since
(
t
.
lastExecTaskStartTime
)
since
:=
time
.
Since
(
t
.
lastExecTaskStartTime
)
queueWaitTime
=
int64
(
lastTaskImageInfo
.
EstimatExeTime
-
int32
(
since
.
Seconds
()))
queueWaitTime
=
int64
(
lastTaskImageInfo
.
EstimatExeTime
-
int32
(
since
.
Seconds
()))
...
@@ -258,12 +266,13 @@ func (t *TaskWorker) GetAckResp(taskMsg *nodemanagerV2.PushTaskMessage) (isCanEx
...
@@ -258,12 +266,13 @@ func (t *TaskWorker) GetAckResp(taskMsg *nodemanagerV2.PushTaskMessage) (isCanEx
isCanExecute
=
true
isCanExecute
=
true
modelInfo
,
err
:=
db
.
GetModel
(
t
.
lastExecTaskImageName
)
modelInfo
,
err
:=
db
.
GetModel
(
t
.
lastExecTaskImageName
)
if
err
!=
nil
{
if
err
!=
nil
{
return
false
,
0
,
0
,
0
return
false
,
0
,
0
,
0
,
""
}
}
if
modelInfo
!=
nil
{
if
modelInfo
!=
nil
{
bootUpTime
=
modelInfo
.
StartUpTime
bootUpTime
=
modelInfo
.
StartUpTime
executeTime
=
int64
(
modelInfo
.
EstimatExeTime
)
executeTime
=
int64
(
modelInfo
.
EstimatExeTime
)
}
}
imageName
=
modelInfo
.
ImageName
return
return
}
}
...
...
operate/docker.go
View file @
ad9d0884
...
@@ -8,7 +8,6 @@ import (
...
@@ -8,7 +8,6 @@ import (
"example.com/m/db"
"example.com/m/db"
"example.com/m/log"
"example.com/m/log"
"example.com/m/models"
"example.com/m/models"
"example.com/m/nm"
"fmt"
"fmt"
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/container"
...
@@ -42,7 +41,7 @@ func NewDockerOp() *DockerOp {
...
@@ -42,7 +41,7 @@ func NewDockerOp() *DockerOp {
dockerClient
,
err
:=
GetDockerClient
()
dockerClient
,
err
:=
GetDockerClient
()
if
err
!=
nil
{
if
err
!=
nil
{
return
&
DockerOp
{
return
&
DockerOp
{
IsHealthy
:
fals
e
,
IsHealthy
:
tru
e
,
Reason
:
fmt
.
Sprintf
(
"The connect docker client failed reason:%s"
,
err
.
Error
()),
Reason
:
fmt
.
Sprintf
(
"The connect docker client failed reason:%s"
,
err
.
Error
()),
}
}
}
}
...
@@ -125,8 +124,8 @@ func (d *DockerOp) ListContainer() []types.Container {
...
@@ -125,8 +124,8 @@ func (d *DockerOp) ListContainer() []types.Container {
return
containers
return
containers
}
}
func
(
d
*
DockerOp
)
CreateAndStartContainer
(
modelInfo
*
models
.
ModelInfo
,
dockerCmd
*
models
.
DockerCmd
)
(
string
,
int32
,
error
)
{
func
(
d
*
DockerOp
)
CreateAndStartContainer
(
info
*
nodemanagerV2
.
HardwareInfo
,
modelInfo
*
models
.
ModelInfo
,
dockerCmd
*
models
.
DockerCmd
)
(
string
,
int32
,
error
)
{
gpuSeq
:=
d
.
checkGpuUsage
(
modelInfo
,
dockerCmd
)
gpuSeq
:=
d
.
checkGpuUsage
(
info
,
modelInfo
,
dockerCmd
)
containerId
,
err
:=
d
.
CreateContainer
(
modelInfo
.
ImageName
,
dockerCmd
)
containerId
,
err
:=
d
.
CreateContainer
(
modelInfo
.
ImageName
,
dockerCmd
)
if
err
!=
nil
{
if
err
!=
nil
{
log
.
Error
(
"Error creating container image failed: "
,
err
)
log
.
Error
(
"Error creating container image failed: "
,
err
)
...
@@ -391,11 +390,7 @@ func (d *DockerOp) getContainerInfo(id string) (types.Container, error) {
...
@@ -391,11 +390,7 @@ func (d *DockerOp) getContainerInfo(id string) (types.Container, error) {
return
types
.
Container
{},
fmt
.
Errorf
(
"get container info failed"
)
return
types
.
Container
{},
fmt
.
Errorf
(
"get container info failed"
)
}
}
func
(
d
*
DockerOp
)
checkGpuUsage
(
modelInfo
*
models
.
ModelInfo
,
dockerCmd
*
models
.
DockerCmd
)
int32
{
func
(
d
*
DockerOp
)
checkGpuUsage
(
info
*
nodemanagerV2
.
HardwareInfo
,
modelInfo
*
models
.
ModelInfo
,
dockerCmd
*
models
.
DockerCmd
)
int32
{
info
:=
nm
.
GetHardwareInfo
()
if
info
==
nil
{
return
0
}
envMap
:=
make
(
map
[
string
]
string
,
0
)
envMap
:=
make
(
map
[
string
]
string
,
0
)
gpu
:=
info
.
GPU
gpu
:=
info
.
GPU
isMatch
:=
false
isMatch
:=
false
...
@@ -421,7 +416,6 @@ func (d *DockerOp) checkGpuUsage(modelInfo *models.ModelInfo, dockerCmd *models.
...
@@ -421,7 +416,6 @@ func (d *DockerOp) checkGpuUsage(modelInfo *models.ModelInfo, dockerCmd *models.
}
}
}
}
if
isMatch
{
if
isMatch
{
nm
.
ModelRunningBeforeMem
[
modelInfo
.
ImageName
]
=
dockerCmd
.
RunningBeforeMem
gpuSeq
,
_
:=
strconv
.
ParseInt
(
dockerCmd
.
EnvMap
[
models
.
CudaEnv
],
10
,
32
)
gpuSeq
,
_
:=
strconv
.
ParseInt
(
dockerCmd
.
EnvMap
[
models
.
CudaEnv
],
10
,
32
)
return
int32
(
gpuSeq
)
return
int32
(
gpuSeq
)
}
}
...
...
utils/util.go
View file @
ad9d0884
...
@@ -310,7 +310,7 @@ func readAndDecryptFile(key []byte, filename string) ([]byte, error) {
...
@@ -310,7 +310,7 @@ func readAndDecryptFile(key []byte, filename string) ([]byte, error) {
return
decryptedData
,
nil
return
decryptedData
,
nil
}
}
func
GetHardwareInfo
(
url
string
)
*
models
.
HardwareInfoRep
{
func
Get
Api
HardwareInfo
(
url
string
)
*
models
.
HardwareInfoRep
{
resp
,
err
:=
http
.
Get
(
url
)
resp
,
err
:=
http
.
Get
(
url
)
if
err
!=
nil
{
if
err
!=
nil
{
log
.
Error
(
"Error creating request"
)
log
.
Error
(
"Error creating request"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment