Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
power-node
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Odysseus
power-node
Commits
e42dfefb
Commit
e42dfefb
authored
May 22, 2024
by
duanjinfei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update model info
parent
e7ea8956
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
82 additions
and
40 deletions
+82
-40
rootcmd.go
cmd/rootcmd.go
+1
-1
model_handler.go
largeModel/model_handler.go
+21
-20
const.go
models/const.go
+3
-0
node_manager.go
models/node_manager.go
+5
-6
msg_handler.go
nm/msg_handler.go
+41
-13
task_handler.go
nm/task_handler.go
+11
-0
No files found.
cmd/rootcmd.go
View file @
e42dfefb
...
...
@@ -20,7 +20,7 @@ var (
)
func
init
()
{
RootCmd
.
PersistentFlags
()
.
StringVarP
(
&
rewardAddr
,
"reward"
,
"r"
,
"0x
0Fb196385c8826e3806ebA2cA2cb78B26E08fEEe
"
,
"please enter a reward address"
)
RootCmd
.
PersistentFlags
()
.
StringVarP
(
&
rewardAddr
,
"reward"
,
"r"
,
"0x
2E60C056fBAf4bf27945516c9364B037D5D31CC2
"
,
"please enter a reward address"
)
RootCmd
.
PersistentFlags
()
.
StringVarP
(
&
externalIp
,
"externalIp"
,
"e"
,
"192.168.1.120"
,
"please enter server external ip address"
)
RootCmd
.
PersistentFlags
()
.
StringVarP
(
&
opSys
,
"opSys"
,
"s"
,
""
,
"please enter you op sys name : win、linux"
)
RootCmd
.
PersistentFlags
()
.
BoolVarP
(
&
debug
,
"debug"
,
"d"
,
false
,
"set log level debug"
)
...
...
largeModel/model_handler.go
View file @
e42dfefb
...
...
@@ -145,6 +145,7 @@ func (m *ModelHandler) MonitorModelStatus() {
for
_
,
key
:=
range
keys
{
model
,
_
:=
db
.
GetModel
(
key
)
if
model
!=
nil
&&
!
model
.
IsInstalled
{
model
.
ImageId
=
image
.
ID
model
.
SetupTime
=
time
.
Now
()
.
Unix
()
model
.
IsInstalled
=
true
err
:=
db
.
PutModel
(
key
,
model
)
...
...
@@ -155,26 +156,26 @@ func (m *ModelHandler) MonitorModelStatus() {
}
}
}
//
containerList := m.dockerOp.ListContainer()
//
if containerList != nil && len(containerList) > 0 {
//
for _, container := range containerList {
//
key := container.Image
//
model, err := db.GetModel(key)
//
if err != nil || model == nil {
//
continue
//
}
//
if container.State == "running" && !model.IsRunning {
//
model.ContainerId = container.ID
//
model.LastRunTime = time.Now().Unix()
//
model.IsRunning = true
//
err = db.PutModel(key, model)
//
if err != nil {
//
continue
//
}
//
}
//
//
}
//}
containerList
:=
m
.
dockerOp
.
ListContainer
()
if
containerList
!=
nil
&&
len
(
containerList
)
>
0
{
for
_
,
container
:=
range
containerList
{
key
:=
container
.
Image
model
,
err
:=
db
.
GetModel
(
key
)
if
err
!=
nil
||
model
==
nil
{
continue
}
if
container
.
State
==
"running"
&&
!
model
.
IsRunning
{
model
.
ContainerId
=
container
.
ID
model
.
LastRunTime
=
time
.
Now
()
.
Unix
()
model
.
IsRunning
=
true
err
=
db
.
PutModel
(
key
,
model
)
if
err
!=
nil
{
continue
}
}
}
}
ticker
=
time
.
NewTicker
(
time
.
Minute
*
10
)
}
}
}
...
...
models/const.go
View file @
e42dfefb
...
...
@@ -27,4 +27,7 @@ const (
BasicMode
=
1
HealthMode
=
2
SaveMode
=
3
OneHour
=
1
OneMinutes
=
1
TwoMinutes
=
2
)
models/node_manager.go
View file @
e42dfefb
...
...
@@ -18,10 +18,11 @@ type TaskCmd struct {
}
type
DockerCmd
struct
{
ContainerPort
int64
`json:"container_port"`
EnvMap
map
[
string
]
string
HostIp
string
HostPort
string
ContainerPort
int64
`json:"container_port"`
RunningBeforeMem
int64
EnvMap
map
[
string
]
string
HostIp
string
HostPort
string
}
type
TaskReq
struct
{
...
...
@@ -147,7 +148,6 @@ type ModelInfo struct {
EstimatExeTime
int32
`json:"estimat_exe_time"`
StartUpTime
int64
`json:"start_up_time"`
RunningMem
int64
`json:"running_mem"`
OpTime
int64
SetupTime
int64
LastRunTime
int64
ImageId
string
...
...
@@ -155,7 +155,6 @@ type ModelInfo struct {
IsInstalled
bool
IsRunning
bool
GpuSeq
int32
GpuRam
int64
LastWorkTime
int64
TotalRunCount
int32
}
...
...
nm/msg_handler.go
View file @
e42dfefb
...
...
@@ -14,6 +14,12 @@ import (
"time"
)
var
modelRunningBeoforeMem
map
[
string
]
int64
func
init
()
{
modelRunningBeoforeMem
=
make
(
map
[
string
]
int64
,
0
)
}
type
NodeManagerHandler
struct
{
nodeManager
*
models
.
NodeManagerClient
worker
nodemanagerV2
.
NodeManagerService_RegisterWorkerClient
...
...
@@ -174,7 +180,7 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
gpu
:=
info
.
GPU
isMatch
:=
false
for
_
,
gpuInfo
:=
range
gpu
{
if
gpuInfo
.
MemFree
>
model
.
GpuRa
m
{
if
gpuInfo
.
MemFree
>
model
.
RunningMe
m
{
envMap
[
models
.
CudaEnv
]
=
strconv
.
FormatInt
(
int64
(
gpuInfo
.
Seq
),
10
)
isMatch
=
true
break
...
...
@@ -186,7 +192,7 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
continue
}
for
_
,
modelInfo
:=
range
runningModel
{
if
modelInfo
.
RunningMem
>
model
.
GpuRa
m
{
if
modelInfo
.
RunningMem
>
model
.
RunningMe
m
{
isMatch
=
true
dockerOp
.
StopContainer
(
model
.
ContainerId
)
envMap
[
models
.
CudaEnv
]
=
strconv
.
FormatInt
(
int64
(
modelInfo
.
GpuSeq
),
10
)
...
...
@@ -195,6 +201,9 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
}
}
if
isMatch
{
modelRunningBeoforeMem
[
model
.
ImageName
]
=
dockerCmd
.
RunningBeforeMem
gpuSeq
,
_
:=
strconv
.
ParseInt
(
dockerCmd
.
EnvMap
[
models
.
CudaEnv
],
10
,
32
)
model
.
GpuSeq
=
int32
(
gpuSeq
)
_
,
err
:=
dockerOp
.
CreateAndStartContainer
(
model
.
ImageName
,
dockerCmd
)
if
err
!=
nil
{
log
.
WithError
(
err
)
.
Error
(
"Error creating container"
)
...
...
@@ -205,10 +214,15 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
case
nodemanagerV2
.
ModelOperateType_STOP
:
{
if
model
.
ContainerId
!=
""
{
model
.
ContainerId
=
""
dockerOp
.
StopContainer
(
model
.
ContainerId
)
}
}
}
err
=
db
.
PutModel
(
model
.
ImageName
,
model
)
if
err
!=
nil
{
log
.
WithError
(
err
)
.
Error
(
"Db put model failed"
)
}
}
}(
modelOpMsg
,
n
.
taskMsgWorker
.
DockerOp
)
continue
...
...
@@ -240,7 +254,6 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
log
.
WithError
(
err
)
.
Error
(
"Op model - get model error"
)
return
}
model
.
OpTime
=
time
.
Now
()
.
Unix
()
ticker
:=
time
.
NewTicker
(
time
.
Second
*
2
)
defer
ticker
.
Stop
()
isOp
:=
false
...
...
@@ -251,8 +264,8 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
for
{
select
{
case
<-
ticker
.
C
:
if
time
.
Since
(
now
)
.
Seconds
()
>
36000
||
isOp
{
break
if
time
.
Since
(
now
)
.
Hours
()
>
models
.
OneHour
||
isOp
{
return
}
imagesMap
,
err
:=
n
.
taskMsgWorker
.
DockerOp
.
PsImageNameMap
()
if
err
!=
nil
{
...
...
@@ -277,8 +290,8 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
for
{
select
{
case
<-
ticker
.
C
:
if
time
.
Since
(
now
)
.
Seconds
()
>
36000
||
isOp
{
break
if
time
.
Since
(
now
)
.
Minutes
()
>
models
.
OneMinutes
||
isOp
{
return
}
imagesMap
,
err
:=
n
.
taskMsgWorker
.
DockerOp
.
PsImageNameMap
()
if
err
!=
nil
{
...
...
@@ -303,18 +316,33 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
for
{
select
{
case
<-
ticker
.
C
:
if
time
.
Since
(
now
)
.
Seconds
()
>
360
||
isOp
{
break
if
time
.
Since
(
now
)
.
Minutes
()
>
models
.
TwoMinutes
||
isOp
{
return
}
info
:=
getHardwareInfo
()
memIsChange
:=
false
for
_
,
gpuInfo
:=
range
info
.
GPU
{
if
gpuInfo
.
Seq
==
model
.
GpuSeq
{
if
modelRunningBeoforeMem
[
op
.
ImageName
]
<=
gpuInfo
.
MemFree
{
break
}
model
.
RunningMem
=
modelRunningBeoforeMem
[
op
.
ImageName
]
-
gpuInfo
.
MemFree
memIsChange
=
true
}
}
if
!
memIsChange
{
continue
}
listContainers
:=
n
.
taskMsgWorker
.
DockerOp
.
ListContainer
()
if
listContainers
!=
nil
&&
len
(
listContainers
)
>
0
{
for
_
,
container
:=
range
listContainers
{
if
container
.
Image
==
op
.
ImageName
{
isOp
=
true
model
.
ContainerId
=
""
model
.
StartUpTime
=
int64
(
time
.
Since
(
now
)
.
Seconds
())
model
.
ContainerId
=
container
.
ID
model
.
IsRunning
=
true
model
.
LastRunTime
=
time
.
Now
()
.
Unix
()
params
:=
utils
.
BuildParams
(
strconv
.
FormatUint
(
model
.
TaskId
,
10
),
model
.
GpuSeq
,
model
.
GpuRa
m
,
model
.
LastRunTime
,
model
.
LastWorkTime
,
model
.
TotalRunCount
,
model
.
EstimatExeTime
)
params
:=
utils
.
BuildParams
(
strconv
.
FormatUint
(
model
.
TaskId
,
10
),
model
.
GpuSeq
,
model
.
RunningMe
m
,
model
.
LastRunTime
,
model
.
LastWorkTime
,
model
.
TotalRunCount
,
model
.
EstimatExeTime
)
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
AddModelRunningResp
,
params
)
break
}
...
...
@@ -329,8 +357,8 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
for
{
select
{
case
<-
ticker
.
C
:
if
time
.
Since
(
now
)
.
Seconds
()
>
360
||
isOp
{
break
if
time
.
Since
(
now
)
.
Minutes
()
>
models
.
OneMinutes
||
isOp
{
return
}
listContainers
:=
n
.
taskMsgWorker
.
DockerOp
.
ListContainer
()
if
listContainers
!=
nil
&&
len
(
listContainers
)
>
0
{
...
...
nm/task_handler.go
View file @
e42dfefb
...
...
@@ -155,6 +155,15 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
t
.
ExecTaskIdIsFinished
.
Store
(
taskMsg
.
TaskId
,
true
)
return
}
model
,
err
:=
db
.
GetModel
(
taskOp
.
taskCmd
.
ImageName
)
if
err
!=
nil
{
log
.
Errorf
(
"failed to unmarshal task cmd: %s"
,
err
.
Error
())
taskOp
.
taskExecResult
.
TaskExecError
=
fmt
.
Sprintf
(
"%s,%s"
,
"Not found location model info: %s"
,
err
.
Error
())
t
.
ExecTaskIdIsFinished
.
Store
(
taskMsg
.
TaskId
,
true
)
return
}
model
.
LastWorkTime
=
time
.
Now
()
.
Unix
()
model
.
TotalRunCount
++
taskOp
.
taskCmd
.
ImageName
=
fmt
.
Sprintf
(
"%s-%s"
,
taskOp
.
taskCmd
.
ImageName
,
conf
.
GetConfig
()
.
OpSys
)
log
.
Info
(
"received task cmd :"
,
taskOp
.
taskCmd
)
log
.
WithField
(
"t.lastExecTaskImageName"
,
t
.
lastExecTaskImageName
)
.
WithField
(
"newTaskImageName"
,
taskOp
.
taskCmd
.
ImageName
)
.
Info
(
"task image info"
)
...
...
@@ -200,7 +209,9 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
}
else
if
taskMsg
.
TaskKind
==
baseV1
.
TaskKind_StandardTask
{
t
.
IsExecStandardTask
=
false
}
model
.
EstimatExeTime
=
int32
(
endAfterTaskTime
.
Seconds
())
t
.
ExecTaskIdIsFinished
.
Store
(
taskMsg
.
TaskId
,
true
)
_
=
db
.
PutModel
(
taskOp
.
taskCmd
.
ImageName
,
model
)
//log.WithField("result", taskExecResult).Info("lru cache storage task result")
log
.
Info
(
"----------------------Compute task exec done--------------------------------"
)
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment