Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
power-node
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Odysseus
power-node
Commits
26212fdb
Commit
26212fdb
authored
May 28, 2024
by
duanjinfei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update msg resp
parent
ef936217
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
124 additions
and
58 deletions
+124
-58
NodeController.go
controllers/NodeController.go
+5
-1
go.mod
go.mod
+2
-0
go.sum
go.sum
+6
-0
api.go
nm/api.go
+1
-1
monitor.go
nm/monitor.go
+18
-0
msg_handler.go
nm/msg_handler.go
+11
-42
msg_resp.go
nm/msg_resp.go
+6
-6
start.go
nm/start.go
+1
-2
task_handler.go
nm/task_handler.go
+2
-1
docker.go
operate/docker.go
+72
-5
No files found.
controllers/NodeController.go
View file @
26212fdb
...
@@ -132,7 +132,7 @@ func (c *NodeController) UpdateRecvStatus() {
...
@@ -132,7 +132,7 @@ func (c *NodeController) UpdateRecvStatus() {
c
.
ResponseInfo
(
500
,
"The task current is recv status , don't need setting"
,
""
)
c
.
ResponseInfo
(
500
,
"The task current is recv status , don't need setting"
,
""
)
return
return
}
}
if
req
.
IsRecv
&&
!
nm
.
IsRunning
{
if
req
.
IsRecv
&&
len
(
conf
.
GetConfig
()
.
BenefitAddress
)
>=
1
{
go
nm
.
StartMonitor
()
go
nm
.
StartMonitor
()
}
}
if
!
nm
.
IsRecvTask
&&
req
.
IsRecv
{
if
!
nm
.
IsRecvTask
&&
req
.
IsRecv
{
...
@@ -206,6 +206,10 @@ func (c *NodeController) DelBenefitAddress() {
...
@@ -206,6 +206,10 @@ func (c *NodeController) DelBenefitAddress() {
c
.
ResponseInfo
(
500
,
"Don't del current benefit address"
,
""
)
c
.
ResponseInfo
(
500
,
"Don't del current benefit address"
,
""
)
return
return
}
}
if
len
(
nm
.
HistoryBenefitAcc
)
==
1
{
c
.
ResponseInfo
(
500
,
"Don't del current benefit address"
,
""
)
return
}
isExist
:=
false
isExist
:=
false
for
_
,
s
:=
range
nm
.
HistoryBenefitAcc
{
for
_
,
s
:=
range
nm
.
HistoryBenefitAcc
{
if
strings
.
ToLower
(
s
.
Address
)
==
strings
.
ToLower
(
req
.
Address
)
{
if
strings
.
ToLower
(
s
.
Address
)
==
strings
.
ToLower
(
req
.
Address
)
{
...
...
go.mod
View file @
26212fdb
...
@@ -42,6 +42,8 @@ require (
...
@@ -42,6 +42,8 @@ require (
github.com/golang/protobuf v1.5.3 // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/golang/snappy v0.0.5-0.20220116011046-fa5810519dcb // indirect
github.com/golang/snappy v0.0.5-0.20220116011046-fa5810519dcb // indirect
github.com/google/uuid v1.4.0 // indirect
github.com/google/uuid v1.4.0 // indirect
github.com/hashicorp/go-immutable-radix v1.3.1 // indirect
github.com/hashicorp/go-memdb v1.3.4 // indirect
github.com/hashicorp/golang-lru v0.5.4 // indirect
github.com/hashicorp/golang-lru v0.5.4 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/holiman/uint256 v1.2.4 // indirect
github.com/holiman/uint256 v1.2.4 // indirect
...
...
go.sum
View file @
26212fdb
...
@@ -218,6 +218,12 @@ github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4=
...
@@ -218,6 +218,12 @@ github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4=
github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
github.com/hashicorp/go-immutable-radix v1.3.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60=
github.com/hashicorp/go-immutable-radix v1.3.1 h1:DKHmCUm2hRBK510BaiZlwvpD40f8bJFeZnpfm2KLowc=
github.com/hashicorp/go-immutable-radix v1.3.1/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60=
github.com/hashicorp/go-memdb v1.3.4 h1:XSL3NR682X/cVk2IeV0d70N4DZ9ljI885xAEU8IoK3c=
github.com/hashicorp/go-memdb v1.3.4/go.mod h1:uBTr1oQbtuMgd1SSGoR8YV27eT3sBHbYiNm53bMpgSg=
github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.4 h1:YDjusn29QI/Das2iO9M0BHnIbxPeyuCHsjMW+lJfyTc=
github.com/hashicorp/golang-lru v0.5.4 h1:YDjusn29QI/Das2iO9M0BHnIbxPeyuCHsjMW+lJfyTc=
...
...
nm/api.go
View file @
26212fdb
...
@@ -19,7 +19,7 @@ var (
...
@@ -19,7 +19,7 @@ var (
)
)
func
init
()
{
func
init
()
{
IsRecvTask
=
tru
e
IsRecvTask
=
fals
e
HistoryBenefitAcc
=
make
([]
*
models
.
BenefitAddressStruct
,
0
)
HistoryBenefitAcc
=
make
([]
*
models
.
BenefitAddressStruct
,
0
)
RunningState
=
&
models
.
RunningState
{
RunningState
=
&
models
.
RunningState
{
RunningTime
:
time
.
Now
()
.
Unix
(),
RunningTime
:
time
.
Now
()
.
Unix
(),
...
...
nm/monitor.go
View file @
26212fdb
...
@@ -65,6 +65,10 @@ func (m *MonitorNm) monitorNmClient() {
...
@@ -65,6 +65,10 @@ func (m *MonitorNm) monitorNmClient() {
nodeManagerHandler
:=
NewNodeManagerHandler
(
nodeManager
,
worker
,
msgRespWorker
,
taskMsgWorker
)
nodeManagerHandler
:=
NewNodeManagerHandler
(
nodeManager
,
worker
,
msgRespWorker
,
taskMsgWorker
)
log
.
Info
(
"Report model info started"
)
log
.
Info
(
"Report model info started"
)
if
nodeManager
.
IsSelected
{
go
m
.
monitorGpuUsage
(
msgRespWorker
,
nodeManager
,
worker
)
}
go
nodeManagerHandler
.
MonitorStandardTaskWorker
()
go
nodeManagerHandler
.
MonitorStandardTaskWorker
()
log
.
Info
(
"Monitor standard task worker started"
)
log
.
Info
(
"Monitor standard task worker started"
)
...
@@ -135,3 +139,17 @@ func (m *MonitorNm) monitorNodeManagerSeed() {
...
@@ -135,3 +139,17 @@ func (m *MonitorNm) monitorNodeManagerSeed() {
}
}
}
}
}
}
func
(
m
*
MonitorNm
)
monitorGpuUsage
(
msgRespWorker
*
RespMsgWorker
,
nodeManager
*
models
.
NodeManagerClient
,
worker
nodemanagerV2
.
NodeManagerService_RegisterWorkerClient
)
{
tick
:=
time
.
NewTicker
(
time
.
Millisecond
)
defer
tick
.
Stop
()
for
{
select
{
case
<-
tick
.
C
:
{
msgRespWorker
.
RegisterMsgResp
(
nodeManager
,
worker
,
GpuUsageResp
,
nil
)
tick
=
time
.
NewTicker
(
time
.
Minute
*
10
)
}
}
}
}
nm/msg_handler.go
View file @
26212fdb
...
@@ -14,10 +14,10 @@ import (
...
@@ -14,10 +14,10 @@ import (
"time"
"time"
)
)
var
modelRunningBeo
foreMem
map
[
string
]
int64
var
ModelRunningBe
foreMem
map
[
string
]
int64
func
init
()
{
func
init
()
{
modelRunningBeo
foreMem
=
make
(
map
[
string
]
int64
,
0
)
ModelRunningBe
foreMem
=
make
(
map
[
string
]
int64
,
0
)
}
}
type
NodeManagerHandler
struct
{
type
NodeManagerHandler
struct
{
...
@@ -108,6 +108,7 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
...
@@ -108,6 +108,7 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
RunningState
.
CompletedTaskCount
++
RunningState
.
CompletedTaskCount
++
log
.
Info
(
"Completed task count: "
,
RunningState
.
CompletedTaskCount
)
log
.
Info
(
"Completed task count: "
,
RunningState
.
CompletedTaskCount
)
log
.
Info
(
"--------------taskMsg--------------:"
,
taskMsg
)
log
.
Info
(
"--------------taskMsg--------------:"
,
taskMsg
)
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
GpuUsageResp
,
ackParams
)
}(
n
.
msgRespWorker
,
n
.
taskMsgWorker
,
taskMsg
)
}(
n
.
msgRespWorker
,
n
.
taskMsgWorker
,
taskMsg
)
continue
continue
}
}
...
@@ -170,49 +171,17 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
...
@@ -170,49 +171,17 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
}
}
case
nodemanagerV2
.
ModelOperateType_RUN
:
case
nodemanagerV2
.
ModelOperateType_RUN
:
{
{
envMap
:=
make
(
map
[
string
]
string
,
0
)
dockerCmd
:=
&
models
.
DockerCmd
{
dockerCmd
:=
&
models
.
DockerCmd
{
EnvMap
:
envMap
,
HostIp
:
models
.
ZeroHost
,
HostIp
:
models
.
ZeroHost
,
HostPort
:
n
.
taskMsgWorker
.
getExternalPort
(),
HostPort
:
n
.
taskMsgWorker
.
getExternalPort
(),
}
}
info
:=
getHardwareInfo
()
containerId
,
gpuSeq
,
err
:=
dockerOp
.
CreateAndStartContainer
(
model
,
dockerCmd
)
if
info
==
nil
{
if
err
!=
nil
{
log
.
WithError
(
err
)
.
Error
(
"Error creating container"
)
continue
continue
}
}
gpu
:=
info
.
GPU
model
.
ContainerId
=
containerId
isMatch
:=
false
model
.
GpuSeq
=
gpuSeq
for
_
,
gpuInfo
:=
range
gpu
{
if
gpuInfo
.
MemFree
>
model
.
RunningMem
{
envMap
[
models
.
CudaEnv
]
=
strconv
.
FormatInt
(
int64
(
gpuInfo
.
Seq
),
10
)
isMatch
=
true
break
}
}
if
!
isMatch
{
runningModel
:=
db
.
GetRunningModel
()
if
len
(
runningModel
)
==
0
{
continue
}
for
_
,
modelInfo
:=
range
runningModel
{
if
modelInfo
.
RunningMem
>
model
.
RunningMem
{
isMatch
=
true
dockerOp
.
StopContainer
(
model
.
ContainerId
)
envMap
[
models
.
CudaEnv
]
=
strconv
.
FormatInt
(
int64
(
modelInfo
.
GpuSeq
),
10
)
break
}
}
}
if
isMatch
{
modelRunningBeoforeMem
[
model
.
ImageName
]
=
dockerCmd
.
RunningBeforeMem
gpuSeq
,
_
:=
strconv
.
ParseInt
(
dockerCmd
.
EnvMap
[
models
.
CudaEnv
],
10
,
32
)
model
.
GpuSeq
=
int32
(
gpuSeq
)
_
,
err
:=
dockerOp
.
CreateAndStartContainer
(
model
.
ImageName
,
dockerCmd
)
if
err
!=
nil
{
log
.
WithError
(
err
)
.
Error
(
"Error creating container"
)
continue
}
}
}
}
case
nodemanagerV2
.
ModelOperateType_STOP
:
case
nodemanagerV2
.
ModelOperateType_STOP
:
{
{
...
@@ -322,17 +291,17 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
...
@@ -322,17 +291,17 @@ func (n *NodeManagerHandler) MonitorImageOp(op *nodemanagerV2.ModelOperate) {
if
time
.
Since
(
now
)
.
Minutes
()
>
models
.
TwoMinutes
||
isOp
{
if
time
.
Since
(
now
)
.
Minutes
()
>
models
.
TwoMinutes
||
isOp
{
return
return
}
}
info
:=
g
etHardwareInfo
()
info
:=
G
etHardwareInfo
()
if
info
==
nil
{
if
info
==
nil
{
continue
continue
}
}
memIsChange
:=
false
memIsChange
:=
false
for
_
,
gpuInfo
:=
range
info
.
GPU
{
for
_
,
gpuInfo
:=
range
info
.
GPU
{
if
gpuInfo
.
Seq
==
model
.
GpuSeq
{
if
gpuInfo
.
Seq
==
model
.
GpuSeq
{
if
modelRunningBeo
foreMem
[
op
.
ImageName
]
<=
gpuInfo
.
MemFree
{
if
ModelRunningBe
foreMem
[
op
.
ImageName
]
<=
gpuInfo
.
MemFree
{
break
break
}
}
model
.
RunningMem
=
modelRunningBeo
foreMem
[
op
.
ImageName
]
-
gpuInfo
.
MemFree
model
.
RunningMem
=
ModelRunningBe
foreMem
[
op
.
ImageName
]
-
gpuInfo
.
MemFree
memIsChange
=
true
memIsChange
=
true
}
}
}
}
...
...
nm/msg_resp.go
View file @
26212fdb
...
@@ -94,7 +94,7 @@ func RegisterInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
...
@@ -94,7 +94,7 @@ func RegisterInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
return
nil
}
}
hardwareInfo
:=
g
etHardwareInfo
()
hardwareInfo
:=
G
etHardwareInfo
()
nodeInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
nodeInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
Message
:
&
nodemanagerV2
.
WorkerMessage_RegisteMessage
{
Message
:
&
nodemanagerV2
.
WorkerMessage_RegisteMessage
{
RegisteMessage
:
&
nodemanagerV2
.
RegisteMessage
{
RegisteMessage
:
&
nodemanagerV2
.
RegisteMessage
{
...
@@ -115,7 +115,7 @@ func RegisterInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
...
@@ -115,7 +115,7 @@ func RegisterInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
func
NodeInfoResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
func
NodeInfoResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"Node info response received params:"
,
params
)
log
.
Info
(
"Node info response received params:"
,
params
)
hardwareInfo
:=
g
etHardwareInfo
()
hardwareInfo
:=
G
etHardwareInfo
()
modelsInfo
:=
params
[
0
]
.
(
*
largeModel
.
ModelHandler
)
modelsInfo
:=
params
[
0
]
.
(
*
largeModel
.
ModelHandler
)
readModels
,
err
:=
modelsInfo
.
GetRpcModelsResp
()
readModels
,
err
:=
modelsInfo
.
GetRpcModelsResp
()
if
err
!=
nil
{
if
err
!=
nil
{
...
@@ -140,7 +140,7 @@ func NodeInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
...
@@ -140,7 +140,7 @@ func NodeInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
func
DeviceInfoResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
func
DeviceInfoResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"Device info response received params:"
,
params
)
log
.
Info
(
"Device info response received params:"
,
params
)
hardwareInfo
:=
g
etHardwareInfo
()
hardwareInfo
:=
G
etHardwareInfo
()
deviceInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
deviceInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
Message
:
&
nodemanagerV2
.
WorkerMessage_DeviceInfo
{
Message
:
&
nodemanagerV2
.
WorkerMessage_DeviceInfo
{
DeviceInfo
:
&
nodemanagerV2
.
DeviceInfoMessage
{
DeviceInfo
:
&
nodemanagerV2
.
DeviceInfoMessage
{
...
@@ -155,7 +155,7 @@ func DeviceInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
...
@@ -155,7 +155,7 @@ func DeviceInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
func
DeviceUsageResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
func
DeviceUsageResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"DeviceUsageResp params :"
,
params
)
log
.
Info
(
"DeviceUsageResp params :"
,
params
)
hardwareInfo
:=
g
etHardwareInfo
()
hardwareInfo
:=
G
etHardwareInfo
()
ramUsage
:=
int32
((
1
-
float64
(
hardwareInfo
.
RAM
.
Total
)
/
float64
(
hardwareInfo
.
RAM
.
Free
))
*
100
)
ramUsage
:=
int32
((
1
-
float64
(
hardwareInfo
.
RAM
.
Total
)
/
float64
(
hardwareInfo
.
RAM
.
Free
))
*
100
)
diskUsage
:=
int32
((
1
-
float64
(
hardwareInfo
.
DISK
.
Total
)
/
float64
(
hardwareInfo
.
DISK
.
Free
))
*
100
)
diskUsage
:=
int32
((
1
-
float64
(
hardwareInfo
.
DISK
.
Total
)
/
float64
(
hardwareInfo
.
DISK
.
Free
))
*
100
)
deviceInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
deviceInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
...
@@ -176,7 +176,7 @@ func DeviceUsageResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
...
@@ -176,7 +176,7 @@ func DeviceUsageResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
func
GpuUsageResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
func
GpuUsageResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"DeviceUsageResp params :"
,
params
)
log
.
Info
(
"DeviceUsageResp params :"
,
params
)
hardwareInfo
:=
g
etHardwareInfo
()
hardwareInfo
:=
G
etHardwareInfo
()
gpusUsage
:=
make
([]
*
nodemanagerV2
.
GPUUsage
,
0
)
gpusUsage
:=
make
([]
*
nodemanagerV2
.
GPUUsage
,
0
)
for
_
,
gpuInfo
:=
range
hardwareInfo
.
GPU
{
for
_
,
gpuInfo
:=
range
hardwareInfo
.
GPU
{
usage
:=
&
nodemanagerV2
.
GPUUsage
{
usage
:=
&
nodemanagerV2
.
GPUUsage
{
...
@@ -355,7 +355,7 @@ func DelModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
...
@@ -355,7 +355,7 @@ func DelModelRunningResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
return
delModelRunningRes
return
delModelRunningRes
}
}
func
g
etHardwareInfo
()
*
nodemanagerV2
.
HardwareInfo
{
func
G
etHardwareInfo
()
*
nodemanagerV2
.
HardwareInfo
{
hardwareInfo
:=
utils
.
GetHardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
hardwareInfo
:=
utils
.
GetHardwareInfo
(
conf
.
GetConfig
()
.
HardwareUrl
)
if
hardwareInfo
==
nil
{
if
hardwareInfo
==
nil
{
return
nil
return
nil
...
...
nm/start.go
View file @
26212fdb
...
@@ -14,7 +14,6 @@ import (
...
@@ -14,7 +14,6 @@ import (
var
(
var
(
nodeManagerArr
[]
*
NodeManager
nodeManagerArr
[]
*
NodeManager
usedNodeManagerClient
[]
*
models
.
NodeManagerClient
usedNodeManagerClient
[]
*
models
.
NodeManagerClient
IsRunning
bool
)
)
func
init
()
{
func
init
()
{
...
@@ -23,7 +22,7 @@ func init() {
...
@@ -23,7 +22,7 @@ func init() {
}
}
func
StartMonitor
()
{
func
StartMonitor
()
{
IsR
unning
=
true
IsR
ecvTask
=
true
dockerOp
:=
operate
.
NewDockerOp
()
dockerOp
:=
operate
.
NewDockerOp
()
if
!
dockerOp
.
IsHealthy
{
if
!
dockerOp
.
IsHealthy
{
log
.
Error
(
"Docker operation is not healthy reason:"
,
dockerOp
.
Reason
)
log
.
Error
(
"Docker operation is not healthy reason:"
,
dockerOp
.
Reason
)
...
...
nm/task_handler.go
View file @
26212fdb
...
@@ -182,13 +182,14 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
...
@@ -182,13 +182,14 @@ func (t *TaskWorker) ComputeTaskHandler(taskMsg *nodemanagerV2.PushTaskMessage)
if
!
running
{
if
!
running
{
taskOp
.
taskCmd
.
DockerCmd
.
HostIp
=
models
.
ZeroHost
taskOp
.
taskCmd
.
DockerCmd
.
HostIp
=
models
.
ZeroHost
taskOp
.
taskCmd
.
DockerCmd
.
HostPort
=
t
.
getExternalPort
()
taskOp
.
taskCmd
.
DockerCmd
.
HostPort
=
t
.
getExternalPort
()
containerId
,
err
:=
t
.
DockerOp
.
CreateAndStartContainer
(
taskOp
.
taskCmd
.
ImageName
,
taskOp
.
taskCmd
.
DockerCmd
)
containerId
,
gpuSeq
,
err
:=
t
.
DockerOp
.
CreateAndStartContainer
(
model
,
taskOp
.
taskCmd
.
DockerCmd
)
if
err
!=
nil
{
if
err
!=
nil
{
log
.
Errorf
(
"Create and start container failed: %s"
,
err
.
Error
())
log
.
Errorf
(
"Create and start container failed: %s"
,
err
.
Error
())
taskOp
.
taskExecResult
.
TaskExecError
=
fmt
.
Sprintf
(
"%s,%s"
,
"Create and start container failed"
,
err
.
Error
())
taskOp
.
taskExecResult
.
TaskExecError
=
fmt
.
Sprintf
(
"%s,%s"
,
"Create and start container failed"
,
err
.
Error
())
t
.
ExecTaskIdIsFinished
.
Store
(
taskMsg
.
TaskId
,
true
)
t
.
ExecTaskIdIsFinished
.
Store
(
taskMsg
.
TaskId
,
true
)
return
return
}
}
model
.
GpuSeq
=
gpuSeq
log
.
Infof
(
"Started container with ID %s"
,
containerId
)
log
.
Infof
(
"Started container with ID %s"
,
containerId
)
}
}
if
err
=
taskOp
.
waitContainerRunning
(
t
,
taskOp
.
taskCmd
.
ImageName
,
uint16
(
taskOp
.
taskCmd
.
DockerCmd
.
ContainerPort
));
err
!=
nil
{
if
err
=
taskOp
.
waitContainerRunning
(
t
,
taskOp
.
taskCmd
.
ImageName
,
uint16
(
taskOp
.
taskCmd
.
DockerCmd
.
ContainerPort
));
err
!=
nil
{
...
...
operate/docker.go
View file @
26212fdb
...
@@ -5,8 +5,10 @@ import (
...
@@ -5,8 +5,10 @@ import (
"context"
"context"
"encoding/json"
"encoding/json"
"example.com/m/conf"
"example.com/m/conf"
"example.com/m/db"
"example.com/m/log"
"example.com/m/log"
"example.com/m/models"
"example.com/m/models"
"example.com/m/nm"
"fmt"
"fmt"
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/container"
...
@@ -123,18 +125,19 @@ func (d *DockerOp) ListContainer() []types.Container {
...
@@ -123,18 +125,19 @@ func (d *DockerOp) ListContainer() []types.Container {
return
containers
return
containers
}
}
func
(
d
*
DockerOp
)
CreateAndStartContainer
(
imageName
string
,
dockerCmd
*
models
.
DockerCmd
)
(
string
,
error
)
{
func
(
d
*
DockerOp
)
CreateAndStartContainer
(
modelInfo
*
models
.
ModelInfo
,
dockerCmd
*
models
.
DockerCmd
)
(
string
,
int32
,
error
)
{
containerId
,
err
:=
d
.
CreateContainer
(
imageName
,
dockerCmd
)
gpuSeq
:=
d
.
checkGpuUsage
(
modelInfo
,
dockerCmd
)
containerId
,
err
:=
d
.
CreateContainer
(
modelInfo
.
ImageName
,
dockerCmd
)
if
err
!=
nil
{
if
err
!=
nil
{
log
.
Error
(
"Error creating container image failed: "
,
err
)
log
.
Error
(
"Error creating container image failed: "
,
err
)
return
""
,
err
return
""
,
gpuSeq
,
err
}
}
// 启动容器
// 启动容器
startContainerIsSuccess
:=
d
.
StartContainer
(
containerId
)
startContainerIsSuccess
:=
d
.
StartContainer
(
containerId
)
if
!
startContainerIsSuccess
{
if
!
startContainerIsSuccess
{
log
.
Error
(
"start container failed:"
,
startContainerIsSuccess
)
log
.
Error
(
"start container failed:"
,
startContainerIsSuccess
)
return
""
,
fmt
.
Errorf
(
"start container failed"
)
return
""
,
gpuSeq
,
fmt
.
Errorf
(
"start container failed"
)
}
}
//ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
//ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
...
@@ -160,7 +163,7 @@ func (d *DockerOp) CreateAndStartContainer(imageName string, dockerCmd *models.D
...
@@ -160,7 +163,7 @@ func (d *DockerOp) CreateAndStartContainer(imageName string, dockerCmd *models.D
// return "", err
// return "", err
//}
//}
return
containerId
,
nil
return
containerId
,
gpuSeq
,
nil
}
}
func
(
d
*
DockerOp
)
CreateContainer
(
imageName
string
,
dockerCmd
*
models
.
DockerCmd
)
(
string
,
error
)
{
func
(
d
*
DockerOp
)
CreateContainer
(
imageName
string
,
dockerCmd
*
models
.
DockerCmd
)
(
string
,
error
)
{
...
@@ -217,6 +220,17 @@ func (d *DockerOp) CreateContainer(imageName string, dockerCmd *models.DockerCmd
...
@@ -217,6 +220,17 @@ func (d *DockerOp) CreateContainer(imageName string, dockerCmd *models.DockerCmd
func
(
d
*
DockerOp
)
StartContainer
(
containerID
string
)
bool
{
func
(
d
*
DockerOp
)
StartContainer
(
containerID
string
)
bool
{
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
time
.
Minute
*
20
)
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
time
.
Minute
*
20
)
defer
cancel
()
defer
cancel
()
info
,
err
:=
d
.
getContainerInfo
(
containerID
)
if
err
==
nil
{
for
_
,
port
:=
range
info
.
Ports
{
d
.
UsedExternalPort
[
int64
(
port
.
PublicPort
)]
=
true
}
}
mounts
:=
info
.
Mounts
for
_
,
mount
:=
range
mounts
{
if
mount
.
Destination
==
"/path/to/gpu/memory"
{
}
}
// 启动容器
// 启动容器
if
err
:=
d
.
dockerClient
.
ContainerStart
(
ctx
,
containerID
,
types
.
ContainerStartOptions
{});
err
!=
nil
{
if
err
:=
d
.
dockerClient
.
ContainerStart
(
ctx
,
containerID
,
types
.
ContainerStartOptions
{});
err
!=
nil
{
log
.
Error
(
"Start container failed:"
,
err
)
log
.
Error
(
"Start container failed:"
,
err
)
...
@@ -230,6 +244,12 @@ func (d *DockerOp) StartContainer(containerID string) bool {
...
@@ -230,6 +244,12 @@ func (d *DockerOp) StartContainer(containerID string) bool {
func
(
d
*
DockerOp
)
StopContainer
(
containerID
string
)
bool
{
func
(
d
*
DockerOp
)
StopContainer
(
containerID
string
)
bool
{
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
time
.
Minute
*
20
)
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
time
.
Minute
*
20
)
defer
cancel
()
defer
cancel
()
info
,
err
:=
d
.
getContainerInfo
(
containerID
)
if
err
==
nil
{
for
_
,
port
:=
range
info
.
Ports
{
d
.
UsedExternalPort
[
int64
(
port
.
PublicPort
)]
=
false
}
}
// 停止容器(如果正在运行)
// 停止容器(如果正在运行)
if
err
:=
d
.
dockerClient
.
ContainerStop
(
ctx
,
containerID
,
container
.
StopOptions
{});
err
!=
nil
{
if
err
:=
d
.
dockerClient
.
ContainerStop
(
ctx
,
containerID
,
container
.
StopOptions
{});
err
!=
nil
{
// 可能容器已经停止或不存在
// 可能容器已经停止或不存在
...
@@ -360,3 +380,50 @@ func (d *DockerOp) GetDockerInfo() (int64, int64, int64, int64, error) {
...
@@ -360,3 +380,50 @@ func (d *DockerOp) GetDockerInfo() (int64, int64, int64, int64, error) {
}
}
return
0
,
0
,
0
,
0
,
fmt
.
Errorf
(
"get disk size failed"
)
return
0
,
0
,
0
,
0
,
fmt
.
Errorf
(
"get disk size failed"
)
}
}
func
(
d
*
DockerOp
)
getContainerInfo
(
id
string
)
(
types
.
Container
,
error
)
{
listContainer
:=
d
.
ListContainer
()
for
_
,
containerInfo
:=
range
listContainer
{
if
containerInfo
.
ID
==
id
{
return
containerInfo
,
nil
}
}
return
types
.
Container
{},
fmt
.
Errorf
(
"get container info failed"
)
}
func
(
d
*
DockerOp
)
checkGpuUsage
(
modelInfo
*
models
.
ModelInfo
,
dockerCmd
*
models
.
DockerCmd
)
int32
{
info
:=
nm
.
GetHardwareInfo
()
if
info
==
nil
{
return
0
}
envMap
:=
make
(
map
[
string
]
string
,
0
)
gpu
:=
info
.
GPU
isMatch
:=
false
for
_
,
gpuInfo
:=
range
gpu
{
if
gpuInfo
.
MemFree
>
modelInfo
.
RunningMem
{
envMap
[
models
.
CudaEnv
]
=
strconv
.
FormatInt
(
int64
(
gpuInfo
.
Seq
),
10
)
isMatch
=
true
break
}
}
if
!
isMatch
{
runningModel
:=
db
.
GetRunningModel
()
if
len
(
runningModel
)
==
0
{
return
0
}
for
_
,
modelInfo
:=
range
runningModel
{
if
modelInfo
.
RunningMem
>
modelInfo
.
RunningMem
{
isMatch
=
true
d
.
StopContainer
(
modelInfo
.
ContainerId
)
envMap
[
models
.
CudaEnv
]
=
strconv
.
FormatInt
(
int64
(
modelInfo
.
GpuSeq
),
10
)
break
}
}
}
if
isMatch
{
nm
.
ModelRunningBeforeMem
[
modelInfo
.
ImageName
]
=
dockerCmd
.
RunningBeforeMem
gpuSeq
,
_
:=
strconv
.
ParseInt
(
dockerCmd
.
EnvMap
[
models
.
CudaEnv
],
10
,
32
)
return
int32
(
gpuSeq
)
}
return
0
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment