Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
power-node
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Odysseus
power-node
Commits
59f14f68
Commit
59f14f68
authored
May 11, 2024
by
duanjinfei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update msg resp
parent
94e34c67
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
146 additions
and
190 deletions
+146
-190
NodeController.go
controllers/NodeController.go
+6
-2
StateController.go
controllers/StateController.go
+18
-3
db.go
db/db.go
+0
-17
model_handler.go
largeModel/model_handler.go
+23
-20
node_manager.go
models/node_manager.go
+9
-4
req_resp.go
models/req_resp.go
+9
-18
api.go
nm/api.go
+1
-0
monitor.go
nm/monitor.go
+15
-18
msg_handler.go
nm/msg_handler.go
+16
-33
msg_resp.go
nm/msg_resp.go
+28
-63
start.go
nm/start.go
+8
-5
task_handler.go
nm/task_handler.go
+6
-1
docker.go
operate/docker.go
+1
-1
router.go
routers/router.go
+1
-0
task_msg_test.go
test/task_msg_test.go
+5
-5
No files found.
controllers/NodeController.go
View file @
59f14f68
...
...
@@ -7,7 +7,7 @@ import (
"example.com/m/nm"
"example.com/m/operate"
"example.com/m/utils"
node
ManagerV1
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v1
"
node
managerV2
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2
"
"io"
)
...
...
@@ -84,7 +84,7 @@ func (c *NodeController) AddNodeManager() {
c
.
ResponseInfo
(
500
,
"param error"
,
""
)
return
}
nodeManager
:=
&
node
ManagerV1
.
NodeManagerInfo
{
nodeManager
:=
&
node
managerV2
.
NodeManagerInfo
{
Publickey
:
req
.
PublicKey
,
Endpoint
:
req
.
EndPoint
,
}
...
...
@@ -130,3 +130,7 @@ func (c *NodeController) GetRecvStatus() {
func
(
c
*
NodeController
)
GetConfigInfo
()
{
c
.
ResponseInfo
(
200
,
"get config successful"
,
conf
.
GetConfig
())
}
func
(
c
*
NodeController
)
GetBenefit
()
{
c
.
ResponseInfo
(
200
,
"get benefit address successful"
,
conf
.
GetConfig
()
.
BenefitAddress
)
}
controllers/StateController.go
View file @
59f14f68
...
...
@@ -61,10 +61,25 @@ func (c *StateController) GetGpuUsageInfo() {
func
(
c
*
StateController
)
GetOtherHardwareInfo
()
{
info
:=
utils
.
GetHardwareInfo
()
var
diskTotal
,
diskFree
int64
for
_
,
disk
:=
range
info
.
Data
.
Disk
{
for
_
,
point
:=
range
disk
.
MountPoints
{
if
point
==
"/"
{
diskTotal
+=
disk
.
SizeBytes
diskFree
+=
disk
.
FreeBytes
}
}
}
diskUsage
:=
int32
((
1
-
diskFree
/
diskTotal
)
*
100
)
res
:=
&
models
.
OtherHardwareInfoResp
{
CpuTemp
:
info
.
Data
.
Cpus
.
Usage
,
RamUsage
:
info
.
Data
.
Mem
.
Total
,
DiskUsage
:
info
.
Data
.
Disk
[
0
]
.
Total
,
NodeID
:
conf
.
GetConfig
()
.
SignPublicAddress
.
Hex
(),
CpuName
:
info
.
Data
.
Cpus
.
Model
,
CpuUsage
:
info
.
Data
.
Cpus
.
Usage
,
CpuFrequency
:
info
.
Data
.
Cpus
.
Frequency
,
RamSize
:
info
.
Data
.
Mem
.
Total
,
RamUsage
:
info
.
Data
.
Mem
.
MemUtil
,
DiskSize
:
diskTotal
,
DiskUsage
:
diskUsage
,
}
c
.
ResponseInfo
(
200
,
"get hardware info successful"
,
res
)
}
db/db.go
View file @
59f14f68
...
...
@@ -16,23 +16,6 @@ func init() {
if
err
!=
nil
{
log
.
Error
(
"Leveldb open file failed: "
,
err
)
}
// 遍历数据库,删除所有数据
iter
:=
dbInstance
.
NewIterator
(
nil
,
nil
)
for
iter
.
Next
()
{
key
:=
iter
.
Key
()
// 删除 key 对应的数据
if
err
:=
dbInstance
.
Delete
(
key
,
nil
);
err
!=
nil
{
log
.
Error
(
"Leveldb delete failed: "
,
err
)
}
}
iter
.
Release
()
//defer func(dbInstance *leveldb.DB) {
// err := dbInstance.Close()
// if err != nil {
// log.Error("Leveldb close file failed: ", err)
// }
//}(dbInstance)
}
func
Put
(
key
string
,
value
[]
byte
)
error
{
...
...
largeModel/model_handler.go
View file @
59f14f68
...
...
@@ -7,6 +7,7 @@ import (
"example.com/m/models"
"example.com/m/operate"
"fmt"
nodemanagerV2
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2"
"io"
"net/http"
"os"
...
...
@@ -61,11 +62,11 @@ func (m *ModelHandler) MonitorModelInfo() {
continue
}
modelInfosResp
:=
resp
.
Data
imageNameMap
,
err
:=
m
.
dockerOp
.
PsImageNameMap
()
if
err
!=
nil
{
log
.
Error
(
"Docker op ps images failed:"
,
err
)
continue
}
//
imageNameMap, err := m.dockerOp.PsImageNameMap()
//
if err != nil {
//
log.Error("Docker op ps images failed:", err)
//
continue
//
}
reportTaskIds
:=
make
([]
uint64
,
0
)
for
_
,
modelInfo
:=
range
modelInfosResp
{
if
modelInfo
.
ImageName
==
""
{
...
...
@@ -76,19 +77,22 @@ func (m *ModelHandler) MonitorModelInfo() {
if
len
(
split
)
!=
2
{
continue
}
if
!
imageNameMap
[
modelInfo
.
ImageName
]
{
{
//if !imageNameMap[modelInfo.ImageName] {
// todo: 判断机器资源是否够用
isPull
:=
m
.
isResourceEnough
(
modelInfo
)
//
isPull := m.isResourceEnough(modelInfo)
// todo: 如果够用
if
isPull
&&
modelInfo
.
PublishStatus
==
models
.
ModelPublishStatusYes
{
log
.
WithField
(
"model image name"
,
modelInfo
.
ImageName
)
.
Info
(
"pulling image"
)
go
m
.
dockerOp
.
PullImage
(
modelInfo
)
}
}
else
{
log
.
WithField
(
"name"
,
modelInfo
.
ImageName
)
.
Info
(
"The image name is already"
)
m
.
dockerOp
.
BootUpModelId
[
modelInfo
.
ImageName
]
=
modelInfo
.
TaskId
reportTaskIds
=
append
(
reportTaskIds
,
modelInfo
.
TaskId
)
//if isPull && modelInfo.PublishStatus == models.ModelPublishStatusYes {
// log.WithField("model image name", modelInfo.ImageName).Info("pulling image")
// go m.dockerOp.PullImage(modelInfo)
//}
//} else {
//
//}
}
log
.
WithField
(
"name"
,
modelInfo
.
ImageName
)
.
Info
(
"The image name is already"
)
m
.
dockerOp
.
BootUpModelId
[
modelInfo
.
ImageName
]
=
modelInfo
.
TaskId
reportTaskIds
=
append
(
reportTaskIds
,
modelInfo
.
TaskId
)
m
.
dockerOp
.
SignApi
[
modelInfo
.
ImageName
]
=
modelInfo
.
SignUrl
}
m
.
dockerOp
.
ModelsInfo
=
modelInfosResp
...
...
@@ -127,13 +131,12 @@ func (m *ModelHandler) ReadModels() ([]*models.ModelInfo, error) {
}
return
resp
.
Data
,
nil
}
func
(
m
*
ModelHandler
)
GetRpcModelsResp
()
(
*
nodemanagerV2
.
ModelsInfo
,
error
)
{
return
nil
,
nil
}
func
(
m
*
ModelHandler
)
isResourceEnough
(
modelInfo
*
models
.
ModelInfo
)
bool
{
//isDownload := m.checkDiskUsage(modelInfo)
//isDownload = true
//if !isDownload {
// return isDownload
//}
return
true
}
...
...
models/node_manager.go
View file @
59f14f68
...
...
@@ -65,13 +65,13 @@ type GpuInfo struct {
type
Gpu
struct
{
Seq
int64
`json:"seq"`
Uuid
string
`json:"uuid"`
Model
string
`json:"model
"`
Model
string
`json:"model"`
Performance
int64
`json:"performance"`
PowerRating
int64
`json:"power_rating"`
MemTotal
int64
`json:"mem_total"`
MemFree
int64
`json:"mem_free"`
Usage
int64
`json:"usage"`
Temp
int64
`json:"temp
"`
Temp
int64
`json:"temp"`
PowerRt
int64
`json:"power_rt"`
}
...
...
@@ -92,8 +92,9 @@ type CoreCpuInfo struct {
}
type
Mem
struct
{
Total
int64
`json:"total"`
Free
int64
`json:"free"`
Total
int64
`json:"total"`
Free
int64
`json:"free"`
MemUtil
int32
`json:"mem_util"`
}
type
Disk
struct
{
...
...
@@ -141,6 +142,10 @@ type ModelInfo struct {
EstimatExeTime
int64
`json:"estimat_exe_time"`
StartUpTime
int64
`json:"start_up_time"`
RunningMem
int64
`json:"running_mem"`
SetupTime
int64
`json:"setup_time"`
LastRunTime
int64
`json:"last_run_time"`
IsInstalled
bool
`json:"is_installed"`
IsRunning
bool
`json:"is_running"`
}
type
HealthyCheck
struct
{
...
...
models/req_resp.go
View file @
59f14f68
...
...
@@ -21,6 +21,7 @@ type RunningState struct {
RunningTime
int64
`json:"running_time"`
CompletedTaskCount
int
`json:"completed_task_count"`
NmIpAddr
string
`json:"nm_ip_addr"`
NmLocation
string
`json:"nm_location"`
NmDelayTime
int64
`json:"nm_delay_time"`
}
...
...
@@ -29,29 +30,19 @@ type WorkerAccount struct {
ChainID
int64
`json:"chain_id"`
}
type
GpuInfoResp
struct
{
Seq
int
`json:"seq"`
Name
string
`json:"name"`
TotalMem
int64
`json:"total_mem"`
UtilMem
int64
`json:"util_mem"`
FreeMem
int64
`json:"free_mem"`
}
type
GpuUsageReq
struct
{
Seq
int64
`json:"seq"`
}
type
GpuUsageInfoResp
struct
{
Seq
int
`json:"seq"`
Occupy
int
`json:"occupy"`
Usage
int64
`json:"usage"`
Temp
int
`json:"temp"`
}
type
OtherHardwareInfoResp
struct
{
CpuTemp
int64
`json:"cpu_temp"`
RamUsage
int64
`json:"ram_usage"`
DiskUsage
int64
`json:"disk_usage"`
NodeID
string
`json:"node_id"`
CpuName
string
`json:"cpu_name"`
CpuUsage
int32
`json:"cpu_usage"`
CpuFrequency
string
`json:"cpu_frequency"`
RamSize
int64
`json:"ram_size"`
RamUsage
int32
`json:"ram_usage"`
DiskSize
int64
`json:"disk_size"`
DiskUsage
int32
`json:"disk_usage"`
}
type
Resp
struct
{
...
...
nm/api.go
View file @
59f14f68
...
...
@@ -26,6 +26,7 @@ func init() {
CompletedTaskCount
:
0
,
NmIpAddr
:
""
,
NmDelayTime
:
0
,
NmLocation
:
""
,
}
}
...
...
nm/monitor.go
View file @
59f14f68
...
...
@@ -3,10 +3,11 @@ package nm
import
(
"context"
"example.com/m/conf"
"example.com/m/largeModel"
"example.com/m/log"
"example.com/m/models"
"example.com/m/operate"
"example.com/m/
validator
"
"example.com/m/
utils
"
nodemanagerV2
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2"
"google.golang.org/grpc"
"time"
...
...
@@ -17,13 +18,15 @@ type MonitorNm struct {
NodeManagerMsgChan
chan
*
nodemanagerV2
.
ManagerMessage
DockerOp
*
operate
.
DockerOp
IsInit
bool
ModelHandler
*
largeModel
.
ModelHandler
}
func
NewMonitorNm
(
DockerOp
*
operate
.
DockerOp
)
*
MonitorNm
{
func
NewMonitorNm
(
dockerOp
*
operate
.
DockerOp
,
modelHandler
*
largeModel
.
ModelHandler
)
*
MonitorNm
{
return
&
MonitorNm
{
NodeManagerClientChan
:
make
(
chan
*
models
.
NodeManagerClient
,
10
),
NodeManagerMsgChan
:
make
(
chan
*
nodemanagerV2
.
ManagerMessage
,
1000
),
DockerOp
:
DockerOp
,
DockerOp
:
dockerOp
,
ModelHandler
:
modelHandler
,
IsInit
:
false
,
}
}
...
...
@@ -50,7 +53,8 @@ func (m *MonitorNm) monitorNmClient() {
taskMsgWorker
.
DistributionTaskWorker
(
4
)
log
.
Info
(
"Distribution task worker started......................."
)
msgRespWorker
.
RegisterMsgResp
(
nodeManager
,
worker
,
RegisterInfoResp
,
nil
)
registerRespParam
:=
utils
.
BuildParams
(
m
.
ModelHandler
)
msgRespWorker
.
RegisterMsgResp
(
nodeManager
,
worker
,
RegisterInfoResp
,
registerRespParam
)
log
.
Info
(
"------------------------Send register message ended------------------------"
)
msgRespWorker
.
RegisterMsgResp
(
nodeManager
,
worker
,
DeviceInfoResp
,
nil
)
...
...
@@ -65,29 +69,22 @@ func (m *MonitorNm) monitorNmClient() {
nodeManagerHandler
:=
NewNodeManagerHandler
(
nodeManager
,
worker
,
msgRespWorker
,
taskMsgWorker
)
log
.
Info
(
"Report model info started"
)
go
nodeManagerHandler
.
ReportResourceMap
(
m
.
DockerOp
)
log
.
Info
(
"Monitor resource map worker started"
)
go
nodeManagerHandler
.
MonitorStandardTaskWorker
()
log
.
Info
(
"Monitor standard task worker started"
)
proofWorker
:=
validator
.
NewProofWorker
()
// 证明存储
//go proofWorker.ProofStorage()
//log.Info("Proof storage worker started")
// 证明提交
//go proofWorker.CommitWitness()
//log.Info("Proof commit worker started")
// 处理消息
for
i
:=
0
;
i
<
2
;
i
++
{
go
nodeManagerHandler
.
DistributionMsgWorker
(
m
.
NodeManagerMsgChan
,
proofWork
er
)
go
nodeManagerHandler
.
DistributionMsgWorker
(
m
.
NodeManagerMsgChan
,
m
.
ModelHandl
er
)
}
log
.
Info
(
"------------------------Start rev msg worker thread------------------------"
)
for
{
if
!
IsRecvTask
{
log
.
Warn
(
"User set recv task status is false"
)
msgRespWorker
.
RegisterMsgResp
(
nodeManager
,
worker
,
RegisterInfoResp
,
nil
)
nodeManager
.
UpdateStatus
(
false
)
return
}
sub
:=
time
.
Now
()
.
Sub
(
nodeManager
.
GetLastHeartTime
())
.
Seconds
()
log
.
WithField
(
"time(uint seconds)"
,
sub
)
.
Info
(
"Handler nm msg thread monitor heartbeat time"
)
rev
,
err
:=
worker
.
Recv
()
...
...
nm/msg_handler.go
View file @
59f14f68
...
...
@@ -2,11 +2,10 @@ package nm
import
(
"example.com/m/conf"
"example.com/m/largeModel"
"example.com/m/log"
"example.com/m/models"
"example.com/m/operate"
"example.com/m/utils"
"example.com/m/validator"
"fmt"
nodemanagerV2
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2"
"time"
...
...
@@ -28,7 +27,7 @@ func NewNodeManagerHandler(nodeManager *models.NodeManagerClient, worker nodeman
}
}
func
(
n
*
NodeManagerHandler
)
DistributionMsgWorker
(
nodeManagerMsgChan
chan
*
nodemanagerV2
.
ManagerMessage
,
proofWorker
*
validator
.
ProofWork
er
)
{
func
(
n
*
NodeManagerHandler
)
DistributionMsgWorker
(
nodeManagerMsgChan
chan
*
nodemanagerV2
.
ManagerMessage
,
modelsHandler
*
largeModel
.
ModelHandl
er
)
{
for
{
select
{
case
rev
:=
<-
nodeManagerMsgChan
:
...
...
@@ -112,7 +111,8 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
nodeInfoMsg
:=
rev
.
GetNodeInfoRequest
()
if
nodeInfoMsg
!=
nil
{
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
NodeInfoResp
,
nil
)
nodeInfoParam
:=
utils
.
BuildParams
(
modelsHandler
)
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
NodeInfoResp
,
nodeInfoParam
)
log
.
Info
(
nodeInfoMsg
)
continue
}
...
...
@@ -126,16 +126,25 @@ func (n *NodeManagerHandler) DistributionMsgWorker(nodeManagerMsgChan chan *node
modelListMsg
:=
rev
.
GetModelListRequest
()
if
modelListMsg
!=
nil
{
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
ModelListResp
,
nil
)
modelListParam
:=
utils
.
BuildParams
(
modelsHandler
)
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
ModelListResp
,
modelListParam
)
log
.
Info
(
modelListMsg
)
continue
}
modelOpMsg
:=
rev
.
GetModelOperateRequest
()
if
modelOpMsg
!=
nil
{
//for _, modelOperate := range modelOpMsg.ModelOperates {
//}
modelOpMsg
.
GetModelOperates
()
continue
}
deviceInfoMsg
:=
rev
.
GetDeviceInfoRequest
()
if
deviceInfoMsg
!=
nil
{
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
DeviceInfoResp
,
nil
)
log
.
Info
(
modelListMsg
)
continue
}
goodByeMsg
:=
rev
.
GetGoodbyeMessage
()
if
goodByeMsg
!=
nil
{
reason
:=
goodByeMsg
.
GetReason
()
...
...
@@ -167,29 +176,3 @@ func (n *NodeManagerHandler) MonitorStandardTaskWorker() {
}
}
}
func
(
n
*
NodeManagerHandler
)
ReportResourceMap
(
dockerOp
*
operate
.
DockerOp
)
{
ticker
:=
time
.
NewTicker
(
time
.
Second
*
1
)
for
{
select
{
case
<-
ticker
.
C
:
if
len
(
dockerOp
.
ReportModelIds
)
>
0
{
bootUpModelIds
:=
make
([]
uint64
,
0
)
containers
:=
dockerOp
.
ListContainer
()
if
containers
!=
nil
&&
len
(
containers
)
>
0
{
for
_
,
container
:=
range
containers
{
if
container
.
State
==
"running"
{
taskId
:=
dockerOp
.
BootUpModelId
[
container
.
Image
]
if
taskId
!=
0
{
bootUpModelIds
=
append
(
bootUpModelIds
,
taskId
)
}
}
}
}
params
:=
utils
.
BuildParams
(
dockerOp
.
ReportModelIds
,
bootUpModelIds
)
n
.
msgRespWorker
.
RegisterMsgResp
(
n
.
nodeManager
,
n
.
worker
,
SubmitResourceMapRes
,
params
)
ticker
=
time
.
NewTicker
(
time
.
Minute
*
10
)
}
}
}
}
nm/msg_resp.go
View file @
59f14f68
...
...
@@ -3,10 +3,10 @@ package nm
import
(
"bytes"
"example.com/m/conf"
"example.com/m/largeModel"
"example.com/m/log"
"example.com/m/models"
"example.com/m/utils"
"github.com/docker/docker/libnetwork/bitmap"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/crypto"
nodemanagerV2
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2"
...
...
@@ -78,45 +78,6 @@ func HeartbeatResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
return
heartRes
}
func
SubmitResourceMapRes
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"Submit resource map response received params: "
,
params
)
existModelIdIndexes
:=
params
[
0
]
.
([]
uint64
)
existMap
:=
bitmap
.
New
(
1000000000
)
for
i
:=
0
;
i
<
len
(
existModelIdIndexes
);
i
++
{
modelIdIndex
:=
existModelIdIndexes
[
i
]
err
:=
existMap
.
Set
(
modelIdIndex
)
if
err
!=
nil
{
log
.
WithField
(
"model id index"
,
modelIdIndex
)
.
WithField
(
"error"
,
err
)
.
Error
(
"Error setting task id index"
)
return
nil
}
}
existImage
,
err
:=
existMap
.
MarshalBinary
()
if
err
!=
nil
{
log
.
Error
(
"bitmap marshal binary failed with error: "
,
err
)
return
nil
}
bootUpModelIdIndexes
:=
params
[
1
]
.
([]
uint64
)
bootUpMap
:=
bitmap
.
New
(
1000000000
)
for
i
:=
0
;
i
<
len
(
bootUpModelIdIndexes
);
i
++
{
modelIdIndex
:=
bootUpModelIdIndexes
[
i
]
err
:=
bootUpMap
.
Set
(
modelIdIndex
)
if
err
!=
nil
{
log
.
WithField
(
"modelId index"
,
modelIdIndex
)
.
WithField
(
"error"
,
err
)
.
Error
(
"Error setting task id index"
)
return
nil
}
}
_
,
err
=
bootUpMap
.
MarshalBinary
()
if
err
!=
nil
{
log
.
Error
(
"bitmap marshal binary failed with error: "
,
err
)
return
nil
}
log
.
WithField
(
""
,
existImage
)
.
Info
(
"Bit map binary byte"
)
heartRes
:=
&
nodemanagerV2
.
WorkerMessage
{}
log
.
Info
(
"---------------------------------------Send resource map msg ------------------------------------"
)
return
heartRes
}
func
RegisterInfoResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"Register info response received params:"
,
params
)
nowTimeStamp
:=
time
.
Now
()
.
Unix
()
...
...
@@ -128,7 +89,11 @@ func RegisterInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
log
.
WithField
(
"hash"
,
signHash
.
String
())
.
Info
(
"register message sign result"
)
sign
,
_
:=
crypto
.
Sign
(
signHash
.
Bytes
(),
conf
.
GetConfig
()
.
SignPrivateKey
)
log
.
Info
(
"register message sign:"
,
common
.
Bytes2Hex
(
sign
))
modelsInfo
:=
getModelsInfo
()
modelsInfo
:=
params
[
0
]
.
(
*
largeModel
.
ModelHandler
)
readModels
,
err
:=
modelsInfo
.
GetRpcModelsResp
()
if
err
!=
nil
{
return
nil
}
hardwareInfo
:=
getHardwareInfo
()
nodeInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
Message
:
&
nodemanagerV2
.
WorkerMessage_RegisteMessage
{
...
...
@@ -138,7 +103,7 @@ func RegisterInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
BenefitAddress
:
conf
.
GetConfig
()
.
BenefitAddress
,
},
Hardware
:
hardwareInfo
,
Models
:
modelsInfo
,
Models
:
readModels
,
Timestamp
:
nowTimeStamp
,
DeviceSignature
:
sign
,
},
...
...
@@ -151,7 +116,11 @@ func RegisterInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
func
NodeInfoResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"Node info response received params:"
,
params
)
hardwareInfo
:=
getHardwareInfo
()
modelsInfo
:=
getModelsInfo
()
modelsInfo
:=
params
[
0
]
.
(
*
largeModel
.
ModelHandler
)
readModels
,
err
:=
modelsInfo
.
GetRpcModelsResp
()
if
err
!=
nil
{
return
nil
}
nodeInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
Message
:
&
nodemanagerV2
.
WorkerMessage_NodeInfo
{
NodeInfo
:
&
nodemanagerV2
.
NodeInfoResponse
{
...
...
@@ -160,7 +129,7 @@ func NodeInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
BenefitAddress
:
conf
.
GetConfig
()
.
BenefitAddress
,
},
Hardware
:
hardwareInfo
,
Models
:
modelsInfo
,
Models
:
readModels
,
},
},
}
...
...
@@ -185,17 +154,17 @@ func DeviceInfoResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
func
DeviceUsageResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"DeviceUsageResp params :"
,
params
)
i
nfo
:=
getHardwareInfo
()
ramUsage
:=
int32
((
1
-
info
.
RAM
.
Total
/
i
nfo
.
RAM
.
Free
)
*
100
)
diskUsage
:=
int32
((
1
-
info
.
DISK
.
Total
/
i
nfo
.
DISK
.
Free
)
*
100
)
hardwareI
nfo
:=
getHardwareInfo
()
ramUsage
:=
int32
((
1
-
hardwareInfo
.
RAM
.
Total
/
hardwareI
nfo
.
RAM
.
Free
)
*
100
)
diskUsage
:=
int32
((
1
-
hardwareInfo
.
DISK
.
Total
/
hardwareI
nfo
.
DISK
.
Free
)
*
100
)
deviceInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
Message
:
&
nodemanagerV2
.
WorkerMessage_DeviceUsage
{
DeviceUsage
:
&
nodemanagerV2
.
DeviceUsageResponse
{
Usage
:
&
nodemanagerV2
.
HardwareUsage
{
CpuUsage
:
i
nfo
.
CPU
.
Usage
,
CpuUsage
:
hardwareI
nfo
.
CPU
.
Usage
,
RamUsage
:
ramUsage
,
DiskUsage
:
diskUsage
,
NetBandwidth
:
i
nfo
.
NET
.
Bandwidth
,
NetBandwidth
:
hardwareI
nfo
.
NET
.
Bandwidth
,
},
},
},
...
...
@@ -206,9 +175,9 @@ func DeviceUsageResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
func
GpuUsageResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"DeviceUsageResp params :"
,
params
)
i
nfo
:=
getHardwareInfo
()
hardwareI
nfo
:=
getHardwareInfo
()
gpusUsage
:=
make
([]
*
nodemanagerV2
.
GPUUsage
,
0
)
for
_
,
gpuInfo
:=
range
i
nfo
.
GPU
{
for
_
,
gpuInfo
:=
range
hardwareI
nfo
.
GPU
{
usage
:=
&
nodemanagerV2
.
GPUUsage
{
Seq
:
gpuInfo
.
Seq
,
MemFree
:
gpuInfo
.
MemFree
,
...
...
@@ -231,10 +200,14 @@ func GpuUsageResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
func
ModelListResp
(
params
...
interface
{})
*
nodemanagerV2
.
WorkerMessage
{
log
.
Info
(
"DeviceUsageResp params :"
,
params
)
info
:=
getModelsInfo
()
modelsInfo
:=
params
[
0
]
.
(
*
largeModel
.
ModelHandler
)
readModels
,
err
:=
modelsInfo
.
GetRpcModelsResp
()
if
err
!=
nil
{
return
nil
}
modelListInfoRes
:=
&
nodemanagerV2
.
WorkerMessage
{
Message
:
&
nodemanagerV2
.
WorkerMessage_ModelsInfo
{
ModelsInfo
:
info
,
ModelsInfo
:
readModels
,
},
}
log
.
Info
(
"---------------------------------------Send model list msg ------------------------------------"
)
...
...
@@ -319,16 +292,6 @@ func GoodbyeResp(params ...interface{}) *nodemanagerV2.WorkerMessage {
return
goodbyeMsgRes
}
func
getModelsInfo
()
*
nodemanagerV2
.
ModelsInfo
{
installModels
:=
make
([]
*
nodemanagerV2
.
InstalledModel
,
0
)
runningModels
:=
make
([]
*
nodemanagerV2
.
RunningModel
,
0
)
res
:=
&
nodemanagerV2
.
ModelsInfo
{
InstalledModels
:
installModels
,
RunningModels
:
runningModels
,
}
return
res
}
func
getHardwareInfo
()
*
nodemanagerV2
.
HardwareInfo
{
hardwareInfo
:=
utils
.
GetHardwareInfo
()
gpusInfo
:=
make
([]
*
nodemanagerV2
.
GPUInfo
,
0
)
...
...
@@ -341,6 +304,8 @@ func getHardwareInfo() *nodemanagerV2.HardwareInfo {
}
}
}
diskTotal
=
diskTotal
*
conf
.
GetConfig
()
.
DiskUsage
diskFree
=
diskFree
*
conf
.
GetConfig
()
.
DiskUsage
var
macAddr
string
var
bandWidth
int32
for
_
,
net
:=
range
hardwareInfo
.
Data
.
Networks
{
...
...
nm/start.go
View file @
59f14f68
...
...
@@ -30,7 +30,7 @@ func StartMonitor() {
modelHandler
:=
largeModel
.
NewModelHandler
(
dockerOp
)
monitorNm
:=
NewMonitorNm
(
dockerOp
)
monitorNm
:=
NewMonitorNm
(
dockerOp
,
modelHandler
)
go
monitorNm
.
monitorNodeManagerSeed
()
...
...
@@ -75,14 +75,17 @@ func StartMonitor() {
for
{
select
{
case
<-
ticker
.
C
:
if
!
IsRecvTask
{
log
.
Warn
(
"Stop receive task........."
)
continue
}
log
.
Info
(
"Monitoring node manager client thread start......"
)
for
_
,
client
:=
range
usedNodeManagerClient
{
if
!
IsRecvTask
&&
!
client
.
Status
{
client
.
Status
=
false
}
log
.
WithField
(
"Endpoint"
,
client
.
Endpoint
)
.
WithField
(
"LastHeartTime"
,
client
.
LastHeartTime
)
.
WithField
(
"Is Del"
,
client
.
IsDel
)
.
WithField
(
"Status"
,
client
.
Status
)
.
Info
(
"Monitoring node manager client thread"
)
}
if
!
IsRecvTask
{
log
.
Warn
(
"Stop receive task........."
)
continue
}
for
i
,
managerClient
:=
range
usedNodeManagerClient
{
if
managerClient
.
GetStatus
()
&&
!
managerClient
.
IsDel
{
sub
:=
time
.
Now
()
.
Sub
(
managerClient
.
GetLastHeartTime
())
.
Seconds
()
...
...
nm/task_handler.go
View file @
59f14f68
...
...
@@ -231,6 +231,10 @@ func (t *TaskWorker) GetAckResp(taskMsg *nodemanagerV2.PushTaskMessage) (isCanEx
if
t
.
foundTaskImage
(
taskCmd
)
==
""
{
log
.
WithField
(
"imageName"
,
taskCmd
.
ImageName
)
.
Error
(
"The image is not found"
)
return
}
running
,
_
,
_
:=
t
.
foundImageIsRunning
(
taskCmd
.
ImageName
)
if
!
running
{
}
log
.
Info
(
"found task image finished"
)
isCanExecute
=
true
...
...
@@ -249,13 +253,14 @@ func (t *TaskWorker) foundTaskImage(taskCmd *models.TaskCmd) (imageId string) {
imageId
=
""
return
}
foundImageName
:=
fmt
.
Sprintf
(
"%s-%s"
,
taskCmd
.
ImageName
,
conf
.
GetConfig
()
.
OpSys
)
isFound
:=
false
for
_
,
image
:=
range
images
{
if
isFound
{
break
}
for
_
,
tag
:=
range
image
.
RepoTags
{
if
tag
==
taskCmd
.
ImageName
{
if
tag
==
found
ImageName
{
imageId
=
image
.
ID
isFound
=
true
log
.
Info
(
"The image found success:"
,
image
.
ID
)
...
...
operate/docker.go
View file @
59f14f68
...
...
@@ -53,7 +53,7 @@ func NewDockerOp() *DockerOp {
Reason
:
""
,
dockerClient
:
dockerClient
,
SignApi
:
make
(
map
[
string
]
string
,
0
),
ModelsInfo
:
make
([]
*
models
.
ModelInfo
,
1000
),
ModelsInfo
:
make
([]
*
models
.
ModelInfo
,
1000
00
),
UsedExternalPort
:
make
(
map
[
int64
]
bool
,
0
),
ReportModelIds
:
make
([]
uint64
,
0
),
BootUpModelId
:
make
(
map
[
string
]
uint64
,
0
),
...
...
routers/router.go
View file @
59f14f68
...
...
@@ -13,6 +13,7 @@ func init() {
beego
.
Router
(
"/api/v1/power/update/recv/status"
,
&
controllers
.
NodeController
{},
"post:UpdateRecvStatus"
)
beego
.
Router
(
"/api/v1/power/get/recv/status"
,
&
controllers
.
NodeController
{},
"get:GetRecvStatus"
)
beego
.
Router
(
"/api/v1/power/get/conf"
,
&
controllers
.
NodeController
{},
"get:GetConfigInfo"
)
beego
.
Router
(
"/api/v1/power/get/current/benefit"
,
&
controllers
.
NodeController
{},
"get:GetBenefit"
)
beego
.
Router
(
"/api/v1/power/get/running/state"
,
&
controllers
.
StateController
{},
"get:GetRunningState"
)
beego
.
Router
(
"/api/v1/power/get/worker/info"
,
&
controllers
.
StateController
{},
"get:GetWorkerInfo"
)
beego
.
Router
(
"/api/v1/power/list/gpu/info"
,
&
controllers
.
StateController
{},
"get:GetListGpuInfo"
)
...
...
test/task_msg_test.go
View file @
59f14f68
...
...
@@ -7,7 +7,7 @@ import (
"example.com/m/operate"
"fmt"
"github.com/golang/groupcache/lru"
nodeManagerV
1
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v1
"
nodeManagerV
2
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2
"
"net/http"
"sync"
"testing"
...
...
@@ -49,7 +49,7 @@ func TestTaskHandler_computeTaskHandler(t1 *testing.T) {
lruCache
*
lru
.
Cache
DockerOp
*
operate
.
DockerOp
CmdOp
*
operate
.
Command
TaskMsg
chan
*
nodeManagerV
1
.
PushTaskMessage
TaskMsg
chan
*
nodeManagerV
2
.
PushTaskMessage
TaskRespHeader
map
[
string
][]
byte
TaskRespBody
map
[
string
][]
byte
TaskIsSuccess
map
[
string
]
bool
...
...
@@ -57,7 +57,7 @@ func TestTaskHandler_computeTaskHandler(t1 *testing.T) {
}
type
args
struct
{
taskMsg
*
nodeManagerV
1
.
PushTaskMessage
taskMsg
*
nodeManagerV
2
.
PushTaskMessage
}
m
:=
&
models
.
TaskCmd
{
ImageName
:
"llm-server:latest"
,
...
...
@@ -82,7 +82,7 @@ func TestTaskHandler_computeTaskHandler(t1 *testing.T) {
return
}
n
:=
args
{
taskMsg
:
&
nodeManagerV
1
.
PushTaskMessage
{
taskMsg
:
&
nodeManagerV
2
.
PushTaskMessage
{
Workload
:
111
,
TaskCmd
:
string
(
marshal
),
TaskParam
:
taskParamBytes
,
...
...
@@ -100,7 +100,7 @@ func TestTaskHandler_computeTaskHandler(t1 *testing.T) {
wg
:
&
sync
.
WaitGroup
{},
lruCache
:
lru
.
New
(
100
),
DockerOp
:
operate
.
NewDockerOp
(),
TaskMsg
:
make
(
chan
*
nodeManagerV
1
.
PushTaskMessage
,
0
),
TaskMsg
:
make
(
chan
*
nodeManagerV
2
.
PushTaskMessage
,
0
),
TaskRespHeader
:
make
(
map
[
string
][]
byte
,
0
),
TaskRespBody
:
make
(
map
[
string
][]
byte
,
0
),
TaskIsSuccess
:
make
(
map
[
string
]
bool
,
0
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment