Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
N
nodemanager
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Odysseus
nodemanager
Commits
61d3804f
Commit
61d3804f
authored
Jul 11, 2024
by
vicotor
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update distribute model to install
parent
f477de55
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
169 additions
and
55 deletions
+169
-55
distribute.go
distribute/distribute.go
+2
-2
interface.go
distribute/interface.go
+3
-0
worker.go
distribute/worker.go
+77
-38
worker.go
server/worker.go
+78
-14
workerManager.go
server/workerManager.go
+9
-1
No files found.
distribute/distribute.go
View file @
61d3804f
...
...
@@ -27,11 +27,11 @@ func StartDistributor(manager WorkerManager) *Distributor {
return
dis
}
func
(
d
*
Distributor
)
AddWorker
(
id
int64
,
info
*
omanager
.
NodeInfoResponse
)
{
func
(
d
*
Distributor
)
AddWorker
(
id
int64
,
addr
string
,
info
*
omanager
.
NodeInfoResponse
)
{
if
_
,
exist
:=
d
.
workers
.
Load
(
id
);
exist
{
return
}
worker
:=
NewImageWorker
(
d
.
modelLib
,
d
.
manager
)
worker
:=
NewImageWorker
(
addr
,
d
.
modelLib
,
d
.
manager
)
d
.
workers
.
Store
(
info
.
Info
.
MinerPubkey
,
worker
)
go
worker
.
DistributeImages
()
}
...
...
distribute/interface.go
View file @
61d3804f
package
distribute
import
omanager
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2"
type
WorkerManager
interface
{
WorkerCount
()
int
ModelOperate
(
addr
string
,
operate
[]
*
omanager
.
ModelOperate
)
error
}
type
ModelLibrary
interface
{
...
...
distribute/worker.go
View file @
61d3804f
package
distribute
import
(
"encoding/json"
omanager
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2"
"golang.org/x/crypto/sha3"
"sort"
...
...
@@ -10,6 +11,7 @@ import (
)
type
imageWorker
struct
{
addr
string
modelLibrary
ModelLibrary
manager
WorkerManager
mux
sync
.
Mutex
...
...
@@ -17,8 +19,9 @@ type imageWorker struct {
quit
chan
struct
{}
}
func
NewImageWorker
(
modellib
ModelLibrary
,
manager
WorkerManager
)
*
imageWorker
{
func
NewImageWorker
(
addr
string
,
modellib
ModelLibrary
,
manager
WorkerManager
)
*
imageWorker
{
return
&
imageWorker
{
addr
:
addr
,
modelLibrary
:
modellib
,
manager
:
manager
,
quit
:
make
(
chan
struct
{}),
...
...
@@ -175,49 +178,85 @@ const (
HashingMode
// 从热度由高到低,选择相匹配的模型进行安装
)
func
(
w
*
imageWorker
)
distribute
()
{
models
:=
w
.
modelLibrary
.
AllModel
()
sort
.
Sort
(
models
)
func
(
w
*
imageWorker
)
getOp
(
model
ModelDetailInfo
,
opType
omanager
.
ModelOperateType
)
*
omanager
.
ModelOperate
{
cmdstr
,
_
:=
json
.
MarshalIndent
(
model
.
Cmd
,
""
,
" "
)
op
:=
new
(
omanager
.
ModelOperate
)
op
.
Operate
=
opType
op
.
ModelId
=
strconv
.
FormatInt
(
int64
(
model
.
TaskID
),
10
)
op
.
ImageName
=
model
.
ImageName
op
.
Username
=
""
op
.
Password
=
""
op
.
Cmd
=
string
(
cmdstr
)
op
.
GpuSeq
=
0
info
:=
w
.
getInfo
()
return
op
}
func
(
w
*
imageWorker
)
distributeMatch
(
mode
DistributeMode
,
model
ModelDetailInfo
,
hash
[]
byte
)
bool
{
if
mode
==
GreedyMode
{
return
true
}
if
mode
==
HashingMode
{
level
:=
w
.
modelLibrary
.
GetModelUsedLevel
(
model
.
TaskID
)
weights
:=
0
switch
level
{
case
ModelUsedLevelSuperLow
:
weights
+=
2
case
ModelUsedLevelVeryLow
:
weights
+=
5
case
ModelUsedLevelLow
:
weights
+=
10
case
ModelUsedLevelMiddle
:
weights
+=
30
case
ModelUsedLevelHigh
:
weights
+=
50
case
ModelUsedLevelSuperHigh
:
weights
+=
80
}
return
int
(
hash
[
0
])
<
(
255
*
weights
/
100
)
}
return
false
}
func
(
w
*
imageWorker
)
distributeToUnstall
(
models
SortedModelDetailInfos
,
info
omanager
.
NodeInfoResponse
,
ops
[]
*
omanager
.
ModelOperate
)
{
}
lib
:=
w
.
modelLibrary
func
(
w
*
imageWorker
)
distributeToInstall
(
models
SortedModelDetailInfos
,
info
omanager
.
NodeInfoResponse
,
ops
[]
*
omanager
.
ModelOperate
)
{
totalWorker
:=
w
.
manager
.
WorkerCount
()
mode
:=
GreedyMode
// 贪婪模式
hash
:=
sha3
.
Sum256
([]
byte
(
info
.
Info
.
MinerPubkey
))
mode
:=
GreedyMode
// 贪婪模式
if
totalWorker
>
10
{
mode
=
HashingMode
mode
=
HashingMode
// 散列模式
}
// 散列模式
for
_
,
model
:=
range
models
{
if
mode
==
GreedyMode
{
if
w
.
CanInstall
(
model
)
{
// todo: quest worker to install the model.
}
}
if
mode
==
HashingMode
{
hash
:=
sha3
.
Sum256
([]
byte
(
info
.
Info
.
MinerPubkey
))
level
:=
lib
.
GetModelUsedLevel
(
model
.
TaskID
)
weights
:=
0
switch
level
{
case
ModelUsedLevelSuperLow
:
weights
+=
2
case
ModelUsedLevelVeryLow
:
weights
+=
5
case
ModelUsedLevelLow
:
weights
+=
10
case
ModelUsedLevelMiddle
:
weights
+=
30
case
ModelUsedLevelHigh
:
weights
+=
50
case
ModelUsedLevelSuperHigh
:
weights
+=
80
}
if
int
(
hash
[
0
])
<
(
255
*
weights
/
100
)
{
if
w
.
CanInstall
(
model
)
{
// todo: quest worker to install the model.
}
}
if
w
.
CanInstall
(
model
)
&&
w
.
distributeMatch
(
mode
,
model
,
hash
[
:
])
{
ops
=
append
(
ops
,
w
.
getOp
(
model
,
omanager
.
ModelOperateType_INSTALL
))
}
}
}
func
(
w
*
imageWorker
)
distributeToRun
(
models
SortedModelDetailInfos
,
info
omanager
.
NodeInfoResponse
,
ops
[]
*
omanager
.
ModelOperate
)
{
}
func
(
w
*
imageWorker
)
distributeToStopRun
(
models
SortedModelDetailInfos
,
info
omanager
.
NodeInfoResponse
,
ops
[]
*
omanager
.
ModelOperate
)
{
}
func
(
w
*
imageWorker
)
distribute
()
{
models
:=
w
.
modelLibrary
.
AllModel
()
sort
.
Sort
(
models
)
info
:=
w
.
getInfo
()
operates
:=
make
([]
*
omanager
.
ModelOperate
,
0
)
w
.
distributeToUnstall
(
models
,
info
,
operates
)
w
.
distributeToInstall
(
models
,
info
,
operates
)
w
.
distributeToRun
(
models
,
info
,
operates
)
w
.
distributeToStopRun
(
models
,
info
,
operates
)
}
server/worker.go
View file @
61d3804f
...
...
@@ -254,10 +254,17 @@ func (w *Worker) doDeviceUsage(msg *omanager.WorkerMessage_DeviceUsage) error {
"worker-addr"
:
w
.
workerAddr
,
})
.
Debugf
(
"receive worker device usage:%v"
,
msg
.
DeviceUsage
.
Usage
)
// 1. update local cache
w
.
info
.
Hardware
.
DISK
.
Free
=
int64
(
msg
.
DeviceUsage
.
Usage
.
DiskUsage
)
w
.
info
.
Hardware
.
RAM
.
Free
=
int64
(
msg
.
DeviceUsage
.
Usage
.
RamUsage
)
w
.
info
.
Hardware
.
CPU
.
Usage
=
int32
(
msg
.
DeviceUsage
.
Usage
.
CpuUsage
)
w
.
info
.
Hardware
.
NET
.
Bandwidth
=
int32
(
msg
.
DeviceUsage
.
Usage
.
NetBandwidth
)
if
!
w
.
registed
{
return
nil
}
// 1. update usage to hardware mogo.
// 2. update usage to hardware mogo.
return
w
.
infoOp
.
UpdateHardwareUsage
(
context
.
TODO
(),
w
.
workerAddr
,
types
.
PbToDeviceUsage
(
msg
.
DeviceUsage
.
Usage
))
}
...
...
@@ -331,6 +338,30 @@ func (w *Worker) doAddInstalledModel(msg *omanager.WorkerMessage_AddModelInstall
if
!
w
.
registed
{
return
}
// 0. update local cache.
for
_
,
model
:=
range
msg
.
AddModelInstalled
.
Models
{
if
w
.
info
.
Models
.
InstalledModels
==
nil
{
w
.
info
.
Models
.
InstalledModels
=
make
([]
*
omanager
.
InstalledModel
,
0
)
}
// if model already installed, skip.
exist
:=
false
for
_
,
installed
:=
range
w
.
info
.
Models
.
InstalledModels
{
if
installed
.
ModelId
==
model
.
ModelId
{
exist
=
true
break
}
}
if
!
exist
{
w
.
info
.
Models
.
InstalledModels
=
append
(
w
.
info
.
Models
.
InstalledModels
,
model
)
}
// remove from wait to install.
for
idx
,
waitted
:=
range
w
.
info
.
Models
.
WaitToInstallModels
{
if
waitted
.
ModelId
==
model
.
ModelId
{
w
.
info
.
Models
.
WaitToInstallModels
=
append
(
w
.
info
.
Models
.
WaitToInstallModels
[
:
idx
],
w
.
info
.
Models
.
WaitToInstallModels
[
idx
+
1
:
]
...
)
break
}
}
}
models
:=
make
([]
*
types
.
InstalledModel
,
0
)
log
.
WithFields
(
log
.
Fields
{
"worker-addr"
:
w
.
workerAddr
,
...
...
@@ -377,6 +408,17 @@ func (w *Worker) doRemoveInstalledModel(msg *omanager.WorkerMessage_DelModelInst
//if !w.registed {
// return
//}
// 0. update local cache.
for
_
,
model
:=
range
msg
.
DelModelInstalled
.
ModelIds
{
// remove from installed.
for
idx
,
installed
:=
range
w
.
info
.
Models
.
InstalledModels
{
if
installed
.
ModelId
==
model
{
w
.
info
.
Models
.
InstalledModels
=
append
(
w
.
info
.
Models
.
InstalledModels
[
:
idx
],
w
.
info
.
Models
.
InstalledModels
[
idx
+
1
:
]
...
)
break
}
}
}
models
:=
make
([]
int
,
0
)
log
.
WithFields
(
log
.
Fields
{
"worker-addr"
:
w
.
workerAddr
,
...
...
@@ -405,6 +447,16 @@ func (w *Worker) doInstalledModelStatus(msg *omanager.WorkerMessage_InstalledMod
//if !w.registed {
// return
//}
// 0. update local cache.
for
_
,
installed
:=
range
w
.
info
.
Models
.
InstalledModels
{
if
installed
.
ModelId
==
msg
.
InstalledModelStatus
.
ModelId
{
installed
.
LastRunTime
=
msg
.
InstalledModelStatus
.
LastRunTime
break
}
}
log
.
WithFields
(
log
.
Fields
{
"worker-addr"
:
w
.
workerAddr
,
})
.
Debugf
(
"receive worker installed model status:%v"
,
msg
.
InstalledModelStatus
)
...
...
@@ -420,6 +472,16 @@ func (w *Worker) doRunningModelStatus(msg *omanager.WorkerMessage_RunningModelSt
//if !w.registed {
// return
//}
// 0. update local cache.
for
_
,
running
:=
range
w
.
info
.
Models
.
RunningModels
{
if
running
.
ModelId
==
msg
.
RunningModelStatus
.
ModelId
{
running
.
LastWorkTime
=
msg
.
RunningModelStatus
.
LastWorkTime
running
.
TotalRunCount
=
int32
(
msg
.
RunningModelStatus
.
TotalRunCount
)
running
.
ExecTime
=
int32
(
msg
.
RunningModelStatus
.
ExecTime
)
break
}
}
log
.
WithFields
(
log
.
Fields
{
"worker-addr"
:
w
.
workerAddr
,
})
.
Debugf
(
"receive worker running model status:%v"
,
msg
.
RunningModelStatus
)
...
...
@@ -492,20 +554,22 @@ func (w *Worker) getMaxGPUFree() *omanager.GPUInfo {
return
maxGpuFree
}
func
(
w
*
Worker
)
ModelOperate
(
info
interface
{},
operate
omanager
.
ModelOperateType
)
*
omanager
.
ManagerMessage_ModelOperateRequest
{
request
:=
&
omanager
.
ManagerMessage_ModelOperateRequest
{
ModelOperateRequest
:
&
omanager
.
ModelOperateRequest
{
ModelOperates
:
[]
*
omanager
.
ModelOperate
{
{
ModelId
:
""
,
ImageName
:
""
,
Username
:
""
,
Password
:
""
,
Cmd
:
""
,
Operate
:
operate
,
},
func
(
w
*
Worker
)
ModelOperate
(
operators
[]
*
omanager
.
ModelOperate
)
error
{
msg
:=
&
omanager
.
ManagerMessage
{
Message
:
&
omanager
.
ManagerMessage_ModelOperateRequest
{
ModelOperateRequest
:
&
omanager
.
ModelOperateRequest
{
ModelOperates
:
operators
,
},
},
}
return
request
var
err
error
send
:=
make
(
chan
bool
)
callback
:=
func
(
e
error
)
bool
{
err
=
e
close
(
send
)
return
true
}
w
.
SendToWorker
(
msg
,
callback
)
<-
send
return
err
}
server/workerManager.go
View file @
61d3804f
...
...
@@ -605,7 +605,7 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
if
e
:=
wm
.
setWorkerLastTaskTime
(
worker
,
time
.
Now
()
.
Unix
());
e
!=
nil
{
log
.
WithField
(
"worker"
,
worker
.
uuid
)
.
WithError
(
e
)
.
Error
(
"set worker last task time failed"
)
}
wm
.
distributor
.
AddWorker
(
worker
.
uuid
,
worker
.
info
)
wm
.
distributor
.
AddWorker
(
worker
.
uuid
,
worker
.
workerAddr
,
worker
.
info
)
default
:
l
.
WithField
(
"worker-addr"
,
worker
.
workerAddr
)
.
Error
(
fmt
.
Sprintf
(
"unsupport msg type %T"
,
msg
))
...
...
@@ -678,3 +678,11 @@ func (wm *WorkerManager) makeTaskProof(worker *Worker, task *odysseus.TaskConten
}
return
proof
}
func
(
wm
*
WorkerManager
)
ModelOperate
(
addr
string
,
operate
[]
*
omanager
.
ModelOperate
)
error
{
worker
:=
wm
.
GetWorkerByAddr
(
addr
)
if
worker
==
nil
{
return
errors
.
New
(
"worker not found"
)
}
return
worker
.
ModelOperate
(
operate
)
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment