Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
N
nodemanager
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Odysseus
nodemanager
Commits
f3f5c792
Commit
f3f5c792
authored
May 29, 2024
by
vicotor
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update protocol
parent
14bc5d34
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
153 additions
and
174 deletions
+153
-174
dispatchTask.go
server/dispatchTask.go
+1
-1
service.go
server/service.go
+0
-8
workerManager.go
server/workerManager.go
+44
-34
worker_registry.go
server/worker_registry.go
+1
-1
workerstatu.go
server/workerstatu.go
+107
-130
No files found.
server/dispatchTask.go
View file @
f3f5c792
...
...
@@ -115,7 +115,7 @@ func (d *dispatchTask) finalize(wm *WorkerManager) {
task
:=
d
.
task
if
task
.
TaskKind
!=
odysseus
.
TaskKind_StandardTask
&&
d
.
worker
.
online
==
true
{
_
=
wm
.
AddWorkerSingle
(
d
.
worker
)
}
_
,
err
:=
wm
.
taskResult
(
d
.
worker
,
task
,
result
)
...
...
server/service.go
View file @
f3f5c792
...
...
@@ -6,7 +6,6 @@ import (
"github.com/odysseus/nodemanager/utils"
omanager
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2"
log
"github.com/sirupsen/logrus"
"strconv"
"strings"
)
...
...
@@ -64,13 +63,6 @@ func (n *NodeManagerService) DispatchTask(ctx context.Context, request *omanager
if
worker
.
online
==
false
{
return
nil
,
errors
.
New
(
"worker offline"
)
}
{
nonceds
:=
strings
.
Split
(
mids
[
1
],
":"
)
nonce
,
_
:=
strconv
.
ParseInt
(
nonceds
[
0
],
10
,
64
)
if
nonce
<
int64
(
worker
.
nonce
)
{
return
nil
,
errors
.
New
(
"expired worker nonce"
)
}
}
dtask
:=
newDispatchTask
(
worker
,
request
.
TaskData
)
...
...
server/workerManager.go
View file @
f3f5c792
...
...
@@ -3,7 +3,6 @@ package server
import
(
"bytes"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"github.com/golang/protobuf/proto"
...
...
@@ -285,15 +284,17 @@ func (wm *WorkerManager) manageWorker(worker *Worker) error {
case
<-
workerCheckTicker
.
C
:
if
worker
.
info
.
nodeInfo
!=
nil
{
//nodeinfoTicker.Reset(time.Hour * 24
)
nodeinfoTicker
.
Reset
(
time
.
Minute
*
30
)
}
if
worker
.
usage
.
hwUsage
!=
nil
{
deviceUsageTicker
.
Reset
(
time
.
Second
*
time
.
Duration
(
tickerConf
.
DeviceUsageTicker
))
}
if
worker
.
registed
&&
worker
.
addFirstSucceed
==
false
&&
len
(
worker
.
deviceInfoHash
)
>
0
{
wm
.
AddWorkerToQueue
(
worker
)
if
worker
.
registed
&&
worker
.
addFirstSucceed
==
false
{
if
err
:=
wm
.
AddWorker
(
worker
);
err
==
nil
{
worker
.
addFirstSucceed
=
true
}
}
wm
.
UpdateWorkerActive
(
worker
)
...
...
@@ -477,6 +478,7 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
"heartBeat"
:
time
.
Now
()
.
Unix
()
-
int64
(
msg
.
HeartbeatResponse
.
Timestamp
),
})
.
Debug
(
"receive worker heartbeat"
)
case
*
omanager
.
WorkerMessage_NodeInfo
:
// todo: remove this message.
nodeinfo
:=
msg
.
NodeInfo
log
.
WithField
(
"worker-addr"
,
worker
.
workerAddr
)
.
Debugf
(
"receive worker node info:%v"
,
nodeinfo
)
if
nodeinfo
.
Hardware
!=
nil
&&
nodeinfo
.
Hardware
.
NET
!=
nil
{
...
...
@@ -566,29 +568,15 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
// ignore the info.
continue
}
// todo: verify signature
{
var
infoHash
[
32
]
byte
infoData
,
err
:=
json
.
Marshal
(
msg
.
DeviceInfo
)
if
err
!=
nil
{
l
.
WithFields
(
log
.
Fields
{
"worker-addr"
:
worker
.
workerAddr
,
"error"
:
err
,
})
.
Error
(
"marshal device info failed"
)
}
if
len
(
infoData
)
==
0
{
continue
}
infoHash
=
sha3
.
Sum256
(
infoData
)
infoHash
=
sha3
.
Sum256
([]
byte
(
msg
.
DeviceInfo
.
String
()))
// update local cache.
worker
.
info
.
nodeInfo
.
Hardware
=
msg
.
DeviceInfo
.
Hardware
if
worker
.
registed
&&
worker
.
addFirstSucceed
==
false
{
wm
.
AddWorkerToQueue
(
worker
)
}
// check device info changed, and update to cache.
if
bytes
.
Compare
(
infoHash
[
:
],
worker
.
deviceInfoHash
)
!=
0
{
wm
.
UpdateWorkerDeviceInfo
(
worker
,
string
(
infoData
)
)
wm
.
UpdateWorkerDeviceInfo
(
worker
,
msg
.
DeviceInfo
)
}
worker
.
deviceInfoHash
=
infoHash
[
:
]
}
...
...
@@ -597,8 +585,7 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
if
!
worker
.
registed
{
continue
}
usageData
,
_
:=
json
.
Marshal
(
msg
.
DeviceUsage
)
wm
.
UpdateWorkerUsageInfo
(
worker
,
string
(
usageData
))
wm
.
UpdateWorkerUsageInfo
(
worker
,
msg
.
DeviceUsage
)
worker
.
usage
.
hwUsage
=
msg
.
DeviceUsage
.
Usage
l
.
WithFields
(
log
.
Fields
{
...
...
@@ -613,6 +600,7 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
"worker-addr"
:
worker
.
workerAddr
,
"model count"
:
len
(
msg
.
AddModelRunning
.
Models
),
})
.
Debugf
(
"receive worker add model running:%v"
,
msg
.
AddModelRunning
.
Models
)
// todo: add worker running model.
case
*
omanager
.
WorkerMessage_DelModeRunning
:
if
!
worker
.
registed
{
...
...
@@ -622,6 +610,7 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
"worker-addr"
:
worker
.
workerAddr
,
"model count"
:
len
(
msg
.
DelModeRunning
.
ModelIds
),
})
.
Debugf
(
"receive worker del model running:%v"
,
msg
.
DelModeRunning
.
ModelIds
)
// todo: del worker running model with model_id.
case
*
omanager
.
WorkerMessage_AddModelInstalled
:
if
!
worker
.
registed
{
...
...
@@ -631,6 +620,7 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
"worker-addr"
:
worker
.
workerAddr
,
"model count"
:
len
(
msg
.
AddModelInstalled
.
Models
),
})
.
Debugf
(
"receive worker add model installed:%v"
,
msg
.
AddModelInstalled
.
Models
)
// todo: add worker installed model with model_id.
case
*
omanager
.
WorkerMessage_DelModelInstalled
:
if
!
worker
.
registed
{
...
...
@@ -640,6 +630,7 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
"worker-addr"
:
worker
.
workerAddr
,
"model count"
:
len
(
msg
.
DelModelInstalled
.
ModelIds
),
})
.
Debugf
(
"receive worker del model installed:%v"
,
msg
.
DelModelInstalled
.
ModelIds
)
// todo: del worker installed model with model_id.
case
*
omanager
.
WorkerMessage_InstalledModelStatus
:
if
!
worker
.
registed
{
...
...
@@ -650,6 +641,7 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
"model"
:
len
(
msg
.
InstalledModelStatus
.
ModelId
),
"type"
:
"status"
,
})
.
Debugf
(
"receive worker installed model status:%v"
,
msg
.
InstalledModelStatus
)
// todo: update worker installed model status.
case
*
omanager
.
WorkerMessage_RunningModelStatus
:
if
!
worker
.
registed
{
...
...
@@ -660,6 +652,7 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
"model"
:
len
(
msg
.
RunningModelStatus
.
ModelId
),
"type"
:
"status"
,
})
.
Debugf
(
"receive worker running model status:%v"
,
msg
.
RunningModelStatus
)
// todo: update worker running model status.
case
*
omanager
.
WorkerMessage_GpuUsage
:
if
!
worker
.
registed
{
...
...
@@ -669,15 +662,17 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
"worker-addr"
:
worker
.
workerAddr
,
"usage count"
:
len
(
msg
.
GpuUsage
.
Usages
),
})
.
Debugf
(
"receive worker gpu usage:%v"
,
msg
.
GpuUsage
.
Usages
)
// todo: update worker gpu usage info.
case
*
omanager
.
WorkerMessage_RegisteMessage
:
// 1. do some verify.
if
worker
.
registed
{
continue
}
l
.
WithFields
(
log
.
Fields
{
"worker-addr"
:
worker
.
workerAddr
,
})
.
Debug
(
"receive registed message"
)
//
todo: verify signature
//
2. check signature.
info
:=
msg
.
RegisteMessage
.
Info
{
...
...
@@ -695,6 +690,9 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
worker
.
quit
<-
ErrInvalidMessageValue
return
}
}
// 3. check timestamp not expired.
if
time
.
Now
()
.
Unix
()
-
int64
(
msg
.
RegisteMessage
.
Timestamp
)
>
config
.
GetConfig
()
.
GetWorkerSignatureExpiredTime
()
{
l
.
WithFields
(
log
.
Fields
{
"worker-addr"
:
worker
.
workerAddr
,
...
...
@@ -702,13 +700,17 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
worker
.
quit
<-
ErrExpiredMsgSignature
return
}
}
// 4. replace old connection.
if
pubkey
,
err
:=
utils
.
HexToPubkey
(
info
.
MinerPubkey
);
err
!=
nil
{
l
.
WithFields
(
log
.
Fields
{
"worker-addr"
:
worker
.
workerAddr
,
"error"
:
err
,
})
.
Error
(
"parse pubkey failed"
)
worker
.
quit
<-
ErrInvalidMsgSignature
return
}
else
{
addr
:=
utils
.
PubkeyToAddress
(
pubkey
)
if
old
:=
wm
.
GetWorkerByAddr
(
addr
);
old
!=
nil
{
...
...
@@ -721,6 +723,7 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
worker
.
workerAddr
=
addr
}
worker
.
registed
=
true
// 5. check ip address.
matched
,
err
:=
regexp
.
MatchString
(
"((2(5[0-5]|[0-4]
\\
d))|[0-1]?
\\
d{1,2})(
\\
.((2(5[0-5]|[0-4]
\\
d))|[0-1]?
\\
d{1,2})){3}"
,
msg
.
RegisteMessage
.
Hardware
.
NET
.
Ip
)
if
err
!=
nil
{
...
...
@@ -735,14 +738,21 @@ func (wm *WorkerManager) handleWorkerMsg(worker *Worker) {
Models
:
msg
.
RegisteMessage
.
Models
,
}
wm
.
SetWorkerAddr
(
worker
,
worker
.
workerAddr
)
// check white list.
if
err
:=
wm
.
checkWhiteList
(
worker
,
info
.
BenefitAddress
);
err
!=
nil
{
worker
.
quit
<-
err
return
}
else
{
wm
.
addWorkerToSets
(
worker
,
info
.
BenefitAddress
)
wm
.
addWorkerToWhiteListSet
(
worker
,
info
.
BenefitAddress
)
}
// add worker to mogo.
if
err
:=
wm
.
AddWorker
(
worker
);
err
==
nil
{
worker
.
addFirstSucceed
=
true
wm
.
UpdateWorkerActive
(
worker
)
}
// start manage worker.
wreg
:=
workerRegistry
{
worker
:
worker
,
wm
:
wm
,
...
...
server/worker_registry.go
View file @
f3f5c792
...
...
@@ -87,7 +87,7 @@ func (w workerRegistry) DetailInfo() (json.RawMessage, error) {
}
info
.
HearBeat
=
w
.
wm
.
GetHeartBeat
(
w
.
worker
.
uuid
)
*
1000
// to ms
info
.
MinerAddress
=
w
.
worker
.
workerAddr
info
.
Nonce
=
int64
(
w
.
worker
.
nonce
)
info
.
Nonce
=
0
if
w
.
worker
.
info
.
nodeInfo
!=
nil
{
info
.
CpuModel
=
w
.
worker
.
info
.
nodeInfo
.
Hardware
.
CPU
.
Model
info
.
CpuCore
=
int
(
w
.
worker
.
info
.
nodeInfo
.
Hardware
.
CPU
.
Cores
)
...
...
server/workerstatu.go
View file @
f3f5c792
...
...
@@ -2,7 +2,6 @@ package server
import
(
"context"
"encoding/hex"
"errors"
"fmt"
"github.com/gomodule/redigo/redis"
...
...
@@ -10,80 +9,49 @@ import (
"github.com/odysseus/mogo/operator"
"github.com/odysseus/mogo/types"
"github.com/odysseus/nodemanager/config"
omanager
"github.com/odysseus/odysseus-protocol/gen/proto/go/nodemanager/v2"
log
"github.com/sirupsen/logrus"
"go.mongodb.org/mongo-driver/mongo"
"strconv"
"strings"
"time"
)
func
(
wm
*
WorkerManager
)
UpdateWorkerDeviceStatusInfo
(
worker
*
Worker
,
status
[]
byte
)
{
wm
.
rdb
.
Set
(
context
.
Background
(),
workerDeviceStatusInfoKey
(
worker
),
status
,
0
)
func
(
wm
*
WorkerManager
)
UpdateWorkerUsageInfo
(
worker
*
Worker
,
usageInfo
*
omanager
.
DeviceUsageResponse
)
{
// todo: update usage info to mogo.
wm
.
rdb
.
Set
(
context
.
Background
(),
workerUsageInfoKey
(
worker
),
usageInfo
.
String
(),
0
)
}
func
(
wm
*
WorkerManager
)
UpdateWorkerUsageInfo
(
worker
*
Worker
,
usageInfo
string
)
{
wm
.
rdb
.
Set
(
context
.
Background
(),
workerUsageInfoKey
(
worker
),
usageInfo
,
0
)
}
func
(
wm
*
WorkerManager
)
UpdateWorkerDeviceInfo
(
worker
*
Worker
,
deviceInfos
string
)
{
wm
.
rdb
.
Set
(
context
.
Background
(),
workerDeviceInfoKey
(
worker
),
deviceInfos
,
0
)
}
func
(
wm
*
WorkerManager
)
UpdateWorkerResourceInfo
(
worker
*
Worker
,
resourceInfo
[]
byte
)
{
rstr
:=
hex
.
EncodeToString
(
resourceInfo
)
log
.
WithField
(
"resourceinfo"
,
rstr
)
.
Infof
(
"update resourceinfo"
)
wm
.
rdb
.
Set
(
context
.
Background
(),
workerResourceInfoKey
(
worker
),
rstr
,
0
)
}
func
(
wm
*
WorkerManager
)
UpdateWorkerBootedResourceInfo
(
worker
*
Worker
,
bootedResourceInfo
[]
byte
)
{
rstr
:=
hex
.
EncodeToString
(
bootedResourceInfo
)
log
.
WithField
(
"resourceinfo"
,
rstr
)
.
Infof
(
"update resourceinfo"
)
wm
.
rdb
.
Set
(
context
.
Background
(),
workerBootedResourceInfoKey
(
worker
),
rstr
,
0
)
}
func
(
wm
*
WorkerManager
)
UpdateWorkerNonce
(
worker
*
Worker
,
nonce
int
)
error
{
return
wm
.
rdb
.
Set
(
context
.
Background
(),
workerNonceKey
(
worker
),
nonce
,
0
)
.
Err
()
func
(
wm
*
WorkerManager
)
UpdateWorkerDeviceInfo
(
worker
*
Worker
,
deviceInfos
*
omanager
.
DeviceInfoMessage
)
{
// todo: update device info to mogo.
wm
.
rdb
.
Set
(
context
.
Background
(),
workerDeviceInfoKey
(
worker
),
deviceInfos
.
String
(),
0
)
}
func
(
wm
*
WorkerManager
)
GetWorkerNonce
(
worker
*
Worker
)
(
int
,
error
)
{
if
worker
.
workerAddr
!=
""
{
nonceK
:=
workerNonceKey
(
worker
)
nonce
,
err
:=
wm
.
rdb
.
Get
(
context
.
Background
(),
nonceK
)
.
Int
()
if
err
==
redis
.
ErrNil
{
nonce
=
1
if
err
=
wm
.
rdb
.
Set
(
context
.
Background
(),
nonceK
,
nonce
,
0
)
.
Err
();
err
!=
nil
{
return
0
,
err
}
}
return
nonce
,
nil
}
return
0
,
errors
.
New
(
"unkown worker node info"
)
return
0
,
nil
}
func
(
wm
*
WorkerManager
)
IncrWorkerNonce
(
worker
*
Worker
)
(
int
,
error
)
{
nonce
,
err
:=
wm
.
rdb
.
Incr
(
context
.
Background
(),
workerNonceKey
(
worker
))
.
Uint64
()
return
int
(
nonce
),
err
}
func
(
wm
*
WorkerManager
)
AddWorkerFirst
(
worker
*
Worker
)
error
{
log
.
WithField
(
"worker"
,
worker
.
workerAddr
)
.
Info
(
"add worker first time."
)
wm
.
UpdateWorkerActive
(
worker
)
for
_
,
gpu
:=
range
worker
.
info
.
nodeInfo
.
Hardware
.
GPU
{
// add device to redis
priority
:=
0
_
=
gpu
// todo: set priority with device info.
for
m
:=
0
;
m
<
config
.
GetConfig
()
.
GetWorkerMultiple
();
m
++
{
// add worker to redis queue
if
err
:=
wm
.
rdb
.
RPush
(
context
.
Background
(),
config
.
WORKER_QUEUE_PREFIX
+
strconv
.
Itoa
(
priority
),
workerUid
(
worker
))
.
Err
();
err
!=
nil
{
continue
func
(
wm
*
WorkerManager
)
updateWorkerInfo
(
worker
*
Worker
,
winfo
*
operator
.
WorkerInfo
)
error
{
// 2. update worker running info.
wm
.
workerRunningOperator
.
DeleteByWorkerId
(
context
.
Background
(),
worker
.
WorkerAccount
()
.
String
())
for
_
,
running
:=
range
worker
.
info
.
nodeInfo
.
Models
.
RunningModels
{
id
,
_
:=
strconv
.
Atoi
(
running
.
ModelId
)
iInfo
:=
&
operator
.
WorkerRunningInfo
{
WorkerId
:
worker
.
WorkerAccount
()
.
String
(),
ModelId
:
id
,
ExecTime
:
int
(
running
.
ExecTime
),
}
_
,
err
:=
wm
.
workerRunningOperator
.
Insert
(
context
.
Background
(),
iInfo
)
if
err
!=
nil
{
log
.
WithFields
(
log
.
Fields
{
"worker"
:
worker
.
WorkerAccount
()
.
String
(),
"model"
:
id
,
})
.
WithError
(
err
)
.
Error
(
"insert worker running model info failed"
)
continue
}
}
_
,
err
:=
wm
.
workerInfoOperator
.
InsertWorker
(
context
.
Background
(),
&
operator
.
WorkerInfo
{
WorkerId
:
worker
.
WorkerAccount
()
.
String
(),
NodeInfo
:
types
.
PbToNodeInfo
(
worker
.
info
.
nodeInfo
.
Info
),
Models
:
types
.
PbToModelInfo
(
worker
.
info
.
nodeInfo
.
Models
),
Hardware
:
types
.
PbToHardwareInfo
(
worker
.
info
.
nodeInfo
.
Hardware
),
})
// 3. update worker installed info.
wm
.
workerInstalledOperator
.
DeleteByWorkerId
(
context
.
Background
(),
worker
.
WorkerAccount
()
.
String
())
for
_
,
installed
:=
range
worker
.
info
.
nodeInfo
.
Models
.
InstalledModels
{
id
,
_
:=
strconv
.
Atoi
(
installed
.
ModelId
)
iInfo
:=
&
operator
.
WorkerInstalledInfo
{
...
...
@@ -93,88 +61,104 @@ func (wm *WorkerManager) AddWorkerFirst(worker *Worker) error {
if
len
(
worker
.
info
.
nodeInfo
.
Hardware
.
GPU
)
>
0
{
iInfo
.
GpuFree
=
worker
.
info
.
nodeInfo
.
Hardware
.
GPU
[
0
]
.
MemFree
}
wm
.
workerInstalledOperator
.
Insert
(
context
.
Background
(),
iInfo
)
}
for
_
,
running
:=
range
worker
.
info
.
nodeInfo
.
Models
.
RunningModels
{
id
,
_
:=
strconv
.
Atoi
(
running
.
ModelId
)
iInfo
:=
&
operator
.
WorkerRunningInfo
{
WorkerId
:
worker
.
WorkerAccount
()
.
String
(),
ModelId
:
id
,
ExecTime
:
int
(
running
.
ExecTime
),
}
wm
.
workerRunningOperator
.
Insert
(
context
.
Background
(),
iInfo
)
_
,
err
:=
wm
.
workerInstalledOperator
.
Insert
(
context
.
Background
(),
iInfo
)
if
err
!=
nil
{
log
.
WithFields
(
log
.
Fields
{
"worker"
:
worker
.
WorkerAccount
()
.
String
(),
"model"
:
id
,
})
.
WithError
(
err
)
.
Error
(
"insert worker installed model info failed"
)
continue
}
}
// 1. update worker info.
winfo
.
Hardware
=
types
.
PbToHardwareInfo
(
worker
.
info
.
nodeInfo
.
Hardware
)
winfo
.
Models
=
types
.
PbToModelInfo
(
worker
.
info
.
nodeInfo
.
Models
)
winfo
.
NodeInfo
=
types
.
PbToNodeInfo
(
worker
.
info
.
nodeInfo
.
Info
)
err
:=
wm
.
workerInfoOperator
.
UpdateWorker
(
context
.
Background
(),
winfo
)
if
err
!=
nil
{
log
.
WithError
(
err
)
.
Error
(
"insert worker info failed"
)
return
err
}
return
nil
}
func
(
wm
*
WorkerManager
)
AddWorkerToQueue
(
worker
*
Worker
)
{
nonce
,
err
:=
wm
.
GetWorkerNonce
(
worker
)
if
err
!=
nil
{
log
.
WithField
(
"worker-addr"
,
worker
.
workerAddr
)
.
Error
(
"get worker nonce failed when get device info"
)
}
else
{
// if statekeys not exist, nonce don't change.
nmlist
,
err
:=
wm
.
WorkerNmList
(
worker
)
func
(
wm
*
WorkerManager
)
addWorkerInfo
(
worker
*
Worker
)
error
{
// 1. add worker info.
_
,
err
:=
wm
.
workerInfoOperator
.
InsertWorker
(
context
.
Background
(),
&
operator
.
WorkerInfo
{
WorkerId
:
worker
.
WorkerAccount
()
.
String
(),
NodeInfo
:
types
.
PbToNodeInfo
(
worker
.
info
.
nodeInfo
.
Info
),
Models
:
types
.
PbToModelInfo
(
worker
.
info
.
nodeInfo
.
Models
),
Hardware
:
types
.
PbToHardwareInfo
(
worker
.
info
.
nodeInfo
.
Hardware
),
})
if
err
!=
nil
{
if
err
==
redis
.
ErrNil
{
wm
.
UpdateWorkerActive
(
worker
)
return
err
}
}
else
{
if
len
(
nmlist
)
==
0
{
// if nmlist is empty, nonce incr.
nonce
,
err
=
wm
.
IncrWorkerNonce
(
worker
)
// 2. add worker running info.
for
_
,
running
:=
range
worker
.
info
.
nodeInfo
.
Models
.
RunningModels
{
id
,
_
:=
strconv
.
Atoi
(
running
.
ModelId
)
iInfo
:=
&
operator
.
WorkerRunningInfo
{
WorkerId
:
worker
.
WorkerAccount
()
.
String
(),
ModelId
:
id
,
ExecTime
:
int
(
running
.
ExecTime
),
}
_
,
err
:=
wm
.
workerRunningOperator
.
Insert
(
context
.
Background
(),
iInfo
)
if
err
!=
nil
{
log
.
WithField
(
"worker-addr"
,
worker
.
workerAddr
)
.
Error
(
"incr worker nonce failed when get device info"
)
log
.
WithFields
(
log
.
Fields
{
"worker"
:
worker
.
WorkerAccount
()
.
String
(),
"model"
:
id
,
})
.
WithError
(
err
)
.
Error
(
"insert worker running model info failed"
)
continue
}
}
else
{
// else if nmlist is not empty, clear and add self to it.
worker
.
nonce
=
nonce
wm
.
rdb
.
Del
(
context
.
Background
(),
workerStatusKey
(
worker
))
wm
.
UpdateWorkerActive
(
worker
)
}
// 3. add worker installed info.
for
_
,
installed
:=
range
worker
.
info
.
nodeInfo
.
Models
.
InstalledModels
{
id
,
_
:=
strconv
.
Atoi
(
installed
.
ModelId
)
iInfo
:=
&
operator
.
WorkerInstalledInfo
{
WorkerId
:
worker
.
WorkerAccount
()
.
String
(),
ModelId
:
id
,
}
if
len
(
worker
.
info
.
nodeInfo
.
Hardware
.
GPU
)
>
0
{
iInfo
.
GpuFree
=
worker
.
info
.
nodeInfo
.
Hardware
.
GPU
[
0
]
.
MemFree
}
if
err
==
nil
{
worker
.
nonce
=
nonce
wm
.
AddWorkerFirst
(
worker
)
worker
.
addFirstSucceed
=
true
_
,
err
:=
wm
.
workerInstalledOperator
.
Insert
(
context
.
Background
(),
iInfo
)
if
err
!=
nil
{
log
.
WithFields
(
log
.
Fields
{
"worker"
:
worker
.
WorkerAccount
()
.
String
(),
"model"
:
id
,
})
.
WithError
(
err
)
.
Error
(
"insert worker installed model info failed"
)
continue
}
}
return
nil
}
func
(
wm
*
WorkerManager
)
AddWorkerSingle
(
worker
*
Worker
)
error
{
log
.
WithField
(
"worker"
,
worker
.
workerAddr
)
.
Info
(
"add worker on back."
)
wm
.
UpdateWorkerActive
(
worker
)
{
// add worker to redis queue
priority
:=
0
if
err
:=
wm
.
rdb
.
RPush
(
context
.
Background
(),
config
.
WORKER_QUEUE_PREFIX
+
strconv
.
Itoa
(
priority
),
workerUid
(
worker
))
.
Err
();
err
!=
nil
{
log
.
WithError
(
err
)
.
Error
(
"add worker back to queue failed."
)
func
(
wm
*
WorkerManager
)
AddWorker
(
worker
*
Worker
)
error
{
// 1. if worker is exist in mogo, update worker info.
winfo
,
err
:=
wm
.
workerInfoOperator
.
FindWorkerByWorkerId
(
context
.
Background
(),
worker
.
WorkerAccount
()
.
String
())
if
err
!=
nil
{
if
err
==
mongo
.
ErrNoDocuments
{
// create a new
return
wm
.
addWorkerInfo
(
worker
)
}
else
{
log
.
With
Field
(
"worker"
,
worker
.
workerAddr
)
.
Info
(
"add worker back to queue success.
"
)
log
.
With
Error
(
err
)
.
Error
(
"find worker info failed
"
)
}
}
else
{
if
winfo
!=
nil
{
// update worker info.
return
wm
.
updateWorkerInfo
(
worker
,
winfo
)
}
// add worker to redis queue
return
nil
}
return
errors
.
New
(
"can't replace worker info"
)
}
func
(
wm
*
WorkerManager
)
UpdateWorkerActive
(
worker
*
Worker
)
{
if
!
worker
.
online
{
return
}
nonce
,
err
:=
wm
.
GetWorkerNonce
(
worker
)
if
err
!=
nil
{
return
}
if
nonce
!=
worker
.
nonce
{
wm
.
InActiveWorker
(
worker
)
worker
.
nonce
=
nonce
}
old
:=
worker
.
latestNmValue
if
newNm
,
err
:=
wm
.
activeWorker
(
worker
);
err
!=
nil
{
return
...
...
@@ -212,12 +196,9 @@ func (wm *WorkerManager) InActiveWorker(worker *Worker) {
if
list
,
err
:=
wm
.
rdb
.
SMembers
(
context
.
Background
(),
workerStatusKey
(
worker
))
.
Result
();
err
==
nil
&&
len
(
list
)
==
0
{
wm
.
rdb
.
Del
(
context
.
Background
(),
workerStatusKey
(
worker
))
wm
.
rdb
.
Del
(
context
.
Background
(),
workerUsageInfoKey
(
worker
))
wm
.
rdb
.
Del
(
context
.
Background
(),
workerDeviceInfoKey
(
worker
))
wm
.
rdb
.
Del
(
context
.
Background
(),
workerResourceInfoKey
(
worker
))
wm
.
rdb
.
Del
(
context
.
Background
(),
workerBootedResourceInfoKey
(
worker
))
if
worker
.
info
.
nodeInfo
!=
nil
{
wm
.
rmWorkerFromSets
(
worker
,
worker
.
info
.
nodeInfo
.
Info
.
BenefitAddress
)
wm
.
delWorkerFromWhiteListSet
(
worker
,
worker
.
info
.
nodeInfo
.
Info
.
BenefitAddress
)
// delete worker info from mogo.
n
,
err
:=
wm
.
workerInfoOperator
.
DeleteByWorkerId
(
context
.
Background
(),
worker
.
WorkerAccount
()
.
String
())
if
err
!=
nil
{
log
.
WithError
(
err
)
.
Error
(
"delete worker info failed"
)
...
...
@@ -227,15 +208,15 @@ func (wm *WorkerManager) InActiveWorker(worker *Worker) {
}
}
func
(
wm
*
WorkerManager
)
addWorkerTo
Sets
(
worker
*
Worker
,
benefit
string
)
{
func
(
wm
*
WorkerManager
)
addWorkerTo
WhiteListSet
(
worker
*
Worker
,
benefit
string
)
{
wm
.
rdb
.
SAdd
(
context
.
Background
(),
workerSetsKey
(
benefit
),
worker
.
workerAddr
)
}
func
(
wm
*
WorkerManager
)
rmWorkerFromSets
(
worker
*
Worker
,
benefit
string
)
{
func
(
wm
*
WorkerManager
)
delWorkerFromWhiteListSet
(
worker
*
Worker
,
benefit
string
)
{
wm
.
rdb
.
SRem
(
context
.
Background
(),
workerSetsKey
(
benefit
),
worker
.
workerAddr
)
}
func
(
wm
*
WorkerManager
)
getWorker
Sets
(
benefit
string
)
([]
string
,
error
)
{
func
(
wm
*
WorkerManager
)
getWorker
WhiteListSetByBenefit
(
benefit
string
)
([]
string
,
error
)
{
list
,
err
:=
wm
.
rdb
.
SMembers
(
context
.
Background
(),
workerSetsKey
(
benefit
))
.
Result
()
if
err
==
redis
.
ErrNil
{
return
[]
string
{},
nil
...
...
@@ -261,7 +242,7 @@ func (wm *WorkerManager) checkWhiteList(worker *Worker, benefit string) error {
return
errors
.
New
(
"not in white list"
)
}
maxNodeCount
:=
wh
.
NodeNum
nodeList
,
err
:=
wm
.
getWorker
Sets
(
benefit
)
nodeList
,
err
:=
wm
.
getWorker
WhiteListSetByBenefit
(
benefit
)
if
err
!=
nil
{
return
errors
.
New
(
"check worker white list failed"
)
}
...
...
@@ -299,10 +280,6 @@ func workerDeviceStatusInfoKey(w *Worker) string {
return
config
.
WORKER_DEVICE_STATUS_PREFIX
+
w
.
workerAddr
}
func
workerNonceKey
(
w
*
Worker
)
string
{
return
config
.
WORKER_NONCE_KEY_PREFIX
+
w
.
workerAddr
}
func
workerSetsKey
(
benefit
string
)
string
{
return
config
.
WORKER_SETS_PREFIX
+
benefit
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment