Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions pkg/debugcmd/checks/check.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package checks

import (
"context"

"github.com/threefoldtech/zosbase/pkg/gridtypes"
"github.com/threefoldtech/zosbase/pkg/gridtypes/zos"
)

type Checker interface {
Name() string
Run(ctx context.Context, data *CheckData) []HealthCheck
}

type HealthCheck struct {
Name string `json:"name"`
OK bool `json:"ok"`
Message string `json:"message,omitempty"`
Evidence map[string]interface{} `json:"evidence,omitempty"`
}

type CheckData struct {
Twin uint32
Contract uint64
Workload gridtypes.Workload
VM func(ctx context.Context, id string) bool
Network func(ctx context.Context, id zos.NetID) string
}

func success(name, message string, evidence map[string]interface{}) HealthCheck {
if evidence == nil {
evidence = make(map[string]interface{})
}
return HealthCheck{Name: name, OK: true, Message: message, Evidence: evidence}
}

func failure(name, message string, evidence map[string]interface{}) HealthCheck {
if evidence == nil {
evidence = make(map[string]interface{})
}
return HealthCheck{Name: name, OK: false, Message: message, Evidence: evidence}
}

func IsHealthy(checks []HealthCheck) bool {
for _, check := range checks {
if !check.OK {
return false
}
}
return true
}

func Run(ctx context.Context, workloadType gridtypes.WorkloadType, data *CheckData) []HealthCheck {
switch workloadType {
case zos.NetworkType, zos.NetworkLightType:
return NetworkCheckerInstance.Run(ctx, data)
case zos.ZMachineType, zos.ZMachineLightType:
return VMCheckerInstance.Run(ctx, data)
default:
return nil
}
}
140 changes: 140 additions & 0 deletions pkg/debugcmd/checks/network.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package checks

import (
"context"
"encoding/json"
"fmt"
"os"
"path/filepath"

cnins "github.com/containernetworking/plugins/pkg/ns"
"github.com/threefoldtech/zosbase/pkg"
"github.com/threefoldtech/zosbase/pkg/gridtypes/zos"
"github.com/threefoldtech/zosbase/pkg/network/namespace"
"github.com/threefoldtech/zosbase/pkg/network/nr"
"github.com/threefoldtech/zosbase/pkg/versioned"
"github.com/threefoldtech/zosbase/pkg/zinit"
"github.com/vishvananda/netlink"
)

const (
networkdVolatileDir = "/var/run/cache/networkd"
networksDir = "networks"
myceliumKeyDir = "mycelium-key"
)

type NetworkChecker struct {
netID zos.NetID
nsName string
netCfgPath string
nrr *nr.NetResource
}

func (nc *NetworkChecker) Name() string { return "network" }

func (nc *NetworkChecker) Run(ctx context.Context, data *CheckData) []HealthCheck {
netID := zos.NetworkID(data.Twin, data.Workload.Name)
nc.netID = netID
nc.nsName = data.Network(ctx, netID)
nc.netCfgPath = filepath.Join(networkdVolatileDir, networksDir, netID.String())
nc.nrr = nr.New(pkg.Network{NetID: netID}, filepath.Join(networkdVolatileDir, myceliumKeyDir))

return []HealthCheck{
nc.checkConfig(),
nc.checkNamespace(),
nc.checkInterfaces(),
nc.checkBridge(),
nc.checkMycelium(),
}
}

func (nc *NetworkChecker) checkConfig() HealthCheck {
_, raw, err := versioned.ReadFile(nc.netCfgPath)
if err != nil {
return failure("network.config", fmt.Sprintf("config file not found: %v", err), map[string]interface{}{"path": nc.netCfgPath, "netid": nc.netID.String()})
}

var netCfg pkg.Network
if err := json.Unmarshal(raw, &netCfg); err != nil {
return failure("network.config", fmt.Sprintf("config file invalid: %v", err), map[string]interface{}{"path": nc.netCfgPath, "netid": nc.netID.String()})
}

if netCfg.NetID != nc.netID {
return failure("network.config", fmt.Sprintf("netid mismatch: expected %s, got %s", nc.netID.String(), netCfg.NetID.String()), map[string]interface{}{"expected": nc.netID.String(), "got": netCfg.NetID.String()})
}

return success("network.config", "config valid", map[string]interface{}{"path": nc.netCfgPath, "netid": nc.netID.String()})
}

func (nc *NetworkChecker) checkNamespace() HealthCheck {
if !namespace.Exists(nc.nsName) {
return failure("network.namespace", "namespace not found", map[string]interface{}{"namespace": nc.nsName})
}
return success("network.namespace", "namespace exists", map[string]interface{}{"namespace": nc.nsName})
}

func (nc *NetworkChecker) checkInterfaces() HealthCheck {
wgIface, _ := nc.nrr.WGName()
nrIface, _ := nc.nrr.NRIface()
pubIface := "public"

netnsLinks := map[string]struct{}{}
if netNS, err := namespace.GetByName(nc.nsName); err == nil {
_ = netNS.Do(func(_ cnins.NetNS) error {
links, err := netlink.LinkList()
if err == nil {
for _, l := range links {
netnsLinks[l.Attrs().Name] = struct{}{}
}
}
return nil
})
netNS.Close()
}

missing := []string{}
for _, iface := range []string{wgIface, nrIface, pubIface} {
if _, ok := netnsLinks[iface]; !ok {
missing = append(missing, iface)
}
}

if len(missing) > 0 {
return failure("network.interfaces", fmt.Sprintf("missing interfaces: %v", missing), map[string]interface{}{"namespace": nc.nsName, "missing": missing})
}

return success("network.interfaces", "all required interfaces present", map[string]interface{}{"namespace": nc.nsName})
}

func (nc *NetworkChecker) checkBridge() HealthCheck {
brName, _ := nc.nrr.BridgeName()
brPath := filepath.Join("/sys/class/net", brName)

if _, err := os.Stat(brPath); err != nil {
return failure("network.bridge", fmt.Sprintf("bridge not found: %v", err), map[string]interface{}{"bridge": brName})
}

brifDir := filepath.Join(brPath, "brif")
ents, err := os.ReadDir(brifDir)
if err != nil || len(ents) == 0 {
return failure("network.bridge", fmt.Sprintf("bridge has no members: %v", err), map[string]interface{}{"bridge": brName})
}

return success("network.bridge", "bridge has members", map[string]interface{}{"bridge": brName})
}

func (nc *NetworkChecker) checkMycelium() HealthCheck {
service := nc.nrr.MyceliumServiceName()
st, err := zinit.Default().Status(service)
if err != nil {
return failure("network.mycelium", fmt.Sprintf("cannot get service status: %v", err), map[string]interface{}{"service": service})
}

if !st.State.Is(zinit.ServiceStateRunning) {
return failure("network.mycelium", fmt.Sprintf("service not running: %s", st.State.String()), map[string]interface{}{"service": service, "state": st.State.String()})
}

return success("network.mycelium", "service running", map[string]interface{}{"service": service, "pid": st.Pid})
}

var NetworkCheckerInstance = &NetworkChecker{}
46 changes: 46 additions & 0 deletions pkg/debugcmd/checks/system.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package checks

import (
"context"
"fmt"
"os/exec"
"strings"
"time"
)

const systemProbeTimeout = 60 * time.Second

type SystemChecker struct {
command string
}

func (sc *SystemChecker) Name() string { return "system" }

func (sc *SystemChecker) Run(ctx context.Context, data *CheckData) []HealthCheck {
if sc.command == "" {
return nil
}

parts := strings.Fields(sc.command)
if len(parts) == 0 {
return []HealthCheck{failure("system.probe", "empty probe command", nil)}
}

probeCtx, cancel := context.WithTimeout(ctx, systemProbeTimeout)
defer cancel()

cmd := exec.CommandContext(probeCtx, parts[0], parts[1:]...)
output, err := cmd.CombinedOutput()
if err != nil {
return []HealthCheck{failure("system.probe", fmt.Sprintf("probe failed: %v", err), map[string]interface{}{"error": err.Error()})}
}

return []HealthCheck{success("system.probe", "probe executed successfully", map[string]interface{}{
"output": string(output),
"exit_code": cmd.ProcessState.ExitCode(),
})}
}

func NewSystemChecker(command string) *SystemChecker {
return &SystemChecker{command: command}
}
120 changes: 120 additions & 0 deletions pkg/debugcmd/checks/vm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package checks

import (
"context"
"fmt"
"os"
"path/filepath"

"github.com/threefoldtech/zosbase/pkg/gridtypes"
"github.com/threefoldtech/zosbase/pkg/vm"
)

const vmdVolatileDir = "/var/run/cache/vmd"

type VMChecker struct {
workloadID gridtypes.WorkloadID
vmID string
cfgPath string
machine *vm.Machine
vmExists func(ctx context.Context, id string) bool
}

func (vc *VMChecker) Name() string { return "vm" }

func (vc *VMChecker) Run(ctx context.Context, data *CheckData) []HealthCheck {
workloadID, err := gridtypes.NewWorkloadID(data.Twin, data.Contract, data.Workload.Name)
if err != nil {
return []HealthCheck{failure("vm.init", fmt.Sprintf("invalid workload ID: %v", err), nil)}
}

vc.workloadID = workloadID
vc.vmID = workloadID.String()
vc.cfgPath = filepath.Join(vmdVolatileDir, workloadID.String())
vc.vmExists = data.VM

return []HealthCheck{
vc.checkConfig(),
vc.checkVMD(ctx),
vc.checkProcess(),
vc.checkDisks(),
vc.checkVirtioFS(),
}
}

func (vc *VMChecker) loadMachine() (*vm.Machine, error) {
if vc.machine != nil {
return vc.machine, nil
}
machine, err := vm.MachineFromFile(vc.cfgPath)
if err != nil {
return nil, err
}
vc.machine = machine
return machine, nil
}

func (vc *VMChecker) checkConfig() HealthCheck {
if _, err := os.Stat(vc.cfgPath); err != nil {
return failure("vm.config", fmt.Sprintf("config file not found: %v", err), map[string]interface{}{"path": vc.cfgPath})
}
if _, err := vm.MachineFromFile(vc.cfgPath); err != nil {
return failure("vm.config", fmt.Sprintf("config file invalid: %v", err), map[string]interface{}{"path": vc.cfgPath})
}
return success("vm.config", "config valid", map[string]interface{}{"path": vc.cfgPath, "vm_id": vc.vmID})
}

func (vc *VMChecker) checkVMD(ctx context.Context) HealthCheck {
if !vc.vmExists(ctx, vc.vmID) {
return failure("vm.vmd", "vmd reports VM does not exist", map[string]interface{}{"vm_id": vc.vmID})
}
return success("vm.vmd", "vmd reports VM exists", map[string]interface{}{"vm_id": vc.vmID})
}

func (vc *VMChecker) checkProcess() HealthCheck {
ps, err := vm.Find(vc.vmID)
if err != nil {
return failure("vm.process", fmt.Sprintf("process not found: %v", err), map[string]interface{}{"vm_id": vc.vmID})
}
return success("vm.process", "process running", map[string]interface{}{"vm_id": vc.vmID, "pid": ps.Pid})
}

func (vc *VMChecker) checkDisks() HealthCheck {
machine, err := vc.loadMachine()
if err != nil {
return failure("vm.disks", "config not available", map[string]interface{}{"vm_id": vc.vmID})
}

for _, disk := range machine.Disks {
if disk.Path == "" {
continue
}
if _, err := os.Stat(disk.Path); err != nil {
return failure("vm.disks", fmt.Sprintf("disk missing: %s", disk.Path), map[string]interface{}{"path": disk.Path, "vm_id": vc.vmID})
}
}

// TODO: check for files on disks?

return success("vm.disks", "all disks valid", map[string]interface{}{"vm_id": vc.vmID})
}

func (vc *VMChecker) checkVirtioFS() HealthCheck {
machine, err := vc.loadMachine()
if err != nil {
return failure("vm.virtiofs", fmt.Sprintf("config unavailable: %v", err), map[string]interface{}{"vm_id": vc.vmID})
}

for i := range machine.FS {
sock := filepath.Join("/var/run", fmt.Sprintf("virtio-%s-%d.socket", vc.vmID, i))
if _, err := os.Stat(sock); err != nil {
return failure("vm.virtiofs", fmt.Sprintf("socket missing: %s", sock), map[string]interface{}{"socket": sock, "vm_id": vc.vmID})
}
}

return success("vm.virtiofs", "all virtiofs sockets present", map[string]interface{}{"vm_id": vc.vmID})
}

// TODO: add cloud-console check

var VMCheckerInstance = &VMChecker{}
Loading
Loading