From 5d228bb4397acc24dba542484fe47fbff2ea929b Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri <44365948+mahendrapaipuri@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:44:18 +0200 Subject: [PATCH] Add openstack resource manager support to API server (#196) * Add openstack resource manager support to API server. Openstack resource manager fetches compute units, users and projects data from Openstack's compute and identity API services. * Mock compute and identity API servers have been added for e2e tests. * A mix of SLURM and openstack tests are used in e2e tests to cover most of the scenarios. e2e test outputs have been updated. * docs: Update config docs --------- Signed-off-by: Mahendra Paipuri --- .promu-go-test.yml | 4 +- Makefile | 2 +- README.md | 2 +- cmd/ceems_api_server/main.go | 4 +- cmd/{mock_tsdb => mock_servers}/main.go | 169 ++- pkg/api/resource/openstack/compute.go | 316 +++++ pkg/api/resource/openstack/identity.go | 214 +++ pkg/api/resource/openstack/manager.go | 216 +++ pkg/api/resource/openstack/manager_test.go | 372 ++++++ pkg/api/resource/openstack/request.go | 40 + pkg/api/resource/openstack/types.go | 370 ++++++ pkg/api/resource/slurm/manager.go | 2 +- pkg/api/testdata/config.yml | 33 + .../testdata/openstack/compute/deleted.json | 316 +++++ .../testdata/openstack/compute/flavors.json | 316 +++++ .../testdata/openstack/compute/servers.json | 1099 ++++++++++++++++ .../03b060551ecc488b8756c9f27258d71e.json | 51 + .../4223638a14e44980bf8557cd3ba14e76.json | 23 + .../5fd1986befa042a4b866944f5adbefeb.json | 37 + .../adbc53ea724f4e2bb954e27725b6cf5b.json | 37 + .../dc87e591c0d247d5ac04e873bd8a1646.json | 23 + .../testdata/openstack/identity/users.json | 76 ++ ...server-admin-query-all-selected-fields.txt | 2 +- ...2e-test-api-server-cluster-admin-query.txt | 2 +- ...t-api-server-current-stats-admin-query.txt | 2 +- ...current-usage-admin-experimental-query.txt | 2 +- ...erver-current-usage-experimental-query.txt | 2 +- ...st-api-server-global-stats-admin-query.txt | 2 +- ...2e-test-api-server-project-admin-query.txt | 2 +- .../e2e-test-api-server-running-query.txt | 2 +- pkg/collector/gpu_test.go | 76 ++ scripts/e2e-test.sh | 36 +- website/cspell.json | 4 +- website/docs/00-introduction.md | 12 + .../docs/configuration/ceems-api-server.md | 1155 +++++++---------- website/docs/configuration/ceems-lb.md | 337 +++-- .../docs/configuration/config-reference.md | 817 ++++++++++++ website/docs/configuration/grafana.md | 115 ++ website/docs/configuration/prometheus.md | 35 + website/docusaurus.config.ts | 2 +- .../src/components/HomepageFeatures/index.tsx | 13 +- website/static/img/ebpf.svg | 67 + 42 files changed, 5613 insertions(+), 794 deletions(-) rename cmd/{mock_tsdb => mock_servers}/main.go (70%) create mode 100644 pkg/api/resource/openstack/compute.go create mode 100644 pkg/api/resource/openstack/identity.go create mode 100644 pkg/api/resource/openstack/manager.go create mode 100644 pkg/api/resource/openstack/manager_test.go create mode 100644 pkg/api/resource/openstack/request.go create mode 100644 pkg/api/resource/openstack/types.go create mode 100644 pkg/api/testdata/openstack/compute/deleted.json create mode 100644 pkg/api/testdata/openstack/compute/flavors.json create mode 100644 pkg/api/testdata/openstack/compute/servers.json create mode 100644 pkg/api/testdata/openstack/identity/03b060551ecc488b8756c9f27258d71e.json create mode 100644 pkg/api/testdata/openstack/identity/4223638a14e44980bf8557cd3ba14e76.json create mode 100644 pkg/api/testdata/openstack/identity/5fd1986befa042a4b866944f5adbefeb.json create mode 100644 pkg/api/testdata/openstack/identity/adbc53ea724f4e2bb954e27725b6cf5b.json create mode 100644 pkg/api/testdata/openstack/identity/dc87e591c0d247d5ac04e873bd8a1646.json create mode 100644 pkg/api/testdata/openstack/identity/users.json create mode 100644 website/docs/configuration/grafana.md create mode 100644 website/docs/configuration/prometheus.md create mode 100644 website/static/img/ebpf.svg diff --git a/.promu-go-test.yml b/.promu-go-test.yml index e084a01f..dd08402f 100644 --- a/.promu-go-test.yml +++ b/.promu-go-test.yml @@ -7,8 +7,8 @@ repository: path: github.com/mahendrapaipuri/ceems build: binaries: - - name: mock_tsdb - path: ./cmd/mock_tsdb + - name: mock_servers + path: ./cmd/mock_servers tags: all: [osusergo, netgo, static_build] flags: -a diff --git a/Makefile b/Makefile index 00f54393..485cb941 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,7 @@ ifeq ($(CGO_BUILD), 1) PROMU_CONF ?= .promu-cgo.yml pkgs := ./pkg/sqlite3 ./pkg/api/cli \ ./pkg/api/db ./pkg/api/helper \ - ./pkg/api/resource ./pkg/api/resource/slurm \ + ./pkg/api/resource ./pkg/api/resource/slurm ./pkg/api/resource/openstack \ ./pkg/api/updater \ ./pkg/api/http ./cmd/ceems_api_server \ ./pkg/lb/backend ./pkg/lb/cli \ diff --git a/README.md b/README.md index 6ffe255b..c7d52d7e 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Installation instructions of CEEMS components can be found in ## Visualizing metrics with Grafana CEEMS is meant to be used with Grafana for visualization and below are some of the -screenshots few possible metrics. +screenshots of dashboards. ### Time series compute unit CPU metrics diff --git a/cmd/ceems_api_server/main.go b/cmd/ceems_api_server/main.go index 0017bddd..f7ca59ab 100644 --- a/cmd/ceems_api_server/main.go +++ b/cmd/ceems_api_server/main.go @@ -1,13 +1,13 @@ package main +// We need to import each resource manager and updater package here. import ( "log" "os" "github.com/mahendrapaipuri/ceems/pkg/api/cli" - // We need to import each resource manager package here to call init function. + _ "github.com/mahendrapaipuri/ceems/pkg/api/resource/openstack" _ "github.com/mahendrapaipuri/ceems/pkg/api/resource/slurm" - // We need to import each updater package here to call init function. _ "github.com/mahendrapaipuri/ceems/pkg/api/updater/tsdb" ) diff --git a/cmd/mock_tsdb/main.go b/cmd/mock_servers/main.go similarity index 70% rename from cmd/mock_tsdb/main.go rename to cmd/mock_servers/main.go index 0e7bbbc5..79a06dbc 100644 --- a/cmd/mock_tsdb/main.go +++ b/cmd/mock_servers/main.go @@ -1,22 +1,31 @@ package main import ( + "context" "encoding/json" + "fmt" "hash/fnv" "log" "math" "net/http" + "os" + "os/signal" "regexp" "slices" "strconv" "strings" + "syscall" "time" "github.com/mahendrapaipuri/ceems/pkg/tsdb" ) -// Default port Prometheus listens on. -const portNum string = ":9090" +// Default ports. +const ( + promPortNum = ":9090" + osNovaPortNum = ":8080" + osKSPortNum = ":7070" +) // Regex to capture query. var ( @@ -211,21 +220,101 @@ func ConfigHandler(w http.ResponseWriter, r *http.Request) { } } -func main() { +// ServersHandler handles OS compute servers. +func ServersHandler(w http.ResponseWriter, r *http.Request) { + var fileName string + if _, ok := r.URL.Query()["deleted"]; ok { + fileName = "deleted" + } else { + fileName = "servers" + } + + if data, err := os.ReadFile(fmt.Sprintf("pkg/api/testdata/openstack/compute/%s.json", fileName)); err == nil { + w.Write(data) + + return + } + + w.WriteHeader(http.StatusInternalServerError) + w.Write([]byte("KO")) +} + +// UsersHandler handles OS users. +func UsersHandler(w http.ResponseWriter, r *http.Request) { + if data, err := os.ReadFile("pkg/api/testdata/openstack/identity/users.json"); err == nil { + w.Write(data) + + return + } + + w.WriteHeader(http.StatusInternalServerError) + w.Write([]byte("KO")) +} + +// ProjectsHandler handles OS projects. +func ProjectsHandler(w http.ResponseWriter, r *http.Request) { + userID := r.PathValue("id") + if data, err := os.ReadFile(fmt.Sprintf("pkg/api/testdata/openstack/identity/%s.json", userID)); err == nil { + w.Write(data) + + return + } + + w.WriteHeader(http.StatusInternalServerError) + w.Write([]byte("KO")) +} + +func promServer(ctx context.Context) { log.Println("Starting fake prometheus server") // Registering our handler functions, and creating paths. - http.HandleFunc("/api/v1/query", QueryHandler) - http.HandleFunc("/api/v1/status/config", ConfigHandler) + promMux := http.NewServeMux() + promMux.HandleFunc("/api/v1/query", QueryHandler) + promMux.HandleFunc("/api/v1/status/config", ConfigHandler) + + log.Println("Started Prometheus on port", promPortNum) + log.Println("To close connection CTRL+C :-)") + + // Start server + server := &http.Server{ + Addr: promPortNum, + ReadHeaderTimeout: 3 * time.Second, + Handler: promMux, + } + defer func() { + if err := server.Shutdown(ctx); err != nil { + log.Println("Failed to shutdown fake Prometheus server", err) + } + }() + + // Spinning up the server. + err := server.ListenAndServe() + if err != nil { + log.Fatal(err) + } +} + +func osNovaServer(ctx context.Context) { + log.Println("Starting fake Openstack compute API server") + + // Registering our handler functions, and creating paths. + osNovaMux := http.NewServeMux() + osNovaMux.HandleFunc("/v2.1/servers/detail", ServersHandler) - log.Println("Started on port", portNum) + log.Println("Started Openstack compute API server on port", osNovaPortNum) log.Println("To close connection CTRL+C :-)") // Start server server := &http.Server{ - Addr: portNum, + Addr: osNovaPortNum, ReadHeaderTimeout: 3 * time.Second, + Handler: osNovaMux, } + defer func() { + if err := server.Shutdown(ctx); err != nil { + log.Println("Failed to shutdown fake Openstack compute API server", err) + } + }() // Spinning up the server. err := server.ListenAndServe() @@ -233,3 +322,69 @@ func main() { log.Fatal(err) } } + +func osKSServer(ctx context.Context) { + log.Println("Starting fake Openstack identity API server") + + // Registering our handler functions, and creating paths. + osKSMux := http.NewServeMux() + osKSMux.HandleFunc("/v3/users", UsersHandler) + osKSMux.HandleFunc("/v3/users/{id}/projects", ProjectsHandler) + + log.Println("Started Prometheus on port", osKSPortNum) + log.Println("To close connection CTRL+C :-)") + + // Start server + server := &http.Server{ + Addr: osKSPortNum, + ReadHeaderTimeout: 3 * time.Second, + Handler: osKSMux, + } + defer func() { + if err := server.Shutdown(ctx); err != nil { + log.Println("Failed to shutdown fake Openstack identity API server", err) + } + }() + + // Spinning up the server. + err := server.ListenAndServe() + if err != nil { + log.Fatal(err) + } +} + +func main() { + log.Println("Starting fake test servers") + + args := os.Args[1:] + + // Registering our handler functions, and creating paths. + ctx, cancel := context.WithCancel(context.Background()) + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT) + + if slices.Contains(args, "prom") { + go func() { + promServer(ctx) + }() + } + + if slices.Contains(args, "os-compute") { + go func() { + osNovaServer(ctx) + }() + } + + if slices.Contains(args, "os-identity") { + go func() { + osKSServer(ctx) + }() + } + + sig := <-sigs + log.Println(sig) + + cancel() + + log.Println("Fake test servers have been stopped") +} diff --git a/pkg/api/resource/openstack/compute.go b/pkg/api/resource/openstack/compute.go new file mode 100644 index 00000000..6503c680 --- /dev/null +++ b/pkg/api/resource/openstack/compute.go @@ -0,0 +1,316 @@ +package openstack + +import ( + "context" + "errors" + "fmt" + "net/http" + "slices" + "strconv" + "strings" + "sync" + "time" + + "github.com/go-kit/log/level" + "github.com/mahendrapaipuri/ceems/pkg/api/models" +) + +var ( + inactiveStatus = []string{ + "SHUTOFF", + "SUSPENDED", + "SHELVED", + "SHELVED_OFFLOADED", + "ERROR", + } + deletedStatus = []string{ + "DELETED", + "SOFT_DELETED", + } + + serversLock = sync.RWMutex{} + projectLock = sync.RWMutex{} + errsLock = sync.RWMutex{} +) + +// Timespan is a custom type to format time.Duration. +type Timespan time.Duration + +// Format formats the time.Duration. +func (t Timespan) Format(format string) string { + z := time.Unix(0, 0).UTC() + duration := time.Duration(t) + day := 24 * time.Hour + + if duration > day { + days := duration / day + + return fmt.Sprintf("%d-%s", days, z.Add(duration).Format(format)) + } + + return z.Add(duration).Format(format) +} + +func (o *openstackManager) activeInstances(ctx context.Context, start time.Time, end time.Time) ([]models.Unit, error) { + // Check if service is online + if err := o.ping("compute"); err != nil { + return nil, err + } + + // Start a wait group + wg := sync.WaitGroup{} + + // Increment by 2 one for active instances, one for deleted instances + wg.Add(2) + + var allServers []Server + + var allErrs error + + // Active instances + go func() { + defer wg.Done() + + // Fetch active servers + servers, err := o.fetchInstances(ctx, start, end, false) + if err != nil { + errsLock.Lock() + allErrs = errors.Join(allErrs, fmt.Errorf("failed to fetch active instances: %w", err)) + errsLock.Unlock() + + return + } + + serversLock.Lock() + allServers = append(allServers, servers...) + serversLock.Unlock() + }() + + // Deleted instances + go func() { + defer wg.Done() + + // Fetch active servers + servers, err := o.fetchInstances(ctx, start, end, true) + if err != nil { + errsLock.Lock() + allErrs = errors.Join(allErrs, fmt.Errorf("failed to fetch active instances: %w", err)) + errsLock.Unlock() + + return + } + + serversLock.Lock() + allServers = append(allServers, servers...) + serversLock.Unlock() + }() + + // Wait all go routines + wg.Wait() + + // If no servers found, return error(s) + if len(allServers) == 0 { + return nil, allErrs + } + + // // Check if there are any new flavors in list of instances + // for _, server := range allServers { + // if _, ok := o.activeFlavors[server.Flavor.ID]; !ok { + // if err := o.updateFlavors(ctx); err != nil { + // level.Info(o.logger).Log("msg", "Failed to update instance flavors for Openstack cluster", "id", o.cluster.ID, "err", err) + // } + + // break + // } + // } + + // Transform servers into units + units := make([]models.Unit, len(allServers)) + + // Update interval period + updateIntPeriod := end.Sub(start).Seconds() + + var iServer int + + for _, server := range allServers { + // Get elapsed time of instance including shutdowns, suspended states + elapsedTime := Timespan(end.Sub(server.LaunchedAt)).Format("15:04:05") + + // Initialise endedAt, endedAtTS + endedAt := "N/A" + + var endedAtTS int64 = 0 + + // Get actual running time of the instance within this + // update period + var activeTimeSeconds float64 + + if slices.Contains(deletedStatus, server.Status) { + // Override elapsed time for deleted instances + elapsedTime = Timespan(server.TerminatedAt.Sub(server.LaunchedAt)).Format("15:04:05") + + // Get instance termination time + endedAt = server.TerminatedAt.Format(osTimeFormat) + endedAtTS = server.TerminatedAt.UnixMilli() + + // If the instance has been terminated in this update interval + // update activeTime from start till termination + if server.TerminatedAt.After(start) { + activeTimeSeconds = server.TerminatedAt.Sub(start).Seconds() + } + } else if slices.Contains(inactiveStatus, server.Status) { + // If the server status has changed in this update interval, + // update activeTime from start till update time + if server.UpdatedAt.After(start) { + activeTimeSeconds = server.UpdatedAt.Sub(start).Seconds() + } + } else { + // If the update time is after start of this interval, it means instance + // has changed from inactive to active + if server.UpdatedAt.After(start) { + activeTimeSeconds = end.Sub(server.UpdatedAt).Seconds() + } else { + activeTimeSeconds = updateIntPeriod + } + } + + // Parse vCPUs + var vcpu, cpuMem, vgpu float64 + // Ignore any errors during parsing. Should not happen + vcpu = float64(server.Flavor.VCPUs) + + // RAM is always in MiB. Convert it to Bytes + cpuMem = float64(server.Flavor.RAM) + + // Check if instance has vGPUs + if v, ok := server.Flavor.ExtraSpecs["resources:VGPU"]; ok { + // Ignore any errors during parsing. Should not happen + vgpu, _ = strconv.ParseFloat(v, 64) + } + + // Total time + totalTime := models.MetricMap{ + "walltime": models.JSONFloat(activeTimeSeconds), + "alloc_cputime": models.JSONFloat(vcpu * activeTimeSeconds), + "alloc_cpumemtime": models.JSONFloat(cpuMem * activeTimeSeconds), + "alloc_gputime": models.JSONFloat(vgpu * activeTimeSeconds), + "alloc_gpumemtime": models.JSONFloat(vgpu * activeTimeSeconds), + } + + // Allocation + allocation := models.Allocation{ + "vcpus": server.Flavor.VCPUs, + "mem": server.Flavor.RAM, + "disk": server.Flavor.Disk, + "swap": server.Flavor.Swap, + "name": server.Flavor.Name, + "extra_specs": server.Flavor.ExtraSpecs, + } + + // Tags + tags := models.Tag{ + "metadata": server.Metadata, + "tags": server.Tags, + "server_groups": strings.Join(server.ServerGroups, ","), + "hypervisor": server.HypervisorHostname, + "reservation_id": server.ReservationID, + "power_state": server.PowerState.String(), + "az": server.AvailabilityZone, + } + + units[iServer] = models.Unit{ + ResourceManager: openstackVMManager, + UUID: server.ID, + Name: server.Name, + Project: o.userProjectsCache.projectIDNameMap[server.TenantID], + User: o.userProjectsCache.userIDNameMap[server.UserID], + CreatedAt: server.CreatedAt.Format(osTimeFormat), + StartedAt: server.LaunchedAt.Format(osTimeFormat), + EndedAt: endedAt, + CreatedAtTS: server.CreatedAt.UnixMilli(), + StartedAtTS: server.LaunchedAt.UnixMilli(), + EndedAtTS: endedAtTS, + Elapsed: elapsedTime, + State: server.Status, + TotalTime: totalTime, + Allocation: allocation, + Tags: tags, + } + + iServer++ + } + + level.Info(o.logger). + Log("msg", "Openstack VM instances fetched", "cluster_id", o.cluster.ID, "start", start, "end", end, "num_instances", len(units)) + + return units, nil +} + +// fetchInstances fetches a list of active/deleted compute instances from Openstack cluster. +func (o *openstackManager) fetchInstances(ctx context.Context, start time.Time, end time.Time, deleted bool) ([]Server, error) { + // Create a new GET request + req, err := http.NewRequestWithContext( + ctx, + http.MethodGet, + o.servers().String(), + nil, + ) + if err != nil { + return nil, fmt.Errorf("failed to create request to fetch Openstack instances: %w", err) + } + + // Add query parameters + q := req.URL.Query() + q.Add("all_tenants", "true") + + if deleted { + q.Add("deleted", "true") + q.Add("changes-since", start.Format(osTimeFormat)) + q.Add("changes-until", end.Format(osTimeFormat)) + } + + req.URL.RawQuery = q.Encode() + + // Get response + resp, err := apiRequest[ServersResponse](req, o.client) + if err != nil { + return nil, fmt.Errorf("failed to complete request to fetch Openstack instances: %w", err) + } + + return resp.Servers, nil +} + +// // fetchFlavors fetches a list of active instance flavors from Openstack cluster. +// func (o *openstackManager) fetchFlavors(ctx context.Context) ([]Flavor, error) { +// // Create a new GET request +// req, err := http.NewRequestWithContext( +// ctx, +// http.MethodGet, +// o.flavors().String(), +// nil, +// ) +// if err != nil { +// return nil, err +// } + +// // Get response +// resp, err := apiRequest[FlavorsResponse](req, o.client) +// if err != nil { +// return nil, err +// } + +// return resp.Flavors, nil +// } + +// func (o *openstackManager) updateFlavors(ctx context.Context) error { +// // Fetch current flavors and update map +// if flavors, err := o.fetchFlavors(ctx); err != nil { +// return err +// } else { +// for _, flavor := range flavors { +// o.activeFlavors[flavor.ID] = flavor +// } +// } + +// return nil +// } diff --git a/pkg/api/resource/openstack/identity.go b/pkg/api/resource/openstack/identity.go new file mode 100644 index 00000000..1c4ebc64 --- /dev/null +++ b/pkg/api/resource/openstack/identity.go @@ -0,0 +1,214 @@ +package openstack + +import ( + "context" + "errors" + "fmt" + "net/http" + "slices" + "sync" + "time" + + "github.com/go-kit/log/level" + "github.com/mahendrapaipuri/ceems/pkg/api/helper" + "github.com/mahendrapaipuri/ceems/pkg/api/models" +) + +const ( + chunkSize = 256 +) + +// updateUsersProjects updates users and projects of a given Openstack cluster. +func (o *openstackManager) updateUsersProjects(ctx context.Context, current time.Time) error { + // Fetch current users and projects + if userProjectsCache, err := o.usersProjectsAssoc(ctx, current); err != nil { + return err + } else { + o.userProjectsCache = userProjectsCache + o.userProjectsLastUpdateTime = current + } + + return nil +} + +// fetchUsers fetches a list of users or specific user from Openstack cluster. +func (o *openstackManager) fetchUsers(ctx context.Context) ([]User, error) { + // Create a new GET request + req, err := http.NewRequestWithContext( + ctx, + http.MethodGet, + o.users().String(), + nil, + ) + if err != nil { + return nil, fmt.Errorf("failed to create request to fetch users for openstack cluster: %w", err) + } + + // Get response + resp, err := apiRequest[UsersResponse](req, o.client) + if err != nil { + return nil, fmt.Errorf("failed to complete request to fetch users for openstack cluster: %w", err) + } + + return resp.Users, nil +} + +// fetchUserProjects fetches a list of projects of a specific user from Openstack cluster. +func (o *openstackManager) fetchUserProjects(ctx context.Context, userID string) ([]Project, error) { + // Create a new GET request + req, err := http.NewRequestWithContext( + ctx, + http.MethodGet, + o.userProjects(userID).String(), + nil, + ) + if err != nil { + return nil, fmt.Errorf("failed to create request to fetch user projects for openstack cluster: %w", err) + } + + // Get response + resp, err := apiRequest[ProjectsResponse](req, o.client) + if err != nil { + return nil, fmt.Errorf("failed to complete request to fetch user projects for openstack cluster: %w", err) + } + + return resp.Projects, nil +} + +// fetchUsers fetches a list of users or specific user from Openstack cluster. +func (o *openstackManager) usersProjectsAssoc(ctx context.Context, current time.Time) (userProjectsCache, error) { + // Check if service is online + if err := o.ping("identity"); err != nil { + return userProjectsCache{}, err + } + + // Current time string + currentTime := current.Format(osTimeFormat) + + // First get all users + users, err := o.fetchUsers(ctx) + if err != nil { + return userProjectsCache{}, fmt.Errorf("failed to fetch openstack users: %w", err) + } + + // Get all user IDs + userIDs := make([]string, len(users)) + usersMap := make(map[string]User, len(users)) + + for iuser, user := range users { + userIDs[iuser] = user.ID + usersMap[user.ID] = user + } + + // Chunk by userIDs in chunks of of a given size so that we make + // concurrent corresponding to chunkSize each time to get projects + // of each user + userIDChunks := helper.ChunkBy[string](userIDs, chunkSize) + + // Get user projects + userProjects := make(map[string][]Project, len(userIDs)) + + var allErrs error + + for _, userIDs := range userIDChunks { + wg := sync.WaitGroup{} + wg.Add(len(userIDs)) + + for _, userID := range userIDs { + go func(id string) { + defer wg.Done() + + projects, err := o.fetchUserProjects(ctx, id) + + projectLock.Lock() + userProjects[id] = projects + allErrs = errors.Join(allErrs, err) + projectLock.Unlock() + }(userID) + } + + // Wait for all routines before moving to next chunk + wg.Wait() + } + + if len(userProjects) == 0 { + return userProjectsCache{}, allErrs + } + + if len(userProjects) < len(userIDs) { + level.Warn(o.logger).Log("msg", "Failed to get projects of few users", "id", o.cluster.ID, "total_users", len(userIDs), "failed_user_project_requests", len(userIDs)-len(userProjects)) + } + + projectUsersList := make(map[string][]string) + userProjectsList := make(map[string][]string) + userIDNameMap := make(map[string]string) + projectIDNameMap := make(map[string]string) + + var projectIDs []string + + for userID, projects := range userProjects { + for _, project := range projects { + userProjectsList[userID] = append(userProjectsList[userID], project.Name) + projectUsersList[project.ID] = append(projectUsersList[project.ID], usersMap[userID].Name) + projectIDs = append(projectIDs, project.ID) + userIDNameMap[userID] = usersMap[userID].Name + projectIDNameMap[project.ID] = project.Name + } + } + + // Sort and compact projects + slices.Sort(projectIDs) + projectIDs = slices.Compact(projectIDs) + + // Transform map into slice of projects + projectModels := make([]models.Project, len(projectIDs)) + + for iproject, projectID := range projectIDs { + projectUsers := projectUsersList[projectID] + + // Sort users + slices.Sort(projectUsers) + + var usersList models.List + for _, u := range slices.Compact(projectUsers) { + usersList = append(usersList, u) + } + + // Make Association + projectModels[iproject] = models.Project{ + UID: projectID, + Name: projectIDNameMap[projectID], + Users: usersList, + LastUpdatedAt: currentTime, + } + } + + // Transform map into slice of users + userModels := make([]models.User, len(userIDs)) + + for iuser, userID := range userIDs { + userProjects := userProjectsList[userID] + + // Sort projects + slices.Sort(userProjects) + + var projectsList models.List + for _, p := range slices.Compact(userProjects) { + projectsList = append(projectsList, p) + } + + // Make Association + userModels[iuser] = models.User{ + UID: userID, + Name: userIDNameMap[userID], + Projects: projectsList, + LastUpdatedAt: currentTime, + } + } + + level.Info(o.logger). + Log("msg", "Openstack user data fetched", + "cluster_id", o.cluster.ID, "num_users", len(userModels), "num_projects", len(projectModels)) + + return userProjectsCache{userModels, projectModels, userIDNameMap, projectIDNameMap}, nil +} diff --git a/pkg/api/resource/openstack/manager.go b/pkg/api/resource/openstack/manager.go new file mode 100644 index 00000000..06a62e16 --- /dev/null +++ b/pkg/api/resource/openstack/manager.go @@ -0,0 +1,216 @@ +// Package openstack implements the fetcher interface to fetch instances from Openstack +// resource manager +package openstack + +import ( + "context" + "errors" + "fmt" + "net" + "net/http" + "net/url" + "slices" + "time" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/mahendrapaipuri/ceems/pkg/api/base" + "github.com/mahendrapaipuri/ceems/pkg/api/models" + "github.com/mahendrapaipuri/ceems/pkg/api/resource" + config_util "github.com/prometheus/common/config" +) + +var novaMicroVersionHeaders = []string{ + "X-OpenStack-Nova-API-Version", + "OpenStack-API-Version", +} + +var osTimeFormat = base.DatetimeLayout + "-0700" + +type userProjectsCache struct { + userModels []models.User + projectModels []models.Project + userIDNameMap map[string]string + projectIDNameMap map[string]string +} + +// openstackManager is the struct containing the configuration of a given openstack cluster. +type openstackManager struct { + logger log.Logger + cluster models.Cluster + apiURLs map[string]*url.URL + client *http.Client + userProjectsCache userProjectsCache + userProjectsCacheTTL time.Duration + userProjectsLastUpdateTime time.Time +} + +type apiConfig struct { + ComputeAPIURL string `yaml:"compute_api_url"` + IdentityAPIURL string `yaml:"identity_api_url"` +} + +const openstackVMManager = "openstack" + +func init() { + // Register openstack VM manager + resource.Register(openstackVMManager, New) +} + +// New returns a new openstackManager that returns compute instances. +func New(cluster models.Cluster, logger log.Logger) (resource.Fetcher, error) { + // Make openstackManager configs from clusters + openstackManager := openstackManager{ + logger: logger, + cluster: cluster, + apiURLs: make(map[string]*url.URL, 2), + userProjectsCacheTTL: 12 * time.Hour, + } + + var err error + // Check if HTTPClientConfig has Nova Micro version header + headerFound := false + + if cluster.Web.HTTPClientConfig.HTTPHeaders != nil { + for header := range cluster.Web.HTTPClientConfig.HTTPHeaders.Headers { + if slices.Contains(novaMicroVersionHeaders, header) { + headerFound = true + + break + } + } + } else { + cluster.Web.HTTPClientConfig.HTTPHeaders = &config_util.Headers{ + Headers: make(map[string]config_util.Header), + } + } + + // If no Nova Micro Version header found, inject one + if !headerFound { + cluster.Web.HTTPClientConfig.HTTPHeaders.Headers[novaMicroVersionHeaders[0]] = config_util.Header{ + Values: []string{"latest"}, + } + } + + // Make a HTTP client for Openstack from client config + if openstackManager.client, err = config_util.NewClientFromConfig(cluster.Web.HTTPClientConfig, "openstack"); err != nil { + level.Error(logger).Log("msg", "Failed to create HTTP client for Openstack cluster", "id", cluster.ID, "err", err) + + return nil, err + } + + // Fetch compute and identity API URLs from extra_config + apiConfig := &apiConfig{} + if err := cluster.Extra.Decode(apiConfig); err != nil { + level.Error(logger).Log("msg", "Failed to decode extra_config for Openstack cluster", "id", cluster.ID, "err", err) + + return nil, err + } + + // Ensure we have valid compute and identity API URLs + // Unwrap original error to avoid leaking sensitive passwords in output + openstackManager.apiURLs["compute"], err = url.Parse(apiConfig.ComputeAPIURL) + if err != nil { + level.Error(logger).Log("msg", "Failed to parse compute service API URL for Openstack cluster", "id", cluster.ID, "err", err) + + return nil, errors.Unwrap(err) + } + + openstackManager.apiURLs["identity"], err = url.Parse(apiConfig.IdentityAPIURL) + if err != nil { + level.Error(logger).Log("msg", "Failed to parse identity service API URL for Openstack cluster", "id", cluster.ID, "err", err) + + return nil, errors.Unwrap(err) + } + + // // Get initial list of flavors + // if err = openstackManager.updateFlavors(context.Background()); err != nil { + // return nil, err + // } + + // Get initial users and projects + if err = openstackManager.updateUsersProjects(context.Background(), time.Now()); err != nil { + level.Error(logger).Log("msg", "Failed to update users and projects for Openstack cluster", "id", cluster.ID, "err", err) + + return nil, err + } + + level.Info(logger).Log("msg", "Fetching VM instances from Openstack cluster", "id", cluster.ID) + + return &openstackManager, nil +} + +// FetchUnits fetches instances from openstack. +func (o *openstackManager) FetchUnits( + ctx context.Context, + start time.Time, + end time.Time, +) ([]models.ClusterUnits, error) { + // Fetch all instances + instances, err := o.activeInstances(ctx, start, end) + if err != nil { + return nil, err + } + + return []models.ClusterUnits{{Cluster: o.cluster, Units: instances}}, nil +} + +// FetchUsersProjects fetches current Openstack users and projects. +func (o *openstackManager) FetchUsersProjects( + ctx context.Context, + current time.Time, +) ([]models.ClusterUsers, []models.ClusterProjects, error) { + // Update user and project data only when cache has expired. + // We need to make an API request for each user to fetch projects of that user + // Doing this at each update interval is very resource consuming, so we cache + // the data for TTL period and update them whenever cache has expired. + if time.Since(o.userProjectsLastUpdateTime) > o.userProjectsCacheTTL { + level.Debug(o.logger).Log("msg", "Updating users and projects for Openstack cluster", "id", o.cluster.ID) + + if err := o.updateUsersProjects(ctx, current); err != nil { + level.Error(o.logger).Log("msg", "Failed to update users and projects data for Openstack cluster", "id", o.cluster.ID, "err", err) + } + } + + return []models.ClusterUsers{ + {Cluster: o.cluster, Users: o.userProjectsCache.userModels}, + }, []models.ClusterProjects{ + {Cluster: o.cluster, Projects: o.userProjectsCache.projectModels}, + }, nil +} + +// servers endpoint. +func (o *openstackManager) servers() *url.URL { + return o.apiURLs["compute"].JoinPath("/servers/detail") +} + +// // flavors endpoint. +// func (o *openstackManager) flavors() *url.URL { +// return o.apiURLs["compute"].JoinPath("/flavors/detail") +// } + +// users endpoint. +func (o *openstackManager) users() *url.URL { + return o.apiURLs["identity"].JoinPath("/v3/users") +} + +// user details endpoint. +func (o *openstackManager) userProjects(id string) *url.URL { + return o.apiURLs["identity"].JoinPath(fmt.Sprintf("/v3/users/%s/projects", id)) +} + +// ping attempts to ping Openstack compute and identity API servers. +func (o *openstackManager) ping(service string) error { + if url, ok := o.apiURLs[service]; ok { + var d net.Dialer + + conn, err := d.Dial("tcp", url.Host) + if err != nil { + return fmt.Errorf("openstack service %s is unreachable: %w", service, err) + } + + defer conn.Close() + } + + return nil +} diff --git a/pkg/api/resource/openstack/manager_test.go b/pkg/api/resource/openstack/manager_test.go new file mode 100644 index 00000000..f93e80b4 --- /dev/null +++ b/pkg/api/resource/openstack/manager_test.go @@ -0,0 +1,372 @@ +package openstack + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "os" + "strings" + "testing" + "time" + + "github.com/go-kit/log" + "github.com/mahendrapaipuri/ceems/pkg/api/models" + config_util "github.com/prometheus/common/config" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "gopkg.in/yaml.v3" +) + +var ( + start, _ = time.Parse(osTimeFormat, "2024-10-15T15:00:00+0200") + end, _ = time.Parse(osTimeFormat, "2024-10-15T15:15:00+0200") + current, _ = time.Parse(osTimeFormat, "2024-10-15T15:15:00+0200") + + expectedUnits = map[string]models.Unit{ + "d0d60434-4bf1-4eb1-9469-d7b38083a88f": { + ResourceManager: "openstack", + UUID: "d0d60434-4bf1-4eb1-9469-d7b38083a88f", + Name: "new-vgpu-3", + Project: "admin", + User: "admin", + CreatedAt: "2024-10-15T13:32:25+0200", + StartedAt: "2024-10-15T13:32:43+0200", + EndedAt: "2024-10-15T14:32:09+0200", + CreatedAtTS: 1728991945000, + StartedAtTS: 1728991963000, + EndedAtTS: 1728995529000, + Elapsed: "00:59:26", + State: "DELETED", + Allocation: models.Generic{ + "disk": 1, + "extra_specs": map[string]string{"hw_rng:allowed": "True", "resources:VGPU": "1"}, + "mem": 8192, + "name": "m10.vgpu", + "swap": 0, + "vcpus": 8, + }, + TotalTime: models.MetricMap{ + "alloc_cpumemtime": 0, + "alloc_cputime": 0, + "alloc_gpumemtime": 0, + "alloc_gputime": 0, + "walltime": 0, + }, + Tags: models.Generic{ + "az": "nova", + "hypervisor": "gpu-node-4", + "power_state": "NOSTATE", + "reservation_id": "r-rcywwpf9", + "metadata": map[string]string{}, + "tags": []string{}, + "server_groups": "", + }, + }, + "0687859c-b7b8-47ea-aa4c-74162f52fbfc": { + ResourceManager: "openstack", + UUID: "0687859c-b7b8-47ea-aa4c-74162f52fbfc", + Name: "newer-2", + Project: "admin", + User: "admin", + CreatedAt: "2024-10-15T14:29:18+0200", + StartedAt: "2024-10-15T14:29:34+0200", + EndedAt: "N/A", + CreatedAtTS: 1728995358000, + StartedAtTS: 1728995374000, + EndedAtTS: 0, + Elapsed: "00:45:26", + State: "ACTIVE", + Allocation: models.Generic{ + "disk": 1, + "extra_specs": map[string]string{"hw_rng:allowed": "True"}, + "mem": 256, + "name": "cirros256", + "swap": 0, + "vcpus": 1, + }, + TotalTime: models.MetricMap{ + "alloc_cpumemtime": 230400, + "alloc_cputime": 900, + "alloc_gpumemtime": 0, + "alloc_gputime": 0, + "walltime": 900, + }, + Tags: models.Generic{ + "az": "nova", + "hypervisor": "cpu-node-4", + "power_state": "RUNNING", + "reservation_id": "r-fius3pcg", + "metadata": map[string]string{}, + "tags": []string{}, + "server_groups": "", + }, + }, + "66c3eff0-52eb-45e2-a5da-5fe21c0ef3f3": { + ResourceManager: "openstack", + UUID: "66c3eff0-52eb-45e2-a5da-5fe21c0ef3f3", + Name: "tp-21", + Project: "test-project-2", + User: "test-user-2", + CreatedAt: "2024-10-15T13:16:44+0200", + StartedAt: "2024-10-15T13:16:55+0200", + EndedAt: "N/A", + CreatedAtTS: 1728991004000, + StartedAtTS: 1728991015000, + EndedAtTS: 0, + Elapsed: "01:58:05", + State: "ACTIVE", + Allocation: models.Generic{ + "disk": 1, + "extra_specs": map[string]string{"hw_rng:allowed": "True"}, + "mem": 192000, + "name": "m1.xl", + "swap": 0, + "vcpus": 128, + }, + TotalTime: models.MetricMap{ + "alloc_cpumemtime": 4.6848e+07, + "alloc_cputime": 31232, + "alloc_gpumemtime": 0, + "alloc_gputime": 0, + "walltime": 244, + }, + Tags: models.Generic{ + "az": "nova", + "hypervisor": "cpu-big-node-4", + "power_state": "RUNNING", + "reservation_id": "r-9ak0uvk9", + "metadata": map[string]string{}, + "tags": []string{}, + "server_groups": "", + }, + }, + } + expectedUsers = []models.User{ + {UID: "adbc53ea724f4e2bb954e27725b6cf5b", Name: "admin", Projects: models.List{"admin", "demo"}, LastUpdatedAt: "2024-10-15T15:15:00+0200"}, + {UID: "03b060551ecc488b8756c9f27258d71e", Name: "test-user-1", Projects: models.List{"test-project-1", "test-project-2", "test-project-3"}, LastUpdatedAt: "2024-10-15T15:15:00+0200"}, + {UID: "5fd1986befa042a4b866944f5adbefeb", Name: "test-user-2", Projects: models.List{"test-project-2", "test-project-3"}, LastUpdatedAt: "2024-10-15T15:15:00+0200"}, + {UID: "4223638a14e44980bf8557cd3ba14e76", Name: "test-user-3", Projects: models.List{"test-project-3"}, LastUpdatedAt: "2024-10-15T15:15:00+0200"}, + {UID: "dc87e591c0d247d5ac04e873bd8a1646", Name: "test-user-4", Projects: models.List{"test-project-4"}, LastUpdatedAt: "2024-10-15T15:15:00+0200"}, + } + expectedProjects = []models.Project{ + {UID: "066a633fd999424faa3409ab60221fbf", Name: "admin", Users: models.List{"admin"}, LastUpdatedAt: "2024-10-15T15:15:00+0200"}, + {UID: "706f9e5f3e174feebcce4e7f08a7b7e3", Name: "test-project-2", Users: models.List{"test-user-1", "test-user-2"}, LastUpdatedAt: "2024-10-15T15:15:00+0200"}, + {UID: "9d87d46f8af54da2adc3e7b94c9d3c30", Name: "demo", Users: models.List{"admin"}, LastUpdatedAt: "2024-10-15T15:15:00+0200"}, + {UID: "b964a9e51c0046a4a84d3f83a135a97c", Name: "test-project-4", Users: models.List{"test-user-4"}, LastUpdatedAt: "2024-10-15T15:15:00+0200"}, + {UID: "bdb137e6ee6d427a899ac22de5d76b8c", Name: "test-project-3", Users: models.List{"test-user-1", "test-user-2", "test-user-3"}, LastUpdatedAt: "2024-10-15T15:15:00+0200"}, + {UID: "cca105ea0cff426e96f096887b7f4b82", Name: "test-project-1", Users: models.List{"test-user-1"}, LastUpdatedAt: "2024-10-15T15:15:00+0200"}, + } +) + +func mockErrorServer() *httptest.Server { + // Start test server + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + w.Write([]byte("KO")) + })) + + return server +} + +func mockOSComputeAPIServer() *httptest.Server { + // Start test server + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.Contains(r.URL.Path, "servers") { + var fileName string + if _, ok := r.URL.Query()["deleted"]; ok { + fileName = "deleted" + } else { + fileName = "servers" + } + + if data, err := os.ReadFile(fmt.Sprintf("../../testdata/openstack/compute/%s.json", fileName)); err == nil { + w.Write(data) + + return + } + } else if strings.Contains(r.URL.Path, "flavors") { + if data, err := os.ReadFile("../../testdata/openstack/compute/flavors.json"); err == nil { + w.Write(data) + + return + } + } else { + w.Write([]byte("KO")) + } + })) + + return server +} + +func mockOSIdentityAPIServer() *httptest.Server { + // Start test server + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.HasSuffix(r.URL.Path, "users") { + if data, err := os.ReadFile("../../testdata/openstack/identity/users.json"); err == nil { + w.Write(data) + + return + } + } else if strings.Contains(r.URL.Path, "users") { + pathParts := strings.Split(r.URL.Path, "/") + + userID := pathParts[len(pathParts)-2] + if data, err := os.ReadFile(fmt.Sprintf("../../testdata/openstack/identity/%s.json", userID)); err == nil { + w.Write(data) + + return + } + } else { + w.Write([]byte("KO")) + } + })) + + return server +} + +func mockConfig(computeAPIURL, identityAPIURL string) (yaml.Node, error) { + config := ` +--- +compute_api_url: %s +identity_api_url: %s` + + cfg := fmt.Sprintf(config, computeAPIURL, identityAPIURL) + + var extraConfig yaml.Node + + if err := yaml.Unmarshal([]byte(cfg), &extraConfig); err == nil { + return extraConfig, nil + } else { + return yaml.Node{}, err + } +} + +func TestOpenstackFetcher(t *testing.T) { + // Setup mock API servers + computeAPIServer := mockOSComputeAPIServer() + defer computeAPIServer.Close() + + identityAPIServer := mockOSIdentityAPIServer() + defer identityAPIServer.Close() + + extraConfig, err := mockConfig(computeAPIServer.URL, identityAPIServer.URL) + require.NoError(t, err) + + // mock config + clusters := []models.Cluster{ + { + ID: "os-0", + Manager: "openstack", + Extra: extraConfig, + }, + { + ID: "os-1", + Manager: "openstack", + Extra: extraConfig, + }, + } + + ctx := context.Background() + + for _, cluster := range clusters { + os, err := New(cluster, log.NewNopLogger()) + require.NoError(t, err) + + units, err := os.FetchUnits(ctx, start, end) + require.NoError(t, err) + assert.Len(t, units[0].Units, 18) + + for uuid, expectedUnit := range expectedUnits { + for _, gotUnit := range units[0].Units { + if uuid == gotUnit.UUID { + assert.Equal(t, expectedUnit, gotUnit, "Unit %s", uuid) + + break + } + } + } + + users, projects, err := os.FetchUsersProjects(ctx, current) + require.NoError(t, err) + + // Use expected LastUpdatedAt + for i := range len(users[0].Users) { + users[0].Users[i].LastUpdatedAt = expectedUsers[0].LastUpdatedAt + } + + for i := range len(projects[0].Projects) { + projects[0].Projects[i].LastUpdatedAt = expectedProjects[0].LastUpdatedAt + } + + assert.EqualValues(t, expectedUsers, users[0].Users) + assert.EqualValues(t, expectedProjects, projects[0].Projects) + } +} + +func TestOpenstackFetcherFail(t *testing.T) { + // Setup mock API servers + computeAPIServer := mockOSComputeAPIServer() + + identityAPIServer := mockOSIdentityAPIServer() + + extraConfig, err := mockConfig(computeAPIServer.URL, identityAPIServer.URL) + require.NoError(t, err) + + // mock config + cluster := models.Cluster{ + ID: "os-0", + Manager: "openstack", + Extra: extraConfig, + } + + ctx := context.Background() + os, err := New(cluster, log.NewNopLogger()) + require.NoError(t, err) + + // Stop test servers to simulate when OS services are offline + computeAPIServer.Close() + identityAPIServer.Close() + + _, err = os.FetchUnits(ctx, start, end) + require.Error(t, err) + + // Here we should not get an error as it will return cached data + // that we created during struct instantiation + _, _, err = os.FetchUsersProjects(ctx, current) + require.NoError(t, err) +} + +func TestOpenstackServiceError(t *testing.T) { + errorServer := mockErrorServer() + defer errorServer.Close() + + identityAPIServer := mockOSIdentityAPIServer() + defer identityAPIServer.Close() + + extraConfig, err := mockConfig(errorServer.URL, identityAPIServer.URL) + require.NoError(t, err) + + // mock config + cluster := models.Cluster{ + ID: "os-0", + Manager: "openstack", + Extra: extraConfig, + } + + // Add header + cluster.Web.HTTPClientConfig.HTTPHeaders = &config_util.Headers{ + Headers: make(map[string]config_util.Header), + } + cluster.Web.HTTPClientConfig.HTTPHeaders.Headers[novaMicroVersionHeaders[0]] = config_util.Header{ + Values: []string{"latest"}, + } + + ctx := context.Background() + os, err := New(cluster, log.NewNopLogger()) + require.NoError(t, err) + + // Attempt to fetch instances and we should get an error + _, err = os.FetchUnits(ctx, time.Now(), time.Now()) + require.Error(t, err) +} diff --git a/pkg/api/resource/openstack/request.go b/pkg/api/resource/openstack/request.go new file mode 100644 index 00000000..5db45b4e --- /dev/null +++ b/pkg/api/resource/openstack/request.go @@ -0,0 +1,40 @@ +package openstack + +import ( + "encoding/json" + "fmt" + "io" + "net/http" +) + +// apiRequest makes the request using client and returns response. +func apiRequest[T any](req *http.Request, client *http.Client) (T, error) { + // Add necessary headers + req.Header.Add("Content-Type", "application/x-www-form-urlencoded") + + // Make request + resp, err := client.Do(req) + if err != nil { + return *new(T), err + } + defer resp.Body.Close() + + // Check status code + if resp.StatusCode != http.StatusOK { + return *new(T), fmt.Errorf("request failed with status: %d", resp.StatusCode) + } + + // Read response body + body, err := io.ReadAll(resp.Body) + if err != nil { + return *new(T), err + } + + // Unpack into data + var data T + if err = json.Unmarshal(body, &data); err != nil { + return *new(T), err + } + + return data, nil +} diff --git a/pkg/api/resource/openstack/types.go b/pkg/api/resource/openstack/types.go new file mode 100644 index 00000000..1c2c1a26 --- /dev/null +++ b/pkg/api/resource/openstack/types.go @@ -0,0 +1,370 @@ +package openstack + +import ( + "encoding/json" + "os" + "strconv" + "time" +) + +func init() { + // If we are in CI env, use fixed time location + // for e2e tests + if os.Getenv("CI") != "" { + currentLocation, _ = time.LoadLocation("CET") + } else { + currentLocation = time.Now().Location() + } +} + +const RFC3339MilliNoZ = "2006-01-02T15:04:05.999999" + +var currentLocation *time.Location + +type JSONRFC3339MilliNoZ time.Time + +func (jt *JSONRFC3339MilliNoZ) UnmarshalJSON(data []byte) error { + var s string + if err := json.Unmarshal(data, &s); err != nil { + return err + } + + if s == "" { + return nil + } + + t, err := time.Parse(RFC3339MilliNoZ, s) + if err != nil { + return err + } + + // Convert the UTC time to local + *jt = JSONRFC3339MilliNoZ( + time.Date( + t.Year(), + t.Month(), + t.Day(), + t.Hour(), + t.Minute(), + t.Second(), + t.Nanosecond(), + currentLocation, + ), + ) + + return nil +} + +// Server represents a server/instance in the OpenStack cloud. +type Server struct { + // ID uniquely identifies this server amongst all other servers, + // including those not accessible to the current tenant. + ID string `json:"id"` + + // TenantID identifies the tenant owning this server resource. + TenantID string `json:"tenant_id"` + + // UserID uniquely identifies the user account owning the tenant. + UserID string `json:"user_id"` + + // Name contains the human-readable name for the server. + Name string `json:"name"` + + // Updated and Created contain ISO-8601 timestamps of when the state of the + // server last changed, and when it was created. + UpdatedAt time.Time `json:"updated"` + CreatedAt time.Time `json:"created"` + + // HostID is the host where the server is located in the cloud. + HostID string `json:"hostid"` + + // Status contains the current operational status of the server, + // such as IN_PROGRESS or ACTIVE. + Status string `json:"status"` + + // Flavor refers to a JSON object, which itself indicates the hardware + // configuration of the deployed server. + Flavor Flavor `json:"flavor"` + + // Metadata includes a list of all user-specified key-value pairs attached + // to the server. + Metadata map[string]string `json:"metadata"` + + // AttachedVolumes includes the volume attachments of this instance + AttachedVolumes []AttachedVolume `json:"os-extended-volumes:volumes_attached"` + + // Fault contains failure information about a server. + Fault Fault `json:"fault"` + + // Tags is a slice/list of string tags in a server. + // The requires microversion 2.26 or later. + Tags []string `json:"tags"` + + // ServerGroups is a slice of strings containing the UUIDs of the + // server groups to which the server belongs. Currently this can + // contain at most one entry. + // New in microversion 2.71 + ServerGroups []string `json:"server_groups"` + + // Host is the host/hypervisor that the instance is hosted on. + Host string `json:"OS-EXT-SRV-ATTR:host"` + + // InstanceName is the name of the instance. + InstanceName string `json:"OS-EXT-SRV-ATTR:instance_name"` + + // HypervisorHostname is the hostname of the host/hypervisor that the + // instance is hosted on. + HypervisorHostname string `json:"OS-EXT-SRV-ATTR:hypervisor_hostname"` + + // ReservationID is the reservation ID of the instance. + // This requires microversion 2.3 or later. + ReservationID string `json:"OS-EXT-SRV-ATTR:reservation_id"` + + // LaunchIndex is the launch index of the instance. + // This requires microversion 2.3 or later. + LaunchIndex int `json:"OS-EXT-SRV-ATTR:launch_index"` + + TaskState string `json:"OS-EXT-STS:task_state"` + VMState string `json:"OS-EXT-STS:vm_state"` + PowerState PowerState `json:"OS-EXT-STS:power_state"` + + LaunchedAt time.Time `json:"-"` + TerminatedAt time.Time `json:"-"` + + // AvailabilityZone is the availability zone the server is in. + AvailabilityZone string `json:"OS-EXT-AZ:availability_zone"` +} + +func (r *Server) UnmarshalJSON(b []byte) error { + type tmp Server + + var s struct { + tmp + LaunchedAt JSONRFC3339MilliNoZ `json:"OS-SRV-USG:launched_at"` + TerminatedAt JSONRFC3339MilliNoZ `json:"OS-SRV-USG:terminated_at"` + } + + err := json.Unmarshal(b, &s) + if err != nil { + return err + } + + *r = Server(s.tmp) + + r.LaunchedAt = time.Time(s.LaunchedAt) + r.TerminatedAt = time.Time(s.TerminatedAt) + + // Convert CreatedAt and UpdatedAt to local times + // Seems like returned values are always in UTC + r.CreatedAt = time.Date( + r.CreatedAt.Year(), + r.CreatedAt.Month(), + r.CreatedAt.Day(), + r.CreatedAt.Hour(), + r.CreatedAt.Minute(), + r.CreatedAt.Second(), + r.CreatedAt.Nanosecond(), + currentLocation, + ) + r.UpdatedAt = time.Date( + r.UpdatedAt.Year(), + r.UpdatedAt.Month(), + r.UpdatedAt.Day(), + r.UpdatedAt.Hour(), + r.UpdatedAt.Minute(), + r.UpdatedAt.Second(), + r.UpdatedAt.Nanosecond(), + currentLocation, + ) + + return err +} + +type AttachedVolume struct { + ID string `json:"id"` +} + +type Fault struct { + Code int `json:"code"` + Created time.Time `json:"created"` + Details string `json:"details"` + Message string `json:"message"` +} + +type PowerState int + +const ( + NOSTATE = iota + RUNNING + _UNUSED1 //nolint:stylecheck + PAUSED + SHUTDOWN + _UNUSED2 //nolint:stylecheck + CRASHED + SUSPENDED +) + +func (r PowerState) String() string { + switch r { + case NOSTATE: + return "NOSTATE" + case RUNNING: + return "RUNNING" + case PAUSED: + return "PAUSED" + case SHUTDOWN: + return "SHUTDOWN" + case CRASHED: + return "CRASHED" + case SUSPENDED: + return "SUSPENDED" + case _UNUSED1, _UNUSED2: + return "_UNUSED" + default: + return "N/A" + } +} + +type ServersResponse struct { + Servers []Server `json:"servers"` +} + +// Flavor represent (virtual) hardware configurations for server resources +// in a region. +type Flavor struct { + // ID is the flavor's unique ID. + ID string `json:"id"` + + // Disk is the amount of root disk, measured in GB. + Disk int `json:"disk"` + + // RAM is the amount of memory, measured in MB. + RAM int `json:"ram"` + + // Name is the name of the flavor. + Name string `json:"original_name"` + + // RxTxFactor describes bandwidth alterations of the flavor. + RxTxFactor float64 `json:"rxtx_factor"` + + // Swap is the amount of swap space, measured in MB. + Swap int `json:"-"` + + // VCPUs indicates how many (virtual) CPUs are available for this flavor. + VCPUs int `json:"vcpus"` + + // IsPublic indicates whether the flavor is public. + IsPublic bool `json:"os-flavor-access:is_public"` + + // Ephemeral is the amount of ephemeral disk space, measured in GB. + Ephemeral int `json:"OS-FLV-EXT-DATA:ephemeral"` + + // Description is a free form description of the flavor. Limited to + // 65535 characters in length. Only printable characters are allowed. + // New in version 2.55 + Description string `json:"description"` + + // Properties is a dictionary of the flavor’s extra-specs key-and-value + // pairs. This will only be included if the user is allowed by policy to + // index flavor extra_specs + // New in version 2.61 + ExtraSpecs map[string]string `json:"extra_specs"` +} + +func (r *Flavor) UnmarshalJSON(b []byte) error { + type tmp Flavor + + var s struct { + tmp + Swap any `json:"swap"` + } + + err := json.Unmarshal(b, &s) + if err != nil { + return err + } + + *r = Flavor(s.tmp) + + switch t := s.Swap.(type) { + case float64: + r.Swap = int(t) + case string: + switch t { + case "": + r.Swap = 0 + default: + swap, err := strconv.ParseFloat(t, 64) + if err != nil { + return err + } + + r.Swap = int(swap) + } + } + + return nil +} + +type FlavorsResponse struct { + Flavors []Flavor `json:"flavors"` +} + +// User represents a User in the OpenStack Identity Service. +type User struct { + // DefaultProjectID is the ID of the default project of the user. + DefaultProjectID string `json:"default_project_id"` + + // Description is the description of the user. + Description string `json:"description"` + + // DomainID is the domain ID the user belongs to. + DomainID string `json:"domain_id"` + + // Enabled is whether or not the user is enabled. + Enabled bool `json:"enabled"` + + // ID is the unique ID of the user. + ID string `json:"id"` + + // Links contains referencing links to the user. + Links map[string]any `json:"links"` + + // Name is the name of the user. + Name string `json:"name"` +} + +type UsersResponse struct { + Users []User `json:"users"` +} + +// Project represents an OpenStack Identity Project. +type Project struct { + // IsDomain indicates whether the project is a domain. + IsDomain bool `json:"is_domain"` + + // Description is the description of the project. + Description string `json:"description"` + + // DomainID is the domain ID the project belongs to. + DomainID string `json:"domain_id"` + + // Enabled is whether or not the project is enabled. + Enabled bool `json:"enabled"` + + // ID is the unique ID of the project. + ID string `json:"id"` + + // Name is the name of the project. + Name string `json:"name"` + + // ParentID is the parent_id of the project. + ParentID string `json:"parent_id"` + + // Tags is the list of tags associated with the project. + Tags []string `json:"tags,omitempty"` +} + +type ProjectsResponse struct { + Projects []Project `json:"projects"` +} diff --git a/pkg/api/resource/slurm/manager.go b/pkg/api/resource/slurm/manager.go index c7cbcc62..1cb88651 100644 --- a/pkg/api/resource/slurm/manager.go +++ b/pkg/api/resource/slurm/manager.go @@ -156,7 +156,7 @@ func (s *slurmScheduler) fetchFromSacct(ctx context.Context, start time.Time, en // Parse sacct output and create BatchJob structs slice jobs, numJobs := parseSacctCmdOutput(string(sacctOutput), start, end) level.Info(s.logger). - Log("msg", "SLURM jobs fetched", "cluster_id", s.cluster.ID, "start", startTime, "end", endTime, "njobs", numJobs) + Log("msg", "SLURM jobs fetched", "cluster_id", s.cluster.ID, "start", start, "end", end, "num_jobs", numJobs) return jobs, nil } diff --git a/pkg/api/testdata/config.yml b/pkg/api/testdata/config.yml index 817907b3..2b95994a 100644 --- a/pkg/api/testdata/config.yml +++ b/pkg/api/testdata/config.yml @@ -27,6 +27,39 @@ clusters: cli: path: pkg/api/testdata + - id: os-0 + manager: openstack + updaters: + - tsdb-0 + web: + http_headers: + X-Auth-Token: + secrets: + - supersecrettoken + X-OpenStack-Nova-API-Version: + values: + - latest + extra_config: + compute_api_url: http://localhost:8080/v2.1 + identity_api_url: http://localhost:7070 + + - id: os-1 + manager: openstack + updaters: + - tsdb-0 + - tsdb-1 + web: + http_headers: + X-Auth-Token: + secrets: + - supersecrettoken + X-OpenStack-Nova-API-Version: + values: + - latest + extra_config: + compute_api_url: http://localhost:8080/v2.1 + identity_api_url: http://localhost:7070 + updaters: - id: tsdb-0 updater: tsdb diff --git a/pkg/api/testdata/openstack/compute/deleted.json b/pkg/api/testdata/openstack/compute/deleted.json new file mode 100644 index 00000000..2b17d1b8 --- /dev/null +++ b/pkg/api/testdata/openstack/compute/deleted.json @@ -0,0 +1,316 @@ +{ + "servers": [ + { + "id": "16af784a-4fa1-429e-953f-4ef5dc462960", + "name": "new-2", + "status": "DELETED", + "tenant_id": "066a633fd999424faa3409ab60221fbf", + "user_id": "adbc53ea724f4e2bb954e27725b6cf5b", + "metadata": {}, + "hostId": "328934663ff29af3f46e70889eb46bf2a367883d1025c02e6365aca1", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 192, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "m1.micro", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:32:25Z", + "updated": "2024-10-15T14:21:29Z", + "addresses": {}, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/16af784a-4fa1-429e-953f-4ef5dc462960" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/16af784a-4fa1-429e-953f-4ef5dc462960" + } + ], + "OS-DCF:diskConfig": "AUTO", + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:32:45.000000", + "OS-SRV-USG:terminated_at": "2024-10-15T14:21:29.000000", + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-0000000c", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-rcywwpf9", + "OS-EXT-SRV-ATTR:launch_index": 1, + "OS-EXT-SRV-ATTR:hostname": "new-2", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "deleted", + "OS-EXT-STS:power_state": 0, + "os-extended-volumes:volumes_attached": [], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "" + }, + { + "id": "d0d60434-4bf1-4eb1-9469-d7b38083a88f", + "name": "new-vgpu-3", + "status": "DELETED", + "tenant_id": "066a633fd999424faa3409ab60221fbf", + "user_id": "adbc53ea724f4e2bb954e27725b6cf5b", + "metadata": {}, + "hostId": "328934663ff29af3f46e70889eb46bf2a367883d1025c02e6365aca1", + "image": "", + "flavor": { + "vcpus": 8, + "ram": 8192, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "m10.vgpu", + "extra_specs": { + "hw_rng:allowed": "True", + "resources:VGPU": "1" + } + }, + "created": "2024-10-15T13:32:25Z", + "updated": "2024-10-15T14:32:10Z", + "addresses": {}, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/d0d60434-4bf1-4eb1-9469-d7b38083a88f" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/d0d60434-4bf1-4eb1-9469-d7b38083a88f" + } + ], + "OS-DCF:diskConfig": "AUTO", + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:32:43.000000", + "OS-SRV-USG:terminated_at": "2024-10-15T14:32:09.000000", + "OS-EXT-SRV-ATTR:host": "gpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-0000000d", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "gpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-rcywwpf9", + "OS-EXT-SRV-ATTR:launch_index": 2, + "OS-EXT-SRV-ATTR:hostname": "new-vgpu-3", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "deleted", + "OS-EXT-STS:power_state": 0, + "os-extended-volumes:volumes_attached": [], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "" + }, + { + "id": "d8af1245-4639-4981-95db-ae097021401d", + "name": "tp-4-1", + "status": "DELETED", + "tenant_id": "bdb137e6ee6d427a899ac22de5d76b8c", + "user_id": "4223638a14e44980bf8557cd3ba14e76", + "metadata": {}, + "hostId": "2113644da507f27b464f562cca634d3ad32bee8e2128555ee03c8e3e", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 256, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "cirros256", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:17:53Z", + "updated": "2024-10-15T14:21:57Z", + "addresses": {}, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/d8af1245-4639-4981-95db-ae097021401d" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/d8af1245-4639-4981-95db-ae097021401d" + } + ], + "OS-DCF:diskConfig": "AUTO", + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:18:08.000000", + "OS-SRV-USG:terminated_at": "2024-10-15T14:21:57.000000", + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-00000008", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-63s45rdp", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "tp-4-1", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "deleted", + "OS-EXT-STS:power_state": 0, + "os-extended-volumes:volumes_attached": [], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "" + }, + { + "id": "6330d17c-6fe8-419c-a044-58e590480e18", + "name": "tp-4-11", + "status": "DELETED", + "tenant_id": "b964a9e51c0046a4a84d3f83a135a97c", + "user_id": "dc87e591c0d247d5ac04e873bd8a1646", + "metadata": {}, + "hostId": "2113644da507f27b464f562cca634d3ad32bee8e2128555ee03c8e3e", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 256, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "cirros256", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:17:53Z", + "updated": "2024-10-15T14:21:57Z", + "addresses": {}, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/6330d17c-6fe8-419c-a044-58e590480e18" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/6330d17c-6fe8-419c-a044-58e590480e18" + } + ], + "OS-DCF:diskConfig": "AUTO", + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:18:08.000000", + "OS-SRV-USG:terminated_at": "2024-10-15T14:21:57.000000", + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-00000008", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-63s45rdp", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "tp-4-1", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "deleted", + "OS-EXT-STS:power_state": 0, + "os-extended-volumes:volumes_attached": [], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "" + }, + { + "id": "38c14d24-2e8a-4cb6-ad5b-851612e800ab", + "name": "tp-4-12", + "status": "DELETED", + "tenant_id": "b964a9e51c0046a4a84d3f83a135a97c", + "user_id": "dc87e591c0d247d5ac04e873bd8a1646", + "metadata": {}, + "hostId": "2113644da507f27b464f562cca634d3ad32bee8e2128555ee03c8e3e", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 256, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "cirros256", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:17:53Z", + "updated": "2024-10-15T14:21:57Z", + "addresses": {}, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/38c14d24-2e8a-4cb6-ad5b-851612e800ab" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/38c14d24-2e8a-4cb6-ad5b-851612e800ab" + } + ], + "OS-DCF:diskConfig": "AUTO", + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:18:08.000000", + "OS-SRV-USG:terminated_at": "2024-10-15T14:21:57.000000", + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-00000008", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-63s45rdp", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "tp-4-1", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "deleted", + "OS-EXT-STS:power_state": 0, + "os-extended-volumes:volumes_attached": [], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "" + } + ] +} + diff --git a/pkg/api/testdata/openstack/compute/flavors.json b/pkg/api/testdata/openstack/compute/flavors.json new file mode 100644 index 00000000..fa3871fb --- /dev/null +++ b/pkg/api/testdata/openstack/compute/flavors.json @@ -0,0 +1,316 @@ +{ + "flavors": [ + { + "id": "1", + "name": "m1.tiny", + "ram": 512, + "disk": 1, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 1, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/1" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/1" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + { + "id": "2", + "name": "m1.small", + "ram": 2048, + "disk": 20, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 1, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/2" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/2" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + { + "id": "3", + "name": "m1.medium", + "ram": 4096, + "disk": 40, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 2, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/3" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/3" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + { + "id": "4", + "name": "m1.large", + "ram": 8192, + "disk": 80, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 4, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/4" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/4" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + { + "id": "42", + "name": "m1.nano", + "ram": 128, + "disk": 1, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 1, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/42" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/42" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + { + "id": "5", + "name": "m1.xlarge", + "ram": 16384, + "disk": 160, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 8, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/5" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/5" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + { + "id": "84", + "name": "m1.micro", + "ram": 192, + "disk": 1, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 1, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/84" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/84" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + { + "id": "c1", + "name": "cirros256", + "ram": 256, + "disk": 1, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 1, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/c1" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/c1" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + { + "id": "d1", + "name": "ds512M", + "ram": 512, + "disk": 5, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 1, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/d1" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/d1" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + { + "id": "d2", + "name": "ds1G", + "ram": 1024, + "disk": 10, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 1, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/d2" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/d2" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + { + "id": "d3", + "name": "ds2G", + "ram": 2048, + "disk": 10, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 2, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/d3" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/d3" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + { + "id": "d4", + "name": "ds4G", + "ram": 4096, + "disk": 20, + "swap": 0, + "OS-FLV-EXT-DATA:ephemeral": 0, + "OS-FLV-DISABLED:disabled": false, + "vcpus": 4, + "os-flavor-access:is_public": true, + "rxtx_factor": 1.0, + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/flavors/d4" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/flavors/d4" + } + ], + "description": null, + "extra_specs": { + "hw_rng:allowed": "True" + } + } + ] +} \ No newline at end of file diff --git a/pkg/api/testdata/openstack/compute/servers.json b/pkg/api/testdata/openstack/compute/servers.json new file mode 100644 index 00000000..2bf8f428 --- /dev/null +++ b/pkg/api/testdata/openstack/compute/servers.json @@ -0,0 +1,1099 @@ +{ + "servers": [ + { + "id": "0687859c-b7b8-47ea-aa4c-74162f52fbfc", + "name": "newer-2", + "status": "ACTIVE", + "tenant_id": "066a633fd999424faa3409ab60221fbf", + "user_id": "adbc53ea724f4e2bb954e27725b6cf5b", + "metadata": {}, + "hostId": "328934663ff29af3f46e70889eb46bf2a367883d1025c02e6365aca1", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 256, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "cirros256", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T14:29:18Z", + "updated": "2024-10-15T14:29:34Z", + "addresses": { + "public": [ + { + "version": 4, + "addr": "172.24.4.50", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:94:07:bf" + }, + { + "version": 6, + "addr": "2001:db8::108", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:94:07:bf" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/0687859c-b7b8-47ea-aa4c-74162f52fbfc" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/0687859c-b7b8-47ea-aa4c-74162f52fbfc" + } + ], + "OS-DCF:diskConfig": "AUTO", + "progress": 0, + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T14:29:34.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-00000010", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-fius3pcg", + "OS-EXT-SRV-ATTR:launch_index": 1, + "OS-EXT-SRV-ATTR:hostname": "newer-2", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "active", + "OS-EXT-STS:power_state": 1, + "os-extended-volumes:volumes_attached": [ + { + "id": "0dfe5359-4d0b-42f4-8795-7bdd53d3ddd6", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "UP", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "c98235c9-d9c0-4b54-8782-0fd8d6312539", + "name": "newer-1", + "status": "ACTIVE", + "tenant_id": "066a633fd999424faa3409ab60221fbf", + "user_id": "adbc53ea724f4e2bb954e27725b6cf5b", + "metadata": {}, + "hostId": "328934663ff29af3f46e70889eb46bf2a367883d1025c02e6365aca1", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 256, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "cirros256", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T14:29:18Z", + "updated": "2024-10-15T14:29:32Z", + "addresses": { + "public": [ + { + "version": 4, + "addr": "172.24.4.162", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:0a:11:19" + }, + { + "version": 6, + "addr": "2001:db8::7b", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:0a:11:19" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/c98235c9-d9c0-4b54-8782-0fd8d6312539" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/c98235c9-d9c0-4b54-8782-0fd8d6312539" + } + ], + "OS-DCF:diskConfig": "AUTO", + "progress": 0, + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T14:29:32.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-0000000f", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-fius3pcg", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "newer-1", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "active", + "OS-EXT-STS:power_state": 1, + "os-extended-volumes:volumes_attached": [ + { + "id": "9885bd69-a061-4817-87cd-9471ffa2fc7e", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "UP", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "1cef0381-0a5a-42e6-9e9b-3d88f84be971", + "name": "tp-gpu-5", + "status": "ACTIVE", + "tenant_id": "bdb137e6ee6d427a899ac22de5d76b8c", + "user_id": "03b060551ecc488b8756c9f27258d71e", + "metadata": {}, + "hostId": "2113644da507f27b464f562cca634d3ad32bee8e2128555ee03c8e3e", + "image": "", + "flavor": { + "vcpus": 8, + "ram": 8192, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "m10.vgpu", + "extra_specs": { + "hw_rng:allowed": "True", + "resources:VGPU": "1" + } + }, + "created": "2024-10-15T14:28:38Z", + "updated": "2024-10-15T14:28:50Z", + "addresses": { + "shared": [ + { + "version": 4, + "addr": "192.168.233.108", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:04:a3:9c" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/1cef0381-0a5a-42e6-9e9b-3d88f84be971" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/1cef0381-0a5a-42e6-9e9b-3d88f84be971" + } + ], + "OS-DCF:diskConfig": "AUTO", + "progress": 0, + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T14:28:50.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": "gpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-0000000e", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "gpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-ct4kh3w1", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "tp-gpu-5", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "active", + "OS-EXT-STS:power_state": 1, + "os-extended-volumes:volumes_attached": [ + { + "id": "d2fe6af9-de7d-4ec2-b36b-b9ba3f8abacb", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "UP", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "3bc984e2-ff73-417c-b123-fdb365ddf241", + "name": "new-1", + "status": "ACTIVE", + "tenant_id": "066a633fd999424faa3409ab60221fbf", + "user_id": "adbc53ea724f4e2bb954e27725b6cf5b", + "metadata": {}, + "hostId": "328934663ff29af3f46e70889eb46bf2a367883d1025c02e6365aca1", + "image": "", + "flavor": { + "vcpus": 2, + "ram": 4192, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "m3.medium", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:32:25Z", + "updated": "2024-10-15T13:32:41Z", + "addresses": { + "public": [ + { + "version": 4, + "addr": "172.24.4.48", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:9a:85:f1" + }, + { + "version": 6, + "addr": "2001:db8::205", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:9a:85:f1" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/3bc984e2-ff73-417c-b123-fdb365ddf241" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/3bc984e2-ff73-417c-b123-fdb365ddf241" + } + ], + "OS-DCF:diskConfig": "AUTO", + "progress": 0, + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:32:40.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-0000000b", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-rcywwpf9", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "new-1", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "active", + "OS-EXT-STS:power_state": 1, + "os-extended-volumes:volumes_attached": [ + { + "id": "cbb86e08-615a-4e4e-a4b1-6d205b451e81", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "UP", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "1c8ad46b-c4a5-42c5-81a0-194aa592f1e1", + "name": "tp-4-3", + "status": "SOFT_DELETED", + "tenant_id": "bdb137e6ee6d427a899ac22de5d76b8c", + "user_id": "4223638a14e44980bf8557cd3ba14e76", + "metadata": {}, + "hostId": "2113644da507f27b464f562cca634d3ad32bee8e2128555ee03c8e3e", + "image": "", + "flavor": { + "vcpus": 32, + "ram": 32156, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "cirros256", + "extra_specs": { + "hw_rng:allowed": "True", + "resources:NUMA": "NUMA:2" + } + }, + "created": "2024-10-15T13:17:53Z", + "updated": "2024-10-15T13:18:13Z", + "addresses": { + "shared": [ + { + "version": 4, + "addr": "192.168.233.166", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:e9:8d:d8" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/1c8ad46b-c4a5-42c5-81a0-194aa592f1e1" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/1c8ad46b-c4a5-42c5-81a0-194aa592f1e1" + } + ], + "OS-DCF:diskConfig": "AUTO", + "progress": 0, + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:18:13.000000", + "OS-SRV-USG:terminated_at": "2024-10-15T14:25:30.000000", + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-0000000a", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-63s45rdp", + "OS-EXT-SRV-ATTR:launch_index": 2, + "OS-EXT-SRV-ATTR:hostname": "tp-4-3", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "soft_deleted", + "OS-EXT-STS:power_state": 1, + "os-extended-volumes:volumes_attached": [ + { + "id": "777d623a-b805-48af-8aa2-3d3f28100da1", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "43c81538-d7ea-479f-b71f-934521a6f7bf", + "name": "tp-4-2", + "status": "PAUSED", + "tenant_id": "bdb137e6ee6d427a899ac22de5d76b8c", + "user_id": "4223638a14e44980bf8557cd3ba14e76", + "metadata": {}, + "hostId": "2113644da507f27b464f562cca634d3ad32bee8e2128555ee03c8e3e", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 256, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "cirros256", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:17:53Z", + "updated": "2024-10-15T13:31:10Z", + "addresses": { + "shared": [ + { + "version": 4, + "addr": "192.168.233.192", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:43:19:7b" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/43c81538-d7ea-479f-b71f-934521a6f7bf" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/43c81538-d7ea-479f-b71f-934521a6f7bf" + } + ], + "OS-DCF:diskConfig": "AUTO", + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:18:10.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-00000009", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-63s45rdp", + "OS-EXT-SRV-ATTR:launch_index": 1, + "OS-EXT-SRV-ATTR:hostname": "tp-4-2", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "paused", + "OS-EXT-STS:power_state": 3, + "os-extended-volumes:volumes_attached": [ + { + "id": "474d0d8f-5915-4aed-8e4b-18a61eb2e5fc", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "UP", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "242760b7-756f-4f13-a64f-e1a6e012f708", + "name": "tp-31", + "status": "ACTIVE", + "tenant_id": "bdb137e6ee6d427a899ac22de5d76b8c", + "user_id": "5fd1986befa042a4b866944f5adbefeb", + "metadata": {}, + "hostId": "2113644da507f27b464f562cca634d3ad32bee8e2128555ee03c8e3e", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 512, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "m1.tiny", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:17:03Z", + "updated": "2024-10-15T13:17:15Z", + "addresses": { + "shared": [ + { + "version": 4, + "addr": "192.168.233.201", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:ac:87:e1" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/242760b7-756f-4f13-a64f-e1a6e012f708" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/242760b7-756f-4f13-a64f-e1a6e012f708" + } + ], + "OS-DCF:diskConfig": "AUTO", + "progress": 0, + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:17:15.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-00000007", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-7oli7cmm", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "tp-31", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "active", + "OS-EXT-STS:power_state": 1, + "os-extended-volumes:volumes_attached": [ + { + "id": "b85865cd-bfd7-412e-8abf-6cf92f64ba3d", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "UP", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "66c3eff0-52eb-45e2-a5da-5fe21c0ef3f3", + "name": "tp-21", + "status": "ACTIVE", + "tenant_id": "706f9e5f3e174feebcce4e7f08a7b7e3", + "user_id": "5fd1986befa042a4b866944f5adbefeb", + "metadata": {}, + "hostId": "85904f58bd5aafcee87e3ec6eb43409ee906957120f207c03f4bfb70", + "image": "", + "flavor": { + "vcpus": 128, + "ram": 192000, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "m1.xl", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:16:44Z", + "updated": "2024-10-15T15:10:56Z", + "addresses": { + "shared": [ + { + "version": 4, + "addr": "192.168.233.142", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:ef:ec:9b" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/66c3eff0-52eb-45e2-a5da-5fe21c0ef3f3" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/66c3eff0-52eb-45e2-a5da-5fe21c0ef3f3" + } + ], + "OS-DCF:diskConfig": "AUTO", + "progress": 0, + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:16:55.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": "cpu-big-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-00000006", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-big-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-9ak0uvk9", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "tp-21", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "active", + "OS-EXT-STS:power_state": 1, + "os-extended-volumes:volumes_attached": [ + { + "id": "72765b41-5dfa-4c12-a004-95bd5fa7b5c6", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "UP", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "b6eafae3-5c24-4f25-b297-5ef291d9487d", + "name": "tp-3", + "status": "SUSPENDED", + "tenant_id": "bdb137e6ee6d427a899ac22de5d76b8c", + "user_id": "03b060551ecc488b8756c9f27258d71e", + "metadata": {}, + "hostId": "2113644da507f27b464f562cca634d3ad32bee8e2128555ee03c8e3e", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 128, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "m1.nano", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:15:49Z", + "updated": "2024-10-15T13:31:21Z", + "addresses": { + "shared": [ + { + "version": 4, + "addr": "192.168.233.53", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:4d:27:fd" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/b6eafae3-5c24-4f25-b297-5ef291d9487d" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/b6eafae3-5c24-4f25-b297-5ef291d9487d" + } + ], + "OS-DCF:diskConfig": "AUTO", + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:16:00.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-00000005", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-ks8nrkb2", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "tp-3", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "suspended", + "OS-EXT-STS:power_state": 4, + "os-extended-volumes:volumes_attached": [ + { + "id": "b44a0dc3-bdea-4076-8574-3a368e52ef54", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "UP", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "1e3b7f2c-a648-41a8-b53e-4fa5bd2ae73c", + "name": "tp-2", + "status": "ACTIVE", + "tenant_id": "706f9e5f3e174feebcce4e7f08a7b7e3", + "user_id": "03b060551ecc488b8756c9f27258d71e", + "metadata": {}, + "hostId": "85904f58bd5aafcee87e3ec6eb43409ee906957120f207c03f4bfb70", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 256, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "cirros256", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:15:30Z", + "updated": "2024-10-15T13:15:42Z", + "addresses": { + "shared": [ + { + "version": 4, + "addr": "192.168.233.254", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:97:7e:0e" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/1e3b7f2c-a648-41a8-b53e-4fa5bd2ae73c" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/1e3b7f2c-a648-41a8-b53e-4fa5bd2ae73c" + } + ], + "OS-DCF:diskConfig": "AUTO", + "progress": 0, + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:15:42.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-00000004", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-tk530ak6", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "tp-2", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "active", + "OS-EXT-STS:power_state": 1, + "os-extended-volumes:volumes_attached": [ + { + "id": "06100fc3-2651-4d82-96c5-6b09fdaec33e", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "UP", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "7fe4fa04-e4ea-4b92-84f4-45c9e78b9520", + "name": "tp-1", + "status": "ACTIVE", + "tenant_id": "cca105ea0cff426e96f096887b7f4b82", + "user_id": "03b060551ecc488b8756c9f27258d71e", + "metadata": {}, + "hostId": "dfaff0e77b5f121149393b61974da06f1c6335973e2b62f73b729bf6", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 192, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "m1.micro", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:14:59Z", + "updated": "2024-10-15T13:15:11Z", + "addresses": { + "shared": [ + { + "version": 4, + "addr": "192.168.233.113", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:b9:00:0d" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/7fe4fa04-e4ea-4b92-84f4-45c9e78b9520" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/7fe4fa04-e4ea-4b92-84f4-45c9e78b9520" + } + ], + "OS-DCF:diskConfig": "AUTO", + "progress": 0, + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:15:11.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-00000003", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-ztao3fbf", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "tp-1", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "active", + "OS-EXT-STS:power_state": 1, + "os-extended-volumes:volumes_attached": [ + { + "id": "7578500c-0f7b-458c-ad2a-15aecfd4b8bc", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "UP", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "83b7be64-daff-477c-8f3d-2ce880c44a53", + "name": "admin-instance-1", + "status": "SHELVED_OFFLOADED", + "tenant_id": "066a633fd999424faa3409ab60221fbf", + "user_id": "adbc53ea724f4e2bb954e27725b6cf5b", + "metadata": {}, + "hostId": "", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 128, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "m1.nano", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:14:10Z", + "updated": "2024-10-15T13:31:56Z", + "addresses": { + "public": [ + { + "version": 4, + "addr": "172.24.4.82", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:4e:7c:68" + }, + { + "version": 6, + "addr": "2001:db8::1d7", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:4e:7c:68" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/83b7be64-daff-477c-8f3d-2ce880c44a53" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/83b7be64-daff-477c-8f3d-2ce880c44a53" + } + ], + "OS-DCF:diskConfig": "AUTO", + "OS-EXT-AZ:availability_zone": "", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:14:34.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": null, + "OS-EXT-SRV-ATTR:instance_name": "instance-00000001", + "OS-EXT-SRV-ATTR:hypervisor_hostname": null, + "OS-EXT-SRV-ATTR:reservation_id": "r-8z0c7fi4", + "OS-EXT-SRV-ATTR:launch_index": 0, + "OS-EXT-SRV-ATTR:hostname": "admin-instance-1", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "shelved_offloaded", + "OS-EXT-STS:power_state": 4, + "os-extended-volumes:volumes_attached": [ + { + "id": "d8bd68e2-5d85-49a9-adc1-2abaa4e1e665", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "", + "security_groups": [ + { + "name": "default" + } + ] + }, + { + "id": "e119caae-1424-47de-9f64-11ac73ae0e75", + "name": "admin-instance-2", + "status": "ACTIVE", + "tenant_id": "066a633fd999424faa3409ab60221fbf", + "user_id": "adbc53ea724f4e2bb954e27725b6cf5b", + "metadata": {}, + "hostId": "328934663ff29af3f46e70889eb46bf2a367883d1025c02e6365aca1", + "image": "", + "flavor": { + "vcpus": 1, + "ram": 128, + "disk": 1, + "ephemeral": 0, + "swap": 0, + "original_name": "m1.nano", + "extra_specs": { + "hw_rng:allowed": "True" + } + }, + "created": "2024-10-15T13:14:10Z", + "updated": "2024-10-15T13:14:33Z", + "addresses": { + "public": [ + { + "version": 4, + "addr": "172.24.4.153", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:38:11:6c" + }, + { + "version": 6, + "addr": "2001:db8::e", + "OS-EXT-IPS:type": "fixed", + "OS-EXT-IPS-MAC:mac_addr": "fa:16:3e:38:11:6c" + } + ] + }, + "accessIPv4": "", + "accessIPv6": "", + "links": [ + { + "rel": "self", + "href": "http://172.16.20.4/compute/v2.1/servers/e119caae-1424-47de-9f64-11ac73ae0e75" + }, + { + "rel": "bookmark", + "href": "http://172.16.20.4/compute/servers/e119caae-1424-47de-9f64-11ac73ae0e75" + } + ], + "OS-DCF:diskConfig": "AUTO", + "progress": 0, + "OS-EXT-AZ:availability_zone": "nova", + "config_drive": "", + "key_name": null, + "OS-SRV-USG:launched_at": "2024-10-15T13:14:33.000000", + "OS-SRV-USG:terminated_at": null, + "OS-EXT-SRV-ATTR:host": "cpu-node-4", + "OS-EXT-SRV-ATTR:instance_name": "instance-00000002", + "OS-EXT-SRV-ATTR:hypervisor_hostname": "cpu-node-4", + "OS-EXT-SRV-ATTR:reservation_id": "r-8z0c7fi4", + "OS-EXT-SRV-ATTR:launch_index": 1, + "OS-EXT-SRV-ATTR:hostname": "admin-instance-2", + "OS-EXT-SRV-ATTR:kernel_id": "", + "OS-EXT-SRV-ATTR:ramdisk_id": "", + "OS-EXT-SRV-ATTR:root_device_name": "/dev/vda", + "OS-EXT-SRV-ATTR:user_data": null, + "OS-EXT-STS:task_state": null, + "OS-EXT-STS:vm_state": "active", + "OS-EXT-STS:power_state": 1, + "os-extended-volumes:volumes_attached": [ + { + "id": "ceee4a36-904f-4334-bed1-a39814b9ed3d", + "delete_on_termination": false + } + ], + "locked": false, + "locked_reason": null, + "description": null, + "tags": [], + "trusted_image_certificates": null, + "host_status": "UP", + "security_groups": [ + { + "name": "default" + } + ] + } + ] +} \ No newline at end of file diff --git a/pkg/api/testdata/openstack/identity/03b060551ecc488b8756c9f27258d71e.json b/pkg/api/testdata/openstack/identity/03b060551ecc488b8756c9f27258d71e.json new file mode 100644 index 00000000..631bf29d --- /dev/null +++ b/pkg/api/testdata/openstack/identity/03b060551ecc488b8756c9f27258d71e.json @@ -0,0 +1,51 @@ +{ + "projects": [ + { + "id": "706f9e5f3e174feebcce4e7f08a7b7e3", + "name": "test-project-2", + "domain_id": "default", + "description": "", + "enabled": true, + "parent_id": "default", + "is_domain": false, + "tags": [], + "options": {}, + "links": { + "self": "http://172.16.20.4/identity/v3/projects/706f9e5f3e174feebcce4e7f08a7b7e3" + } + }, + { + "id": "bdb137e6ee6d427a899ac22de5d76b8c", + "name": "test-project-3", + "domain_id": "default", + "description": "", + "enabled": true, + "parent_id": "default", + "is_domain": false, + "tags": [], + "options": {}, + "links": { + "self": "http://172.16.20.4/identity/v3/projects/bdb137e6ee6d427a899ac22de5d76b8c" + } + }, + { + "id": "cca105ea0cff426e96f096887b7f4b82", + "name": "test-project-1", + "domain_id": "default", + "description": "", + "enabled": true, + "parent_id": "default", + "is_domain": false, + "tags": [], + "options": {}, + "links": { + "self": "http://172.16.20.4/identity/v3/projects/cca105ea0cff426e96f096887b7f4b82" + } + } + ], + "links": { + "next": null, + "self": "http://172.16.20.4/identity/v3/users/03b060551ecc488b8756c9f27258d71e/projects", + "previous": null + } +} \ No newline at end of file diff --git a/pkg/api/testdata/openstack/identity/4223638a14e44980bf8557cd3ba14e76.json b/pkg/api/testdata/openstack/identity/4223638a14e44980bf8557cd3ba14e76.json new file mode 100644 index 00000000..64b152ef --- /dev/null +++ b/pkg/api/testdata/openstack/identity/4223638a14e44980bf8557cd3ba14e76.json @@ -0,0 +1,23 @@ +{ + "projects": [ + { + "id": "bdb137e6ee6d427a899ac22de5d76b8c", + "name": "test-project-3", + "domain_id": "default", + "description": "", + "enabled": true, + "parent_id": "default", + "is_domain": false, + "tags": [], + "options": {}, + "links": { + "self": "http://172.16.20.4/identity/v3/projects/bdb137e6ee6d427a899ac22de5d76b8c" + } + } + ], + "links": { + "next": null, + "self": "http://172.16.20.4/identity/v3/users/4223638a14e44980bf8557cd3ba14e76/projects", + "previous": null + } +} \ No newline at end of file diff --git a/pkg/api/testdata/openstack/identity/5fd1986befa042a4b866944f5adbefeb.json b/pkg/api/testdata/openstack/identity/5fd1986befa042a4b866944f5adbefeb.json new file mode 100644 index 00000000..b049bf23 --- /dev/null +++ b/pkg/api/testdata/openstack/identity/5fd1986befa042a4b866944f5adbefeb.json @@ -0,0 +1,37 @@ +{ + "projects": [ + { + "id": "706f9e5f3e174feebcce4e7f08a7b7e3", + "name": "test-project-2", + "domain_id": "default", + "description": "", + "enabled": true, + "parent_id": "default", + "is_domain": false, + "tags": [], + "options": {}, + "links": { + "self": "http://172.16.20.4/identity/v3/projects/706f9e5f3e174feebcce4e7f08a7b7e3" + } + }, + { + "id": "bdb137e6ee6d427a899ac22de5d76b8c", + "name": "test-project-3", + "domain_id": "default", + "description": "", + "enabled": true, + "parent_id": "default", + "is_domain": false, + "tags": [], + "options": {}, + "links": { + "self": "http://172.16.20.4/identity/v3/projects/bdb137e6ee6d427a899ac22de5d76b8c" + } + } + ], + "links": { + "next": null, + "self": "http://172.16.20.4/identity/v3/users/5fd1986befa042a4b866944f5adbefeb/projects", + "previous": null + } +} \ No newline at end of file diff --git a/pkg/api/testdata/openstack/identity/adbc53ea724f4e2bb954e27725b6cf5b.json b/pkg/api/testdata/openstack/identity/adbc53ea724f4e2bb954e27725b6cf5b.json new file mode 100644 index 00000000..522204bf --- /dev/null +++ b/pkg/api/testdata/openstack/identity/adbc53ea724f4e2bb954e27725b6cf5b.json @@ -0,0 +1,37 @@ +{ + "projects": [ + { + "id": "066a633fd999424faa3409ab60221fbf", + "name": "admin", + "domain_id": "default", + "description": "Bootstrap project for initializing the cloud.", + "enabled": true, + "parent_id": "default", + "is_domain": false, + "tags": [], + "options": {}, + "links": { + "self": "http://172.16.20.4/identity/v3/projects/066a633fd999424faa3409ab60221fbf" + } + }, + { + "id": "9d87d46f8af54da2adc3e7b94c9d3c30", + "name": "demo", + "domain_id": "default", + "description": "", + "enabled": true, + "parent_id": "default", + "is_domain": false, + "tags": [], + "options": {}, + "links": { + "self": "http://172.16.20.4/identity/v3/projects/9d87d46f8af54da2adc3e7b94c9d3c30" + } + } + ], + "links": { + "next": null, + "self": "http://172.16.20.4/identity/v3/users/adbc53ea724f4e2bb954e27725b6cf5b/projects", + "previous": null + } +} \ No newline at end of file diff --git a/pkg/api/testdata/openstack/identity/dc87e591c0d247d5ac04e873bd8a1646.json b/pkg/api/testdata/openstack/identity/dc87e591c0d247d5ac04e873bd8a1646.json new file mode 100644 index 00000000..99ea2849 --- /dev/null +++ b/pkg/api/testdata/openstack/identity/dc87e591c0d247d5ac04e873bd8a1646.json @@ -0,0 +1,23 @@ +{ + "projects": [ + { + "id": "b964a9e51c0046a4a84d3f83a135a97c", + "name": "test-project-4", + "domain_id": "default", + "description": "", + "enabled": true, + "parent_id": "default", + "is_domain": false, + "tags": [], + "options": {}, + "links": { + "self": "http://172.16.20.4/identity/v3/projects/b964a9e51c0046a4a84d3f83a135a97c" + } + } + ], + "links": { + "next": null, + "self": "http://172.16.20.4/identity/v3/users/adbc53ea724f4e2bb954e27725b6cf5b/projects", + "previous": null + } +} \ No newline at end of file diff --git a/pkg/api/testdata/openstack/identity/users.json b/pkg/api/testdata/openstack/identity/users.json new file mode 100644 index 00000000..90508d53 --- /dev/null +++ b/pkg/api/testdata/openstack/identity/users.json @@ -0,0 +1,76 @@ +{ + "users": [ + { + "id": "adbc53ea724f4e2bb954e27725b6cf5b", + "name": "admin", + "domain_id": "default", + "enabled": true, + "password_expires_at": null, + "options": {}, + "links": { + "self": "http://172.16.20.4/identity/v3/users/adbc53ea724f4e2bb954e27725b6cf5b" + } + }, + { + "id": "03b060551ecc488b8756c9f27258d71e", + "name": "test-user-1", + "domain_id": "default", + "enabled": true, + "default_project_id": "cca105ea0cff426e96f096887b7f4b82", + "password_expires_at": null, + "options": { + "lock_password": false + }, + "links": { + "self": "http://172.16.20.4/identity/v3/users/03b060551ecc488b8756c9f27258d71e" + } + }, + { + "id": "5fd1986befa042a4b866944f5adbefeb", + "name": "test-user-2", + "domain_id": "default", + "enabled": true, + "default_project_id": "706f9e5f3e174feebcce4e7f08a7b7e3", + "password_expires_at": null, + "options": { + "lock_password": false + }, + "links": { + "self": "http://172.16.20.4/identity/v3/users/5fd1986befa042a4b866944f5adbefeb" + } + }, + { + "id": "4223638a14e44980bf8557cd3ba14e76", + "name": "test-user-3", + "domain_id": "default", + "enabled": true, + "default_project_id": "bdb137e6ee6d427a899ac22de5d76b8c", + "password_expires_at": null, + "options": { + "lock_password": false + }, + "links": { + "self": "http://172.16.20.4/identity/v3/users/4223638a14e44980bf8557cd3ba14e76" + } + }, + { + "id": "dc87e591c0d247d5ac04e873bd8a1646", + "name": "test-user-4", + "domain_id": "default", + "enabled": true, + "default_project_id": "b964a9e51c0046a4a84d3f83a135a97c", + "password_expires_at": null, + "options": { + "lock_password": false + }, + "links": { + "self": "http://172.16.20.4/identity/v3/users/dc87e591c0d247d5ac04e873bd8a1646" + } + } + ], + "links": { + "next": null, + "self": "http://172.16.20.4/identity/v3/users", + "previous": null + } +} diff --git a/pkg/api/testdata/output/e2e-test-api-server-admin-query-all-selected-fields.txt b/pkg/api/testdata/output/e2e-test-api-server-admin-query-all-selected-fields.txt index c52d13e5..afc70bd2 100644 --- a/pkg/api/testdata/output/e2e-test-api-server-admin-query-all-selected-fields.txt +++ b/pkg/api/testdata/output/e2e-test-api-server-admin-query-all-selected-fields.txt @@ -1 +1 @@ -{"status":"success","data":[{"uuid":"1009248","started_at":"2023-02-21T15:49:06+0100","ended_at":"2023-02-21T15:57:23+0100"},{"uuid":"11508","started_at":"2023-02-21T15:49:06+0100","ended_at":"2023-02-21T15:57:23+0100"},{"uuid":"14508","started_at":"2023-02-21T15:49:06+0100","ended_at":"2023-02-21T15:57:23+0100"},{"uuid":"147975","started_at":"2023-02-21T14:37:07+0100","ended_at":"2023-02-21T15:26:29+0100"},{"uuid":"1479765","started_at":"2023-02-21T14:37:07+0100","ended_at":"2023-02-21T15:26:29+0100"},{"uuid":"1481508","started_at":"2023-02-21T15:49:06+0100","ended_at":"2023-02-21T15:57:23+0100"},{"uuid":"1481510","started_at":"2023-02-21T15:49:06+0100","ended_at":"2023-02-21T15:57:23+0100"},{"uuid":"81510","started_at":"2023-02-21T15:49:06+0100","ended_at":"2023-02-21T15:57:23+0100"}]} +{"status":"success","data":[{"uuid":"0687859c-b7b8-47ea-aa4c-74162f52fbfc","started_at":"2024-10-15T14:29:34+0200","ended_at":"N/A"},{"uuid":"16af784a-4fa1-429e-953f-4ef5dc462960","started_at":"2024-10-15T13:32:45+0200","ended_at":"2024-10-15T14:21:29+0200"},{"uuid":"1c8ad46b-c4a5-42c5-81a0-194aa592f1e1","started_at":"2024-10-15T13:18:13+0200","ended_at":"2024-10-15T14:25:30+0200"},{"uuid":"1cef0381-0a5a-42e6-9e9b-3d88f84be971","started_at":"2024-10-15T14:28:50+0200","ended_at":"N/A"},{"uuid":"1e3b7f2c-a648-41a8-b53e-4fa5bd2ae73c","started_at":"2024-10-15T13:15:42+0200","ended_at":"N/A"},{"uuid":"242760b7-756f-4f13-a64f-e1a6e012f708","started_at":"2024-10-15T13:17:15+0200","ended_at":"N/A"},{"uuid":"38c14d24-2e8a-4cb6-ad5b-851612e800ab","started_at":"2024-10-15T13:18:08+0200","ended_at":"2024-10-15T14:21:57+0200"},{"uuid":"3bc984e2-ff73-417c-b123-fdb365ddf241","started_at":"2024-10-15T13:32:40+0200","ended_at":"N/A"},{"uuid":"43c81538-d7ea-479f-b71f-934521a6f7bf","started_at":"2024-10-15T13:18:10+0200","ended_at":"N/A"},{"uuid":"6330d17c-6fe8-419c-a044-58e590480e18","started_at":"2024-10-15T13:18:08+0200","ended_at":"2024-10-15T14:21:57+0200"},{"uuid":"66c3eff0-52eb-45e2-a5da-5fe21c0ef3f3","started_at":"2024-10-15T13:16:55+0200","ended_at":"N/A"},{"uuid":"7fe4fa04-e4ea-4b92-84f4-45c9e78b9520","started_at":"2024-10-15T13:15:11+0200","ended_at":"N/A"},{"uuid":"83b7be64-daff-477c-8f3d-2ce880c44a53","started_at":"2024-10-15T13:14:34+0200","ended_at":"N/A"},{"uuid":"b6eafae3-5c24-4f25-b297-5ef291d9487d","started_at":"2024-10-15T13:16:00+0200","ended_at":"N/A"},{"uuid":"c98235c9-d9c0-4b54-8782-0fd8d6312539","started_at":"2024-10-15T14:29:32+0200","ended_at":"N/A"},{"uuid":"d0d60434-4bf1-4eb1-9469-d7b38083a88f","started_at":"2024-10-15T13:32:43+0200","ended_at":"2024-10-15T14:32:09+0200"},{"uuid":"d8af1245-4639-4981-95db-ae097021401d","started_at":"2024-10-15T13:18:08+0200","ended_at":"2024-10-15T14:21:57+0200"},{"uuid":"e119caae-1424-47de-9f64-11ac73ae0e75","started_at":"2024-10-15T13:14:33+0200","ended_at":"N/A"}]} diff --git a/pkg/api/testdata/output/e2e-test-api-server-cluster-admin-query.txt b/pkg/api/testdata/output/e2e-test-api-server-cluster-admin-query.txt index 63876adb..08d4610f 100644 --- a/pkg/api/testdata/output/e2e-test-api-server-cluster-admin-query.txt +++ b/pkg/api/testdata/output/e2e-test-api-server-cluster-admin-query.txt @@ -1 +1 @@ -{"status":"success","data":[{"id":"slurm-0","manager":"slurm"},{"id":"slurm-1","manager":"slurm"}]} +{"status":"success","data":[{"id":"os-0","manager":"openstack"},{"id":"os-1","manager":"openstack"},{"id":"slurm-0","manager":"slurm"},{"id":"slurm-1","manager":"slurm"}]} diff --git a/pkg/api/testdata/output/e2e-test-api-server-current-stats-admin-query.txt b/pkg/api/testdata/output/e2e-test-api-server-current-stats-admin-query.txt index a6c30645..7bea2037 100644 --- a/pkg/api/testdata/output/e2e-test-api-server-current-stats-admin-query.txt +++ b/pkg/api/testdata/output/e2e-test-api-server-current-stats-admin-query.txt @@ -1 +1 @@ -{"status":"success","data":[{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":10,"num_inactive_units":8,"num_active_units":2,"num_projects":5,"num_users":6}]} +{"status":"success","data":[{"cluster_id":"os-1","resource_manager":"openstack","num_units":18,"num_inactive_units":6,"num_active_units":12,"num_projects":5,"num_users":5}]} diff --git a/pkg/api/testdata/output/e2e-test-api-server-current-usage-admin-experimental-query.txt b/pkg/api/testdata/output/e2e-test-api-server-current-usage-admin-experimental-query.txt index ac791ea8..353627a6 100644 --- a/pkg/api/testdata/output/e2e-test-api-server-current-usage-admin-experimental-query.txt +++ b/pkg/api/testdata/output/e2e-test-api-server-current-usage-admin-experimental-query.txt @@ -1 +1 @@ -{"status":"success","data":[{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":1,"project":"acc1","groupname":"grp1","username":"usr1","total_time_seconds":{"alloc_cpumemtime":970588160,"alloc_cputime":23696,"alloc_gpumemtime":2962,"alloc_gputime":23696,"walltime":2962},"avg_cpu_usage":{"global":21.22149394},"avg_cpu_mem_usage":{"global":21.22149394},"total_cpu_energy_usage_kwh":{"total":21.22149394},"total_cpu_emissions_gms":{"emaps_total":21.22149394,"rte_total":21.22149394},"avg_gpu_usage":{"global":21.22149394},"avg_gpu_mem_usage":{"global":21.22149394},"total_gpu_energy_usage_kwh":{"total":21.22149394},"total_gpu_emissions_gms":{"emaps_total":21.22149394,"rte_total":21.22149394}},{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":2,"project":"acc1","groupname":"grp15","username":"usr15","total_time_seconds":{"alloc_cpumemtime":325713920,"alloc_cputime":15904,"alloc_gpumemtime":994,"alloc_gputime":7952,"walltime":994},"avg_cpu_usage":{"global":18.18507953},"avg_cpu_mem_usage":{"global":18.18507953},"total_cpu_energy_usage_kwh":{"total":36.37015906},"total_cpu_emissions_gms":{"emaps_total":36.37015906,"rte_total":36.37015906},"avg_gpu_usage":{"global":18.18507953},"avg_gpu_mem_usage":{"global":18.18507953},"total_gpu_energy_usage_kwh":{"total":36.37015906},"total_gpu_emissions_gms":{"emaps_total":36.37015906,"rte_total":36.37015906}},{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":1,"project":"acc4","groupname":"grp4","username":"usr4","total_time_seconds":{"alloc_cpumemtime":162856960,"alloc_cputime":7952,"alloc_gpumemtime":497,"alloc_gputime":3976,"walltime":497},"avg_cpu_usage":{"global":14.03205767},"avg_cpu_mem_usage":{"global":14.03205767},"total_cpu_energy_usage_kwh":{"total":14.03205767},"total_cpu_emissions_gms":{"emaps_total":14.03205767,"rte_total":14.03205767},"avg_gpu_usage":{"global":14.03205767},"avg_gpu_mem_usage":{"global":14.03205767},"total_gpu_energy_usage_kwh":{"total":14.03205767},"total_gpu_emissions_gms":{"emaps_total":14.03205767,"rte_total":14.03205767}},{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":1,"project":"acc1","groupname":"grp8","username":"usr8","total_time_seconds":{"alloc_cpumemtime":970588160,"alloc_cputime":23696,"alloc_gpumemtime":2962,"alloc_gputime":23696,"walltime":2962},"avg_cpu_usage":{"global":20.21483680},"avg_cpu_mem_usage":{"global":20.21483680},"total_cpu_energy_usage_kwh":{"total":20.21483680},"total_cpu_emissions_gms":{"emaps_total":20.21483680,"rte_total":20.21483680},"avg_gpu_usage":{"global":20.21483680},"avg_gpu_mem_usage":{"global":20.21483680},"total_gpu_energy_usage_kwh":{"total":20.21483680},"total_gpu_emissions_gms":{"emaps_total":20.21483680,"rte_total":20.21483680}}]} +{"status":"success","data":[{"cluster_id":"os-1","resource_manager":"openstack","num_units":2,"project":"test-project-4","groupname":"","username":"test-user-4","total_time_seconds":{"alloc_cpumemtime":0,"alloc_cputime":0,"alloc_gpumemtime":0,"alloc_gputime":0,"walltime":0},"avg_cpu_usage":{"global":0},"avg_cpu_mem_usage":{"global":0},"total_cpu_energy_usage_kwh":{"total":133.04426070},"total_cpu_emissions_gms":{"emaps_total":133.04426070,"rte_total":133.04426070},"avg_gpu_usage":{"global":0},"avg_gpu_mem_usage":{"global":0},"total_gpu_energy_usage_kwh":{"total":133.04426070},"total_gpu_emissions_gms":{"emaps_total":133.04426070,"rte_total":133.04426070},"total_io_write_stats":{"bytes":1330442607,"requests":13304426070},"total_io_read_stats":{"bytes":1330442607,"requests":13304426070},"total_ingress_stats":{"bytes":133044260700,"packets":1330442607000},"total_outgress_stats":{"bytes":133044260700,"packets":1330442607000}},{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":1,"project":"acc1","groupname":"grp1","username":"usr1","total_time_seconds":{"alloc_cpumemtime":970588160,"alloc_cputime":23696,"alloc_gpumemtime":2962,"alloc_gputime":23696,"walltime":2962},"avg_cpu_usage":{"global":21.22149394},"avg_cpu_mem_usage":{"global":21.22149394},"total_cpu_energy_usage_kwh":{"total":21.22149394},"total_cpu_emissions_gms":{"emaps_total":21.22149394,"rte_total":21.22149394},"avg_gpu_usage":{"global":21.22149394},"avg_gpu_mem_usage":{"global":21.22149394},"total_gpu_energy_usage_kwh":{"total":21.22149394},"total_gpu_emissions_gms":{"emaps_total":21.22149394,"rte_total":21.22149394},"total_io_write_stats":{"bytes":0,"requests":0},"total_io_read_stats":{"bytes":0,"requests":0},"total_ingress_stats":{"bytes":0,"packets":0},"total_outgress_stats":{"bytes":0,"packets":0}},{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":2,"project":"acc1","groupname":"grp15","username":"usr15","total_time_seconds":{"alloc_cpumemtime":325713920,"alloc_cputime":15904,"alloc_gpumemtime":994,"alloc_gputime":7952,"walltime":994},"avg_cpu_usage":{"global":18.18507953},"avg_cpu_mem_usage":{"global":18.18507953},"total_cpu_energy_usage_kwh":{"total":36.37015906},"total_cpu_emissions_gms":{"emaps_total":36.37015906,"rte_total":36.37015906},"avg_gpu_usage":{"global":18.18507953},"avg_gpu_mem_usage":{"global":18.18507953},"total_gpu_energy_usage_kwh":{"total":36.37015906},"total_gpu_emissions_gms":{"emaps_total":36.37015906,"rte_total":36.37015906},"total_io_write_stats":{"bytes":0,"requests":0},"total_io_read_stats":{"bytes":0,"requests":0},"total_ingress_stats":{"bytes":0,"packets":0},"total_outgress_stats":{"bytes":0,"packets":0}},{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":1,"project":"acc4","groupname":"grp4","username":"usr4","total_time_seconds":{"alloc_cpumemtime":162856960,"alloc_cputime":7952,"alloc_gpumemtime":497,"alloc_gputime":3976,"walltime":497},"avg_cpu_usage":{"global":14.03205767},"avg_cpu_mem_usage":{"global":14.03205767},"total_cpu_energy_usage_kwh":{"total":14.03205767},"total_cpu_emissions_gms":{"emaps_total":14.03205767,"rte_total":14.03205767},"avg_gpu_usage":{"global":14.03205767},"avg_gpu_mem_usage":{"global":14.03205767},"total_gpu_energy_usage_kwh":{"total":14.03205767},"total_gpu_emissions_gms":{"emaps_total":14.03205767,"rte_total":14.03205767},"total_io_write_stats":{"bytes":0,"requests":0},"total_io_read_stats":{"bytes":0,"requests":0},"total_ingress_stats":{"bytes":0,"packets":0},"total_outgress_stats":{"bytes":0,"packets":0}},{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":1,"project":"acc1","groupname":"grp8","username":"usr8","total_time_seconds":{"alloc_cpumemtime":970588160,"alloc_cputime":23696,"alloc_gpumemtime":2962,"alloc_gputime":23696,"walltime":2962},"avg_cpu_usage":{"global":20.21483680},"avg_cpu_mem_usage":{"global":20.21483680},"total_cpu_energy_usage_kwh":{"total":20.21483680},"total_cpu_emissions_gms":{"emaps_total":20.21483680,"rte_total":20.21483680},"avg_gpu_usage":{"global":20.21483680},"avg_gpu_mem_usage":{"global":20.21483680},"total_gpu_energy_usage_kwh":{"total":20.21483680},"total_gpu_emissions_gms":{"emaps_total":20.21483680,"rte_total":20.21483680},"total_io_write_stats":{"bytes":0,"requests":0},"total_io_read_stats":{"bytes":0,"requests":0},"total_ingress_stats":{"bytes":0,"packets":0},"total_outgress_stats":{"bytes":0,"packets":0}}]} diff --git a/pkg/api/testdata/output/e2e-test-api-server-current-usage-experimental-query.txt b/pkg/api/testdata/output/e2e-test-api-server-current-usage-experimental-query.txt index e9090d4e..366a5a25 100644 --- a/pkg/api/testdata/output/e2e-test-api-server-current-usage-experimental-query.txt +++ b/pkg/api/testdata/output/e2e-test-api-server-current-usage-experimental-query.txt @@ -1 +1 @@ -{"status":"success","data":[{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":1,"project":"acc4","groupname":"grp4","username":"usr4","total_time_seconds":{"alloc_cpumemtime":162856960,"alloc_cputime":7952,"alloc_gpumemtime":497,"alloc_gputime":3976,"walltime":497},"avg_cpu_usage":{"global":14.03205767},"avg_cpu_mem_usage":{"global":14.03205767},"total_cpu_energy_usage_kwh":{"total":14.03205767},"total_cpu_emissions_gms":{"emaps_total":14.03205767,"rte_total":14.03205767},"avg_gpu_usage":{"global":14.03205767},"avg_gpu_mem_usage":{"global":14.03205767},"total_gpu_energy_usage_kwh":{"total":14.03205767},"total_gpu_emissions_gms":{"emaps_total":14.03205767,"rte_total":14.03205767}}]} +{"status":"success","data":[{"cluster_id":"os-1","resource_manager":"openstack","num_units":2,"project":"test-project-4","groupname":"","username":"test-user-4","total_time_seconds":{"alloc_cpumemtime":0,"alloc_cputime":0,"alloc_gpumemtime":0,"alloc_gputime":0,"walltime":0},"avg_cpu_usage":{"global":0},"avg_cpu_mem_usage":{"global":0},"total_cpu_energy_usage_kwh":{"total":133.04426070},"total_cpu_emissions_gms":{"emaps_total":133.04426070,"rte_total":133.04426070},"avg_gpu_usage":{"global":0},"avg_gpu_mem_usage":{"global":0},"total_gpu_energy_usage_kwh":{"total":133.04426070},"total_gpu_emissions_gms":{"emaps_total":133.04426070,"rte_total":133.04426070},"total_io_write_stats":{"bytes":1330442607,"requests":13304426070},"total_io_read_stats":{"bytes":1330442607,"requests":13304426070},"total_ingress_stats":{"bytes":133044260700,"packets":1330442607000},"total_outgress_stats":{"bytes":133044260700,"packets":1330442607000}}]} diff --git a/pkg/api/testdata/output/e2e-test-api-server-global-stats-admin-query.txt b/pkg/api/testdata/output/e2e-test-api-server-global-stats-admin-query.txt index 01060f5f..9efcde66 100644 --- a/pkg/api/testdata/output/e2e-test-api-server-global-stats-admin-query.txt +++ b/pkg/api/testdata/output/e2e-test-api-server-global-stats-admin-query.txt @@ -1 +1 @@ -{"status":"success","data":[{"cluster_id":"slurm-0","resource_manager":"slurm","num_units":12,"num_inactive_units":10,"num_active_units":2,"num_projects":5,"num_users":7},{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":12,"num_inactive_units":10,"num_active_units":2,"num_projects":5,"num_users":7}]} +{"status":"success","data":[{"cluster_id":"os-0","resource_manager":"openstack","num_units":18,"num_inactive_units":6,"num_active_units":12,"num_projects":5,"num_users":5},{"cluster_id":"os-1","resource_manager":"openstack","num_units":18,"num_inactive_units":6,"num_active_units":12,"num_projects":5,"num_users":5},{"cluster_id":"slurm-0","resource_manager":"slurm","num_units":12,"num_inactive_units":10,"num_active_units":2,"num_projects":5,"num_users":7},{"cluster_id":"slurm-1","resource_manager":"slurm","num_units":12,"num_inactive_units":10,"num_active_units":2,"num_projects":5,"num_users":7}]} diff --git a/pkg/api/testdata/output/e2e-test-api-server-project-admin-query.txt b/pkg/api/testdata/output/e2e-test-api-server-project-admin-query.txt index 1a2c8db4..a9d338df 100644 --- a/pkg/api/testdata/output/e2e-test-api-server-project-admin-query.txt +++ b/pkg/api/testdata/output/e2e-test-api-server-project-admin-query.txt @@ -1 +1 @@ -{"status":"success","data":[{"cluster_id":"slurm-0","resource_manager":"slurm","name":"acc1","users":["usr1","usr15","usr8"]},{"cluster_id":"slurm-1","resource_manager":"slurm","name":"acc1","users":["usr1","usr15","usr8"]}]} +{"status":"success","data":[{"uid":"bdb137e6ee6d427a899ac22de5d76b8c","cluster_id":"os-0","resource_manager":"openstack","name":"test-project-3","users":["test-user-1","test-user-2","test-user-3"]},{"uid":"bdb137e6ee6d427a899ac22de5d76b8c","cluster_id":"os-1","resource_manager":"openstack","name":"test-project-3","users":["test-user-1","test-user-2","test-user-3"]}]} diff --git a/pkg/api/testdata/output/e2e-test-api-server-running-query.txt b/pkg/api/testdata/output/e2e-test-api-server-running-query.txt index 3d65fe9e..6b0c8169 100644 --- a/pkg/api/testdata/output/e2e-test-api-server-running-query.txt +++ b/pkg/api/testdata/output/e2e-test-api-server-running-query.txt @@ -1 +1 @@ -{"status":"success","data":[{"uuid":"147975","started_at":"2023-02-21T14:37:07+0100","state":"CANCELLED by 1003","allocation":{"billing":80,"cpus":8,"gpus":8,"mem":343597383680,"nodes":1},"tags":{"exit_code":"0:0","gid":1003,"nodelist":"compute-0","nodelistexp":"compute-0","partition":"part1","qos":"qos1","uid":1003,"workdir":"/home/usr3"}},{"uuid":"1481510","started_at":"2023-02-21T15:49:06+0100","state":"CANCELLED by 1003","allocation":{"billing":160,"cpus":16,"gpus":8,"mem":343597383680,"nodes":2},"tags":{"exit_code":"0:0","gid":1003,"nodelist":"compute-[0-2]","nodelistexp":"compute-0|compute-1|compute-2","partition":"part1","qos":"qos1","uid":1003,"workdir":"/home/usr3"}},{"uuid":"2009248","started_at":"2023-02-21T15:49:06+0100","state":"RUNNING","allocation":{"billing":0,"cpus":0,"gpus":0,"mem":0,"nodes":2},"tags":{"exit_code":"0:0","gid":1003,"nodelist":"compute-[0-2]","nodelistexp":"compute-0|compute-1|compute-2","partition":"part2","qos":"qos3","uid":1003,"workdir":"/home/usr3"}}]} +{"status":"success","data":[{"uuid":"1cef0381-0a5a-42e6-9e9b-3d88f84be971","started_at":"2024-10-15T14:28:50+0200","state":"ACTIVE","allocation":{"disk":1,"extra_specs":{"hw_rng:allowed":"True","resources:VGPU":"1"},"mem":8192,"name":"m10.vgpu","swap":0,"vcpus":8},"tags":{"az":"nova","hypervisor":"gpu-node-4","metadata":{},"power_state":"RUNNING","reservation_id":"r-ct4kh3w1","server_groups":"","tags":[]}},{"uuid":"1e3b7f2c-a648-41a8-b53e-4fa5bd2ae73c","started_at":"2024-10-15T13:15:42+0200","state":"ACTIVE","allocation":{"disk":1,"extra_specs":{"hw_rng:allowed":"True"},"mem":256,"name":"cirros256","swap":0,"vcpus":1},"tags":{"az":"nova","hypervisor":"cpu-node-4","metadata":{},"power_state":"RUNNING","reservation_id":"r-tk530ak6","server_groups":"","tags":[]}},{"uuid":"7fe4fa04-e4ea-4b92-84f4-45c9e78b9520","started_at":"2024-10-15T13:15:11+0200","state":"ACTIVE","allocation":{"disk":1,"extra_specs":{"hw_rng:allowed":"True"},"mem":192,"name":"m1.micro","swap":0,"vcpus":1},"tags":{"az":"nova","hypervisor":"cpu-node-4","metadata":{},"power_state":"RUNNING","reservation_id":"r-ztao3fbf","server_groups":"","tags":[]}},{"uuid":"b6eafae3-5c24-4f25-b297-5ef291d9487d","started_at":"2024-10-15T13:16:00+0200","state":"SUSPENDED","allocation":{"disk":1,"extra_specs":{"hw_rng:allowed":"True"},"mem":128,"name":"m1.nano","swap":0,"vcpus":1},"tags":{"az":"nova","hypervisor":"cpu-node-4","metadata":{},"power_state":"SHUTDOWN","reservation_id":"r-ks8nrkb2","server_groups":"","tags":[]}}]} diff --git a/pkg/collector/gpu_test.go b/pkg/collector/gpu_test.go index 61976469..21e44fc5 100644 --- a/pkg/collector/gpu_test.go +++ b/pkg/collector/gpu_test.go @@ -603,6 +603,82 @@ func TestUpdateMdevs(t *testing.T) { assert.EqualValues(t, expectedDevs, updatedGPUDevs) } +func TestUpdateMdevsEviction(t *testing.T) { + nvidiaVGPULog := `GPU 00000000:10:00.0 + Active vGPUs : 1 + vGPU ID : 3251634213 + MDEV UUID : c73f1fa6-489e-4834-9476-d70dabd98c40 + GPU Instance ID : N/A + + ` + tempDir := t.TempDir() + nvidiaSMIPath := filepath.Join(tempDir, "nvidia-smi") + content := fmt.Sprintf(`#!/bin/bash +echo """%s""" +`, nvidiaVGPULog) + os.WriteFile(nvidiaSMIPath, []byte(content), 0o700) // #nosec + + _, err := CEEMSExporterApp.Parse( + []string{ + "--collector.gpu.nvidia-smi-path", nvidiaSMIPath, + }, + ) + require.NoError(t, err) + + // Set devices + devs := []Device{ + { + busID: BusID{0x0, 0x10, 0x0, 0x0}, + vgpuEnabled: true, + }, + } + + // Now updates gpuDevices with mdevs + updatedGPUDevs, err := updateGPUMdevs(devs) + require.NoError(t, err) + assert.EqualValues(t, []string{"c73f1fa6-489e-4834-9476-d70dabd98c40"}, updatedGPUDevs[0].mdevUUIDs) + + // Update nvidia-smi output to simulate a new mdev addition + nvidiaVGPULog = `GPU 00000000:10:00.0 + Active vGPUs : 2 + vGPU ID : 3251634213 + MDEV UUID : c73f1fa6-489e-4834-9476-d70dabd98c40 + GPU Instance ID : N/A + + vGPU ID : 3251634214 + MDEV UUID : 741ac383-27e9-49a9-9955-b513ad2e2e16 + GPU Instance ID : N/A + + ` + content = fmt.Sprintf(`#!/bin/bash +echo """%s""" +`, nvidiaVGPULog) + os.WriteFile(nvidiaSMIPath, []byte(content), 0o700) // #nosec + + // Now update gpuDevices again with mdevs + updatedGPUDevs, err = updateGPUMdevs(devs) + require.NoError(t, err) + assert.EqualValues(t, []string{"c73f1fa6-489e-4834-9476-d70dabd98c40", "741ac383-27e9-49a9-9955-b513ad2e2e16"}, updatedGPUDevs[0].mdevUUIDs) + + // Update nvidia-smi output to simulate removal of an existing mdev + nvidiaVGPULog = `GPU 00000000:10:00.0 + Active vGPUs : 1 + vGPU ID : 3251634214 + MDEV UUID : 741ac383-27e9-49a9-9955-b513ad2e2e16 + GPU Instance ID : N/A + + ` + content = fmt.Sprintf(`#!/bin/bash +echo """%s""" +`, nvidiaVGPULog) + os.WriteFile(nvidiaSMIPath, []byte(content), 0o700) // #nosec + + // Now update gpuDevices again with mdevs + updatedGPUDevs, err = updateGPUMdevs(devs) + require.NoError(t, err) + assert.EqualValues(t, []string{"741ac383-27e9-49a9-9955-b513ad2e2e16"}, updatedGPUDevs[0].mdevUUIDs) +} + func TestParseBusIDPass(t *testing.T) { id := "00000000:AD:00.0" busID, err := parseBusID(id) diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index 967889d2..0e61ab2c 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -516,10 +516,12 @@ then fi export PATH="${GOBIN:-}:${PATH}" - ./bin/mock_tsdb >> "${logfile}" 2>&1 & - PROMETHEUS_PID=$! + ./bin/mock_servers prom os-compute os-identity >> "${logfile}" 2>&1 & + MOCK_SERVERS_PID=$! waitport "9090" + waitport "8080" + waitport "7070" # Copy config file to tmpdir cp pkg/api/testdata/config.yml "${tmpdir}/config.yml" @@ -535,7 +537,7 @@ then --log.level="debug" >> "${logfile}" 2>&1 & CEEMS_API=$! - echo "${PROMETHEUS_PID} ${CEEMS_API}" > "${pidfile}" + echo "${MOCK_SERVERS_PID} ${CEEMS_API}" > "${pidfile}" sleep 2 waitport "${port}" @@ -548,7 +550,7 @@ then get -H "X-Grafana-User: usr1" "127.0.0.1:${port}/api/${api_version}/projects?project=acc3" > "${fixture_output}" elif [ "${scenario}" = "api-project-admin-query" ] then - get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/projects/admin?project=acc1" > "${fixture_output}" + get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/projects/admin?project=test-project-3" > "${fixture_output}" elif [ "${scenario}" = "api-user-query" ] then get -H "X-Grafana-User: usr1" "127.0.0.1:${port}/api/${api_version}/users" > "${fixture_output}" @@ -569,7 +571,7 @@ then get -H "X-Grafana-User: usr2" "127.0.0.1:${port}/api/${api_version}/units?cluster_id=slurm-0&from=1676934000&to=1677538800&field=uuiid" > "${fixture_output}" elif [ "${scenario}" = "api-running-query" ] then - get -H "X-Grafana-User: usr3" "127.0.0.1:${port}/api/${api_version}/units?running&cluster_id=slurm-1&from=1676934000&to=1677538800&field=uuid&field=state&field=started_at&field=allocation&field=tags" > "${fixture_output}" + get -H "X-Grafana-User: test-user-1" "127.0.0.1:${port}/api/${api_version}/units?running&cluster_id=os-1&field=uuid&field=state&field=started_at&field=allocation&field=tags" > "${fixture_output}" elif [ "${scenario}" = "api-admin-query" ] then get -H "X-Grafana-User: grafana" -H "X-Dashboard-User: usr3" "127.0.0.1:${port}/api/${api_version}/units?cluster_id=slurm-0&project=acc3&from=1676934000&to=1677538800" > "${fixture_output}" @@ -578,7 +580,7 @@ then get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/units/admin?cluster_id=slurm-1&from=1676934000&to=1677538800" > "${fixture_output}" elif [ "${scenario}" = "api-admin-query-all-selected-fields" ] then - get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/units/admin?cluster_id=slurm-0&from=1676934000&to=1677538800&field=uuid&field=started_at&field=ended_at&field=foo" > "${fixture_output}" + get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/units/admin?cluster_id=os-0&running&from=1728990800&to=1728995400&field=uuid&field=started_at&field=ended_at&field=foo" > "${fixture_output}" elif [ "${scenario}" = "api-admin-denied-query" ] then get -H "X-Grafana-User: usr1" "127.0.0.1:${port}/api/${api_version}/units/admin" > "${fixture_output}" @@ -587,7 +589,7 @@ then get -H "X-Grafana-User: usr1" "127.0.0.1:${port}/api/${api_version}/usage/current?cluster_id=slurm-1&from=1676934000&to=1677538800" > "${fixture_output}" elif [ "${scenario}" = "api-current-usage-experimental-query" ] then - get -H "X-Grafana-User: usr4" "127.0.0.1:${port}/api/${api_version}/usage/current?cluster_id=slurm-1&from=1676934000&experimental" > "${fixture_output}" + get -H "X-Grafana-User: test-user-4" "127.0.0.1:${port}/api/${api_version}/usage/current?cluster_id=os-1&from=1728990800&experimental" > "${fixture_output}" elif [ "${scenario}" = "api-global-usage-query" ] then get -H "X-Grafana-User: usr1" "127.0.0.1:${port}/api/${api_version}/usage/global?cluster_id=slurm-0&field=username&field=project&field=num_units" > "${fixture_output}" @@ -596,13 +598,13 @@ then get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/usage/current/admin?cluster_id=slurm-1&user=usr15&user=usr3&from=1676934000&to=1677538800" > "${fixture_output}" elif [ "${scenario}" = "api-current-usage-admin-experimental-query" ] then - get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/usage/current/admin?cluster_id=slurm-1&user=usr15&user=usr4&from=1676934000&experimental" > "${fixture_output}" + get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/usage/current/admin?cluster_id=slurm-1&user=usr15&user=usr4&cluster_id=os-1&user=test-user-4&from=1728990800&running&experimental" > "${fixture_output}" elif [ "${scenario}" = "api-global-usage-admin-query" ] then get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/usage/global/admin?cluster_id=slurm-0&field=username&field=project&field=num_units" > "${fixture_output}" elif [ "${scenario}" = "api-current-stats-admin-query" ] then - get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/stats/current/admin?cluster_id=slurm-1&from=1676934000&to=1677538800" > "${fixture_output}" + get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/stats/current/admin?cluster_id=os-1&from=1728994800&to=1729005000" > "${fixture_output}" elif [ "${scenario}" = "api-global-stats-admin-query" ] then get -H "X-Grafana-User: grafana" "127.0.0.1:${port}/api/${api_version}/stats/global/admin" > "${fixture_output}" @@ -729,6 +731,12 @@ then waitport "9090" + ./bin/mock_servers os-compute os-identity >> "${logfile}" 2>&1 & + MOCK_SERVERS_PID=$! + + waitport "8080" + waitport "7070" + # Copy config file to tmpdir cp pkg/api/testdata/config.yml "${tmpdir}/config.yml" @@ -750,7 +758,7 @@ then --log.level="debug" >> "${logfile}" 2>&1 & LB_PID=$! - echo "${PROMETHEUS_PID} ${CEEMS_API_PID} ${LB_PID}" > "${pidfile}" + echo "${PROMETHEUS_PID} ${MOCK_SERVERS_PID} ${CEEMS_API_PID} ${LB_PID}" > "${pidfile}" waitport "${port}" @@ -773,6 +781,12 @@ then waitport "9090" + ./bin/mock_servers os-compute os-identity >> "${logfile}" 2>&1 & + MOCK_SERVERS_PID=$! + + waitport "8080" + waitport "7070" + # Copy config file to tmpdir cp pkg/api/testdata/config.yml "${tmpdir}/config.yml" @@ -794,7 +808,7 @@ then --log.level="debug" >> "${logfile}" 2>&1 & LB_PID=$! - echo "${PROMETHEUS_PID} ${CEEMS_API_PID} ${LB_PID}" > "${pidfile}" + echo "${PROMETHEUS_PID} ${MOCK_SERVERS_PID} ${CEEMS_API_PID} ${LB_PID}" > "${pidfile}" waitport "${port}" diff --git a/website/cspell.json b/website/cspell.json index 71f4486c..622e2c1a 100644 --- a/website/cspell.json +++ b/website/cspell.json @@ -53,7 +53,9 @@ "memsw", "retrans", "Mellanox", - "blkio" + "blkio", + "tsbd", + "gpuuuid" ], // flagWords - list of words to be always considered incorrect // This is useful for offensive words and common spelling errors. diff --git a/website/docs/00-introduction.md b/website/docs/00-introduction.md index aab60c07..70818dce 100644 --- a/website/docs/00-introduction.md +++ b/website/docs/00-introduction.md @@ -23,6 +23,18 @@ of backward compatibility. ::: +## Features + +- Monitor energy, performance, IO and network metrics for different types of resource +managers (SLURM, Openstack, k8s) +- Support NVIDIA (MIG and vGPU) and AMD GPUs +- Realtime access to metrics *via* Grafana dashboards +- Access control to Prometheus datasource in Grafana +- Stores aggregated metrics in a separate DB that can be retained for long time +- CEEMS apps are [capability aware](https://tbhaxor.com/understanding-linux-capabilities/) + +## Components + CEEMS provide a set of components that enable operators to monitor the consumption of resources of the compute units of different resource managers like SLURM, Openstack and Kubernetes. diff --git a/website/docs/configuration/ceems-api-server.md b/website/docs/configuration/ceems-api-server.md index 7d5798be..a63c2baf 100644 --- a/website/docs/configuration/ceems-api-server.md +++ b/website/docs/configuration/ceems-api-server.md @@ -4,691 +4,546 @@ sidebar_position: 3 # CEEMS API Server -The following shows the reference for CEEMS API server config. A valid sample configuration -file can be found in the [repo](https://github.com/mahendrapaipuri/ceems/blob/main/build/config/ceems_api_server/ceems_api_server.yml) +CEEMS API server can be configured using a YAML file under the key `ceems_api_server`. +Along with the base configuration of the server, we need to provide the configurations +of clusters that we need to monitor and updaters that will be used to update the +compute unit metrics. They can be configured under keys `clusters` and `updaters`, +respectively. Thus a bare configuration file looks as below: ```yaml -# Configuration file to configure CEEMS API server -# -# This config file has following sections: -# - `ceems_api_server`: Core configuration of CEEMS API server -# - `clusters`: Configuration of clusters that are being monitored -# - `updaters`: Configuration of updaters -# ---- -# CEEMS API Server and data config -ceems_api_server: - # Data related configuration of the CEEMS API server. This config concerns with the - # locations where data will be saved, frequency of data updates, etc. - # - data: - [ ] +# CEEMS API Server configuration skeleton - # HTTP web admin related config for CEEMS API server - # - admin: - [ ] +ceems_api_server: - # HTTP web related config for CEEMS API server. - # - web: - # Maximum allowable query range, ie, the difference between `from` and `to` query - # parameters. - # - # This can be used to restrict the query range made by the users to avoid OOM errors - # when handling too much data. - # - # Default value `0s` means no restrictions are imposed on the query. - # - # Units Supported: y, w, d, h, m, s, ms. - # - [ max_query: | default: 0s ] - - # Number of requests allowed in ONE MINUTE per client identified by Real IP address. - # Request headers `True-Client-IP`, `X-Real-IP` and `X-Forwarded-For` are looked up - # to get the real IP address. - # - # This is to effectively impose a rate limit for the entire CEEMS server irrespective - # of URL path. We advise to set it to a value based on your needs to avoid DoS/DDoS - # attacks. - # - # Rate limiting is done using the Sliding Window Counter pattern inspired by - # CloudFlare https://blog.cloudflare.com/counting-things-a-lot-of-different-things/ - # - # Default value `0` means no rate limiting is applied. - # - [ requests_limit: | default: 0 ] - - # It will be used to prefix all HTTP endpoints served by CEEMS API server. - # For example, if CEEMS API server is served via a reverse proxy. - # - # Default is '/' - # - [ route_prefix: | default: / ] - -# A list of clusters from which CEEMS API server will fetch the compute units. -# -# Each cluster must provide an unique `id`. The `id` will enable CEEMS to identify -# different clusters in multi-cluster setup. This `id` must be consistent throughout -# all the CEEMS components. -# -clusters: - [ - ... ] - - -# A list of Updaters that will be used to update the compute unit metrics. This update -# step can be used to update the aggregate metrics of each compute unit in real time -# or to add complementary information to the compute units from on-premise third -# party services. -# -# Currently only TSDB updater is supported. The compute unit aggregate metrics can be -# updated from TSDB (Prometheus/VM) instances. -# -updaters: - [ - ... ] - +clusters: + +updaters: ``` -## `` +A complete reference on CEEMS API server configuration can be found in [Reference](./config-reference.md) +section. A valid sample configuration +file can be found in the [repo](https://github.com/mahendrapaipuri/ceems/blob/main/build/config/ceems_api_server/ceems_api_server.yml) + +## CEEMS API Server Configuration -A `data_config` allows configuring the DB settings of CEEMS API server. +This section guides on how to configure CEEMS API server. A sample configuration +file is shown below: ```yaml -# Path at which CEEMS API server data will be stored. -# If relative path is used, it will be resolved based on the current working directory. -# -[ path: | default = data ] - -# Units data will be fetched at this interval. CEEMS will pull the units from the -# underlying resource manager at this frequency into its own DB. -# -# Units Supported: y, w, d, h, m, s, ms. -# -[ update_interval: | default = 15m ] - -# The duration to retain the data in the DB. Units older than this duration will be -# purged from the DB. -# -# In the case of global usage stats, if the last activity on a given project/user -# combination is older than this period, those stats will be purged from the DB. -# -# Units Supported: y, w, d, h, m, s, ms. -# -[ retention_period: | default = 30d ] - -# CEEMS API server is capable of creating DB backups using SQLite backup API. Created -# DB backups will be saved to this path. NOTE that for huge DBs, this backup can take -# a considerable amount of time. -# -# Use a different disk device than `ceems_api_server.data.path` to achieve -# fault tolerance. -# -# If the path is empty, no backups will be created. -# -[ backup_path: ] - -# The interval at which DB back ups will be created. -# -# Minimum allowable interval is `1d`, ie, 1 day. -# -# Units Supported: y, w, d, h, m, s, ms. -# -[ backup_interval: | default = 1d ] +ceems_api_server: + data: + path: /path/to/ceems/data + update_interval: 15m + retention_period: 1y + backup_path: /path/to/backup/ceems/data + backup_interval: 1d + admin: + users: + - adm1 + - adm2 + grafana: + url: https://grafana.example.com + teams_ids: + - 1 + authorization: + type: Bearer + credentials: mysupersecretgrafanaservicetoken + + web: + max_query: 30d + requests_limit: 30 + route_prefix: /ceems/ ``` -## `` - -A `admin_config` allows configuring the admin users of CEEMS API server. +The configuration for `ceems_api_server` has three sections namely, `data`, `admin` and `web` +for configuring different aspects of the API server. Some explanation about the `data` +config is discussed below: + +- `data.path`: Path where all CEEMS related data will be stored. +- `data.update_interval`: The frequency at which CEEMS API server will fetch compute units +from the underlying cluster. Do not use too small intervals or high frequency. `15m` is a +sane default and it should work in most of the production cases. +- `data.retention_period`: CEEMS API server stores all the meta data of compute units along +with their aggregated metrics in a SQLite relational DB. This config parameter can be used +to configure the retention time of the compute unit data in the SQLite. For example, when +a value of `1y` is used, it means all the compute units data in the last one year will be +retained and the rest of the units data will be purged. +- `data.backup_path`: It is possible to create backups of SQLite DB at a configured interval +set by `data.backup_interval` onto a fault tolerant storage. + +:::warning[WARNING] + +As DB grows in size, time taken for creating a backup increases non-linearly +and hence, use the back up option only if it is absolutely needed. A general +advice is to use a continuous backup solution like +[litestream](https://litestream.io/) instead of native backup solution offered +by CEEMS + +::: + +CEEMS API server exposes admin endpoints in its API and the `admin` section can be used to +configure which users can access those endpoints. More details on admin endpoints can be +consulted from the [API Docs](https://mahendrapaipuri.github.io/ceems/api). + +- `admin.users`: A list of statically defined users that will have access to admin endpoints. +- `admin.grafana`: Besides a static list of users, CEEMS API server can pull users from given +Grafana teams to add them to list of users that will be granted access to admin endpoints. This +enables to dynamically add users to admin users list for CEEMS without having to re-configure +and restart CEEMS API server. This section allows to provide the client configuration of +Grafana. All possible client configuration options can be consulted in the +[Config Reference](./config-reference.md#grafana-config). + +Finally, the section `web` can be used to configured HTTP server of CEEMS API server. + +- `web.max_query`: Maximum allowable query period. Configure this value appropriately +based on the needs as queries with too longer period can put considerable amount of +pressure on DB queries. +- `web.requests_limit`: Maximum number of requests per minute per client identified by +remote IP address. +- `web.route_prefix`: All the CEEMS API end points will be prefixed by this value. It +is useful when serving CEEMS API server behind a reverse proxy at a given path. + +## Clusters Configuration + +A sample clusters configuration section is shown as below: ```yaml -# List of users that will have admin privileges for accessing CEEMS API server -# -# These users will have full access to DB and can query stats of any user/project. -# -# In addition, it is possible to pull users from Grafana teams and add them to -# admin users. Check `grafana` configuration on how to fetch users from Grafana. -# -users: - [ - ... ] - -# Besides setting a static list of admin users using `ceems_api_server.web.admin_users`, -# it is possible to pull the users from a given Grafana instance and update the admin users -# list of CEEMS API server. This allows operators to add new admins to CEEMS API server -# without having to restart `ceems_api_server`. -# -# Typically, one or several Grafana team(s) can be created dedicated to CEEMS admins and -# CEEMS API server will fetch the Grafana team members at the same frequency as compute -# units. -# -# The web config of Grafana can be set in the following section: -# -grafana: - [ ] +clusters: + - id: slurm-0 + manager: slurm + updaters: + - tsdb-0 + - tsdb-1 + cli: + path: path/to/slurm/bin + + - id: os-0 + manager: openstack + updaters: + - tsdb-0 + web: + http_headers: + X-Auth-Token: + secrets: + - supersecrettoken + X-OpenStack-Nova-API-Version: + values: + - latest + extra_config: + compute_api_url: https://openstack-nova.example.com/v2.1 + identity_api_url: https://openstack-keystone.example.com ``` -## `` - -A `grafana_config` allows configuring the Grafana client config to fetch members of -Grafana teams to be added to admin users of CEEMS API server. +Essentially it is a list of objects where each object describes a cluster. + +- `id`: A unique identifier for each cluster. The identifier must stay consistent across +CEEMS components, especially for CEEMS LB. More details can be found in +[Configuring CEEMS LB](./ceems-lb.md) section. +- `manager`: Resource manager kind. Currently only **SLURM** and **Openstack** are +supported. +- `updaters`: List of updaters to be used to update the aggregate metrics of the +compute units. The order is important as compute units are updated in the same order +as provided here. For example, using the current sample file, it is important for the +operators to ensure that we do not override the metrics updated by `tsdb-0` by `tsdb-1`. +More details on updaters can be found in [Updaters Configuration](#updaters-configuration). +- `cli`: If the resource manager uses CLI tools to fetch compute units, configuration related +to those CLI tools can be provided here. For example, currently CEEMS API server supports +fetching SLURM jobs only using `sacct` command and hence, it is essential to provide the +path to `bin` folder where `sacct` command will be found. More options on CLI section can +be found in [Cluster Configuration Reference](./config-reference.md#cluster_config). +- `web`: If the resource manager supports fetching compute units using API, the client +configuration to access API endpoints can be provided here. In this particular example, +the Openstack cluster's authentication is configured using `web.http_headers` section. +All available options for the `web` configuration can be found in +[Web Client Configuration Reference](./config-reference.md#web_client_config). +- `extra_config`: Any extra configuration required by a particular resource manager can be +provided here. Currently, Openstack resource manager uses this section to configure the API +URLs for compute and identity servers to fetch compute units, users and projects data. + +:::important[IMPORTANT] + +It is important to configure the compute and identity API URLs as displayed by the +service catalog in the Openstack cluster. For instance, CEEMS API server supports +only identity API version `v3` and it adds `v3` to URL path when making API requests. +However, it expects the configured API URL for compute contains the API version `v2.1` +as shown in the above config. + +::: + +## Updaters Configuration + +A sample updater config is shown below: ```yaml -# Web URL of the Grafana instance -# -url: - -# List of IDs of the Grafana teams from which the members will be synchronized -# with CEEMS admin users -# -teams_ids: - - ... - -# Sets the `Authorization` header on every API request with the -# configured username and password. -# password and password_file are mutually exclusive. -# -basic_auth: - [ username: ] - [ password: ] - [ password_file: ] - -# Sets the `Authorization` header on every API request with -# the configured credentials. -# -authorization: - # Sets the authentication type of the request. - [ type: | default: Bearer ] - # Sets the credentials of the request. It is mutually exclusive with - # `credentials_file`. - [ credentials: ] - # Sets the credentials of the request with the credentials read from the - # configured file. It is mutually exclusive with `credentials`. - [ credentials_file: ] - -# Optional OAuth 2.0 configuration. -# Cannot be used at the same time as basic_auth or authorization. -# -oauth2: - [ ] - -# Configure whether scrape requests follow HTTP 3xx redirects. -[ follow_redirects: | default = true ] - -# Whether to enable HTTP2. -[ enable_http2: | default: true ] - -# Configures the API request's TLS settings. -# -tls_config: - [ ] - -# List of headers that will be passed in the API requests to the server. -# Authentication related headers may be configured in this section. Header name -# must be configured as key and header value supports three different types of -# headers: values, secrets and files. -# -# The difference between values and secrets is that secret will be redacted -# in server logs where as values will be emitted in the logs. -# -# Values are regular headers with values, secrets are headers that pass secret -# information like tokens and files pass the file content in the headers. -# -http_headers: - [ ] +updaters: + - id: tsdb-0 + updater: tsdb + web: + url: http://localhost:9090 + extra_config: + cutoff_duration: 5m + query_batch_size: 1000 + queries: + # Average CPU utilisation + avg_cpu_usage: + global: | + avg_over_time( + avg by (uuid) ( + ( + irate(ceems_compute_unit_cpu_user_seconds_total{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) + + + irate(ceems_compute_unit_cpu_system_seconds_total{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) + ) + / + ceems_compute_unit_cpus{uuid=~"{{.UUIDs}}"} + )[{{.Range}}:{{.ScrapeInterval}}] + ) * 100 ``` -## `` - -A `cluster_config` allows configuring the cluster of CEEMS API server. +Similar to `clusters`, `updaters` is also a list of objects where each object +describes an `updater`. Currently only **TSDB** updater is supported to update +compute units metrics from PromQL compliant TSDB server like Prometheus, Victoria +Metrics. + +- `id`: A unique identifier for the updater. This identifier must be used in +`updaters` section of `clusters` as shown in [Clusters Configuration](#clusters-configuration) +section. +- `updater`: Name of the updater. Currently only `tsdb` is allowed. +- `web`: Web client configuration of updater server. +- `extra_config`: The `extra_config` allows to further configure TSDB. + - `extra_config.cutoff_duration`: The time series data of compute units that have + total elapsed time less than this period will be purged from TSDB to decrease + cardinality. This is useful to remove time series data of failed compute units + or compute units that lasted very short duration and in-turn keep cardinality of + TSDB under check. For this feature to work, Prometheus needs to be started with + `--web.enable-admin-api` CLI flag that enabled admin API endpoints + - `extra_config.query_batch_size`: In order to not to hit TSDB server's API response + limits, queries are batched with this config parameter size to estimate aggregate + metrics of compute units. + - `extra_config.queries`: This defines the queries to be made to TSDB to estimate + the aggregate metrics of each compute unit. The example config shows the query + to estimate average CPU usage of the compute unit. All the supported queries can + be consulted from the [Updaters Configuration Reference](./config-reference.md#updater_config). + +## Examples + +The following configuration shows a basic config needed to fetch batch jobs from +SLURM resource manager ```yaml -# Identifier of the cluster. Must be unique for each cluster -# -# Use an id that end users can identify, for instance, name of the cluster. -# -id: - -# Resource manager of the cluster. Currently only `slurm` is supported. In future, -# `openstack` will be supported -# -manager: - -# List of updater IDs to run on the compute units of current cluster. The updaters -# will be run in the same order as provided in the list. -# -# ID of each updater is set in the `updaters` section of the config. If an unknown -# ID is provided here, it will be ignored during the update step. -# -updaters: - [- ... ] - -# CLI tool configuration. -# -# If the resource manager supports fetching compute units data from a CLI tool, -# this section can be used to configure the tool. This can be mainly used to configure -# SLURM CLI utility tools that can be used to fetch job data. -# -# When SLURM resource manager is configured to fetch job data using `sacct` command, -# execution mode of the command will be decided as follows: -# -# - If the current user running `ceems_api_server` is `root` or `slurm` user, `sacct` -# command will be executed natively as that user. -# -# - If above check fails, `sacct` command will be attempted to execute as `slurm` user. -# If the `ceems_api_server` process have enough privileges setup using Linux capabilities -# in the systemd unit file, this will succeed and `sacct` will be always executed -# as `slurm` user. -# -# - If above check fails as well, we attempt to execute `sacct` with `sudo` prefix. If -# the current user running `ceems_api_server` is in the list of sudoers, this check -# will pass and `sacct` will be always executed as `sudo sacct ` to fetch jobs. -# -# If none of the above checks, pass, `sacct` will be executed as the current user -# which might not give job data of _all_ users in the cluster. -# -# If the operators are unsure which method to use, there is a default systemd -# unit file provided in the repo that uses Linux capabilities. Use that file as -# starting point and modify the CLI args accordingly -# -# If no `cli` and no `web` config is found, `ceems_api_server` will check -# if CLI utilities like `sacct` exist on `PATH` and if found, will use them. -# -# Systemd Unit File: -# https://github.com/mahendrapaipuri/ceems/blob/main/build/package/ceems_api_server/ceems_api_server.service -# -cli: - # Path to the binaries of the CLI utilities. - # - [ path: ] - - # An object of environment variables that will be injected while executing the - # CLI utilities to fetch compute unit data. - # - # This is handy when executing CLI tools like `keystone` for openstack or `kubectl` - # for k8s needs to source admin credentials. Those credentials can be set manually - # here in this section. - # - environment_variables: - [ : ... ] - -# If the resource manager supports API server, configure the REST API -# server details here. -# -# When configured, REST API server is always preferred over CLI utilities for -# fetching compute units -# -# Most of the web configuration has been inspired from Prometheus `scrape_config` -# and its utility functions are used to create HTTP client using the configuration -# set below. -# -web: - # Web client config of resource manager's cluster - # - [ ] - -# Any other configuration needed to reach API server of the resource manager -# can be configured in this section. -# -# Currently this section is used for both SLURM and Openstack resource managers -# to configure API versions -# -# For example, for SLURM if your API endpoints are of form `/slurm/v0.0.40/diag`, -# the version is `v0.0.40`. -# Docs: https://slurm.schedmd.com/rest_api.html -# SLURM's REST API version can be set as `slurm: v0.0.40` -# -# In the case of Openstack, we need to fetch from different sources like identity, -# compute and they use different versioning of API. They can be configured using -# this section as well -# -# Example: -# -# slurm: v0.0.40 # SLURM -# identity: v3 # Openstack -# compute: v2.1 # Openstack -# -extra_config: - [ : ... ] -``` - -## `` +ceems_api_server: + data: + path: /var/lib/ceems + update_interval: 15m -A `updater_config` allows configuring updaters of CEEMS API server. + admin: + users: + - adm1 + + web: + requests_limit: 30 -```yaml -# Identifier of the updater. Must be unique for each updater -# -# This identifier should be used in the `updaters` section inside each -# `clusters` config to update the compute units of that resource manager with a -# given updater. -# -id: - -# Updater kind. Currently only `tsdb` is supported. -# -updater: - -# Web Config of the updater. -# -web: - # Web client config of updater instance - # - [ ] - -# Any other configuration needed for the updater instance can be configured -# in this section. -# Currently this section is used for `tsdb` updater to configure the queries that -# will be used to aggregate the compute unit metrics. -# -extra_config: - # Query batch size when making TSDB queries. - # CEEMS making queries in batches in order to avoid OOM errors on TSDB. - # This parameter can be used to configure the number of compute units queried - # in a single query. - # - # Set this value based on your `--query.max-samples` parameter set to TSDB and - # scrape interval. For instance, at a given time, if there are 80k compute units - # running, TSDB is scrapping at a rate of 5sec and CEEMS is updating for every - # 60 min. In this case, a given metric for 20k compute units will have - # 80,000 * (60 * 60) / 5 = 57600000 samples in the query. The default value - # used by Prometheus for `--query.max-samples` is 50000000 which is less than - # what we got in the calculation in the example. Thus, we need to make multiple - # queries by batching the compute units. In the current example, using a batch - # size of 40k should work, however, we recommend using much lesser batch sizes - # to protect TSDB from over consuming the memory. - # - # Default value is 1000 and it should work in most of the cases - # - [ query_batch_size: | default: 1000 ] - - # Compute units that have total life time less than this value will be deleted from - # TSDB to reduce number of labels and cardinality - # - # Default value `0s` means no compute units will be purged. - # - # Units Supported: y, w, d, h, m, s, ms. - # - [ cutoff_duration: | default: 0s ] - - # List of labels to delete from TSDB. These labels should be valid matchers for TSDB - # More information of delete API of Prometheus https://prometheus.io/docs/prometheus/latest/querying/api/#delete-series - # - # TSDB must be started with --web.enable-admin-api flag for this to work - # - labels_to_drop: - [ - ... ] - - # Define queries that are used to estimate aggregate metrics of each compute unit - # These queries will be passed to golang's text/template package to build them - # Available template variables - # - UUIDs -> UUIDs string delimited by "|", eg, 123|345|567 - # - ScrapeInterval -> Scrape interval of TSDB in time.Duration format eg 15s, 1m - # - ScrapeIntervalMilli -> Scrape interval of TSDB in milli seconds eg 15000, 60000 - # - EvaluationInterval -> Evaluation interval of TSDB in time.Duration format eg 15s, 1m - # - EvaluationIntervalMilli -> Evaluation interval of TSDB in milli seconds eg 15s, 1m - # - RateInterval -> Rate interval in time.Duration format. It is estimated based on Scrape interval as 4*scrape_interval - # - Range -> Duration of interval where aggregation is being made in time.Duration format - # - queries: - [ ] +clusters: + - id: slurm-0 + manager: slurm + cli: + path: /usr/bin ``` -## `` - -A `queries_config` allows configuring PromQL queries for TSDB updater of CEEMS API server. +Both SLURM and openstack clusters can be monitored using a single instance of +CEEMS API server using a config as below: ```yaml -# -# It is possible to define multiple "sub-metrics" for each parent metric. -# For instance, for the case of `total_cpu_energy_usage_kwh`, we wish to store -# energy usage from different sources like RAPL, IPMI, we can do so using following -# config: -# -# total_cpu_energy_usage_kwh: -# rapl_total: -# ipmi_total: -# -# With the above configuration, the server response from API server will contain -# energy usage from both RAPL and IPMI using the same keys as we used in the -# sub query. For instance, an example response can be: -# -# `{"total_cpu_energy_usage_kwh": {"rapl_total": 100, "ipmi_total": 120}}` -# -# This approach will let the operators to define the metrics freely according to -# their deployments. This will also allow to fetch metrics from third party -# DBs outside of CEEMS components without hassle. -# -# The placeholder queries shown below should work out-of-the-box with CEEMS -# exporter and operators are free to deploy more exporters of their own and use -# the metrics from them to estimate aggregated metrics of each compute unit -# -# Average CPU utilisation -# -# Example of valid query: -# -# global: -# avg_over_time( -# avg by (uuid) ( -# ( -# rate(ceems_compute_unit_cpu_user_seconds_total{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) -# + -# rate(ceems_compute_unit_cpu_system_seconds_total{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) -# ) -# / -# ceems_compute_unit_cpus{uuid=~"{{.UUIDs}}"} -# )[{{.Range}}:] -# ) * 100 -avg_cpu_usage: - [ : ... ] - - -# Average CPU Memory utilisation -# -# Example of valid query: -# -# global: -# avg_over_time( -# avg by (uuid) ( -# ceems_compute_unit_memory_used_bytes{uuid=~"{{.UUIDs}}"} -# / -# ceems_compute_unit_memory_total_bytes{uuid=~"{{.UUIDs}}"} -# )[{{.Range}}:] -# ) * 100 -avg_cpu_mem_usage: - [ : ... ] - +ceems_api_server: + data: + path: /var/lib/ceems + update_interval: 15m -# Total CPU energy usage in kWh -# -# Example of valid query: -# -# total: -# sum_over_time( -# sum by (uuid) ( -# unit:ceems_compute_unit_cpu_energy_usage:sum{uuid=~"{{.UUIDs}}"} * {{.ScrapeIntervalMilli}} / 3.6e9 -# )[{{.Range}}:{{.ScrapeInterval}}] -# ) -total_cpu_energy_usage_kwh: - [ : ... ] + admin: + users: + - adm1 + web: + requests_limit: 30 -# Total CPU emissions in gms -# -# Example of valid query: -# -# rte_total: -# sum_over_time( -# sum by (uuid) ( -# label_replace( -# unit:ceems_compute_unit_cpu_energy_usage:sum{uuid=~"{{.UUIDs}}"} * {{.ScrapeIntervalMilli}} / 3.6e9, -# "common_label", -# "mock", -# "hostname", -# "(.*)" -# ) -# * on (common_label) group_left () -# label_replace( -# ceems_emissions_gCo2_kWh{provider="rte",country_code="fr"}, -# "common_label", -# "mock", -# "hostname", -# "(.*)" -# ) -# )[{{.Range}}:{{.ScrapeInterval}}] -# ) -# emaps_total: -# sum_over_time( -# sum by (uuid) ( -# label_replace( -# unit:ceems_compute_unit_cpu_energy_usage:sum{uuid=~"{{.UUIDs}}"} * {{.ScrapeIntervalMilli}} / 3.6e9, -# "common_label", -# "mock", -# "hostname", -# "(.*)" -# ) -# * on (common_label) group_left () -# label_replace( -# ceems_emissions_gCo2_kWh{provider="emaps",country_code="fr"}, -# "common_label", -# "mock", -# "hostname", -# "(.*)" -# ) -# )[{{.Range}}:{{.ScrapeInterval}}] -# ) -total_cpu_emissions_gms: - [ : ... ] - +clusters: + - id: slurm-0 + manager: slurm + cli: + path: /usr/bin + + - id: os-0 + manager: openstack + web: + http_headers: + X-Auth-Token: + secrets: + - supersecrettoken + X-OpenStack-Nova-API-Version: + values: + - latest + extra_config: + compute_api_url: https://openstack-nova.example.com/v2.1 + identity_api_url: https://openstack-keystone.example.com +``` -# Average GPU utilization -# -# Example of valid query: -# -# global: -# avg_over_time( -# avg by (uuid) ( -# DCGM_FI_DEV_GPU_UTIL -# * on (gpuuuid) group_right () -# ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"} -# )[{{.Range}}:{{.ScrapeInterval}}] -# ) -avg_gpu_usage: - [ : ... ] - +Assuming CEEMS exporter is deployed on the compute nodes of both SLURM +and Openstack clusters and metrics are scrapped by a Prometheus running +at `https://prometheus.example.com`, we can add updater config to +the above examples as follows: -# Average GPU memory utilization -# -# Example of valid query: -# -# global: -# avg_over_time( -# avg by (uuid) ( -# DCGM_FI_DEV_MEM_COPY_UTIL -# * on (gpuuuid) group_right () -# ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"} -# )[{{.Range}}:{{.ScrapeInterval}}] -# ) -avg_gpu_mem_usage: - [ : ... ] - +```yaml +ceems_api_server: + data: + path: /var/lib/ceems + update_interval: 15m -# Total GPU energy usage in kWh -# -# Example of valid query: -# -# total: -# sum_over_time( -# sum by (uuid) ( -# instance:DCGM_FI_DEV_POWER_USAGE:pue_avg * {{.ScrapeIntervalMilli}} / 3.6e9 -# * on (gpuuuid) group_right() -# ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"} -# )[{{.Range}}:{{.ScrapeInterval}}] -# ) -total_gpu_energy_usage_kwh: - [ : ... ] + admin: + users: + - adm1 + web: + requests_limit: 30 -# Total GPU emissions in gms -# -# Example of valid query: -# -# rte_total: -# sum_over_time( -# sum by (uuid) ( -# label_replace( -# instance:DCGM_FI_DEV_POWER_USAGE:pue_avg * {{.ScrapeIntervalMilli}} / 3.6e+09 -# * on (gpuuuid) group_right () -# ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"}, -# "common_label", -# "mock", -# "instance", -# "(.*)" -# ) -# * on (common_label) group_left () -# label_replace( -# ceems_emissions_gCo2_kWh{provider="rte",country_code="fr"}, -# "common_label", -# "mock", -# "instance", -# "(.*)" -# ) -# )[{{.Range}}:{{.ScrapeInterval}}] -# ) -# emaps_total: -# sum_over_time( -# sum by (uuid) ( -# label_replace( -# instance:DCGM_FI_DEV_POWER_USAGE:pue_avg * {{.ScrapeIntervalMilli}} / 3.6e+09 -# * on (gpuuuid) group_right () -# ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"}, -# "common_label", -# "mock", -# "instance", -# "(.*)" -# ) -# * on (common_label) group_left () -# label_replace( -# ceems_emissions_gCo2_kWh{provider="emaps",country_code="fr"}, -# "common_label", -# "mock", -# "instance", -# "(.*)" -# ) -# )[{{.Range}}:{{.ScrapeInterval}}] -# ) -total_gpu_emissions_gms: - [ : ... ] - +clusters: + - id: slurm-0 + manager: slurm + updaters: + - tsdb-0 + cli: + path: /usr/bin + + - id: os-0 + manager: openstack + updaters: + - tsdb-0 + web: + http_headers: + X-Auth-Token: + secrets: + - supersecrettoken + X-OpenStack-Nova-API-Version: + values: + - latest + extra_config: + compute_api_url: https://openstack-nova.example.com/v2.1 + identity_api_url: https://openstack-keystone.example.com -# Total IO write in GB stats -# -# Currently CEEMS exporter do not scrape this metric. Operators can configure -# this metric from third party exporters, if and when available -# -total_io_write_stats: - [ : ... ] - -# Total IO read in GB stats -# -# Currently CEEMS exporter do not scrape this metric. Operators can configure -# this metric from third party exporters, if and when available -# -total_io_read_stats: - [ : ... ] - -# Total ingress traffic stats -# -# Currently CEEMS exporter do not scrape this metric. Operators can configure -# this metric from third party exporters, if and when available -# -total_ingress_stats: - [ : ... ] - -# Total outgress traffic stats -# -# Currently CEEMS exporter do not scrape this metric. Operators can configure -# this metric from third party exporters, if and when available -# -total_outgress_stats: - [ : ... ] +updaters: + - id: tsdb-0 + updater: tsdb + web: + url: http://tsdb-0 + extra_config: + cutoff_duration: 5m + query_batch_size: 1000 + queries: + # Average CPU utilisation + avg_cpu_usage: + global: | + avg_over_time( + avg by (uuid) ( + ( + irate(ceems_compute_unit_cpu_user_seconds_total{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) + + + irate(ceems_compute_unit_cpu_system_seconds_total{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) + ) + / + ceems_compute_unit_cpus{uuid=~"{{.UUIDs}}"} + )[{{.Range}}:{{.ScrapeInterval}}] + ) * 100 + + # Avgerage CPU Memory utilisation + avg_cpu_mem_usage: + global: | + avg_over_time( + avg by (uuid) ( + ceems_compute_unit_memory_used_bytes{uuid=~"{{.UUIDs}}"} + / + ceems_compute_unit_memory_total_bytes{uuid=~"{{.UUIDs}}"} + )[{{.Range}}:{{.ScrapeInterval}}] + ) * 100 + + # Total CPU energy usage in kWh + total_cpu_energy_usage_kwh: + total: | + sum_over_time( + sum by (uuid) ( + unit:ceems_compute_unit_cpu_energy_usage:sum{uuid=~"{{.UUIDs}}"} * {{.ScrapeIntervalMilli}} / 3.6e9 + )[{{.Range}}:{{.ScrapeInterval}}] + ) + + # Total CPU emissions in gms + total_cpu_emissions_gms: + rte_total: | + sum_over_time( + sum by (uuid) ( + label_replace( + unit:ceems_compute_unit_cpu_energy_usage:sum{uuid=~"{{.UUIDs}}"} * {{.ScrapeIntervalMilli}} / 3.6e9, + "common_label", + "mock", + "hostname", + "(.*)" + ) + * on (common_label) group_left () + label_replace( + ceems_emissions_gCo2_kWh{provider="rte",country_code="FR"}, + "common_label", + "mock", + "hostname", + "(.*)" + ) + )[{{.Range}}:{{.ScrapeInterval}}] + ) + + emaps_total: | + sum_over_time( + sum by (uuid) ( + label_replace( + unit:ceems_compute_unit_cpu_energy_usage:sum{uuid=~"{{.UUIDs}}"} * {{.ScrapeIntervalMilli}} / 3.6e9, + "common_label", + "mock", + "hostname", + "(.*)" + ) + * on (common_label) group_left () + label_replace( + ceems_emissions_gCo2_kWh{provider="emaps",country_code="FR"}, + "common_label", + "mock", + "hostname", + "(.*)" + ) + )[{{.Range}}:{{.ScrapeInterval}}] + ) + + owid_total: | + sum_over_time( + sum by (uuid) ( + label_replace( + unit:ceems_compute_unit_cpu_energy_usage:sum{uuid=~"{{.UUIDs}}"} * {{.ScrapeIntervalMilli}} / 3.6e9, + "common_label", + "mock", + "hostname", + "(.*)" + ) + * on (common_label) group_left () + label_replace( + ceems_emissions_gCo2_kWh{provider="owid",country_code="FR"}, + "common_label", + "mock", + "hostname", + "(.*)" + ) + )[{{.Range}}:{{.ScrapeInterval}}] + ) + + # Average GPU utilization + avg_gpu_usage: + global: | + avg_over_time( + avg by (uuid) ( + DCGM_FI_DEV_GPU_UTIL + * on (gpuuuid) group_right () + ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"} + )[{{.Range}}:{{.ScrapeInterval}}] + ) + + # Average GPU memory utilization + avg_gpu_mem_usage: + global: | + avg_over_time( + avg by (uuid) ( + DCGM_FI_DEV_MEM_COPY_UTIL + * on (gpuuuid) group_right () + ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"} + )[{{.Range}}:{{.ScrapeInterval}}] + ) + + # Total GPU energy usage in kWh + total_gpu_energy_usage_kwh: + total: | + sum_over_time( + sum by (uuid) ( + instance:DCGM_FI_DEV_POWER_USAGE:pue_avg * {{.ScrapeIntervalMilli}} / 3.6e9 + * on (gpuuuid) group_right() + ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"} + )[{{.Range}}:{{.ScrapeInterval}}] + ) + + # Total GPU emissions in gms + total_gpu_emissions_gms: + rte_total: | + sum_over_time( + sum by (uuid) ( + label_replace( + instance:DCGM_FI_DEV_POWER_USAGE:pue_avg * {{.ScrapeIntervalMilli}} / 3.6e+09 + * on (gpuuuid) group_right () + ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"}, + "common_label", + "mock", + "instance", + "(.*)" + ) + * on (common_label) group_left () + label_replace( + ceems_emissions_gCo2_kWh{provider="rte",country_code="FR"}, + "common_label", + "mock", + "instance", + "(.*)" + ) + )[{{.Range}}:{{.ScrapeInterval}}] + ) + + emaps_total: | + sum_over_time( + sum by (uuid) ( + label_replace( + instance:DCGM_FI_DEV_POWER_USAGE:pue_avg * {{.ScrapeIntervalMilli}} / 3.6e+09 + * on (gpuuuid) group_right () + ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"}, + "common_label", + "mock", + "instance", + "(.*)" + ) + * on (common_label) group_left () + label_replace( + ceems_emissions_gCo2_kWh{provider="emaps",country_code="FR"}, + "common_label", + "mock", + "instance", + "(.*)" + ) + )[{{.Range}}:{{.ScrapeInterval}}] + ) + + owid_total: | + sum_over_time( + sum by (uuid) ( + label_replace( + instance:DCGM_FI_DEV_POWER_USAGE:pue_avg * {{.ScrapeIntervalMilli}} / 3.6e+09 + * on (gpuuuid) group_right () + ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"}, + "common_label", + "mock", + "instance", + "(.*)" + ) + * on (common_label) group_left () + label_replace( + ceems_emissions_gCo2_kWh{provider="owid",country_code="FR"}, + "common_label", + "mock", + "instance", + "(.*)" + ) + )[{{.Range}}:{{.ScrapeInterval}}] + ) ``` + +The above configuration assumes that GPU compute nodes possess NVIDIA GPUs and +[dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) is running along CEEMS +exporter to export metrics of GPUs. diff --git a/website/docs/configuration/ceems-lb.md b/website/docs/configuration/ceems-lb.md index bc85a1c0..f4e0180e 100644 --- a/website/docs/configuration/ceems-lb.md +++ b/website/docs/configuration/ceems-lb.md @@ -4,125 +4,246 @@ sidebar_position: 4 # CEEMS Load Balancer -The following shows the reference for CEEMS load balancer config. A valid sample -configuration file can be found in the +CEEMS Load Balancer configuration has one main section and two optional +section. A basic skeleton of the configuration is as follows: + +```yaml +# CEEMS Load Balancer configuration skeleton + +ceems_lb: + +# Optional section +ceems_api_server: + +# Optional section +clusters: +``` + +CEEMS LB uses the same configuration section of `ceems_api_server` and +`clusters` and hence, it is **possible to merge config files** of CEEMS +API server and CEEMS LB. Each component will read the necessary config +from the same file. + +A valid sample +configuration file can be found in the [repo](https://github.com/mahendrapaipuri/ceems/blob/main/build/config/ceems_lb/ceems_lb.yml). +## CEEMS Load Balancer Configuration + +A sample CEEMS LB config file is shown below: + ```yaml -# Configuration file to configure CEEMS Load Balancer -# -# This config file has following sections: -# - `ceems_lb`: Core configuration of CEEMS LB -# - `ceems_api_server`: Client configuration of CEEMS API server -# - `clusters`: This is optional config which can be used to validate backends IDs -# ---- + ceems_lb: - # Load balancing strategy. Three possibilites - # - # - round-robin - # - least-connection - # - resource-based - # - # Round robin and least connection are classic strategies. - # Resource based works based on the query range in the TSDB query. The - # query will be proxied to the backend that covers the query_range - # - [ strategy: | default = round-robin ] - - # List of backends for each cluster - # + strategy: resource-based backends: - [ - ] - - -# CEEMS API server config. -# This config is essential to enable access control on the TSDB. By excluding -# this config, no access control is imposed on the TSDB and a basic load balancing -# based on the chosen strategy will be made. -# -# Essentially, basic access control is implemented by checking the ownership of the -# queried unit. Users that belong to the same project can query the units belong -# to that project. -# -# For example, if there is a unit U that belongs to User A and -# Project P. Any user that belongs to same project P can query for the metrics of unit U -# but not users from other projects. -# + - id: slurm-0 + tsdb_urls: + - http://localhost:9090 + + - id: slurm-1 + tsdb_urls: + - http://localhost:9090 +``` + +- `strategy`: Load balancing strategy. Besides classical `round-robin` and +`least-connection` strategies, a custom `resource-based` strategy is supported. +In the `resource-based` strategy, the query will be proxied to the TSDB instance +that has the data based on the time period in the query. +- `backends`: A list of objects describing each TSDB backend. + - `backends.id`: It is **important** + that the `id` in the backend must be the same `id` used in the + [Clusters Configuration](./ceems-api-server.md#clusters-configuration). This + is how CEEMS LB will know which cluster to target. + - `backends.tsdb_urls`: A list of TSDB servers that scrape metrics from this + cluster identified by `id`. + +:::warning[WARNING] + +CEEMS LB is meant to deploy in the same DMZ as the TSDB servers and hence, it +does not support TLS for the backends. + +::: + +### Matching `backends.id` with `clusters.id` + +This is the tricky part of the configuration which can be better explained with +an example. Consider we are running CEEMS API server with the following +configuration: + +```yaml ceems_api_server: - # The DB contains the information of user and projet units and LB will verify - # if user/project is the owner of the uuid under request to decide whether to - # proxy request to backend or not. - # - # To identify the current user, X-Grafana-User header will be used that Grafana - # is capable of sending to the datasource. Grafana essenatially adds this header - # on the backend server and hence it is not possible for the users to spoof this - # header from the browser. - # In order to enable this feature, it is essential to set `send_user_header = true` - # in Grafana config file. - # - # If both CEEMS API and CEEMS LB is running on the same host, it is preferable to - # use the DB directly using `data.path` as DB query is way faster than a API request - # If both apps are deployed on the same host, ensure that the user running `ceems_lb` - # has permissions to open CEEMS API data files - # data: - [ ] - - # In the case where CEEMS API and ceems LB are deployed on different hosts, we can - # still perform access control using CEEMS API server by making a API request to - # check the ownership of the queried unit. This method should be only preferred when - # DB cannot be access directly as API request has additional latency than querying DB - # directly. - # - # If both `data.path` and `web.url` are provided, DB will be preferred as it has lower - # latencies. - # + path: /var/lib/ceems + update_interval: 15m + +clusters: + - id: slurm-0 + manager: slurm + updaters: + - tsdb-0 + cli: + + + - id: slurm-1 + manager: slurm + updaters: + - tsdb-1 + cli: + + +updaters: + - id: tsdb-0 + updater: tsdb + web: + url: http://tsdb-0 + extra_config: + + + - id: tsdb-1 + updater: tsdb + web: + url: http://tsdb-1 + extra_config: + +``` + +Here are we monitoring two SLURM clusters: `slurm-0`and `slurm-1`. +There are two different TSDB servers `tsdb-0` +and `tsdb-1` where `tsdb-0` is scrapping metrics from `slurm-0` +and `tsdb-1` scrapping metrics from only `slurm-1`. Assuming +`tsdb-0` is replicating data onto `tsdb-0-replica` and `tsdb-1` +onto `tsdb-1-replica`, we need to use the following config for +`ceems_lb` + +```yaml +ceems_lb: + strategy: resource-based + backends: + - id: slurm-0 + tsdb_urls: + - http://tsdb-0 + - http://tsdb-0-replica + + - id: slurm-1 + tsdb_urls: + - http://tsdb-1 + - http://tsdb-1-replica + +``` + +As metrics data of `slurm-0` only exists in either `tsdb-0` or +`tsdb-0-replica`, we need to set `backends.id` to `slurm-0` for +these TSDB backends. + +Effectively we will use CEEMS LB as a Prometheus datasource in +Grafana and while doing so, we need to target correct cluster +using path parameter. For instance, for `slurm-0` cluster the +datasource URL must be configured as `http://ceems-lb:9030/slurm-0` +assuming `ceems_lb` is running on port `9030`. Now, CEEMS LB will +know which cluster to target (in this case `slurm-0`), strips +the path parameter `slurm-0` from the path and proxies the request +to one of the configured backends. This allows a single instance +of CEEMS to load balance across different clusters. + +## CEEMS API Server Configuration + +This is an optional config when provided will enforce access +control for the backend TSDBs. A sample config file is given +below: + +```yaml +ceems_api_server: web: - [ ] + url: http://localhost:9020 ``` -## `` +- `web.url`: Address at which CEEMS API server is running. CEEMS LB +will make a request to CEEMS API request to verify the ownership of +the comput unit before proxying request to TSDB. All the possible +configuration parameters for `web` can be found in +[Web Client Configuration Reference](./config-reference.md#web_client_config). + +If both CEEMS API server and CEEMS LB has access to CEEMS data path, +it is possible to use the `ceems_api_server.db.path` as well to +query the DB directly instead of making an API request. This will have +much lower latency and higher performance. + +## Clusters Configuration + +Same configuration as discussed in +[CEEMS API Server's Cluster Configuration](./ceems-api-server.md#clusters-configuration) +can be provided as an optional configuration to verify the `backends` configuration. +This is not mandatory and if not provided, CEEMS LB will verify the backend +`ids` by making an API request to CEEMS API server. -A `backend_config` allows configuring backend TSDB servers for load balancer. +## Example configuration files + +As it is clear from above sections, there is a lot of common configuration +between CEEMS API server and CEEMS LB. Thus, when it is possible, it is +advised to merge two configurations in one file. + +Taking one of the [examples](./ceems-api-server.md#examples) in CEEMS API +server section, we can add CEEMS LB config as follows: ```yaml -# Identifier of the cluster -# -# This ID must match with the ones defined in `clusters` config. CEEMS API server -# will tag each compute unit from that cluster with this ID and when verifying -# for compute unit ownership, CEEMS LB will use the ID to query for the compute -# units of that cluster. -# -# This identifier needs to be in the path parameter for requests to CEEMS LB -# to target correct cluster. For instance there are two different clusters, -# say `cluster-0` and `cluster-1`, that have different TSDBs configured. Using CEEMS -# LB we can load balance the traffic for these two clusters using a single CEEMS LB -# deployement. However, we need to tell CEEMS LB which cluster to target for the -# incoming traffic. This is done via path parameter. -# -# If CEEMS LB is running at http://localhost:9030, then the `cluster-0` is reachable at -# `http://localhost:9030/cluster-0` and `cluster-1` at `http://localhost:9030/cluster-1`. -# Internally, CEEMS will strip the first part in the URL path, use it to identify -# cluster and proxy the rest of URL path to underlying TSDB backend. -# Thus, all the requests to `http://localhost:9030/cluster-0` will be load -# balanced across TSDB backends of `cluster-0`. -# -id: - -# List of TSDBs for this cluster. Load balancing between these TSDBs will be -# made based on the strategy chosen. -# -# TLS is not supported for backends. CEEMS LB supports TLS and TLS terminates -# at the LB and requests are proxied to backends on HTTP. -# -# LB and backend servers are meant to be in the same DMZ so that we do not need -# to encrypt communications. Backends however support basic auth and they can -# be configured in URL with usual syntax. -# -# An example of configuring the basic auth username and password with backend -# - http://alice:password@localhost:9090 -# -tsdb_urls: - [ - ] +ceems_api_server: + data: + path: /var/lib/ceems + update_interval: 15m + + admin: + users: + - adm1 + + web: + url: http://localhost:9020 + requests_limit: 30 + +clusters: + - id: slurm-0 + manager: slurm + updaters: + - tsdb-0 + cli: + + + - id: os-0 + manager: openstack + updaters: + - tsdb-1 + web: + + +updaters: + - id: tsdb-0 + updater: tsdb + web: + url: http://tsdb-0 + extra_config: + + + - id: tsdb-1 + updater: tsdb + web: + url: http://tsdb-1 + extra_config: + + +ceems_lb: + strategy: resource-based + backends: + - id: slurm-0 + tsdb_urls: + - http://tsdb-0 + - http://tsdb-0-replica + + - id: os-0 + tsdb_urls: + - http://tsdb-1 + - http://tsdb-1-replica ``` + +This config assumes `tsdb-0` is replicating data to `tsdb-0-replica`, +`tsdb-1` to `tsbd-1-replica` and CEEMS API server is running on +port `9020` on the same host as CEEMS LB. diff --git a/website/docs/configuration/config-reference.md b/website/docs/configuration/config-reference.md index 78c162ce..8b2147f4 100644 --- a/website/docs/configuration/config-reference.md +++ b/website/docs/configuration/config-reference.md @@ -37,6 +37,823 @@ character in the source label should be converted to an underscore The other placeholders are specified separately. +## `` + +The following shows the reference for CEEMS API server config. +A valid sample configuration file can be found in the +[repo](https://github.com/mahendrapaipuri/ceems/blob/main/build/config/ceems_api_server/ceems_api_server.yml). + +```yaml +# Configuration file to configure CEEMS API server +# +# This config file has following sections: +# - `ceems_api_server`: Core configuration of CEEMS API server +# - `clusters`: Configuration of clusters that are being monitored +# - `updaters`: Configuration of updaters +# +--- +# CEEMS API Server and data config +ceems_api_server: + # Data related configuration of the CEEMS API server. This config concerns with the + # locations where data will be saved, frequency of data updates, etc. + # + data: + [ ] + + # HTTP web admin related config for CEEMS API server + # + admin: + [ ] + + # HTTP web related config for CEEMS API server. + # + web: + # Maximum allowable query range, ie, the difference between `from` and `to` query + # parameters. + # + # This can be used to restrict the query range made by the users to avoid OOM errors + # when handling too much data. + # + # Default value `0s` means no restrictions are imposed on the query. + # + # Units Supported: y, w, d, h, m, s, ms. + # + [ max_query: | default: 0s ] + + # Number of requests allowed in ONE MINUTE per client identified by Real IP address. + # Request headers `True-Client-IP`, `X-Real-IP` and `X-Forwarded-For` are looked up + # to get the real IP address. + # + # This is to effectively impose a rate limit for the entire CEEMS server irrespective + # of URL path. We advise to set it to a value based on your needs to avoid DoS/DDoS + # attacks. + # + # Rate limiting is done using the Sliding Window Counter pattern inspired by + # CloudFlare https://blog.cloudflare.com/counting-things-a-lot-of-different-things/ + # + # Default value `0` means no rate limiting is applied. + # + [ requests_limit: | default: 0 ] + + # It will be used to prefix all HTTP endpoints served by CEEMS API server. + # For example, if CEEMS API server is served via a reverse proxy. + # + # Default is '/' + # + [ route_prefix: | default: / ] + +# A list of clusters from which CEEMS API server will fetch the compute units. +# +# Each cluster must provide an unique `id`. The `id` will enable CEEMS to identify +# different clusters in multi-cluster setup. This `id` must be consistent throughout +# all the CEEMS components. +# +clusters: + [ - ... ] + + +# A list of Updaters that will be used to update the compute unit metrics. This update +# step can be used to update the aggregate metrics of each compute unit in real time +# or to add complementary information to the compute units from on-premise third +# party services. +# +# Currently only TSDB updater is supported. The compute unit aggregate metrics can be +# updated from TSDB (Prometheus/VM) instances. +# +updaters: + [ - ... ] + +``` + +### `` + +A `data_config` allows configuring the DB settings of CEEMS API server. + +```yaml +# Path at which CEEMS API server data will be stored. +# If relative path is used, it will be resolved based on the current working directory. +# +[ path: | default = data ] + +# Units data will be fetched at this interval. CEEMS will pull the units from the +# underlying resource manager at this frequency into its own DB. +# +# Units Supported: y, w, d, h, m, s, ms. +# +[ update_interval: | default = 15m ] + +# The duration to retain the data in the DB. Units older than this duration will be +# purged from the DB. +# +# In the case of global usage stats, if the last activity on a given project/user +# combination is older than this period, those stats will be purged from the DB. +# +# Units Supported: y, w, d, h, m, s, ms. +# +[ retention_period: | default = 30d ] + +# CEEMS API server is capable of creating DB backups using SQLite backup API. Created +# DB backups will be saved to this path. NOTE that for huge DBs, this backup can take +# a considerable amount of time. +# +# Use a different disk device than `ceems_api_server.data.path` to achieve +# fault tolerance. +# +# If the path is empty, no backups will be created. +# +[ backup_path: ] + +# The interval at which DB back ups will be created. +# +# Minimum allowable interval is `1d`, ie, 1 day. +# +# Units Supported: y, w, d, h, m, s, ms. +# +[ backup_interval: | default = 1d ] + +``` + +### `` + +A `admin_config` allows configuring the admin users of CEEMS API server. + +```yaml +# List of users that will have admin privileges for accessing CEEMS API server +# +# These users will have full access to DB and can query stats of any user/project. +# +# In addition, it is possible to pull users from Grafana teams and add them to +# admin users. Check `grafana` configuration on how to fetch users from Grafana. +# +users: + [ - ... ] + +# Besides setting a static list of admin users using `ceems_api_server.web.admin_users`, +# it is possible to pull the users from a given Grafana instance and update the admin users +# list of CEEMS API server. This allows operators to add new admins to CEEMS API server +# without having to restart `ceems_api_server`. +# +# Typically, one or several Grafana team(s) can be created dedicated to CEEMS admins and +# CEEMS API server will fetch the Grafana team members at the same frequency as compute +# units. +# +# The web config of Grafana can be set in the following section: +# +grafana: + [ ] +``` + +### `` + +A `grafana_config` allows configuring the Grafana client config to fetch members of +Grafana teams to be added to admin users of CEEMS API server. + +```yaml +# Web URL of the Grafana instance +# +url: + +# List of IDs of the Grafana teams from which the members will be synchronized +# with CEEMS admin users +# +teams_ids: + - ... + +# Sets the `Authorization` header on every API request with the +# configured username and password. +# password and password_file are mutually exclusive. +# +basic_auth: + [ username: ] + [ password: ] + [ password_file: ] + +# Sets the `Authorization` header on every API request with +# the configured credentials. +# +authorization: + # Sets the authentication type of the request. + [ type: | default: Bearer ] + # Sets the credentials of the request. It is mutually exclusive with + # `credentials_file`. + [ credentials: ] + # Sets the credentials of the request with the credentials read from the + # configured file. It is mutually exclusive with `credentials`. + [ credentials_file: ] + +# Optional OAuth 2.0 configuration. +# Cannot be used at the same time as basic_auth or authorization. +# +oauth2: + [ ] + +# Configure whether scrape requests follow HTTP 3xx redirects. +[ follow_redirects: | default = true ] + +# Whether to enable HTTP2. +[ enable_http2: | default: true ] + +# Configures the API request's TLS settings. +# +tls_config: + [ ] + +# List of headers that will be passed in the API requests to the server. +# Authentication related headers may be configured in this section. Header name +# must be configured as key and header value supports three different types of +# headers: values, secrets and files. +# +# The difference between values and secrets is that secret will be redacted +# in server logs where as values will be emitted in the logs. +# +# Values are regular headers with values, secrets are headers that pass secret +# information like tokens and files pass the file content in the headers. +# +http_headers: + [ ] +``` + +## `` + +A `cluster_config` allows configuring the cluster of CEEMS API server. + +```yaml +# Identifier of the cluster. Must be unique for each cluster +# +# Use an id that end users can identify, for instance, name of the cluster. +# +id: + +# Resource manager of the cluster. Currently only `slurm` is supported. In future, +# `openstack` will be supported +# +manager: + +# List of updater IDs to run on the compute units of current cluster. The updaters +# will be run in the same order as provided in the list. +# +# ID of each updater is set in the `updaters` section of the config. If an unknown +# ID is provided here, it will be ignored during the update step. +# +updaters: + [- ... ] + +# CLI tool configuration. +# +# If the resource manager supports fetching compute units data from a CLI tool, +# this section can be used to configure the tool. This can be mainly used to configure +# SLURM CLI utility tools that can be used to fetch job data. +# +# When SLURM resource manager is configured to fetch job data using `sacct` command, +# execution mode of the command will be decided as follows: +# +# - If the current user running `ceems_api_server` is `root` or `slurm` user, `sacct` +# command will be executed natively as that user. +# +# - If above check fails, `sacct` command will be attempted to execute as `slurm` user. +# If the `ceems_api_server` process have enough privileges setup using Linux capabilities +# in the systemd unit file, this will succeed and `sacct` will be always executed +# as `slurm` user. +# +# - If above check fails as well, we attempt to execute `sacct` with `sudo` prefix. If +# the current user running `ceems_api_server` is in the list of sudoers, this check +# will pass and `sacct` will be always executed as `sudo sacct ` to fetch jobs. +# +# If none of the above checks, pass, `sacct` will be executed as the current user +# which might not give job data of _all_ users in the cluster. +# +# If the operators are unsure which method to use, there is a default systemd +# unit file provided in the repo that uses Linux capabilities. Use that file as +# starting point and modify the CLI args accordingly +# +# If no `cli` and no `web` config is found, `ceems_api_server` will check +# if CLI utilities like `sacct` exist on `PATH` and if found, will use them. +# +# Systemd Unit File: +# https://github.com/mahendrapaipuri/ceems/blob/main/build/package/ceems_api_server/ceems_api_server.service +# +cli: + # Path to the binaries of the CLI utilities. + # + [ path: ] + + # An object of environment variables that will be injected while executing the + # CLI utilities to fetch compute unit data. + # + # This is handy when executing CLI tools like `keystone` for openstack or `kubectl` + # for k8s needs to source admin credentials. Those credentials can be set manually + # here in this section. + # + environment_variables: + [ : ... ] + +# If the resource manager supports API server, configure the REST API +# server details here. +# +# When configured, REST API server is always preferred over CLI utilities for +# fetching compute units +# +# Most of the web configuration has been inspired from Prometheus `scrape_config` +# and its utility functions are used to create HTTP client using the configuration +# set below. +# +web: + # Web client config of resource manager's cluster + # + [ ] + +# Any other configuration needed to reach API server of the resource manager +# can be configured in this section. +# +# Currently this section is used for both SLURM and Openstack resource managers +# to configure API versions +# +# For example, for SLURM if your API endpoints are of form `/slurm/v0.0.40/diag`, +# the version is `v0.0.40`. +# Docs: https://slurm.schedmd.com/rest_api.html +# SLURM's REST API version can be set as `slurm: v0.0.40` +# +# In the case of Openstack, we need to fetch from different sources like identity, +# compute and they use different versioning of API. They can be configured using +# this section as well +# +# Example: +# +# slurm: v0.0.40 # SLURM +# identity: v3 # Openstack +# compute: v2.1 # Openstack +# +extra_config: + [ : ... ] +``` + +## `` + +A `updater_config` allows configuring updaters of CEEMS API server. + +```yaml +# Identifier of the updater. Must be unique for each updater +# +# This identifier should be used in the `updaters` section inside each +# `clusters` config to update the compute units of that resource manager with a +# given updater. +# +id: + +# Updater kind. Currently only `tsdb` is supported. +# +updater: + +# Web Config of the updater. +# +web: + # Web client config of updater instance + # + [ ] + +# Any other configuration needed for the updater instance can be configured +# in this section. +# Currently this section is used for `tsdb` updater to configure the queries that +# will be used to aggregate the compute unit metrics. +# +extra_config: + # Query batch size when making TSDB queries. + # CEEMS making queries in batches in order to avoid OOM errors on TSDB. + # This parameter can be used to configure the number of compute units queried + # in a single query. + # + # Set this value based on your `--query.max-samples` parameter set to TSDB and + # scrape interval. For instance, at a given time, if there are 80k compute units + # running, TSDB is scrapping at a rate of 5sec and CEEMS is updating for every + # 60 min. In this case, a given metric for 20k compute units will have + # 80,000 * (60 * 60) / 5 = 57600000 samples in the query. The default value + # used by Prometheus for `--query.max-samples` is 50000000 which is less than + # what we got in the calculation in the example. Thus, we need to make multiple + # queries by batching the compute units. In the current example, using a batch + # size of 40k should work, however, we recommend using much lesser batch sizes + # to protect TSDB from over consuming the memory. + # + # Default value is 1000 and it should work in most of the cases + # + [ query_batch_size: | default: 1000 ] + + # Compute units that have total life time less than this value will be deleted from + # TSDB to reduce number of labels and cardinality + # + # Default value `0s` means no compute units will be purged. + # + # Units Supported: y, w, d, h, m, s, ms. + # + [ cutoff_duration: | default: 0s ] + + # List of labels to delete from TSDB. These labels should be valid matchers for TSDB + # More information of delete API of Prometheus https://prometheus.io/docs/prometheus/latest/querying/api/#delete-series + # + # TSDB must be started with --web.enable-admin-api flag for this to work + # + labels_to_drop: + [ - ... ] + + # Define queries that are used to estimate aggregate metrics of each compute unit + # These queries will be passed to golang's text/template package to build them + # Available template variables + # - UUIDs -> UUIDs string delimited by "|", eg, 123|345|567 + # - ScrapeInterval -> Scrape interval of TSDB in time.Duration format eg 15s, 1m + # - ScrapeIntervalMilli -> Scrape interval of TSDB in milli seconds eg 15000, 60000 + # - EvaluationInterval -> Evaluation interval of TSDB in time.Duration format eg 15s, 1m + # - EvaluationIntervalMilli -> Evaluation interval of TSDB in milli seconds eg 15s, 1m + # - RateInterval -> Rate interval in time.Duration format. It is estimated based on Scrape interval as 4*scrape_interval + # - Range -> Duration of interval where aggregation is being made in time.Duration format + # + queries: + [ ] +``` + +### `` + +A `queries_config` allows configuring PromQL queries for TSDB updater of CEEMS API server. + +```yaml +# +# It is possible to define multiple "sub-metrics" for each parent metric. +# For instance, for the case of `total_cpu_energy_usage_kwh`, we wish to store +# energy usage from different sources like RAPL, IPMI, we can do so using following +# config: +# +# total_cpu_energy_usage_kwh: +# rapl_total: +# ipmi_total: +# +# With the above configuration, the server response from API server will contain +# energy usage from both RAPL and IPMI using the same keys as we used in the +# sub query. For instance, an example response can be: +# +# `{"total_cpu_energy_usage_kwh": {"rapl_total": 100, "ipmi_total": 120}}` +# +# This approach will let the operators to define the metrics freely according to +# their deployments. This will also allow to fetch metrics from third party +# DBs outside of CEEMS components without hassle. +# +# The placeholder queries shown below should work out-of-the-box with CEEMS +# exporter and operators are free to deploy more exporters of their own and use +# the metrics from them to estimate aggregated metrics of each compute unit +# +# Average CPU utilisation +# +# Example of valid query: +# +# global: +# avg_over_time( +# avg by (uuid) ( +# ( +# rate(ceems_compute_unit_cpu_user_seconds_total{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) +# + +# rate(ceems_compute_unit_cpu_system_seconds_total{uuid=~"{{.UUIDs}}"}[{{.RateInterval}}]) +# ) +# / +# ceems_compute_unit_cpus{uuid=~"{{.UUIDs}}"} +# )[{{.Range}}:] +# ) * 100 +avg_cpu_usage: + [ : ... ] + + +# Average CPU Memory utilisation +# +# Example of valid query: +# +# global: +# avg_over_time( +# avg by (uuid) ( +# ceems_compute_unit_memory_used_bytes{uuid=~"{{.UUIDs}}"} +# / +# ceems_compute_unit_memory_total_bytes{uuid=~"{{.UUIDs}}"} +# )[{{.Range}}:] +# ) * 100 +avg_cpu_mem_usage: + [ : ... ] + + +# Total CPU energy usage in kWh +# +# Example of valid query: +# +# total: +# sum_over_time( +# sum by (uuid) ( +# unit:ceems_compute_unit_cpu_energy_usage:sum{uuid=~"{{.UUIDs}}"} * {{.ScrapeIntervalMilli}} / 3.6e9 +# )[{{.Range}}:{{.ScrapeInterval}}] +# ) +total_cpu_energy_usage_kwh: + [ : ... ] + + +# Total CPU emissions in gms +# +# Example of valid query: +# +# rte_total: +# sum_over_time( +# sum by (uuid) ( +# label_replace( +# unit:ceems_compute_unit_cpu_energy_usage:sum{uuid=~"{{.UUIDs}}"} * {{.ScrapeIntervalMilli}} / 3.6e9, +# "common_label", +# "mock", +# "hostname", +# "(.*)" +# ) +# * on (common_label) group_left () +# label_replace( +# ceems_emissions_gCo2_kWh{provider="rte",country_code="fr"}, +# "common_label", +# "mock", +# "hostname", +# "(.*)" +# ) +# )[{{.Range}}:{{.ScrapeInterval}}] +# ) +# emaps_total: +# sum_over_time( +# sum by (uuid) ( +# label_replace( +# unit:ceems_compute_unit_cpu_energy_usage:sum{uuid=~"{{.UUIDs}}"} * {{.ScrapeIntervalMilli}} / 3.6e9, +# "common_label", +# "mock", +# "hostname", +# "(.*)" +# ) +# * on (common_label) group_left () +# label_replace( +# ceems_emissions_gCo2_kWh{provider="emaps",country_code="fr"}, +# "common_label", +# "mock", +# "hostname", +# "(.*)" +# ) +# )[{{.Range}}:{{.ScrapeInterval}}] +# ) +total_cpu_emissions_gms: + [ : ... ] + + +# Average GPU utilization +# +# Example of valid query: +# +# global: +# avg_over_time( +# avg by (uuid) ( +# DCGM_FI_DEV_GPU_UTIL +# * on (gpuuuid) group_right () +# ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"} +# )[{{.Range}}:{{.ScrapeInterval}}] +# ) +avg_gpu_usage: + [ : ... ] + + +# Average GPU memory utilization +# +# Example of valid query: +# +# global: +# avg_over_time( +# avg by (uuid) ( +# DCGM_FI_DEV_MEM_COPY_UTIL +# * on (gpuuuid) group_right () +# ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"} +# )[{{.Range}}:{{.ScrapeInterval}}] +# ) +avg_gpu_mem_usage: + [ : ... ] + + +# Total GPU energy usage in kWh +# +# Example of valid query: +# +# total: +# sum_over_time( +# sum by (uuid) ( +# instance:DCGM_FI_DEV_POWER_USAGE:pue_avg * {{.ScrapeIntervalMilli}} / 3.6e9 +# * on (gpuuuid) group_right() +# ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"} +# )[{{.Range}}:{{.ScrapeInterval}}] +# ) +total_gpu_energy_usage_kwh: + [ : ... ] + + +# Total GPU emissions in gms +# +# Example of valid query: +# +# rte_total: +# sum_over_time( +# sum by (uuid) ( +# label_replace( +# instance:DCGM_FI_DEV_POWER_USAGE:pue_avg * {{.ScrapeIntervalMilli}} / 3.6e+09 +# * on (gpuuuid) group_right () +# ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"}, +# "common_label", +# "mock", +# "instance", +# "(.*)" +# ) +# * on (common_label) group_left () +# label_replace( +# ceems_emissions_gCo2_kWh{provider="rte",country_code="fr"}, +# "common_label", +# "mock", +# "instance", +# "(.*)" +# ) +# )[{{.Range}}:{{.ScrapeInterval}}] +# ) +# emaps_total: +# sum_over_time( +# sum by (uuid) ( +# label_replace( +# instance:DCGM_FI_DEV_POWER_USAGE:pue_avg * {{.ScrapeIntervalMilli}} / 3.6e+09 +# * on (gpuuuid) group_right () +# ceems_compute_unit_gpu_index_flag{uuid=~"{{.UUIDs}}"}, +# "common_label", +# "mock", +# "instance", +# "(.*)" +# ) +# * on (common_label) group_left () +# label_replace( +# ceems_emissions_gCo2_kWh{provider="emaps",country_code="fr"}, +# "common_label", +# "mock", +# "instance", +# "(.*)" +# ) +# )[{{.Range}}:{{.ScrapeInterval}}] +# ) +total_gpu_emissions_gms: + [ : ... ] + + +# Total IO write in GB stats +# +# Currently CEEMS exporter do not scrape this metric. Operators can configure +# this metric from third party exporters, if and when available +# +total_io_write_stats: + [ : ... ] + +# Total IO read in GB stats +# +# Currently CEEMS exporter do not scrape this metric. Operators can configure +# this metric from third party exporters, if and when available +# +total_io_read_stats: + [ : ... ] + +# Total ingress traffic stats +# +# Currently CEEMS exporter do not scrape this metric. Operators can configure +# this metric from third party exporters, if and when available +# +total_ingress_stats: + [ : ... ] + +# Total outgress traffic stats +# +# Currently CEEMS exporter do not scrape this metric. Operators can configure +# this metric from third party exporters, if and when available +# +total_outgress_stats: + [ : ... ] +``` + +## `` + +The following shows the reference for CEEMS load balancer config. A valid sample +configuration file can be found in the +[repo](https://github.com/mahendrapaipuri/ceems/blob/main/build/config/ceems_lb/ceems_lb.yml). + +```yaml +# Configuration file to configure CEEMS Load Balancer +# +# This config file has following sections: +# - `ceems_lb`: Core configuration of CEEMS LB +# - `ceems_api_server`: Client configuration of CEEMS API server +# - `clusters`: This is optional config which can be used to validate backends IDs +# +--- +ceems_lb: + # Load balancing strategy. Three possibilites + # + # - round-robin + # - least-connection + # - resource-based + # + # Round robin and least connection are classic strategies. + # Resource based works based on the query range in the TSDB query. The + # query will be proxied to the backend that covers the query_range + # + [ strategy: | default = round-robin ] + + # List of backends for each cluster + # + backends: + [ - ] + + +# CEEMS API server config. +# This config is essential to enable access control on the TSDB. By excluding +# this config, no access control is imposed on the TSDB and a basic load balancing +# based on the chosen strategy will be made. +# +# Essentially, basic access control is implemented by checking the ownership of the +# queried unit. Users that belong to the same project can query the units belong +# to that project. +# +# For example, if there is a unit U that belongs to User A and +# Project P. Any user that belongs to same project P can query for the metrics of unit U +# but not users from other projects. +# +ceems_api_server: + # The DB contains the information of user and projet units and LB will verify + # if user/project is the owner of the uuid under request to decide whether to + # proxy request to backend or not. + # + # To identify the current user, X-Grafana-User header will be used that Grafana + # is capable of sending to the datasource. Grafana essenatially adds this header + # on the backend server and hence it is not possible for the users to spoof this + # header from the browser. + # In order to enable this feature, it is essential to set `send_user_header = true` + # in Grafana config file. + # + # If both CEEMS API and CEEMS LB is running on the same host, it is preferable to + # use the DB directly using `data.path` as DB query is way faster than a API request + # If both apps are deployed on the same host, ensure that the user running `ceems_lb` + # has permissions to open CEEMS API data files + # + data: + [ ] + + # In the case where CEEMS API and ceems LB are deployed on different hosts, we can + # still perform access control using CEEMS API server by making a API request to + # check the ownership of the queried unit. This method should be only preferred when + # DB cannot be access directly as API request has additional latency than querying DB + # directly. + # + # If both `data.path` and `web.url` are provided, DB will be preferred as it has lower + # latencies. + # + web: + [ ] +``` + +### `` + +A `backend_config` allows configuring backend TSDB servers for load balancer. + +```yaml +# Identifier of the cluster +# +# This ID must match with the ones defined in `clusters` config. CEEMS API server +# will tag each compute unit from that cluster with this ID and when verifying +# for compute unit ownership, CEEMS LB will use the ID to query for the compute +# units of that cluster. +# +# This identifier needs to be in the path parameter for requests to CEEMS LB +# to target correct cluster. For instance there are two different clusters, +# say `cluster-0` and `cluster-1`, that have different TSDBs configured. Using CEEMS +# LB we can load balance the traffic for these two clusters using a single CEEMS LB +# deployement. However, we need to tell CEEMS LB which cluster to target for the +# incoming traffic. This is done via path parameter. +# +# If CEEMS LB is running at http://localhost:9030, then the `cluster-0` is reachable at +# `http://localhost:9030/cluster-0` and `cluster-1` at `http://localhost:9030/cluster-1`. +# Internally, CEEMS will strip the first part in the URL path, use it to identify +# cluster and proxy the rest of URL path to underlying TSDB backend. +# Thus, all the requests to `http://localhost:9030/cluster-0` will be load +# balanced across TSDB backends of `cluster-0`. +# +id: + +# List of TSDBs for this cluster. Load balancing between these TSDBs will be +# made based on the strategy chosen. +# +# TLS is not supported for backends. CEEMS LB supports TLS and TLS terminates +# at the LB and requests are proxied to backends on HTTP. +# +# LB and backend servers are meant to be in the same DMZ so that we do not need +# to encrypt communications. Backends however support basic auth and they can +# be configured in URL with usual syntax. +# +# An example of configuring the basic auth username and password with backend +# - http://alice:password@localhost:9090 +# +tsdb_urls: + [ - ] +``` + ## `` A `web_client_config` allows configuring HTTP clients. diff --git a/website/docs/configuration/grafana.md b/website/docs/configuration/grafana.md new file mode 100644 index 00000000..96294143 --- /dev/null +++ b/website/docs/configuration/grafana.md @@ -0,0 +1,115 @@ +--- +sidebar_position: 8 +--- + +# Grafana + +When using CEEMS LB to provide access control and loading balancing for +TSDB servers, the Prometheus datasource on the Grafana must be configured +slightly differently than using a regular native Prometheus server. As +discussed in [CEEMS LB Configuration](./ceems-lb.md#matching-backendsid-with-clustersid), +a path parameter corresponding to the cluster must be appended to CEEMS LB server URL. + +For instance, if CEEMS API server and CEEMS LB has following configuration: + +```yaml +ceems_api_server: + data: + path: /var/lib/ceems + update_interval: 15m + + admin: + users: + - adm1 + + web: + url: http://localhost:9020 + requests_limit: 30 + +clusters: + - id: slurm-0 + manager: slurm + updaters: + - tsdb-0 + cli: + + + - id: os-0 + manager: openstack + updaters: + - tsdb-1 + web: + + +updaters: + - id: tsdb-0 + updater: tsdb + web: + url: http://tsdb-0 + extra_config: + + + - id: tsdb-1 + updater: tsdb + web: + url: http://tsdb-1 + extra_config: + + +ceems_lb: + strategy: resource-based + backends: + - id: slurm-0 + tsdb_urls: + - http://tsdb-0 + - http://tsdb-0-replica + + - id: os-0 + tsdb_urls: + - http://tsdb-1 + - http://tsdb-1-replica +``` + +it is clear that there are two different clusters, `slurm-0` and `os-0` +and each cluster has its own TSDB server `tsdb-0` and `tsdb-1`, respectively. +In Grafana, a Prometheus datasource for each cluster must be configured to +present the metrics of each cluster separately. Thus, the following +provisioning config can be used to configure datasources of each cluster + +```yaml +datasources: + - name: SLURM-TSDB + type: prometheus + access: proxy + # Notice the path parameter `slurm-0` at the end. + # IT IS IMPORTANT TO HAVE IT + url: http://ceems-lb:9030/slurm-0 + basicAuth: true + basicAuthUser: + secureJsonData: + basicAuthPassword: + + - name: OS-TSDB + type: prometheus + access: proxy + # Notice the path parameter `os-0` at the end. + # IT IS IMPORTANT TO HAVE IT + url: http://ceems-lb:9030/os-0 + basicAuth: true + basicAuthUser: + secureJsonData: + basicAuthPassword: +``` + +Internally, CEEMS LB will strip the path parameter and forwards the request +to the correct backends group based on the provided path parameter. This ensures +that we can use a single instance of CEEMS LB to load balance across multiple +clusters. + +:::important[IMPORTANT] + +Even if there is only one cluster and one TSDB instance for that cluster, we need +to configure the datasource on Grafana as explained above if we wish to use +CEEMS LB. This is the only way for the CEEMS LB to know which cluster to target. + +::: diff --git a/website/docs/configuration/prometheus.md b/website/docs/configuration/prometheus.md new file mode 100644 index 00000000..7db1ee75 --- /dev/null +++ b/website/docs/configuration/prometheus.md @@ -0,0 +1,35 @@ +--- +sidebar_position: 7 +--- + +# Prometheus + +In order to use the dashboards provided in the repository, a minor +[`metric_relabel_configs`](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs) +must be provided for all the target groups that have NVIDIA GPUs where +`dcgm-exporter` is exporting metrics of the GPUs to Prometheus. + +The following shows an example scrape configs where the target nodes +contains NVIDIA GPUs: + +```yaml +scrape_configs: + - job_name: "gpu-node-group" + metric_relabel_configs: + - source_labels: [UUID] + regex: (.*) + target_label: gpuuuid + replacement: $1 + action: replace + - regex: UUID + action: labeldrop + - regex: modelName + action: labeldrop + static_configs: + - targets: ["http://gpu-0:9400", "http://gpu-1:9400", ...] +``` + +The `metric_relabel_configs` is replacing the label `UUID` which is +the UUID of GPU with `gpuuuid` which is compatible with CEEMS +exporter. Moreover the config also drops unused `UUID` and `modelName` +labels to reduce storage and cardinality. diff --git a/website/docusaurus.config.ts b/website/docusaurus.config.ts index b258e6b4..c8da2d3b 100644 --- a/website/docusaurus.config.ts +++ b/website/docusaurus.config.ts @@ -10,7 +10,7 @@ const projectName = "ceems"; const config: Config = { title: "Compute Energy & Emissions Monitoring Stack (CEEMS)", tagline: - "Monitor the energy consumption and equivalent emissions of your workloads in realtime", + "Monitor the energy consumption and carbon footprint of your workloads in realtime", favicon: "img/favicon.ico", // Set the production url of your site here diff --git a/website/src/components/HomepageFeatures/index.tsx b/website/src/components/HomepageFeatures/index.tsx index 51bd93f5..25353f8f 100644 --- a/website/src/components/HomepageFeatures/index.tsx +++ b/website/src/components/HomepageFeatures/index.tsx @@ -20,6 +20,17 @@ const FeatureList: FeatureItem[] = [ ), }, + { + title: "Uses eBPF for perf metrics", + Svg: require("@site/static/img/ebpf.svg").default, + description: ( + <> + Besides energy and carbon footprint, CEEMS monitors and reports + performance, IO and network metrics for user workloads using + eBPF. + + ), + }, { title: "ML/AI workloads", Svg: require("@site/static/img/ml_ai.svg").default, @@ -45,7 +56,7 @@ const FeatureList: FeatureItem[] = [ function Feature({title, Svg, description}: FeatureItem) { return ( -
+
diff --git a/website/static/img/ebpf.svg b/website/static/img/ebpf.svg new file mode 100644 index 00000000..8cf716f8 --- /dev/null +++ b/website/static/img/ebpf.svg @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + +