Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SMP] Establish experiment naming prefix convention for Quality Gates #30273

Merged
merged 1 commit into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
auth_token_file_path: /tmp/agent-auth-token

dd_url: http://127.0.0.1:9091
process_config.process_dd_url: http://localhost:9092

# Disable cloud detection. This stops the Agent from poking around the
# execution environment & network. This is particularly important if the target
# has network access.
cloud_provider_metadata: []

telemetry.enabled: true
telemetry.checks: '*'
39 changes: 39 additions & 0 deletions test/regression/cases/quality_gate_idle/experiment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Agent 'out of the box' idle experiment. Represents an agent install with the
# default configuration and no active workload.

optimization_goal: memory
erratic: false

target:
name: datadog-agent
command: /bin/entrypoint.sh

environment:
DD_API_KEY: 00000001
DD_HOSTNAME: smp-regression

profiling_environment:
DD_INTERNAL_PROFILING_BLOCK_PROFILE_RATE: 10000
DD_INTERNAL_PROFILING_CPU_DURATION: 1m
DD_INTERNAL_PROFILING_DELTA_PROFILES: true
DD_INTERNAL_PROFILING_ENABLED: true
DD_INTERNAL_PROFILING_ENABLE_GOROUTINE_STACKTRACES: true
DD_INTERNAL_PROFILING_MUTEX_PROFILE_FRACTION: 10
DD_INTERNAL_PROFILING_PERIOD: 1m
DD_INTERNAL_PROFILING_UNIX_SOCKET: /var/run/datadog/apm.socket
DD_PROFILING_EXECUTION_TRACE_ENABLED: true
DD_PROFILING_EXECUTION_TRACE_PERIOD: 1m
DD_PROFILING_WAIT_PROFILE: true

DD_INTERNAL_PROFILING_EXTRA_TAGS: experiment:quality_gate_idle

checks:
- name: memory_usage
description: "Memory usage quality gate. This puts a bound on the total agent memory usage."
bounds:
series: total_rss_bytes
upper_bound: "430.0 MiB"

report_links:
- text: "bounds checks dashboard"
link: "https://app.datadoghq.com/dashboard/vz3-jd5-bdi?fromUser=true&refresh_mode=paused&tpl_var_experiment%5B0%5D={{ experiment }}&tpl_var_job_id%5B0%5D={{ job_id }}&tpl_var_run-id%5B0%5D={{ job_id }}&view=spans&from_ts={{ start_time_ms }}&to_ts={{ end_time_ms }}&live=false"
176 changes: 176 additions & 0 deletions test/regression/cases/quality_gate_idle/lading/lading.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
generator: []

blackhole:
- http:
binding_addr: "127.0.0.1:9091"
- http:
binding_addr: "127.0.0.1:9092"

target_metrics:
- prometheus: #core agent telemetry
uri: "http://127.0.0.1:5000/telemetry"
tags:
sub_agent: "core"
- prometheus: #process agent telemetry
uri: "http://127.0.0.1:6062/telemetry"
tags:
sub_agent: "process"
- expvar: #trace agent telemetry
uri: "http://127.0.0.1:5012/debug/vars"
vars:
- "/Event"
- "/ServiceCheck"
- "/check_run_v1"
- "/cmdline"
- "/compressor/BytesIn"
- "/compressor/BytesOut"
- "/compressor/TotalCompressCycles"
- "/compressor/TotalPayloads"
- "/connections"
- "/container"
- "/events_v2"
- "/forwarder/APIKeyFailure"
- "/forwarder/APIKeyStatus"
- "/forwarder/FileStorage/CurrentSizeInBytes"
- "/forwarder/FileStorage/DeserializeCount"
- "/forwarder/FileStorage/DeserializeErrorsCount"
- "/forwarder/FileStorage/DeserializeTransactionsCount"
- "/forwarder/FileStorage/FileSize"
- "/forwarder/FileStorage/FilesCount"
- "/forwarder/FileStorage/FilesRemovedCount"
- "/forwarder/FileStorage/PointsDroppedCount"
- "/forwarder/FileStorage/SerializeCount"
- "/forwarder/FileStorage/StartupReloadedRetryFilesCount"
- "/forwarder/RemovalPolicy/FilesFromUnknownDomainCount"
- "/forwarder/RemovalPolicy/NewRemovalPolicyCount"
- "/forwarder/RemovalPolicy/OutdatedFilesCount"
- "/forwarder/RemovalPolicy/RegisteredDomainCount"
- "/forwarder/TransactionContainer/CurrentMemSizeInBytes"
- "/forwarder/TransactionContainer/ErrorsCount"
- "/forwarder/TransactionContainer/PointsDroppedCount"
- "/forwarder/TransactionContainer/TransactionsCount"
- "/forwarder/TransactionContainer/TransactionsDroppedCount"
- "/forwarder/Transactions/Cluster"
- "/forwarder/Transactions/ClusterRole"
- "/forwarder/Transactions/ClusterRoleBinding"
- "/forwarder/Transactions/ConnectionEvents/ConnectSuccess"
- "/forwarder/Transactions/ConnectionEvents/DNSSuccess"
- "/forwarder/Transactions/CronJob"
- "/forwarder/Transactions/CustomResource"
- "/forwarder/Transactions/CustomResourceDefinition"
- "/forwarder/Transactions/DaemonSet"
- "/forwarder/Transactions/Deployment"
- "/forwarder/Transactions/Dropped"
- "/forwarder/Transactions/DroppedByEndpoint"
- "/forwarder/Transactions/ECSTask"
- "/forwarder/Transactions/Errors"
- "/forwarder/Transactions/ErrorsByType/ConnectionErrors"
- "/forwarder/Transactions/ErrorsByType/DNSErrors"
- "/forwarder/Transactions/ErrorsByType/SentRequestErrors"
- "/forwarder/Transactions/ErrorsByType/TLSErrors"
- "/forwarder/Transactions/ErrorsByType/WroteRequestErrors"
- "/forwarder/Transactions/HTTPErrors"
- "/forwarder/Transactions/HTTPErrorsByCode"
- "/forwarder/Transactions/HighPriorityQueueFull"
- "/forwarder/Transactions/HorizontalPodAutoscaler"
- "/forwarder/Transactions/Ingress"
- "/forwarder/Transactions/InputBytesByEndpoint"
- "/forwarder/Transactions/InputCountByEndpoint"
- "/forwarder/Transactions/Job"
- "/forwarder/Transactions/LimitRange"
- "/forwarder/Transactions/Namespace"
- "/forwarder/Transactions/NetworkPolicy"
- "/forwarder/Transactions/Node"
- "/forwarder/Transactions/OrchestratorManifest"
- "/forwarder/Transactions/PersistentVolume"
- "/forwarder/Transactions/PersistentVolumeClaim"
- "/forwarder/Transactions/Pod"
- "/forwarder/Transactions/ReplicaSet"
- "/forwarder/Transactions/Requeued"
- "/forwarder/Transactions/RequeuedByEndpoint"
- "/forwarder/Transactions/Retried"
- "/forwarder/Transactions/RetriedByEndpoint"
- "/forwarder/Transactions/RetryQueueSize"
- "/forwarder/Transactions/Role"
- "/forwarder/Transactions/RoleBinding"
- "/forwarder/Transactions/Service"
- "/forwarder/Transactions/ServiceAccount"
- "/forwarder/Transactions/StatefulSet"
- "/forwarder/Transactions/StorageClass"
- "/forwarder/Transactions/Success"
- "/forwarder/Transactions/SuccessByEndpoint/check_run_v1"
- "/forwarder/Transactions/SuccessByEndpoint/connections"
- "/forwarder/Transactions/SuccessByEndpoint/container"
- "/forwarder/Transactions/SuccessByEndpoint/events_v2"
- "/forwarder/Transactions/SuccessByEndpoint/host_metadata_v2"
- "/forwarder/Transactions/SuccessByEndpoint/intake"
- "/forwarder/Transactions/SuccessByEndpoint/orchestrator"
- "/forwarder/Transactions/SuccessByEndpoint/process"
- "/forwarder/Transactions/SuccessByEndpoint/rtcontainer"
- "/forwarder/Transactions/SuccessByEndpoint/rtprocess"
- "/forwarder/Transactions/SuccessByEndpoint/series_v1"
- "/forwarder/Transactions/SuccessByEndpoint/series_v2"
- "/forwarder/Transactions/SuccessByEndpoint/services_checks_v2"
- "/forwarder/Transactions/SuccessByEndpoint/sketches_v1"
- "/forwarder/Transactions/SuccessByEndpoint/sketches_v2"
- "/forwarder/Transactions/SuccessByEndpoint/validate_v1"
- "/forwarder/Transactions/SuccessBytesByEndpoint"
- "/forwarder/Transactions/VerticalPodAutoscaler"
- "/host_metadata_v2"
- "/hostname/errors"
- "/hostname/provider"
- "/intake"
- "/jsonstream/CompressorLocks"
- "/jsonstream/ItemDrops"
- "/jsonstream/PayloadFulls"
- "/jsonstream/TotalCalls"
- "/jsonstream/TotalItems"
- "/jsonstream/TotalLockTime"
- "/jsonstream/TotalSerializationTime"
- "/jsonstream/WriteItemErrors"
- "/kubeletQueries"
- "/orchestrator"
- "/pid"
- "/process"
- "/rtcontainer"
- "/rtprocess"
- "/serializer/SendEventsErrItemTooBigs"
- "/serializer/SendEventsErrItemTooBigsFallback"
- "/series"
- "/series_v1"
- "/series_v2"
- "/services_checks_v2"
- "/sketch_series/ItemTooBig"
- "/sketch_series/PayloadFull"
- "/sketch_series/UnexpectedItemDrops"
- "/sketches_v1"
- "/sketches_v2"
- "/splitter/NotTooBig"
- "/splitter/PayloadDrops"
- "/splitter/TooBig"
- "/splitter/TotalLoops"
- "/stats_writer/Bytes"
- "/stats_writer/ClientPayloads"
- "/stats_writer/Errors"
- "/stats_writer/Payloads"
- "/stats_writer/Retries"
- "/stats_writer/Splits"
- "/stats_writer/StatsBuckets"
- "/stats_writer/StatsEntries"
- "/trace_writer/Bytes"
- "/trace_writer/BytesUncompressed"
- "/trace_writer/Errors"
- "/trace_writer/Events"
- "/trace_writer/Payloads"
- "/trace_writer/Retries"
- "/trace_writer/SingleMaxSize"
- "/trace_writer/Spans"
- "/trace_writer/Traces"
- "/uptime"
- "/validate_v1"
- "/version/Version"
- "/version/GitCommit"
- "/watchdog/CPU/UserAvg"
- "/watchdog/Mem/Alloc"
tags:
sub_agent: "trace"
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
auth_token_file_path: /tmp/agent-auth-token
hostname: smp-regression

dd_url: http://127.0.0.1:9092

confd_path: /etc/datadog-agent/conf.d

# Disable cloud detection. This stops the Agent from poking around the
# execution environment & network. This is particularly important if the target
# has network access.
cloud_provider_metadata: []

dogstatsd_socket: '/tmp/dsd.socket'

logs_enabled: true

apm_config:
enabled: true

process_config:
process_collection:
enabled: true
container_collection:
enabled: true

network_path:
connections_monitoring:
enabled: true

runtime_security_config:
## Set to true to enable Threat Detection
enabled: true

cluster_checks:
enabled: true

otlp_config:
metrics:
enabled: true
traces:
enabled: true
logs:
enabled: true

system_probe_config:
enabled: true

network_config:
enabled: true

# Per Cloud Security Management setup documentation
# https://docs.datadoghq.com/security/cloud_security_management/setup/agent/linux/
remote_configuration:
# SMP environment does not support remote config currently.
enabled: false

compliance_config:
## Set to true to enable CIS benchmarks for Misconfigurations.
enabled: true
host_benchmarks:
enabled: true

# Vulnerabilities are evaluated and scanned against your containers and hosts every hour.
sbom:
enabled: true
# Set to true to enable Container Vulnerability Management
container_image:
enabled: true
# Set to true to enable Host Vulnerability Management
host:
enabled: true

container_image:
enabled: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Per https://docs.datadoghq.com/security/cloud_security_management/setup/agent/linux/
runtime_security_config:
## @param enabled - boolean - optional - default: false
## Set to true to enable Threat Detection
enabled: true

compliance_config:
## @param enabled - boolean - optional - default: false
## Set to true to enable CIS benchmarks for Misconfigurations.
#
enabled: true
host_benchmarks:
enabled: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Per https://docs.datadoghq.com/security/cloud_security_management/setup/agent/linux/

runtime_security_config:
## @param enabled - boolean - optional - default: false
## Set to true to enable Threat Detection
enabled: true

remote_configuration:
## @param enabled - boolean - optional - default: false
enabled: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Agent 'all features enabled' idle experiment. Represents an agent install with
# all sub-agents enabled in configuration and no active workload.
Comment on lines +1 to +2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think about adding a blurb about quality gates and a link to our docs in these?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I plan to add this once the corresponding confluence docs exist

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👀 🙈


optimization_goal: memory
erratic: false

target:
name: datadog-agent
command: /bin/entrypoint.sh

environment:
DD_TELEMETRY_ENABLED: true
DD_API_KEY: 00000001
DD_HOSTNAME: smp-regression
DD_DD_URL: http://127.0.0.1:9092

profiling_environment:
# internal profiling
DD_INTERNAL_PROFILING_ENABLED: true
DD_SYSTEM_PROBE_INTERNAL_PROFILING_ENABLED: true
# run all the time
DD_SYSTEM_PROBE_INTERNAL_PROFILING_PERIOD: 1m
DD_INTERNAL_PROFILING_PERIOD: 1m
DD_SYSTEM_PROBE_INTERNAL_PROFILING_CPU_DURATION: 1m
DD_INTERNAL_PROFILING_CPU_DURATION: 1m
# destination
DD_INTERNAL_PROFILING_UNIX_SOCKET: /var/run/datadog/apm.socket
DD_SYSTEM_PROBE_CONFIG_INTERNAL_PROFILING_UNIX_SOCKET: /var/run/datadog/apm.socket
# tags
DD_INTERNAL_PROFILING_EXTRA_TAGS: experiment:quality_gate_idle_all_features
DD_SYSTEM_PROBE_CONFIG_INTERNAL_PROFILING_EXTRA_TAGS: experiment:quality_gate_idle_all_features

DD_INTERNAL_PROFILING_BLOCK_PROFILE_RATE: 10000
DD_INTERNAL_PROFILING_DELTA_PROFILES: true
DD_INTERNAL_PROFILING_ENABLE_GOROUTINE_STACKTRACES: true
DD_INTERNAL_PROFILING_MUTEX_PROFILE_FRACTION: 10

# ddprof options
DD_PROFILING_EXECUTION_TRACE_ENABLED: true
DD_PROFILING_EXECUTION_TRACE_PERIOD: 1m
DD_PROFILING_WAIT_PROFILE: true

checks:
- name: memory_usage
description: "Memory usage quality gate. This puts a bound on the total agent memory usage."
bounds:
series: total_rss_bytes
upper_bound: "785.0 MiB"

report_links:
- text: "bounds checks dashboard"
link: "https://app.datadoghq.com/dashboard/vz3-jd5-bdi?fromUser=true&refresh_mode=paused&tpl_var_experiment%5B0%5D={{ experiment }}&tpl_var_job_id%5B0%5D={{ job_id }}&tpl_var_run-id%5B0%5D={{ job_id }}&view=spans&from_ts={{ start_time_ms }}&to_ts={{ end_time_ms }}&live=false"
Loading
Loading