DataDog · Guillaume-Barrier · Oct 11, 2024 · Oct 11, 2024 · Oct 14, 2024 · Oct 14, 2024
@@ -1091,6 +1091,23 @@ func TestLoadEnv(t *testing.T) {
 		assert.Equal(t, 12.3, cfg.OTLPReceiver.ProbabilisticSampling)
 	})
 
+	env = "DD_APM_ERROR_TRACKING_STANDALONE_ENABLED"
+	t.Run(env, func(t *testing.T) {
+		t.Setenv(env, "true")
+
+		config := fxutil.Test[Component](t, fx.Options(
+			corecomp.MockModule(),
+			fx.Replace(corecomp.MockParams{
+				Params: corecomp.Params{ConfFilePath: "./testdata/undocumented.yaml"},
+			}),
+			MockModule(),
+		))
+		cfg := config.Object()
+
+		assert.NotNil(t, cfg)
+		assert.Equal(t, true, cfg.ErrorTrackingStandalone)
+	})
+
 	for _, envKey := range []string{
 		"DD_IGNORE_RESOURCE", // deprecated
 		"DD_APM_IGNORE_RESOURCES",

@@ -274,6 +274,10 @@ func applyDatadogConfig(c *config.AgentConfig, core corecompcfg.Component) error
 		c.ProbabilisticSamplerHashSeed = uint32(core.GetInt("apm_config.probabilistic_sampler.hash_seed"))
 	}
 
+	if core.IsSet("apm_config.error_tracking_standalone.enabled") {
+		c.ErrorTrackingStandalone = core.GetBool("apm_config.error_tracking_standalone.enabled")
+	}
+
 	if core.IsSet("apm_config.max_remote_traces_per_second") {
 		c.MaxRemoteTPS = core.GetFloat64("apm_config.max_remote_traces_per_second")
 	}

@@ -1452,6 +1452,14 @@ api_key:
   ##            collectors using the probabilistic sampler to ensure consistent sampling.
   #  hash_seed: 0
 
+  ## @param error_tracking_standalone - object - optional
+  ## Enables Error Tracking Standalone
+  ##
+  #error_tracking_standalone:
+  ## @env DD_APM_ERROR_TRACKING_STANDALONE_ENABLED - boolean - optional - default: false
+  ## Enables or disables Error Tracking Standalone
+  # enabled: false
+
 
   {{- if .InternalProfiling -}}
   ## @param profiling - custom object - optional

@@ -112,6 +112,7 @@ func setupAPM(config pkgconfigmodel.Setup) {
 	config.BindEnv("apm_config.probabilistic_sampler.enabled", "DD_APM_PROBABILISTIC_SAMPLER_ENABLED")
 	config.BindEnv("apm_config.probabilistic_sampler.sampling_percentage", "DD_APM_PROBABILISTIC_SAMPLER_SAMPLING_PERCENTAGE")
 	config.BindEnv("apm_config.probabilistic_sampler.hash_seed", "DD_APM_PROBABILISTIC_SAMPLER_HASH_SEED")
+	config.BindEnv("apm_config.error_tracking_standalone.enabled", "DD_APM_ERROR_TRACKING_STANDALONE_ENABLED")
 
 	config.BindEnv("apm_config.max_memory", "DD_APM_MAX_MEMORY")
 	config.BindEnv("apm_config.max_cpu_percent", "DD_APM_MAX_CPU_PERCENT")

@@ -561,8 +561,15 @@ func (a *Agent) ProcessStats(in *pb.ClientStatsPayload, lang, tracerVersion stri
 	a.ClientStatsAggregator.In <- a.processStats(in, lang, tracerVersion)
 }
 
-// sample performs all sampling on the processedTrace modifying it as needed and returning if the trace should be kept and the number of events in the trace
+// sample performs all sampling on the processedTrace modifying it as needed and returning if the trace should be kept
+// and the number of events in the trace
 func (a *Agent) sample(now time.Time, ts *info.TagStats, pt *traceutil.ProcessedTrace) (keep bool, numEvents int) {
+	// If the agent is set for Error Tracking Standalone only the ErrorSampler is run (bypasses all other samplers).
+	// Trace chunks that don't contain errors are dropped.
+	if a.conf.ErrorTrackingStandalone {
+		return a.errorSampling(now, ts, pt)
+	}
+
 	// We have a `keep` that is different from pt's `DroppedTrace` field as `DroppedTrace` will be sent to intake.
 	// For example: We want to maintain the overall trace level sampling decision for a trace with Analytics Events
 	// where a trace might be marked as DroppedTrace true, but we still sent analytics events in that ProcessedTrace.
@@ -605,6 +612,48 @@ func isManualUserDrop(pt *traceutil.ProcessedTrace) bool {
 	return dm == manualSampling
 }
 
+// errorSampling reports trace chunks with errors and tags spans as Error Tracking Backend Standalone.
+// Also sets "DroppedTrace" on the chunk.
+func (a *Agent) errorSampling(now time.Time, ts *info.TagStats, pt *traceutil.ProcessedTrace) (keep bool, numEvents int) {
+	sampled := a.runErrorSampler(now, *pt)
+	numEvents = len(a.getAnalyzedEvents(pt, ts))
+	if sampled {
+		for _, span := range pt.TraceChunk.Spans {
+			if span.Error != 0 || spanContainsExceptionSpanEvent(span) {
+				span.Meta["_dd.error_tracking_backend_standalone.error"] = "true"
+			} else {
+				span.Meta["_dd.error_tracking_backend_standalone.error"] = "false"
+			}
+		}
+	}
+	pt.TraceChunk.DroppedTrace = !sampled
+	return sampled, numEvents
+}
+
+func spanContainsExceptionSpanEvent(span *pb.Span) bool {
+	if hasExceptionSpanEvents, ok := span.Meta["_dd.span_events.has_exception"]; ok && hasExceptionSpanEvents == "true" {
+		return true
+	}
+	return false
+}
+
+// runErrorSampler runs the agent's configured ErrorSampler if pt contains errors and returns the sampling decision.
+func (a *Agent) runErrorSampler(now time.Time, pt traceutil.ProcessedTrace) (keep bool) {
+	if traceContainsErrorOrExceptionSpanEvent(pt.TraceChunk.Spans) {
+		return a.ErrorsSampler.Sample(now, pt.TraceChunk.Spans, pt.Root, pt.TracerEnv)
+	}
+	return false
+}
+
+func traceContainsErrorOrExceptionSpanEvent(trace pb.Trace) bool {
+	for _, span := range trace {
+		if span.Error != 0 || spanContainsExceptionSpanEvent(span) {
+			return true
+		}
+	}
+	return false
+}
+
 // traceSampling reports whether the chunk should be kept as a trace, setting "DroppedTrace" on the chunk
 func (a *Agent) traceSampling(now time.Time, ts *info.TagStats, pt *traceutil.ProcessedTrace) (keep bool, checkAnalyticsEvents bool) {
 	sampled, check := a.runSamplers(now, ts, *pt)

@@ -1273,7 +1273,7 @@ func TestSampling(t *testing.T) {
 	}
 }
 
-func TestSample(t *testing.T) {
+func TestSampleTrace(t *testing.T) {
 	now := time.Now()
 	cfg := &config.AgentConfig{TargetTPS: 5, ErrorTPS: 1000, Features: make(map[string]struct{})}
 	genSpan := func(decisionMaker string, priority sampler.SamplingPriority, err int32) traceutil.ProcessedTrace {
@@ -1357,6 +1357,133 @@ func TestSample(t *testing.T) {
 	}
 }
 
+func TestSample(t *testing.T) {
+	now := time.Now()
+	cfg := &config.AgentConfig{TargetTPS: 5, ErrorTPS: 1000, Features: make(map[string]struct{})}
+	genSpan := func(decisionMaker string, priority sampler.SamplingPriority, err int32, exceptionInSpanEvent bool) traceutil.ProcessedTrace {
+		root := &pb.Span{
+			Service:  "serv1",
+			Start:    now.UnixNano(),
+			Duration: (100 * time.Millisecond).Nanoseconds(),
+			Metrics:  map[string]float64{"_top_level": 1},
+			Error:    err, // If 1, the Error Sampler will keep the trace, if 0, it will not be sampled
+			Meta:     map[string]string{},
+		}
+		if exceptionInSpanEvent {
+			root.Meta["_dd.span_events.has_exception"] = "true" // the Error Sampler will keep the trace
+		}
+		chunk := testutil.TraceChunkWithSpan(root)
+		if decisionMaker != "" {
+			chunk.Tags["_dd.p.dm"] = decisionMaker
+		}
+		pt := traceutil.ProcessedTrace{TraceChunk: chunk, Root: root}
+		pt.TraceChunk.Priority = int32(priority)
+		return pt
+	}
+	statsd := &statsd.NoOpClient{}
+	tests := map[string]struct {
+		trace           traceutil.ProcessedTrace
+		etsEnabled      bool
+		keep            bool
+		keepWithFeature bool
+	}{
+		"userdrop-error-manual-dm-unsampled": {
+			trace:           genSpan("-4", sampler.PriorityUserDrop, 1, false),
+			keep:            false,
+			keepWithFeature: false,
+		},
+		"userkeep-error-no-dm-sampled": {
+			trace:           genSpan("", sampler.PriorityUserKeep, 1, false),
+			keep:            true,
+			keepWithFeature: true,
+		},
+		"userkeep-error-agent-dm-sampled": {
+			trace:           genSpan("-1", sampler.PriorityUserKeep, 1, false),
+			keep:            true,
+			keepWithFeature: true,
+		},
+		"autodrop-error-sampled": {
+			trace:           genSpan("", sampler.PriorityAutoDrop, 1, false),
+			keep:            true,
+			keepWithFeature: true,
+		},
+		"autodrop-not-sampled": {
+			trace:           genSpan("", sampler.PriorityAutoDrop, 0, false),
+			keep:            false,
+			keepWithFeature: false,
+		},
+		"ets-userdrop-error-manual-dm-unsampled": {
+			trace:           genSpan("-4", sampler.PriorityUserDrop, 1, false),
+			etsEnabled:      true,
+			keep:            true,
+			keepWithFeature: true,
+		},
+		"ets-userdrop-errorspanevent-manual-dm-unsampled": {
+			trace:           genSpan("-4", sampler.PriorityUserDrop, 1, false),
+			etsEnabled:      true,
+			keep:            true,
+			keepWithFeature: true,
+		},
+		"ets-userdrop-manual-dm-unsampled": {
+			trace:           genSpan("-4", sampler.PriorityUserDrop, 0, false),
+			etsEnabled:      true,
+			keep:            false,
+			keepWithFeature: false,
+		},
+		"ets-userkeep-error-no-dm-sampled": {
+			trace:           genSpan("", sampler.PriorityUserKeep, 1, false),
+			etsEnabled:      true,
+			keep:            true,
+			keepWithFeature: true,
+		},
+		"ets-userkeep-error-agent-dm-sampled": {
+			trace:           genSpan("-1", sampler.PriorityUserKeep, 1, false),
+			etsEnabled:      true,
+			keep:            true,
+			keepWithFeature: true,
+		},
+		"ets-autodrop-error-sampled": {
+			trace:           genSpan("", sampler.PriorityAutoDrop, 1, false),
+			etsEnabled:      true,
+			keep:            true,
+			keepWithFeature: true,
+		},
+		"ets-autodrop-errorspanevent-sampled": {
+			trace:           genSpan("", sampler.PriorityAutoDrop, 0, true),
+			etsEnabled:      true,
+			keep:            true,
+			keepWithFeature: true,
+		},
+		"ets-autodrop-not-sampled": {
+			trace:           genSpan("", sampler.PriorityAutoDrop, 0, false),
+			etsEnabled:      true,
+			keep:            false,
+			keepWithFeature: false,
+		},
+	}
+	for name, tt := range tests {
+		cfg.ErrorTrackingStandalone = tt.etsEnabled
+		a := &Agent{
+			NoPrioritySampler: sampler.NewNoPrioritySampler(cfg, statsd),
+			ErrorsSampler:     sampler.NewErrorsSampler(cfg, statsd),
+			PrioritySampler:   sampler.NewPrioritySampler(cfg, &sampler.DynamicConfig{}, statsd),
+			RareSampler:       sampler.NewRareSampler(config.New(), statsd),
+			EventProcessor:    newEventProcessor(cfg, statsd),
+			conf:              cfg,
+		}
+		t.Run(name, func(t *testing.T) {
+			keep, _ := a.sample(now, info.NewReceiverStats().GetTagStats(info.Tags{}), &tt.trace)
+			assert.Equal(t, tt.keep, keep)
+			assert.Equal(t, !tt.keep, tt.trace.TraceChunk.DroppedTrace)
+			cfg.Features["error_rare_sample_tracer_drop"] = struct{}{}
+			defer delete(cfg.Features, "error_rare_sample_tracer_drop")
+			keep, _ = a.sample(now, info.NewReceiverStats().GetTagStats(info.Tags{}), &tt.trace)
+			assert.Equal(t, tt.keepWithFeature, keep)
+			assert.Equal(t, !tt.keepWithFeature, tt.trace.TraceChunk.DroppedTrace)
+		})
+	}
+}
+
 func TestSampleManualUserDropNoAnalyticsEvents(t *testing.T) {
 	// This test exists to confirm previous behavior where we did not extract nor tag analytics events on
 	// user manual drop traces

@@ -517,6 +517,12 @@ func (o *OTLPReceiver) convertSpan(rattr map[string]string, lib pcommon.Instrume
 	if in.Events().Len() > 0 {
 		transform.SetMetaOTLP(span, "events", transform.MarshalEvents(in.Events()))
 	}
+	for i := range in.Events().Len() {
+		if in.Events().At(i).Name() == "exception" {
+			span.Meta["_dd.span_events.has_exception"] = "true"
+			break
+		}
+	}
 	if in.Links().Len() > 0 {
 		transform.SetMetaOTLP(span, "_dd.span_links", transform.MarshalLinks(in.Links()))
 	}