Skip to content

Commit acb94fa

Browse files
authored
feat(metrics): Track memory footprint of metrics buckets (#1284)
To be able to limit the memory footprint of metrics buckets in the aggregator, we need to keep track of the number of elements we store. Instead of measuring the actual memory consumption, we apply a simple model, roughly measuring the bytes needed to encode a bucket: - counter buckets: 8 bytes (f64) - set buckets: number of unique elements * 4 (f32) - distribution buckets: number of unique elements * 12 (f64 + u32) - gauge: 40 bytes (4 * f64 + 1 * u64) To avoid iterating over all the buckets every time we want to query the memory footprint, we keep a map of counters per project key (plus one total count) that is incremented with the footprint delta on every insert.
1 parent 904625c commit acb94fa

File tree

3 files changed

+215
-3
lines changed

3 files changed

+215
-3
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
- Add support for profile outcomes. ([#1272](https://github.com/getsentry/relay/pull/1272))
2222
- Avoid potential panics when scrubbing minidumps. ([#1282](https://github.com/getsentry/relay/pull/1282))
2323
- Fix typescript profile validation. ([#1283](https://github.com/getsentry/relay/pull/1283))
24+
- Track memory footprint of metrics buckets. ([#1284](https://github.com/getsentry/relay/pull/1284))
2425

2526
## 22.5.0
2627

relay-metrics/src/aggregation.rs

Lines changed: 205 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,21 @@ impl BucketValue {
520520
Self::Distribution(m) => m.internal_size(),
521521
}
522522
}
523+
524+
/// Estimates the number of bytes needed to encode the bucket.
525+
/// Note that this does not necessarily match the exact memory footprint of the bucket,
526+
/// because datastructures might have a memory overhead.
527+
///
528+
/// This is very similar to [`BucketValue::relative_size`], which can possibly be removed.
529+
pub fn cost(&self) -> usize {
530+
match self {
531+
Self::Counter(_) => 8,
532+
Self::Set(s) => 4 * s.len(),
533+
Self::Gauge(_) => 5 * 8,
534+
// Distribution values are stored as maps of (f64, u32) pairs
535+
Self::Distribution(m) => 12 * m.internal_size(),
536+
}
537+
}
523538
}
524539

525540
impl From<MetricValue> for BucketValue {
@@ -537,7 +552,7 @@ impl From<MetricValue> for BucketValue {
537552
///
538553
/// Currently either a [`MetricValue`] or another `BucketValue`.
539554
trait MergeValue: Into<BucketValue> {
540-
/// Merges `self` into the given `bucket_value`.
555+
/// Merges `self` into the given `bucket_value` and returns the additional cost for storing this value.
541556
///
542557
/// Aggregation is performed according to the rules documented in [`BucketValue`].
543558
fn merge_into(self, bucket_value: &mut BucketValue) -> Result<(), AggregateMetricsError>;
@@ -1018,6 +1033,48 @@ enum AggregatorState {
10181033
ShuttingDown,
10191034
}
10201035

1036+
#[derive(Debug, Default)]
1037+
struct CostTracker {
1038+
total_cost: usize,
1039+
// Choosing a BTreeMap instead of a HashMap here, under the assumption that a BTreeMap
1040+
// is still more efficient for the number of project keys we store.
1041+
cost_per_project_key: BTreeMap<ProjectKey, usize>,
1042+
}
1043+
1044+
impl CostTracker {
1045+
fn add_cost(&mut self, project_key: ProjectKey, cost: usize) {
1046+
self.total_cost += cost;
1047+
let project_cost = self.cost_per_project_key.entry(project_key).or_insert(0);
1048+
*project_cost += cost;
1049+
}
1050+
1051+
fn subtract_cost(&mut self, project_key: ProjectKey, cost: usize) {
1052+
match self.cost_per_project_key.entry(project_key) {
1053+
btree_map::Entry::Vacant(_) => {
1054+
relay_log::error!(
1055+
"Trying to subtract cost for a project key that has not been tracked"
1056+
);
1057+
}
1058+
btree_map::Entry::Occupied(mut entry) => {
1059+
// Handle per-project cost:
1060+
let project_cost = entry.get_mut();
1061+
if cost > *project_cost {
1062+
relay_log::error!("Subtracting a project cost higher than what we tracked");
1063+
self.total_cost = self.total_cost.saturating_sub(*project_cost);
1064+
*project_cost = 0;
1065+
} else {
1066+
*project_cost -= cost;
1067+
self.total_cost = self.total_cost.saturating_sub(cost);
1068+
}
1069+
if *project_cost == 0 {
1070+
// Remove this project_key from the map
1071+
entry.remove();
1072+
}
1073+
}
1074+
};
1075+
}
1076+
}
1077+
10211078
/// A collector of [`Metric`] submissions.
10221079
///
10231080
/// # Aggregation
@@ -1074,6 +1131,7 @@ pub struct Aggregator {
10741131
buckets: HashMap<BucketKey, QueuedBucket>,
10751132
receiver: Recipient<FlushBuckets>,
10761133
state: AggregatorState,
1134+
cost_tracker: CostTracker,
10771135
}
10781136

10791137
impl Aggregator {
@@ -1087,6 +1145,7 @@ impl Aggregator {
10871145
buckets: HashMap::new(),
10881146
receiver,
10891147
state: AggregatorState::Running,
1148+
cost_tracker: CostTracker::default(),
10901149
}
10911150
}
10921151

@@ -1200,14 +1259,19 @@ impl Aggregator {
12001259

12011260
let key = Self::validate_bucket_key(key, &self.config)?;
12021261

1262+
let added_cost;
12031263
match self.buckets.entry(key) {
12041264
Entry::Occupied(mut entry) => {
12051265
relay_statsd::metric!(
12061266
counter(MetricCounters::MergeHit) += 1,
12071267
metric_type = entry.key().metric_type.as_str(),
12081268
metric_name = &entry.key().metric_name
12091269
);
1210-
value.merge_into(&mut entry.get_mut().value)?;
1270+
let bucket_value = &mut entry.get_mut().value;
1271+
let cost_before = bucket_value.cost();
1272+
value.merge_into(bucket_value)?;
1273+
let cost_after = bucket_value.cost();
1274+
added_cost = cost_after.saturating_sub(cost_before);
12111275
}
12121276
Entry::Vacant(entry) => {
12131277
relay_statsd::metric!(
@@ -1222,10 +1286,14 @@ impl Aggregator {
12221286
);
12231287

12241288
let flush_at = self.config.get_flush_time(timestamp, project_key);
1225-
entry.insert(QueuedBucket::new(flush_at, value.into()));
1289+
let bucket = value.into();
1290+
added_cost = bucket.cost();
1291+
entry.insert(QueuedBucket::new(flush_at, bucket));
12261292
}
12271293
}
12281294

1295+
self.cost_tracker.add_cost(project_key, added_cost);
1296+
12291297
Ok(())
12301298
}
12311299

@@ -1299,18 +1367,32 @@ impl Aggregator {
12991367
pub fn pop_flush_buckets(&mut self) -> HashMap<ProjectKey, Vec<Bucket>> {
13001368
relay_statsd::metric!(gauge(MetricGauges::Buckets) = self.buckets.len() as u64);
13011369

1370+
// We only emit statsd metrics for the cost on flush (and not when merging the buckets),
1371+
// assuming that this gives us more than enough data points.
1372+
relay_statsd::metric!(
1373+
gauge(MetricGauges::BucketsCost) = self.cost_tracker.total_cost as u64
1374+
);
1375+
for cost in self.cost_tracker.cost_per_project_key.values() {
1376+
relay_statsd::metric!(
1377+
histogram(MetricHistograms::BucketsCostPerProjectKey) = *cost as f64
1378+
);
1379+
}
1380+
13021381
let mut buckets = HashMap::<ProjectKey, Vec<Bucket>>::new();
13031382

13041383
let force = matches!(&self.state, AggregatorState::ShuttingDown);
13051384

13061385
relay_statsd::metric!(timer(MetricTimers::BucketsScanDuration), {
13071386
let bucket_interval = self.config.bucket_interval;
1387+
let cost_tracker = &mut self.cost_tracker;
13081388
self.buckets.retain(|key, entry| {
13091389
if force || entry.elapsed() {
13101390
// Take the value and leave a placeholder behind. It'll be removed right after.
13111391
let value = std::mem::replace(&mut entry.value, BucketValue::Counter(0.0));
1392+
cost_tracker.subtract_cost(key.project_key, value.cost());
13121393
let bucket = Bucket::from_parts(key.clone(), bucket_interval, value);
13131394
buckets.entry(key.project_key).or_default().push(bucket);
1395+
13141396
false
13151397
} else {
13161398
true
@@ -1883,6 +1965,24 @@ mod tests {
18831965
);
18841966
}
18851967

1968+
#[test]
1969+
fn test_bucket_value_cost() {
1970+
let counter = BucketValue::Counter(123.0);
1971+
assert_eq!(counter.cost(), 8);
1972+
let set = BucketValue::Set(vec![1, 2, 3, 4, 5].into_iter().collect());
1973+
assert_eq!(set.cost(), 20);
1974+
let distribution = BucketValue::Distribution(dist![1., 2., 3.]);
1975+
assert_eq!(distribution.cost(), 36);
1976+
let gauge = BucketValue::Gauge(GaugeValue {
1977+
max: 43.,
1978+
min: 42.,
1979+
sum: 85.,
1980+
last: 43.,
1981+
count: 2,
1982+
});
1983+
assert_eq!(gauge.cost(), 40);
1984+
}
1985+
18861986
#[test]
18871987
fn test_aggregator_merge_counters() {
18881988
relay_test::setup();
@@ -2059,6 +2159,108 @@ mod tests {
20592159
assert_eq!(aggregator.buckets.len(), 2);
20602160
}
20612161

2162+
#[test]
2163+
fn test_cost_tracker() {
2164+
let project_key1 = ProjectKey::parse("a94ae32be2584e0bbd7a4cbb95971fed").unwrap();
2165+
let project_key2 = ProjectKey::parse("a94ae32be2584e0bbd7a4cbb95971fee").unwrap();
2166+
let project_key3 = ProjectKey::parse("a94ae32be2584e0bbd7a4cbb95971fef").unwrap();
2167+
let mut cost_tracker = CostTracker::default();
2168+
insta::assert_debug_snapshot!(cost_tracker, @r###"
2169+
CostTracker {
2170+
total_cost: 0,
2171+
cost_per_project_key: {},
2172+
}
2173+
"###);
2174+
cost_tracker.add_cost(project_key1, 100);
2175+
insta::assert_debug_snapshot!(cost_tracker, @r###"
2176+
CostTracker {
2177+
total_cost: 100,
2178+
cost_per_project_key: {
2179+
ProjectKey("a94ae32be2584e0bbd7a4cbb95971fed"): 100,
2180+
},
2181+
}
2182+
"###);
2183+
cost_tracker.add_cost(project_key2, 200);
2184+
insta::assert_debug_snapshot!(cost_tracker, @r###"
2185+
CostTracker {
2186+
total_cost: 300,
2187+
cost_per_project_key: {
2188+
ProjectKey("a94ae32be2584e0bbd7a4cbb95971fed"): 100,
2189+
ProjectKey("a94ae32be2584e0bbd7a4cbb95971fee"): 200,
2190+
},
2191+
}
2192+
"###);
2193+
// Unknown project: Will log error, but not crash
2194+
cost_tracker.subtract_cost(project_key3, 666);
2195+
insta::assert_debug_snapshot!(cost_tracker, @r###"
2196+
CostTracker {
2197+
total_cost: 300,
2198+
cost_per_project_key: {
2199+
ProjectKey("a94ae32be2584e0bbd7a4cbb95971fed"): 100,
2200+
ProjectKey("a94ae32be2584e0bbd7a4cbb95971fee"): 200,
2201+
},
2202+
}
2203+
"###);
2204+
// Subtract too much: Will log error, but not crash
2205+
cost_tracker.subtract_cost(project_key1, 666);
2206+
insta::assert_debug_snapshot!(cost_tracker, @r###"
2207+
CostTracker {
2208+
total_cost: 200,
2209+
cost_per_project_key: {
2210+
ProjectKey("a94ae32be2584e0bbd7a4cbb95971fee"): 200,
2211+
},
2212+
}
2213+
"###);
2214+
cost_tracker.subtract_cost(project_key2, 20);
2215+
insta::assert_debug_snapshot!(cost_tracker, @r###"
2216+
CostTracker {
2217+
total_cost: 180,
2218+
cost_per_project_key: {
2219+
ProjectKey("a94ae32be2584e0bbd7a4cbb95971fee"): 180,
2220+
},
2221+
}
2222+
"###);
2223+
cost_tracker.subtract_cost(project_key2, 180);
2224+
insta::assert_debug_snapshot!(cost_tracker, @r###"
2225+
CostTracker {
2226+
total_cost: 0,
2227+
cost_per_project_key: {},
2228+
}
2229+
"###);
2230+
}
2231+
2232+
#[test]
2233+
fn test_aggregator_cost_tracking() {
2234+
// Make sure that the right cost is added / subtracted
2235+
let receiver = TestReceiver::start_default().recipient();
2236+
let mut aggregator = Aggregator::new(test_config(), receiver);
2237+
let project_key = ProjectKey::parse("a94ae32be2584e0bbd7a4cbb95971fed").unwrap();
2238+
2239+
let mut metric = Metric {
2240+
name: "c:foo".to_owned(),
2241+
unit: MetricUnit::None,
2242+
value: MetricValue::Counter(42.),
2243+
timestamp: UnixTimestamp::from_secs(999994711),
2244+
tags: BTreeMap::new(),
2245+
};
2246+
for (metric_value, expected_total_cost) in [
2247+
(MetricValue::Counter(42.), 8),
2248+
(MetricValue::Counter(42.), 8), // counters have constant size
2249+
(MetricValue::Set(123), 12), // 8 + 1*4
2250+
(MetricValue::Set(123), 12), // Same element in set, no change
2251+
(MetricValue::Set(456), 16), // Different element in set -> +4
2252+
(MetricValue::Distribution(1.0), 28), // 1 unique element -> +12
2253+
(MetricValue::Distribution(1.0), 28), // no new element
2254+
(MetricValue::Distribution(2.0), 40), // 1 new element -> +12
2255+
(MetricValue::Gauge(0.3), 80),
2256+
(MetricValue::Gauge(0.2), 80), // gauge has constant size
2257+
] {
2258+
metric.value = metric_value;
2259+
aggregator.insert(project_key, metric.clone()).unwrap();
2260+
assert_eq!(aggregator.cost_tracker.total_cost, expected_total_cost);
2261+
}
2262+
}
2263+
20622264
#[test]
20632265
fn test_flush_bucket() {
20642266
relay_test::setup();

relay-metrics/src/statsd.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ pub enum MetricHistograms {
103103
/// - `backdated`: A flag indicating whether the metric was reported within the `initial_delay`
104104
/// time period (`false`) or after the initial delay has expired (`true`).
105105
BucketsDelay,
106+
107+
/// The storage cost of metrics buckets stored Relay's metrics aggregator, for a project key.
108+
///
109+
/// See also [`MetricGauges::BucketsCost`].
110+
BucketsCostPerProjectKey,
106111
}
107112

108113
impl HistogramMetric for MetricHistograms {
@@ -112,6 +117,7 @@ impl HistogramMetric for MetricHistograms {
112117
Self::BucketsFlushedPerProject => "metrics.buckets.flushed_per_project",
113118
Self::BucketRelativeSize => "metrics.buckets.relative_bucket_size",
114119
Self::BucketsDelay => "metrics.buckets.delay",
120+
Self::BucketsCostPerProjectKey => "metrics.buckets.cost_per_project_key",
115121
}
116122
}
117123
}
@@ -120,12 +126,15 @@ impl HistogramMetric for MetricHistograms {
120126
pub enum MetricGauges {
121127
/// The total number of metric buckets in Relay's metrics aggregator.
122128
Buckets,
129+
/// The total storage cost of metric buckets in Relay's metrics aggregator.
130+
BucketsCost,
123131
}
124132

125133
impl GaugeMetric for MetricGauges {
126134
fn name(&self) -> &'static str {
127135
match *self {
128136
Self::Buckets => "metrics.buckets",
137+
Self::BucketsCost => "metrics.buckets.cost",
129138
}
130139
}
131140
}

0 commit comments

Comments
 (0)