1
//! Macros to make it easier to work with rate-limited logs
2
//!
3

            
4
/// Log a rate-limited failure message.
5
///
6
/// This macro looks at a single `Result<T,E>`,
7
/// and keeps track of how many times the `Ok` and `Err` branches are seen.
8
/// After a delay, it reports via [`tracing::event!`]
9
/// how many errors it has seen since its last report.
10
/// (It only reports an absence of errors
11
/// when such an absence has followed
12
/// an error report.)
13
///
14
/// ## A simple example
15
///
16
/// ```
17
/// # use std::num::ParseIntError;
18
/// # let s = "1234";
19
/// # let source = "the cache";
20
/// use tor_log_ratelim::log_ratelim;
21
///  let r: Result<u8, ParseIntError> = s.parse();
22
///
23
/// log_ratelim!(
24
///   // The activity we were doing.
25
///   "Parsing a value from {}", source;
26
///   // The result we got.
27
///   r;
28
/// );
29
/// ```
30
///
31
/// This invocation could report errors like
32
/// ```text
33
/// WARN: Parsing a value from the cache: error (occurred 9/12 times in the last 5 minutes): number too large to fit in target type
34
/// ```
35
///
36
/// After a while without errors, it might log:
37
/// ```text
38
/// WARN: Parsing a value from the cache: now working (occurred 0/100 times in th last hour)
39
/// ```
40
///
41
/// ## Important concept: Activities
42
///
43
/// Every invocation of `log_ratelim!` defines a _set_ of rate limits
44
/// with respect to a collection of **activities**.  
45
/// Each separate **activity** value gets its own rate limit.
46
/// This lets you have separate rate limits for different operations,
47
/// such as connecting to different parties,
48
/// or invoking different programs.
49
///
50
/// Typical activities might be
51
/// `"Connecting to port {}", p`
52
/// or
53
/// `"Trying to start program {}", p`
54
///
55
/// (These activities should be described using a verb ending with "-ing",
56
/// to make the output nice.)
57
///
58
/// ## Requirements on the error type.
59
///
60
/// The error type for each `Result` passed to this macro must implement:
61
///  * [`Clone`]
62
///  * [`Send`]
63
///  * [`Error`](std::error::Error)
64
///
65
/// ## Reports are representative
66
///
67
/// No matter how many failures are seen between log messages,
68
/// `log_ratelim!` only records and reports
69
/// one error for each time it logs.
70
///
71
/// Its current behavior is to record and report
72
/// the _first_ error for each logged interval,
73
/// and discard the others.  
74
/// This can lead to confusing results if the error is not representative.
75
///
76
/// ## Advanced syntax
77
///
78
/// The `log_ratelim` macro can record additional information for its
79
/// representative error report,
80
/// and can log information on successes as well.
81
///
82
/// A full invocation of `log_ratelim!` looks like this:
83
///
84
/// ```
85
/// # use std::num::ParseIntError;
86
/// # let s = "1234";
87
/// # let source = "the cache";
88
/// # let more_information = |_| "";
89
/// use tor_log_ratelim::log_ratelim;
90
/// let r: Result<u8, ParseIntError> = s.parse();
91
/// log_ratelim!(
92
///   "Parsing a value from {}", source;
93
///   r;
94
///   Err(x) => WARN, "The problem was {}", more_information(x);
95
///   Ok(v) => TRACE, "Parsed {} successfully", v;
96
/// );
97
/// ```
98
///
99
/// Here the clause starting with `Err(x)`
100
/// tells the logger to include a message along with the error report,
101
/// and we explicitly specifies the level at which
102
/// to log our failure reports.
103
///
104
/// Note that the error itself is **always** reported;
105
/// there is no need to say
106
/// `Err(e) => WARN, "{}", e`.
107
/// In fact, doing so will create a redundant report of
108
/// the error.
109
//
110
// TODO: I don't think it makes sense to have an Ok() logger.
111
// Instead, we could just say
112
//    log_ratelim!("Parsing a value from {}", source; r;);
113
//    trace!("parsed value from {}: {:?}", source, r);
114
// This is probably better, since it logs a trace for every occurrence.
115
//
116
/// The clause starting with `Ok(v)` tells the logger what to do on success:
117
/// each individual success causes a _non-rate-limited_
118
/// message at TRACE level.
119
///
120
/// The `Ok() ...` clause
121
/// and the `Err() ...` clause
122
/// are both optional.
123
///
124
/// Within the Err() clause,
125
/// the format string and its arguments
126
/// are optional.
127
//
128
// TODO performance notes:
129
//
130
// There are many opportunities for possibly making this code go faster:
131
//  - Optimize the case where there is only one activity.
132
//  - Use a non-string key to distinguish activities, to avoid formatting
133
//    the activity string needlessly.
134
//  - Limit copies (of activity and of error).
135
//  - Use Event and Metadata from the tracing crate to defer formatting
136
//  - Check Metadata early for the case where we don't want to report the
137
//    event at all.
138
//
139
// Let's not pursue any of these until we know that this code actually
140
// shows up in a critical path.
141
#[macro_export]
142
macro_rules! log_ratelim {
143
  // ====
144
  // Actual implementation for rate-limited logging.
145
  // ====
146

            
147
  // Nobody invokes this syntax directly; it's used as a common body for the
148
  // various other syntaxes.
149
  {
150
    @impl activity_format: ( $act_fmt:literal $(, $act_arg:expr)* ) ;
151
          result: ($result:expr ) ;
152
          on_error: (Err($err_pat:pat), $err_level:ident $(, $err_fmt:literal $(, $err_arg:expr)* )? );
153
          $( on_ok: (Ok($ok_pat:pat), $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr)*  ); )?
154
  } => {
155
    #[allow(clippy::redundant_closure_call)]
156
166
    (||{
157
    use $crate::macro_prelude::*;
158
166
    let Some(runtime) = rt_support() else {
159
      // Nobody has called `install_runtime()`: we should just log whatever
160
      // happened and not worry about the rate-limiting.
161
166
      match &$result {
162
        #[allow(clippy::redundant_pattern)]
163
        Err(the_error @ $err_pat) => {
164
          tracing::event!(
165
            tracing::Level::$err_level,
166
            concat!($act_fmt, $(": ", $err_fmt, )? ": {}"),
167
            $($act_arg,)*
168
            $( $($err_arg, )* )?
169
            the_error.report()
170
          );
171
        }
172
        $(Ok($ok_pat) => {
173
          tracing::event!(
174
            tracing::Level::$ok_level,
175
            $ok_fmt
176
            $(, $ok_arg)*
177
          );
178
        })?
179
        #[allow(unreachable_patterns)]
180
166
        Ok(_) => {}
181
      }
182
166
      return;
183
    };
184

            
185
    /// An implementation of Loggable for this log message.
186
    //
187
    // We use a separate implementation here so that the tracing metadata will get
188
    // constructed correctly.  If we called tracing::event! from a location in
189
    // `tor-log-ratelim`, all the messages would appear to originate from there.
190
    //
191
    // (TODO: We could use tracing::Metadata explicitly, perhaps? That might be hard.)
192
    struct Lg {
193
        /// Internal state of the log.
194
        state: LogState,
195
        /// The activity status the last time we flushed.
196
        last_activity: Option<Activity>,
197
    }
198
    impl Loggable for Lg {
199
        fn flush(&mut self, summarizing: std::time::Duration) -> Activity {
200
            let activity = self.state.activity();
201
            match activity {
202
               Activity::Active => {
203
                  tracing::event!(
204
                      tracing::Level::$err_level,
205
                      "{}",
206
                      self.state.display_problem(summarizing)
207
                  );
208
               }
209
               Activity::AppearsResolved => {
210
                   // Don't want to log an update if we're continuing to see successes and no
211
                   // failures.
212
                   if self.last_activity != Some(Activity::AppearsResolved) {
213
                       tracing::event!(
214
                           // Using err_level here is in some respects confusing:
215
                           // if the _presence_ of the problem is (say) a WARN,
216
                           // why should its newfound absence also be a WARN?
217
                           //
218
                           // We have had to decide which is worse:
219
                           // that a user only watching WARNs
220
                           // might not see a problem has gone away,
221
                           // or that a non-problem would be reported
222
                           // at an excessive severity.
223
                           // We went with the latter.
224
                           tracing::Level::$err_level,
225
                           "{}",
226
                           self.state.display_recovery(summarizing)
227
                       );
228
                   }
229
               }
230
               // There have been no new successes or failures, so there's no update to report.
231
               Activity::Dormant => {}
232
            }
233
            self.state.reset();
234
            self.last_activity = Some(activity);
235
            activity
236
        }
237
    }
238

            
239
    /// A lazy map from activity keys to weak RateLim handles.
240
    //
241
    // The strong reference for each RateLim is held by a task that flushes
242
    // the logger as appropriate, and drops the strong reference once it's
243
    // quiescent.
244
    static LOGGERS: LazyLock<Mutex<WeakValueHashMap<String, Weak<RateLim<Lg>>>>> =
245
    LazyLock::new(|| Mutex::new(WeakValueHashMap::new()));
246

            
247
    // We assign a separate rate limit for each activity.
248
    // For now, this is string-ly typed.
249
    let activity = format!($act_fmt $(, $act_arg)*);
250
    let key = activity.clone();
251

            
252
    match &$result {
253
      #[allow(clippy::redundant_pattern)]
254
      Err(the_error @ $err_pat) => {
255
        // The operation failed.
256
        //
257
        // 1) Create a rate-limited logger for this activity if one  did not
258
        //    already exist.
259
        let logger = LOGGERS
260
          .lock()
261
          .expect("poisoned lock")
262
          .entry(key)
263
          .or_insert_with(|| RateLim::new(Lg {
264
              state: LogState::new(activity),
265
              last_activity: None,
266
          }));
267
        // 2) Note failure in the activity with note_fail().
268
        logger.event(runtime, |lg| lg.state.note_fail(||
269
          // 2b) If this is the first time that this activity failed since the
270
          //     last flush, record the formatted err_msg, and a Clone of the error.
271
          (
272
            $crate::log_ratelim!{@first_nonempty
273
              { $( Some(format!($err_fmt $(, $err_arg)* )) )? }
274
              { None }
275
            },
276
            Some(Box::new(the_error.clone()))
277
          )
278
        ));
279
      }
280
      Ok($crate::log_ratelim!{@first_nonempty { $($ok_pat)? } {_} }) => {
281
        // The operation succeeded.
282
        //
283
        // 1) If this activity is tracked, call note_ok() on it.
284
        if let Some(logger) = LOGGERS
285
          .lock()
286
          .expect("poisoned lock")
287
          .get(&key) {
288
            logger.nonevent(|lg| lg.state.note_ok());
289
          }
290
        // 2) If we have a per-success item to log, log it.
291
        $(
292
        tracing::event!(tracing::Level::$ok_level, $ok_fmt $(, $ok_arg )* );
293
        )?
294
      }
295
    }
296
  })()
297
  };
298

            
299
  // ======
300
  // Exposed, documented syntax.
301
  // ======
302

            
303
  // Regular invocation with an Err(_) case.
304
  {
305
    $act_fmt:literal $(, $act_arg:expr )* $(,)? ;
306
    $result:expr ;
307
    Err($err_pat:pat) => $err_level:ident $(, $err_fmt:literal $(, $err_arg:expr)* )? $(,)?
308
    $(; Ok($ok_pat:pat) => $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr )* $(,)?  )?
309
    $(;)?
310
  } => {
311
    $crate::log_ratelim!{
312
      @impl
313
        activity_format: ( $act_fmt $(, $act_arg)* );
314
        result: ($result);
315
        on_error: (Err($err_pat), $err_level $(, $err_fmt $(, $err_arg)* )? );
316
        $( on_ok: ( Ok($ok_pat), $ok_level, $ok_fmt $(, $ok_arg)* ); )?
317
    }
318
  };
319
  // Regular invocation with no Err(_) case.
320
  {
321
    $act_fmt:literal $(, $act_arg:expr )* $(,)? ;
322
    $result:expr
323
    $(; Ok($ok_pat:pat) => $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr )* $(,)? )?
324
    $(;)?
325
  } => {
326
    $crate::log_ratelim!{
327
      @impl
328
        activity_format: ( $act_fmt $(, $act_arg)* );
329
        result: ($result);
330
        on_error: (Err(_), WARN);
331
        $( on_ok: ( Ok($ok_pat), $ok_level, $ok_fmt $(, $ok_arg)* ); )?
332
    }
333
  };
334

            
335
  // Expand to the first of two bodies that has at least one token in it.
336
  { @first_nonempty { $($a:tt)+ } { $($b:tt)* }} => { $($a)+ };
337
  { @first_nonempty { } { $($b:tt)* } } => { $($b)+ };
338
}
339

            
340
#[cfg(test)]
341
mod test_syntax {
342
    #![allow(dead_code)]
343

            
344
    #[derive(Clone, Debug, thiserror::Error)]
345
    enum MyErr {
346
        #[error("it didn't work")]
347
        DidntWork,
348
    }
349
    impl MyErr {
350
        fn badness(&self) -> u8 {
351
            3
352
        }
353
    }
354

            
355
    /// This doesn't actually run or test anything; it just makes sure that all
356
    /// the different syntaxes work.
357
    fn various_syntaxes(friend: &str, r: &Result<u32, MyErr>) {
358
        log_ratelim!(
359
          "saying hi to {}", friend;
360
          r;
361
        );
362

            
363
        log_ratelim!(
364
          "saying hi to {}", friend;
365
          r;
366
          Err(_) => WARN;
367
        );
368

            
369
        log_ratelim!(
370
          "saying hi to {}", friend;
371
          r;
372
          Err(e) => WARN, "badness={}", e.badness();
373
        );
374

            
375
        log_ratelim!(
376
          "saying hi to {}", friend;
377
          r;
378
          Ok(v) => TRACE, "nothing bad happened; v={}", v;
379
        );
380

            
381
        log_ratelim!(
382
          "saying hi to {}", friend;
383
          r;
384
          Ok(v) => TRACE, "nothing bad happened; v={}", v;
385
        );
386

            
387
        log_ratelim!(
388
          "saying hi to {}", friend;
389
          r;
390
          Err(e) => WARN, "badness={}", e.badness();
391
          Ok(v) => TRACE, "nothing bad happened; v={}", v;
392
        );
393
    }
394
}