improvement: provide reason for abort in prometheus metric

2025-05-05 00:59:56 -04:00 · 2023-01-30 18:40:47 -06:00 · 2023-01-30 18:40:47 -06:00 · 214f152c5d
commit 214f152c5d
parent 3fcaf97a15
3 changed files with 10 additions and 9 deletions
--- a/src/repo/postgres.rs
+++ b/src/repo/postgres.rs
@ -358,7 +358,7 @@ ON CONFLICT (id) DO NOTHING"#,
                        if last_successful_send + abort_cutoff < Instant::now() {
                            // the queue has been full for too long, abort
                            info!("aborting database query due to slow client");
-                            metrics.query_aborts.inc();
+                            metrics.query_aborts.with_label_values(&["slowclient"]).inc();
                            return Ok(());
                        }
                        // give the queue a chance to clear before trying again
--- a/src/repo/sqlite.rs
+++ b/src/repo/sqlite.rs
@ -323,6 +323,7 @@ impl NostrRepo for SqliteRepo {
                    "shedding DB query load queued for {:?} (cid: {}, sub: {:?})",
                    db_queue_time, client_id, sub.id
                );
+                metrics.query_aborts.with_label_values(&["loadshed"]).inc();
                return Ok(());
            }
            // otherwise, report queuing time if it is slow
@ -383,7 +384,7 @@ impl NostrRepo for SqliteRepo {
                                if self.checkpoint_in_progress.try_lock().is_err() {
                                    // lock was held, abort this query
                                    debug!("query aborted due to checkpoint (cid: {}, sub: {:?})", client_id, sub.id);
-                                    metrics.query_aborts.inc();
+                                    metrics.query_aborts.with_label_values(&["checkpoint"]).inc();
                                    return Ok(());
                                }
                            }
@ -407,7 +408,7 @@ impl NostrRepo for SqliteRepo {
                                // the queue has been full for too long, abort
                                info!("aborting database query due to slow client (cid: {}, sub: {:?})",
                                      client_id, sub.id);
-                                metrics.query_aborts.inc();
+                                metrics.query_aborts.with_label_values(&["slowclient"]).inc();
                                let ok: Result<()> = Ok(());
                                return ok;
                            }
@ -415,7 +416,7 @@ impl NostrRepo for SqliteRepo {
                            if self.checkpoint_in_progress.try_lock().is_err() {
                                // lock was held, abort this query
                                debug!("query aborted due to checkpoint (cid: {}, sub: {:?})", client_id, sub.id);
-                                metrics.query_aborts.inc();
+                                metrics.query_aborts.with_label_values(&["checkpoint"]).inc();
                                return Ok(());
                            }
                            // give the queue a chance to clear before trying again
--- a/src/server.rs
+++ b/src/server.rs
@ -244,10 +244,10 @@ fn create_metrics() -> (Registry, NostrMetrics) {
        "nostr_connections_total",
        "New connections",
    )).unwrap();
-    let query_aborts = IntCounter::with_opts(Opts::new(
-        "nostr_query_abort_total",
-        "Aborted queries",
-    )).unwrap();
+    let query_aborts = IntCounterVec::new(
+        Opts::new("nostr_query_abort_total", "Aborted queries"),
+        vec!["reason"].as_slice(),
+    ).unwrap();
    let cmd_req = IntCounter::with_opts(Opts::new(
        "nostr_cmd_req_total",
        "REQ commands",
@ -834,7 +834,7 @@ pub struct NostrMetrics {
    pub sent_events: IntCounterVec, // count of events sent to clients
    pub connections: IntCounter, // count of websocket connections
    pub disconnects: IntCounterVec, // client disconnects
-    pub query_aborts: IntCounter, // count of queries aborted by server
+    pub query_aborts: IntCounterVec, // count of queries aborted by server
    pub cmd_req: IntCounter, // count of REQ commands received
    pub cmd_event: IntCounter, // count of EVENT commands received
    pub cmd_close: IntCounter, // count of CLOSE commands received