From 214f152c5dd794a04616454058d9145467452ced Mon Sep 17 00:00:00 2001 From: Greg Heartsfield Date: Mon, 30 Jan 2023 18:40:47 -0600 Subject: [PATCH] improvement: provide reason for abort in prometheus metric --- src/repo/postgres.rs | 2 +- src/repo/sqlite.rs | 7 ++++--- src/server.rs | 10 +++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/repo/postgres.rs b/src/repo/postgres.rs index 46417ad..e14d493 100644 --- a/src/repo/postgres.rs +++ b/src/repo/postgres.rs @@ -358,7 +358,7 @@ ON CONFLICT (id) DO NOTHING"#, if last_successful_send + abort_cutoff < Instant::now() { // the queue has been full for too long, abort info!("aborting database query due to slow client"); - metrics.query_aborts.inc(); + metrics.query_aborts.with_label_values(&["slowclient"]).inc(); return Ok(()); } // give the queue a chance to clear before trying again diff --git a/src/repo/sqlite.rs b/src/repo/sqlite.rs index 0d7fd09..5964af7 100644 --- a/src/repo/sqlite.rs +++ b/src/repo/sqlite.rs @@ -323,6 +323,7 @@ impl NostrRepo for SqliteRepo { "shedding DB query load queued for {:?} (cid: {}, sub: {:?})", db_queue_time, client_id, sub.id ); + metrics.query_aborts.with_label_values(&["loadshed"]).inc(); return Ok(()); } // otherwise, report queuing time if it is slow @@ -383,7 +384,7 @@ impl NostrRepo for SqliteRepo { if self.checkpoint_in_progress.try_lock().is_err() { // lock was held, abort this query debug!("query aborted due to checkpoint (cid: {}, sub: {:?})", client_id, sub.id); - metrics.query_aborts.inc(); + metrics.query_aborts.with_label_values(&["checkpoint"]).inc(); return Ok(()); } } @@ -407,7 +408,7 @@ impl NostrRepo for SqliteRepo { // the queue has been full for too long, abort info!("aborting database query due to slow client (cid: {}, sub: {:?})", client_id, sub.id); - metrics.query_aborts.inc(); + metrics.query_aborts.with_label_values(&["slowclient"]).inc(); let ok: Result<()> = Ok(()); return ok; } @@ -415,7 +416,7 @@ impl NostrRepo for SqliteRepo { if self.checkpoint_in_progress.try_lock().is_err() { // lock was held, abort this query debug!("query aborted due to checkpoint (cid: {}, sub: {:?})", client_id, sub.id); - metrics.query_aborts.inc(); + metrics.query_aborts.with_label_values(&["checkpoint"]).inc(); return Ok(()); } // give the queue a chance to clear before trying again diff --git a/src/server.rs b/src/server.rs index 56a18d8..84dce2b 100644 --- a/src/server.rs +++ b/src/server.rs @@ -244,10 +244,10 @@ fn create_metrics() -> (Registry, NostrMetrics) { "nostr_connections_total", "New connections", )).unwrap(); - let query_aborts = IntCounter::with_opts(Opts::new( - "nostr_query_abort_total", - "Aborted queries", - )).unwrap(); + let query_aborts = IntCounterVec::new( + Opts::new("nostr_query_abort_total", "Aborted queries"), + vec!["reason"].as_slice(), + ).unwrap(); let cmd_req = IntCounter::with_opts(Opts::new( "nostr_cmd_req_total", "REQ commands", @@ -834,7 +834,7 @@ pub struct NostrMetrics { pub sent_events: IntCounterVec, // count of events sent to clients pub connections: IntCounter, // count of websocket connections pub disconnects: IntCounterVec, // client disconnects - pub query_aborts: IntCounter, // count of queries aborted by server + pub query_aborts: IntCounterVec, // count of queries aborted by server pub cmd_req: IntCounter, // count of REQ commands received pub cmd_event: IntCounter, // count of EVENT commands received pub cmd_close: IntCounter, // count of CLOSE commands received