FEATURE: Differentiate between restarted, failed and dead jobs (#57)

approved
#1

FEATURE: Differentiate between restarted, failed and dead jobs (#57)

  • restarted jobs metric should be own metric

  • wait for empty send queue before killing worker thread

  • queue wait timeout should be optionally passed to client.stop

  • add dead jobs counter

  • fix collector_test

  • change default wait_timeout_seconds to 0

  • use sidekiqs default max retry constant

  • use sidekiqs death_handlers for dead_jobs_total

  • only configure sidekiq if sidekiq is loaded

  • revert now unnecessary changes to test

  • simplify death_handler helper to only return handler

  • update readme to include sidekiq death_handler

  • Update CHANGELOG

  • Update Readme to include explicit client shutdown

diff --git a/CHANGELOG b/CHANGELOG
index b5886f7..e59f5a0 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,8 @@
+0.4.5 - 22-02-2019
+
+- Feature: Add sidekiq metrics: restarted, dead jobs counters
+- Fix: Client shutting down before sending metrics to collector
+
 0.4.4 - 14-02-2019
 
 - Feature: Allow process collector to ship custom labels for all process metrics
diff --git a/README.md b/README.md
index c025ba7..4855e03 100644
--- a/README.md
+++ b/README.md
@@ -196,7 +196,7 @@ end
 
 #### Sidekiq metrics
 
-Including Sidekiq metrics (how many jobs ran? how many failed? how long did they take?)
+Including Sidekiq metrics (how many jobs ran? how many failed? how long did they take? how many are dead? how many were restarted?)
 
 `‍``ruby
 Sidekiq.configure_server do |config|
@@ -204,6 +204,7 @@ Sidekiq.configure_server do |config|
       require 'prometheus_exporter/instrumentation'
       chain.add PrometheusExporter::Instrumentation::Sidekiq
    end
+   config.death_handlers << PrometheusExporter::Instrumentation::Sidekiq.death_handler
 end
 `‍``
 
@@ -218,6 +219,16 @@ Sidekiq.configure_server do |config|
 end
 `‍``
 
+Sometimes the Sidekiq server shuts down before it can send metrics, that were generated right before the shutdown, to the collector. Especially if you care about the `sidekiq_restarted_jobs_total` metric, it is a good idea to explicitly stop the client:
+
+`‍``ruby
+  Sidekiq.configure_server do |config|
+    at_exit do
+      PrometheusExporter::Client.default.stop(wait_timeout_seconds: 10)
+    end
+  end
+`‍``
+
 #### Delayed Job plugin
 
 In an initializer:
diff --git a/lib/prometheus_exporter/client.rb b/lib/prometheus_exporter/client.rb
index f7d80fa..21b6835 100644
--- a/lib/prometheus_exporter/client.rb
+++ b/lib/prometheus_exporter/client.rb
@@ -120,16 +120,16 @@ module PrometheusExporter
       end
     end
 
-    def stop
+    def stop(wait_timeout_seconds: 0)
       @mutex.synchronize do
+        wait_for_empty_queue_with_timeout(wait_timeout_seconds)
         @worker_thread&.kill
         while @worker_thread.alive?
           sleep 0.001
         end
         @worker_thread = nil
       end
-
-      close_socket!
+        close_socket!
     end
 
     private
@@ -212,4 +212,11 @@ module PrometheusExporter
     end
   end
 
+  def wait_for_empty_queue_with_timeout(timeout_seconds)
+    start_time = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
+    while @queue.length > 0
+      break if start_time + timeout_seconds < ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
+      sleep(0.05)
+    end
+  end
 end
diff --git a/lib/prometheus_exporter/instrumentation/sidekiq.rb b/lib/prometheus_exporter/instrumentation/sidekiq.rb
index c6e3add..b25a2e8 100644
--- a/lib/prometheus_exporter/instrumentation/sidekiq.rb
+++ b/lib/prometheus_exporter/instrumentation/sidekiq.rb
@@ -2,6 +2,15 @@
 
 module PrometheusExporter::Instrumentation
   class Sidekiq
+    def self.death_handler
+      -> (job, ex) do
+        PrometheusExporter::Client.default.send_json(
+          type: "sidekiq",
+          name: job["class"],
+          dead: true,
+        )
+      end
+    end
 
     def initialize(client: nil)
       @client = client || PrometheusExporter::Client.default
@@ -9,19 +18,23 @@ module PrometheusExporter::Instrumentation
 
     def call(worker, msg, queue)
       success = false
+      shutdown = false
       start = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
       result = yield
       success = true
       result
+    rescue ::Sidekiq::Shutdown => e
+      shutdown = true
+      raise e
     ensure
       duration = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC) - start
       class_name = worker.class.to_s == 'ActiveJob::QueueAdapters::SidekiqAdapter::JobWrapper' ?
                      msg['wrapped'] : worker.class.to_s
-
       @client.send_json(
         type: "sidekiq",
         name: class_name,
         success: success,
+        shutdown: shutdown,
         duration: duration
       )
     end
diff --git a/lib/prometheus_exporter/server/sidekiq_collector.rb b/lib/prometheus_exporter/server/sidekiq_collector.rb
index 9c5a6c6..506b245 100644
--- a/lib/prometheus_exporter/server/sidekiq_collector.rb
+++ b/lib/prometheus_exporter/server/sidekiq_collector.rb
@@ -11,14 +11,25 @@ module PrometheusExporter::Server
       labels = custom_labels.nil? ? default_labels : default_labels.merge(custom_labels)
 
       ensure_sidekiq_metrics
-      @sidekiq_job_duration_seconds.observe(obj["duration"], labels)
-      @sidekiq_jobs_total.observe(1, labels)
-      @sidekiq_failed_jobs_total.observe(1, labels) if !obj["success"]
+      if obj["dead"]
+        @sidekiq_dead_jobs_total.observe(1, labels)
+      else
+        @sidekiq_job_duration_seconds.observe(obj["duration"], labels)
+        @sidekiq_jobs_total.observe(1, labels)
+        @sidekiq_restarted_jobs_total.observe(1, labels) if obj["shutdown"]
+        @sidekiq_failed_jobs_total.observe(1, labels) if !obj["success"] && !obj["shutdown"]
+      end
     end
 
     def metrics
       if @sidekiq_jobs_total
-        [@sidekiq_job_duration_seconds, @sidekiq_jobs_total, @sidekiq_failed_jobs_total]
+        [
+          @sidekiq_job_duration_seconds,
+          @sidekiq_jobs_total,
+          @sidekiq_restarted_jobs_total,
+          @sidekiq_failed_jobs_total,
+          @sidekiq_dead_jobs_total,
+        ]
       else
         []
       end
@@ -37,9 +48,17 @@ module PrometheusExporter::Server
         PrometheusExporter::Metric::Counter.new(
           "sidekiq_jobs_total", "Total number of sidekiq jobs executed.")
 
+        @sidekiq_restarted_jobs_total =
+        PrometheusExporter::Metric::Counter.new(
+          "sidekiq_restarted_jobs_total", "Total number of sidekiq jobs that we restarted because of a sidekiq shutdown.")
+
         @sidekiq_failed_jobs_total =
         PrometheusExporter::Metric::Counter.new(
-          "sidekiq_failed_jobs_total", "Total number failed sidekiq jobs executed.")
+          "sidekiq_failed_jobs_total", "Total number of failed sidekiq jobs.")
+
+        @sidekiq_dead_jobs_total =
+        PrometheusExporter::Metric::Counter.new(
+          "sidekiq_dead_jobs_total", "Total number of dead sidekiq jobs.")
       end
     end
   end

GitHub sha: 92fa9ebb

Approved #2