REFACTOR: Use a dedicated table for storing alert data (#8)

REFACTOR: Use a dedicated table for storing alert data (#8)

This provides a few important benefits over JSON blobs:

  • Alert data has a fixed schema
  • We can query it more efficiently
  • We can update it using upsert queries

A migration is included for currently open alert topics. Closed topics will only be migrated if they are using client-side rendering.

diff --git a/app/controllers/discourse_prometheus_alert_receiver/receiver_controller.rb b/app/controllers/discourse_prometheus_alert_receiver/receiver_controller.rb
index 4ac0a56..1447e28 100644
--- a/app/controllers/discourse_prometheus_alert_receiver/receiver_controller.rb
+++ b/app/controllers/discourse_prometheus_alert_receiver/receiver_controller.rb
@@ -74,7 +74,8 @@ module DiscoursePrometheusAlertReceiver
         data: params[:data].to_json,
         graph_url: params[:graphURL],
         logs_url: params[:logsURL],
-        grafana_url: params[:grafanaURL]
+        grafana_url: params[:grafanaURL],
+        external_url: params[:externalURL]
       )
 
       render json: success_json
diff --git a/app/jobs/concerns/alert_post_mixin.rb b/app/jobs/concerns/alert_post_mixin.rb
index 650fd6e..0eb12ce 100644
--- a/app/jobs/concerns/alert_post_mixin.rb
+++ b/app/jobs/concerns/alert_post_mixin.rb
@@ -7,44 +7,60 @@ module AlertPostMixin
   HIGH_PRIORITY_TAG = "high-priority".freeze
   NEXT_BUSINESS_DAY_SLA = "nbd".freeze
 
+  MAX_BUMP_RATE = 5.minutes
+
+  PREVIOUS_TOPIC = DiscoursePrometheusAlertReceiver::PREVIOUS_TOPIC_CUSTOM_FIELD
+  TOPIC_BODY = DiscoursePrometheusAlertReceiver::TOPIC_BODY_CUSTOM_FIELD
+  BASE_TITLE = DiscoursePrometheusAlertReceiver::TOPIC_BASE_TITLE_CUSTOM_FIELD
+
   private
 
-  def local_date(time, starts_at = nil)
-    parsed = Time.zone.parse(time)
-    format = "L HH:mm"
+  def parse_alerts(raw_alerts, external_url:, logs_url: nil, grafana_url: nil)
+    raw_alerts.map do |raw_alert|
+      alert = {
+        external_url: external_url,
+        alertname: raw_alert['labels']['alertname'],
+        datacenter: raw_alert["labels"]["datacenter"],
+        identifier: raw_alert['labels']['id'],
+        status: normalize_status(raw_alert['status']),
+        starts_at: raw_alert['startsAt'],
+        ends_at: raw_alert['status'] == 'firing' ? nil : raw_alert['endsAt'],
+        graph_url: raw_alert['generatorURL'],
+        description: raw_alert.dig('annotations', 'description'),
+        logs_url: logs_url
+      }
 
-    if starts_at.present?
-      from = Time.zone.parse(starts_at)
-      format = "HH:mm" if from.at_beginning_of_day == parsed.at_beginning_of_day
-    end
+      if dashboard_path = raw_alert.dig('annotations', 'grafana_dashboard_path')
+        alert[:grafana_url] = "#{grafana_url}#{dashboard_path}"
+      end
 
-    date = +<<~DATE
-    [date=#{parsed.strftime("%Y-%m-%d")} time=#{parsed.strftime("%H:%M:%S")} format="#{format}" displayedTimezone="UTC" timezones="Europe/Paris\\|America/Los_Angeles\\|Asia/Singapore\\|Australia/Sydney"]
-    DATE
+      alert[:ends_at] = nil if alert['status'] != 'resolved'
 
-    date.chomp!
-    date
+      alert
+    end
   end
 
-  def get_grafana_dashboard_url(alert, grafana_url)
-    return if alert.blank? || grafana_url.blank?
+  def normalize_status(status)
+    status = status['state'] if status.is_a?(Hash)
+    return "firing" if status == "active"
+    status
+  end
 
-    dashboard_path = alert.dig('annotations', 'grafana_dashboard_path')
-    return if dashboard_path.blank?
+  def local_date(time)
+    parsed = Time.zone.parse(time)
 
-    "#{grafana_url}#{dashboard_path}"
+    <<~DATE.chomp
+      [date=#{parsed.strftime("%Y-%m-%d")} time=#{parsed.strftime("%H:%M:%S")} format="YYYY-MM-DD HH:mm" displayedTimezone="UTC"]
+    DATE
   end
 
   def prev_topic_link(topic_id)
     return "" if topic_id.nil?
-    created_at = Topic.where(id: topic_id).pluck(:created_at).first
-    return "" unless created_at
-
+    return "" unless created_at = Topic.where(id: topic_id).pluck_first(:created_at)
     "[Previous alert](#{Discourse.base_url}/t/#{topic_id}) #{local_date(created_at.to_s)}\n\n"
   end
 
-  def generate_title(base_title, alert_history)
-    firing_count = alert_history&.count { |alert| is_firing?(alert["status"]) }
+  def generate_title(base_title, firing_count)
     if firing_count > 0
       I18n.t("prom_alert_receiver.topic_title.firing", base_title: base_title, count: firing_count)
     else
@@ -52,68 +68,73 @@ module AlertPostMixin
     end
   end
 
-  def first_post_body(receiver:,
-                      topic_body: "",
-                      alert_history:,
-                      prev_topic_id:)
-
-    output = ""
-    output += "#{topic_body}\n\n"
-    output += "#{prev_topic_link(prev_topic_id)}\n\n" if prev_topic_id
-
+  def first_post_body(topic_body: "", prev_topic_id:)
+    output = "#{topic_body}"
+    output += "\n\n#{prev_topic_link(prev_topic_id)}" if prev_topic_id
     output
   end
 
-  def revise_topic(topic:, title:, raw:, datacenters:, firing: nil, high_priority: false)
-    post = topic.first_post
-    title_changed = topic.title != title
-    skip_revision = true
+  def revise_topic(topic:, high_priority: false)
+    firing_count = topic.alert_receiver_alerts.firing.count
+    firing = firing_count > 0
 
-    if post.raw.strip != raw.strip || title_changed || !firing.nil?
-      post = topic.first_post
+    title = generate_title(
+      topic.custom_fields[BASE_TITLE],
+      firing_count
+    )
 
-      fields = {
-        title: title,
-        raw: raw
-      }
+    raw = first_post_body(
+      topic_body: topic.custom_fields[TOPIC_BODY],
+      prev_topic_id: topic.custom_fields[PREVIOUS_TOPIC]
+    )
 
-      fields[:tags] ||= []
+    datacenters = topic.alert_receiver_alerts.distinct.pluck(:datacenter).compact
 
-      if datacenters.present?
-        fields[:tags] = topic.tags.pluck(:name)
-        fields[:tags].concat(datacenters)
-        fields[:tags].uniq!
-      end
+    existing_tags = topic.tags.pluck(:name)
+    new_tags = existing_tags
 
-      fields[:tags] << HIGH_PRIORITY_TAG.dup if high_priority
+    new_tags.concat(datacenters)
+    new_tags << HIGH_PRIORITY_TAG.dup if high_priority
 
-      if firing
-        fields[:tags] << FIRING_TAG.dup
-      else
-        fields[:tags].delete(FIRING_TAG)
-      end
+    if firing
+      new_tags << FIRING_TAG.dup
+    else
+      new_tags.delete(FIRING_TAG)
+    end
 
-      PostRevisor.new(post, topic).revise!(
+    tags_changed = new_tags.uniq != existing_tags
+    title_changed = topic.title != title
+    raw_changed = topic.first_post.raw != raw
+
+    if raw_changed || title_changed || tags_changed
+      PostRevisor.new(topic.first_post, topic).revise!(
         Discourse.system_user,
-        fields,
-        skip_revision: skip_revision,
+        {
+          title: title,
+          raw: raw,
+          tags: new_tags
+        },
+        skip_revision: true,
         skip_validations: true,
         validate_topic: true # This is a very weird API
       )
-      if firing && title_changed
+
+      if firing && title_changed && topic.bumped_at < MAX_BUMP_RATE.ago
+        # Articifically bump the topic
         topic.update_column(:bumped_at, Time.now)
         TopicTrackingState.publish_latest(topic)
       end
+    else
+      # The topic hasn't changed
+      # The alert data has changed, so notify clients to reload
+      topic.first_post.publish_change_to_clients(:revised, reload_topic: true)
     end
   end
 
-  def is_firing?(status)
-    status == "firing".freeze
-  end
-
-  def datacenters(alerts)
-    alerts.each_with_object(Set.new) do |alert, set|
-      set << alert['datacenter'] if alert['datacenter']
-    end.to_a
+  def publish_alert_counts
+    MessageBus.publish("/alert-receiver",
+      firing_alerts_count: Topic.firing_alerts.count,
+      open_alerts_count: Topic.open_alerts.count
+    )
   end
 end
diff --git a/app/jobs/regular/process_alert.rb b/app/jobs/regular/process_alert.rb
index 66d3e82..d64a009 100644
--- a/app/jobs/regular/process_alert.rb
+++ b/app/jobs/regular/process_alert.rb
@@ -2,6 +2,8 @@
 
 module Jobs
   class ProcessAlert < ::Jobs::Base
+    sidekiq_options retry: false
+
     include AlertPostMixin
 
     def execute(args)
@@ -9,13 +11,12 @@ module Jobs
       params = args[:params]
 

[... diff too long, it was truncated ...]

GitHub sha: 45ade1d2

1 Like

This commit appears in #8 which was merged by davidtaylorhq.