PERF: optimise backfilling of topic_id (#13545)

PERF: optimise backfilling of topic_id (#13545)

Relying on large offsets can have uneven performance on huge table, new implementation recovers more cleanly and avoids double updates

diff --git a/db/post_migrate/20210621234939_backfill_email_log_topic_id.rb b/db/post_migrate/20210621234939_backfill_email_log_topic_id.rb
index 5e2770e..232f9e2 100644
--- a/db/post_migrate/20210621234939_backfill_email_log_topic_id.rb
+++ b/db/post_migrate/20210621234939_backfill_email_log_topic_id.rb
@@ -5,27 +5,22 @@ class BackfillEmailLogTopicId < ActiveRecord::Migration[6.1]
   BATCH_SIZE = 30_000
 
   def up
-    offset = 0
-    email_log_count = DB.query_single("SELECT COUNT(*) FROM email_logs").first
-
     loop do
-      DB.exec(<<~SQL, offset: offset, batch_size: BATCH_SIZE)
+      count = DB.exec(<<~SQL, batch_size: BATCH_SIZE)
           WITH cte AS (
-            SELECT post_id
-            FROM email_logs
-            ORDER BY id
+            SELECT l1.id, p1.topic_id
+            FROM email_logs l1
+            INNER JOIN posts p1 ON p1.id = l1.post_id
+            WHERE l1.topic_id IS NULL AND p1.topic_id IS NOT NULL
             LIMIT :batch_size
-            OFFSET :offset
           )
           UPDATE email_logs
-          SET topic_id = posts.topic_id
+          SET topic_id = cte.topic_id
           FROM cte
-          INNER JOIN posts ON posts.id = cte.post_id
-          WHERE email_logs.post_id = cte.post_id
+          WHERE email_logs.id = cte.id
       SQL
 
-      offset += BATCH_SIZE
-      break if offset > (email_log_count + BATCH_SIZE * 2)
+      break if count == 0
     end
   end
 

GitHub sha: 14a0247301510a5703d1d4b7f62141c13935cc97

This commit appears in #13545 which was approved by tgxworld. It was merged by SamSaffron.