FIX: Remove 'backfill_etags' keyword argument from 'uploads:missing' rake task

FIX: Remove ‘backfill_etags’ keyword argument from ‘uploads:missing’ rake task

And etags backfilling code is optimized

diff --git a/lib/file_store/s3_store.rb b/lib/file_store/s3_store.rb
index e51ea23..d4bb664 100644
--- a/lib/file_store/s3_store.rb
+++ b/lib/file_store/s3_store.rb
@@ -123,11 +123,11 @@ module FileStore
       SiteSetting.Upload.s3_upload_bucket.downcase
     end
 
-    def list_missing_uploads(skip_optimized: false, backfill_etags: false)
+    def list_missing_uploads(skip_optimized: false)
       if SiteSetting.enable_s3_inventory
         require 's3_inventory'
-        S3Inventory.new(s3_helper, :upload).list_missing(backfill_etags: backfill_etags)
-        S3Inventory.new(s3_helper, :optimized).list_missing(backfill_etags: backfill_etags) unless skip_optimized
+        S3Inventory.new(s3_helper, :upload).list_missing
+        S3Inventory.new(s3_helper, :optimized).list_missing unless skip_optimized
       else
         list_missing(Upload, "original/")
         list_missing(OptimizedImage, "optimized/") unless skip_optimized
diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb
index 8acb9b2..1625e43 100644
--- a/lib/s3_inventory.rb
+++ b/lib/s3_inventory.rb
@@ -24,7 +24,7 @@ class S3Inventory
     end
   end
 
-  def list_missing(backfill_etags: false)
+  def list_missing
     if files.blank?
       error("Failed to list inventory from S3")
       return
@@ -46,12 +46,12 @@ class S3Inventory
           end
         end
 
-        if backfill_etags
-          uploads = model.where(etag: nil).joins("INNER JOIN #{table_name} ON #{model.table_name}.url ILIKE '%' || #{table_name}.key")
-          uploads.select(:id, :"#{table_name}.etag").find_each do |upload|
-            model.where(id: upload.id).update_all(etag: upload.etag)
-          end
-        end
+        # backfilling etags
+        connection.async_exec("UPDATE #{model.table_name}
+          SET etag = #{table_name}.etag
+          FROM #{table_name}
+          WHERE #{model.table_name}.etag IS NULL
+            AND url ILIKE '%' || #{table_name}.key")
 
         uploads = (model == Upload) ? model.where("created_at < ?", inventory_date) : model
         missing_uploads = uploads.joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag").where("#{table_name}.etag is NULL")
diff --git a/lib/tasks/uploads.rake b/lib/tasks/uploads.rake
index ed11824..40b33e8 100644
--- a/lib/tasks/uploads.rake
+++ b/lib/tasks/uploads.rake
@@ -482,16 +482,16 @@ end
 # list all missing uploads and optimized images
 task "uploads:missing" => :environment do
   if ENV["RAILS_DB"]
-    list_missing_uploads(skip_optimized: ENV['SKIP_OPTIMIZED'], backfill_etags: ENV['BACKFILL_ETAGS'])
+    list_missing_uploads(skip_optimized: ENV['SKIP_OPTIMIZED'])
   else
     RailsMultisite::ConnectionManagement.each_connection do |db|
-      list_missing_uploads(skip_optimized: ENV['SKIP_OPTIMIZED'], backfill_etags: ENV['BACKFILL_ETAGS'])
+      list_missing_uploads(skip_optimized: ENV['SKIP_OPTIMIZED'])
     end
   end
 end
 
-def list_missing_uploads(skip_optimized: false, backfill_etags: false)
-  Discourse.store.list_missing_uploads(skip_optimized: skip_optimized, backfill_etags: backfill_etags)
+def list_missing_uploads(skip_optimized: false)
+  Discourse.store.list_missing_uploads(skip_optimized: skip_optimized)
 end
 
 ################################################################################
diff --git a/spec/components/s3_inventory_spec.rb b/spec/components/s3_inventory_spec.rb
index 26e7772..03d70ae 100644
--- a/spec/components/s3_inventory_spec.rb
+++ b/spec/components/s3_inventory_spec.rb
@@ -77,14 +77,20 @@ describe "S3Inventory" do
   end
 
   it "should backfill etags to uploads table correctly" do
-    Fabricate(:upload, url: "//bucket.amazonaws.com/original/0184537a4f419224404d013414e913a4f56018f2.jpg", created_at: 2.days.ago)
+    files = [
+      ["//bucket.amazonaws.com/original/0184537a4f419224404d013414e913a4f56018f2.jpg", "defcaac0b4aca535c284e95f30d608d0"],
+      ["//bucket.amazonaws.com/original/0789fbf5490babc68326b9cec90eeb0d6590db05.png", "25c02eaceef4cb779fc17030d33f7f06"]
+    ]
+    files.each { |file| Fabricate(:upload, url: file[0]) }
 
     inventory.expects(:download_inventory_files_to_tmp_directory)
     inventory.expects(:decompress_inventory_files)
     inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(2)
 
     output = capture_stdout do
-      expect { inventory.list_missing(backfill_etags: true) }.to change { Upload.where(etag: nil).count }.by(-1)
+      expect { inventory.list_missing }.to change { Upload.where(etag: nil).count }.by(-2)
     end
+
+    expect(Upload.order(:url).pluck(:url, :etag)).to eq(files)
   end
 end

GitHub sha: 0472bd4a

1 Like
  • We need to make sure we have a transaction around the place that creates the temp table and consumes, otherwise pgbouncer which runs in transaction pooling may end up not having access to the temp table

once the client gets a connection from the pool, it keeps it to run a single transaction only. After that, the connection is returned to the pool. If the client wants to run other transactions it has to wait until it gets another connection assigned to it.

  • We should be careful with naming here to ensure that if a method loads up etags it communicates that in the method name: list_missing -> backfill_etags and list_missing
2 Likes
1 Like