DEV: Detect when s3 inventory failure is caused by etag difference (#10427)

DEV: Detect when s3 inventory failure is caused by etag difference (#10427)

diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb
index 5b2fbff..e561d5c 100644
--- a/lib/s3_inventory.rb
+++ b/lib/s3_inventory.rb
@@ -67,6 +67,10 @@ class S3Inventory
               .joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag")
               .where("#{table_name}.etag IS NULL")
 
+            exists_with_different_etag = missing_uploads
+              .joins("LEFT JOIN #{table_name} inventory2 ON inventory2.url = #{model.table_name}.url")
+              .where("inventory2.etag IS NOT NULL").pluck(:id)
+
             # marking as verified/not verified
             id_threshold_clause = model == Upload ? " AND model_table.id > #{model::SEEDED_ID_THRESHOLD}" : ""
             DB.exec(<<~SQL, inventory_date
@@ -86,10 +90,18 @@ class S3Inventory
 
             if (missing_count = missing_uploads.count) > 0
               missing_uploads.select(:id, :url).find_each do |upload|
-                log upload.url
+                if exists_with_different_etag.include?(upload.id)
+                  log "#{upload.url} has different etag"
+                else
+                  log upload.url
+                end
               end
 
               log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing"
+              if exists_with_different_etag.present?
+                log "#{exists_with_different_etag.count} of these are caused by differing etags"
+                log "Null the etag column and re-run for automatic backfill"
+              end
             end
 
             Discourse.stats.set("missing_s3_#{model.table_name}", missing_count)
diff --git a/spec/components/s3_inventory_spec.rb b/spec/components/s3_inventory_spec.rb
index ea04213..2f4122d 100644
--- a/spec/components/s3_inventory_spec.rb
+++ b/spec/components/s3_inventory_spec.rb
@@ -64,7 +64,10 @@ describe "S3Inventory" do
 
       CSV.foreach(csv_filename, headers: false) do |row|
         next unless row[S3Inventory::CSV_KEY_INDEX].include?("default")
-        Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago)
+        Fabricate(:upload,
+          etag: row[S3Inventory::CSV_ETAG_INDEX],
+          url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]),
+          updated_at: 2.days.ago)
       end
 
       @upload1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)
@@ -84,6 +87,25 @@ describe "S3Inventory" do
       expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
     end
 
+    it "should detect when a url match exists with a different etag" do
+      differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0")
+      differing_etag.update_columns(etag: "somethingelse")
+
+      output = capture_stdout do
+        inventory.backfill_etags_and_list_missing
+      end
+
+      expect(output).to eq(<<~TEXT)
+        #{differing_etag.url} has different etag
+        #{@upload1.url}
+        #{@no_etag.url}
+        3 of 5 uploads are missing
+        1 of these are caused by differing etags
+        Null the etag column and re-run for automatic backfill
+      TEXT
+      expect(Discourse.stats.get("missing_s3_uploads")).to eq(3)
+    end
+
     it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do
       expect(Upload.where(verified: nil).count).to eq(12)
       output = capture_stdout do

GitHub sha: bd0a7553

1 Like

This commit appears in #10427 which was merged by SamSaffron.