DEV: Add verified to uploads and fill in S3 inventory (#10406)

DEV: Add verified to uploads and fill in S3 inventory (#10406)

When we run the S3 inventory, mark uploads that exist as verified true, those that don’t as verified false, and uploads not included in the check / not yet checked as verified nil.

diff --git a/db/migrate/20200811004537_add_verified_column_to_uploads.rb b/db/migrate/20200811004537_add_verified_column_to_uploads.rb
new file mode 100644
index 0000000..3d4eb41
--- /dev/null
+++ b/db/migrate/20200811004537_add_verified_column_to_uploads.rb
@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+
+class AddVerifiedColumnToUploads < ActiveRecord::Migration[6.0]
+  def change
+    add_column :uploads, :verified, :boolean, null: true
+  end
+end
diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb
index 1869f0e..5b2fbff 100644
--- a/lib/s3_inventory.rb
+++ b/lib/s3_inventory.rb
@@ -67,6 +67,23 @@ class S3Inventory
               .joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag")
               .where("#{table_name}.etag IS NULL")
 
+            # marking as verified/not verified
+            id_threshold_clause = model == Upload ? " AND model_table.id > #{model::SEEDED_ID_THRESHOLD}" : ""
+            DB.exec(<<~SQL, inventory_date
+              UPDATE #{model.table_name}
+              SET verified = CASE when table_name_alias.etag IS NULL THEN false ELSE true END
+              FROM #{model.table_name} AS model_table
+              LEFT JOIN #{table_name} AS table_name_alias ON model_table.etag = table_name_alias.etag
+              WHERE model_table.id = #{model.table_name}.id
+              AND model_table.updated_at < ?
+              AND (
+                model_table.verified IS NULL OR
+                model_table.verified <> CASE when table_name_alias.etag IS NULL THEN false ELSE true END
+              )
+               #{id_threshold_clause}
+              SQL
+            )
+
             if (missing_count = missing_uploads.count) > 0
               missing_uploads.select(:id, :url).find_each do |upload|
                 log upload.url
diff --git a/spec/components/s3_inventory_spec.rb b/spec/components/s3_inventory_spec.rb
index eb1f2c9..ea04213 100644
--- a/spec/components/s3_inventory_spec.rb
+++ b/spec/components/s3_inventory_spec.rb
@@ -58,27 +58,57 @@ describe "S3Inventory" do
     expect(output).to eq("Failed to list inventory from S3\n")
   end
 
-  it "should display missing uploads correctly" do
-    freeze_time
+  describe "verifying uploads" do
+    before do
+      freeze_time
 
-    CSV.foreach(csv_filename, headers: false) do |row|
-      next unless row[S3Inventory::CSV_KEY_INDEX].include?("default")
-      Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago)
+      CSV.foreach(csv_filename, headers: false) do |row|
+        next unless row[S3Inventory::CSV_KEY_INDEX].include?("default")
+        Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago)
+      end
+
+      @upload1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)
+      @upload2 = Fabricate(:upload, etag: "ETag2", updated_at: Time.now)
+      @no_etag = Fabricate(:upload, updated_at: 2.days.ago)
+
+      inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3)
+      inventory.expects(:inventory_date).times(2).returns(Time.now)
     end
 
-    upload = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)
-    Fabricate(:upload, etag: "ETag2", updated_at: Time.now)
-    no_etag = Fabricate(:upload, updated_at: 2.days.ago)
+    it "should display missing uploads correctly" do
+      output = capture_stdout do
+        inventory.backfill_etags_and_list_missing
+      end
 
-    inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3)
-    inventory.expects(:inventory_date).returns(Time.now)
+      expect(output).to eq("#{@upload1.url}\n#{@no_etag.url}\n2 of 5 uploads are missing\n")
+      expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
+    end
 
-    output = capture_stdout do
-      inventory.backfill_etags_and_list_missing
+    it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do
+      expect(Upload.where(verified: nil).count).to eq(12)
+      output = capture_stdout do
+        inventory.backfill_etags_and_list_missing
+      end
+
+      verified = Upload.pluck(:verified)
+      expect(Upload.where(verified: true).count).to eq(3)
+      expect(Upload.where(verified: false).count).to eq(2)
+      expect(Upload.where(verified: nil).count).to eq(7)
     end
 
-    expect(output).to eq("#{upload.url}\n#{no_etag.url}\n2 of 5 uploads are missing\n")
-    expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
+    it "does not affect the updated_at date of uploads" do
+      upload_1_updated = @upload1.updated_at
+      upload_2_updated = @upload2.updated_at
+      no_etag_updated = @no_etag.updated_at
+
+      output = capture_stdout do
+        inventory.backfill_etags_and_list_missing
+      end
+
+      expect(@upload1.reload.updated_at).to eq_time(upload_1_updated)
+      expect(@upload2.reload.updated_at).to eq_time(upload_2_updated)
+      expect(@no_etag.reload.updated_at).to eq_time(no_etag_updated)
+    end
   end
 
   it "should backfill etags to uploads table correctly" do

GitHub sha: b950b3fb

This commit appears in #10406 which was merged by martin.

I feel like a boolean should never be nullable, as that implies a third state - true / false / null.

Can it be made to work with only true/false and a default?

2 Likes

@SamSaffron what do you think of the above? I made this null based on internal discussion. Would an integer column work instead? (e.g. 1 = not_checked, 2 = verified, 3 = not_verified)

1 Like

I am open to changing this to an int now that I used it for a bit

Then I guess we can have another state, upload in inventory but etag does not match

1 Like

@SamSaffron change made here DEV: Change upload verified column to be integer by martin-brennan · Pull Request #10643 · discourse/discourse · GitHub ; could you please review because this is your baby? :slight_smile:

2 Likes