FEATURE: Use amazon s3 inventory to manage upload stats (#6867)

FEATURE: Use amazon s3 inventory to manage upload stats (#6867)

diff --git a/app/jobs/regular/update_s3_inventory.rb b/app/jobs/regular/update_s3_inventory.rb
new file mode 100644
index 0000000..946ec47
--- /dev/null
+++ b/app/jobs/regular/update_s3_inventory.rb
@@ -0,0 +1,17 @@
+require "s3_inventory"
+
+module Jobs
+  # if upload bucket changes or inventory bucket changes we want to update s3 bucket policy and inventory configuration
+  class UpdateS3Inventory < Jobs::Base
+
+    def execute(args)
+      return unless SiteSetting.enable_s3_inventory? && SiteSetting.enable_s3_uploads?
+
+      [:upload, :optimized].each do |type|
+        s3_inventory = S3Inventory.new(Discourse.store.s3_helper, type)
+        s3_inventory.update_bucket_policy if type == :upload
+        s3_inventory.update_bucket_inventory_configuration
+      end
+    end
+  end
+end
diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml
index ddea713..2af15b9 100644
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@@ -191,6 +191,7 @@ en:
         other: "You specified the invalid choices %{name}"
       default_categories_already_selected: "You cannot select a category used in another list."
       s3_upload_bucket_is_required: "You cannot enable uploads to S3 unless you've provided the 's3_upload_bucket'."
+      enable_s3_uploads_is_required: "You cannot enable inventory to S3 unless you've enabled the S3 uploads."
       s3_backup_requires_s3_settings: "You cannot use S3 as backup location unless you've provided the '%{setting_name}'."
       s3_bucket_reused: "You cannot use the same bucket for 's3_upload_bucket' and 's3_backup_bucket'. Choose a different bucket or use a different path for each bucket."
     conflicting_google_user_id: 'The Google Account ID for this account has changed; staff intervention is required for security reasons. Please contact staff and point them to <br><a href="https://meta.discourse.org/t/76575">https://meta.discourse.org/t/76575</a>'
@@ -1488,6 +1489,7 @@ en:
     s3_force_path_style: "Enforce path-style addressing for your custom endpoint. IMPORTANT: Required for using Minio uploads and backups."
     s3_configure_tombstone_policy: "Enable automatic deletion policy for tombstone uploads. IMPORTANT: If disabled, no space will be reclaimed after uploads are deleted."
     s3_disable_cleanup: "Disable the removal of backups from S3 when removed locally."
+    enable_s3_inventory: "Generate reports and verify uploads using Amazon S3 inventory. IMPORTANT: requires valid S3 credentials (both access key id & secret access key)."
     backup_time_of_day: "Time of day UTC when the backup should occur."
     backup_with_uploads: "Include uploads in scheduled backups. Disabling this will only backup the database."
     backup_location: "Location where backups are stored. IMPORTANT: S3 requires valid S3 credentials entered in Files settings."
diff --git a/config/site_settings.yml b/config/site_settings.yml
index a897c96..17ae4a2 100644
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@@ -1050,6 +1050,8 @@ files:
   s3_configure_tombstone_policy:
     default: true
     shadowed_by_global: true
+  enable_s3_inventory:
+    default: false
   allow_profile_backgrounds:
     client: true
     default: true
diff --git a/lib/backup_restore/s3_backup_store.rb b/lib/backup_restore/s3_backup_store.rb
index 03edc01..fbbb18f 100644
--- a/lib/backup_restore/s3_backup_store.rb
+++ b/lib/backup_restore/s3_backup_store.rb
@@ -32,9 +32,7 @@ module BackupRestore
     end
 
     def download_file(filename, destination_path, failure_message = nil)
-      unless @s3_helper.object(filename).download_file(destination_path)
-        raise failure_message&.to_s || "Failed to download file"
-      end
+      @s3_helper.download_file(filename, destination_path, failure_message)
     end
 
     def upload_file(filename, source_path, content_type)
diff --git a/lib/discourse.rb b/lib/discourse.rb
index c8d76f3..8f621b8 100644
--- a/lib/discourse.rb
+++ b/lib/discourse.rb
@@ -463,6 +463,11 @@ module Discourse
     end
   end
 
+  DiscourseEvent.on(:site_setting_saved) do |site_setting|
+    name = site_setting.name.to_s
+    Jobs.enqueue(:update_s3_inventory) if name.include?("s3_inventory") || name == "s3_upload_bucket"
+  end
+
   def self.current_user_provider
     @current_user_provider || Auth::DefaultCurrentUserProvider
   end
diff --git a/lib/file_store/s3_store.rb b/lib/file_store/s3_store.rb
index 119dd9c..d4bb664 100644
--- a/lib/file_store/s3_store.rb
+++ b/lib/file_store/s3_store.rb
@@ -124,8 +124,14 @@ module FileStore
     end
 
     def list_missing_uploads(skip_optimized: false)
-      list_missing(Upload, "original/")
-      list_missing(OptimizedImage, "optimized/") unless skip_optimized
+      if SiteSetting.enable_s3_inventory
+        require 's3_inventory'
+        S3Inventory.new(s3_helper, :upload).list_missing
+        S3Inventory.new(s3_helper, :optimized).list_missing unless skip_optimized
+      else
+        list_missing(Upload, "original/")
+        list_missing(OptimizedImage, "optimized/") unless skip_optimized
+      end
     end
 
     private
@@ -140,7 +146,7 @@ module FileStore
         verified_ids = []
 
         files.each do |f|
-          id = model.where("url LIKE '%#{f.key}'").pluck(:id).first if f.size > 0
+          id = model.where("url LIKE '%#{f.key}' AND etag = '#{f.etag}'").pluck(:id).first
           verified_ids << id if id.present?
           marker = f.key
         end
@@ -150,7 +156,7 @@ module FileStore
         files = @s3_helper.list(prefix, marker)
       end
 
-      missing_uploads = model.where("id NOT IN (SELECT val FROM verified_ids)")
+      missing_uploads = model.joins('LEFT JOIN verified_ids ON verified_ids.val = id').where("verified_ids.val IS NULL")
       missing_count = missing_uploads.count
 
       if missing_count > 0
diff --git a/lib/s3_helper.rb b/lib/s3_helper.rb
index fbdde96..380ba86 100644
--- a/lib/s3_helper.rb
+++ b/lib/s3_helper.rb
@@ -205,6 +205,20 @@ class S3Helper
     opts
   end
 
+  def download_file(filename, destination_path, failure_message = nil)
+    unless object(filename).download_file(destination_path)
+      raise failure_message&.to_s || "Failed to download file"
+    end
+  end
+
+  def s3_client
+    @s3_client ||= Aws::S3::Client.new(@s3_options)
+  end
+
+  def s3_inventory_path(path = 'inventory')
+    get_path_for_s3_upload(path)
+  end
+
   private
 
   def default_s3_options
@@ -228,10 +242,6 @@ class S3Helper
     File.join("uploads", RailsMultisite::ConnectionManagement.current_db, "/")
   end
 
-  def s3_client
-    @s3_client ||= Aws::S3::Client.new(@s3_options)
-  end
-
   def s3_resource
     Aws::S3::Resource.new(client: s3_client)
   end
diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb
new file mode 100644
index 0000000..321182f
--- /dev/null
+++ b/lib/s3_inventory.rb
@@ -0,0 +1,193 @@
+# frozen_string_literal: true
+
+require "aws-sdk-s3"
+require "csv"
+
+class S3Inventory
+
+  attr_reader :inventory_id, :csv_filename, :model
+
+  CSV_KEY_INDEX ||= 1
+  CSV_ETAG_INDEX ||= 2
+  INVENTORY_PREFIX ||= "inventory"
+
+  def initialize(s3_helper, type)
+    @s3_helper = s3_helper
+
+    if type == :upload
+      @inventory_id = "original"
+      @model = Upload
+    elsif type == :optimized
+      @inventory_id = "optimized"
+      @model = OptimizedImage
+    end
+  end
+
+  def file
+    @file ||= unsorted_files.sort_by { |file| -file.last_modified.to_i }.first
+  end
+
+  def list_missing
+    if file.blank?
+      error("Failed to list inventory from S3")
+      return
+    end
+
+    DistributedMutex.synchronize("s3_inventory_list_missing_#{inventory_id}") do
+      current_db = RailsMultisite::ConnectionManagement.current_db
+      timestamp = Time.now.strftime("%Y-%m-%d-%H%M%S")
+      @tmp_directory = File.join(Rails.root, "tmp", INVENTORY_PREFIX, current_db, timestamp)
+      @archive_filename = File.join(@tmp_directory, File.basename(file.key))
+      @csv_filename = @archive_filename[0...-3]
+

[... diff too long, it was truncated ...]

GitHub sha: b4f713ca