FEATURE: Make uploads:missing task compatible with s3 uploads

FEATURE: Make uploads:missing task compatible with s3 uploads

From fd272eee440d8c9c35bf1cb4b0dd20b22f2dad2d Mon Sep 17 00:00:00 2001
From: Vinoth Kannan <vinothkannan@vinkas.com>
Date: Tue, 27 Nov 2018 00:54:51 +0530
Subject: [PATCH] FEATURE: Make uploads:missing task compatible with s3 uploads


diff --git a/lib/file_store/base_store.rb b/lib/file_store/base_store.rb
index 5523c4a..2cd9dad 100644
--- a/lib/file_store/base_store.rb
+++ b/lib/file_store/base_store.rb
@@ -60,6 +60,10 @@ module FileStore
       not_implemented
     end
 
+    def list_missing_uploads(skip_optimized: false)
+      not_implemented
+    end
+
     def download(upload)
       DistributedMutex.synchronize("download_#{upload.sha1}") do
         filename = "#{upload.sha1}#{File.extname(upload.original_filename)}"
diff --git a/lib/file_store/local_store.rb b/lib/file_store/local_store.rb
index 6d4e733..63ae443 100644
--- a/lib/file_store/local_store.rb
+++ b/lib/file_store/local_store.rb
@@ -97,6 +97,36 @@ module FileStore
       "#{public_dir}#{relative_base_url.sub("/uploads/", "/uploads/tombstone/")}"
     end
 
-  end
+    def list_missing_uploads(skip_optimized: false)
+      list_missing(Upload)
+      list_missing(OptimizedImage) unless skip_optimized
+    end
+
+    private
+
+    def list_missing(model)
+      public_directory = "#{Rails.root}/public"
+
+      count = 0
+      model.find_each do |upload|
 
+        # could be a remote image
+        next unless upload.url =~ /^\/[^\/]/
+
+        path = "#{public_directory}#{upload.url}"
+        bad = true
+        begin
+          bad = false if File.size(path) != 0
+        rescue
+          # something is messed up
+        end
+        if bad
+          count += 1
+          puts path
+        end
+      end
+      puts "#{count} of #{model.count} #{model.name.underscore.pluralize} are missing" if count > 0
+    end
+
+  end
 end
diff --git a/lib/file_store/s3_store.rb b/lib/file_store/s3_store.rb
index 6aca50f..f5b12d5 100644
--- a/lib/file_store/s3_store.rb
+++ b/lib/file_store/s3_store.rb
@@ -110,5 +110,46 @@ module FileStore
       raise Discourse::SiteSettingMissing.new("s3_upload_bucket") if SiteSetting.Upload.s3_upload_bucket.blank?
       SiteSetting.Upload.s3_upload_bucket.downcase
     end
+
+    def list_missing_uploads(skip_optimized: false)
+      list_missing(Upload, "original/")
+      list_missing(OptimizedImage, "optimized/") unless skip_optimized
+    end
+
+    private
+
+    def list_missing(model, prefix)
+      connection = ActiveRecord::Base.connection.raw_connection
+      connection.exec('CREATE TEMP TABLE verified_ids(val integer PRIMARY KEY)')
+      marker = nil
+      files = @s3_helper.list(prefix, marker)
+
+      while files.count > 0 do
+        verified_ids = []
+
+        files.each do |f|
+          id = model.where("url LIKE '%#{f.key}' AND filesize = #{f.size}").pluck(:id).first
+          verified_ids << id if id.present?
+          marker = f.key
+        end
+
+        verified_id_clause = verified_ids.map { |id| "('#{PG::Connection.escape_string(id.to_s)}')" }.join(",")
+        connection.exec("INSERT INTO verified_ids VALUES #{verified_id_clause}")
+        files = @s3_helper.list(prefix, marker)
+      end
+
+      missing_uploads = model.joins('LEFT JOIN verified_ids ON val = id').where(val: nil)
+      missing_count = missing_uploads.count
+
+      if missing_count > 0
+        missing_uploads.find_each do |upload|
+          puts upload.url
+        end
+
+        puts "#{missing_count} of #{model.count} #{model.name.underscore.pluralize} are missing"
+      end
+    ensure
+      connection.exec('DROP TABLE verified_ids') unless connection.nil?  
+    end
   end
 end
diff --git a/lib/s3_helper.rb b/lib/s3_helper.rb
index b4bcb44..f12497b 100644
--- a/lib/s3_helper.rb
+++ b/lib/s3_helper.rb
@@ -136,9 +136,10 @@ class S3Helper
     update_lifecycle("purge_tombstone", grace_period, prefix: @tombstone_prefix)
   end
 
-  def list(prefix = "")
-    prefix = get_path_for_s3_upload(prefix)
-    s3_bucket.objects(prefix: prefix)
+  def list(prefix = "", marker = nil)
+    options = { prefix: get_path_for_s3_upload(prefix) }
+    options[:marker] = marker if marker.present?
+    s3_bucket.objects(options)
   end
 
   def tag_file(key, tags)
diff --git a/lib/tasks/uploads.rake b/lib/tasks/uploads.rake
index fbf162d..cf17515 100644
--- a/lib/tasks/uploads.rake
+++ b/lib/tasks/uploads.rake
@@ -404,54 +404,7 @@ task "uploads:missing" => :environment do
 end
 
 def list_missing_uploads(skip_optimized: false)
-  if Discourse.store.external?
-    puts "This task only works for internal storages."
-    return
-  end
-
-  public_directory = "#{Rails.root}/public"
-
-  count = 0
-  Upload.find_each do |upload|
-
-    # could be a remote image
-    next unless upload.url =~ /^\/[^\/]/
-
-    path = "#{public_directory}#{upload.url}"
-    bad = true
-    begin
-      bad = false if File.size(path) != 0
-    rescue
-      # something is messed up
-    end
-    if bad
-      count += 1
-      puts path
-    end
-  end
-  puts "#{count} of #{Upload.count} uploads are missing" if count > 0
-
-  unless skip_optimized
-    count = 0
-    OptimizedImage.find_each do |optimized_image|
-      # remote?
-      next unless optimized_image.url =~ /^\/[^\/]/
-
-      path = "#{public_directory}#{optimized_image.url}"
-
-      bad = true
-      begin
-        bad = false if File.size(path) != 0
-      rescue
-        # something is messed up
-      end
-      if bad
-        count += 1
-        puts path
-      end
-    end
-    puts "#{count} of #{OptimizedImage.count} optimized images are missing" if count > 0
-  end
+  Discourse.store.list_missing_uploads(skip_optimized: skip_optimized)
 end
 
 ################################################################################

GitHub

1 Like

What is the exact error that we’re testing for? rescueing every Exception is going to rescue mistakes like typos as well.

lol guess who wrote this in 2016

2 Likes

I think we need to rethink how this is done. The intention was to be able to verify the uploads in an efficient manner but what we’re doing here is to iterate through every single object and executing one DB query each time to pull out its id.

I think it is fine for rake tasks to be printing to STDOUT but once we move it into the store, we shouldn’t be doing so and instead have the rake task be responsible for printing to STDOUT instead. Someone else might call this method from somewhere else in the app and we end up printing to STDOUT which might no be ideal.