PERF: Verify post upload's existence by preloaded upload sha1s array

PERF: Very post upload’s existence by preloaded upload sha1s array

diff --git a/app/models/concerns/has_url.rb b/app/models/concerns/has_url.rb
index 8b92f6a..5fdc4b2 100644
--- a/app/models/concerns/has_url.rb
+++ b/app/models/concerns/has_url.rb
@@ -6,6 +6,16 @@ module HasUrl
       url.match(self::URL_REGEX)
     end
 
+    def extract_sha1(path)
+      data = extract_url(path)
+      return if data.blank?
+
+      sha1 = data[2]
+      return if sha1&.length != Upload::SHA1_LENGTH
+
+      sha1
+    end
+
     def get_from_url(url)
       return if url.blank?
 
diff --git a/lib/tasks/posts.rake b/lib/tasks/posts.rake
index d01a409..f8cf44e 100644
--- a/lib/tasks/posts.rake
+++ b/lib/tasks/posts.rake
@@ -388,31 +388,58 @@ task 'posts:reorder_posts', [:topic_id] => [:environment] do |_, args|
   puts "", "Done.", ""
 end
 
+UPLOAD_PATTERNS ||= [
+  /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//,
+  /\/original\//,
+  /\/optimized\//
+].freeze
+
 desc 'Finds missing post upload records from cooked HTML content'
 task 'posts:missing_uploads' => :environment do
   PostCustomField.where(name: Post::MISSING_UPLOADS).destroy_all
-  posts = Post.have_uploads.select(:id, :cooked)
   count = 0
 
-  posts.find_each do |post|
-    missing = []
+  Post.have_uploads.select(:id, :cooked).find_in_batches do |posts|
+    ids = posts.pluck(:id)
+    sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1)
 
-    Nokogiri::HTML::fragment(post.cooked).css("a/@href", "img/@src").each do |media|
-      src = media.value
-      next if src.blank? || (src =~ /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//).blank?
+    posts.each do |post|
+      missing = []
 
-      src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
-      next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i
+      Nokogiri::HTML::fragment(post.cooked).css("a/@href", "img/@src").each do |media|
+        src = media.value
+        next if src.blank? || UPLOAD_PATTERNS.none? { |pattern| src =~ pattern }
 
-      missing << src unless Upload.get_from_url(src) || OptimizedImage.get_from_url(src)
-    end
+        src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
+        next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i
 
-    if missing.present?
-      PostCustomField.create!(post_id: post.id, name: Post::MISSING_UPLOADS, value: missing.to_json)
-      count += missing.count
-    end
+        path = begin
+          URI(URI.unescape(src))&.path
+        rescue URI::Error
+        end
+
+        next if path.blank?
+
+        sha1 =
+          if path.include? "optimized"
+            OptimizedImage.extract_sha1(path)
+          else
+            Upload.extract_sha1(path)
+          end
 
-    putc "."
+        if sha1.blank? || sha1s.exclude?(sha1)
+          missing << src
+        end
+      end
+
+      if missing.present?
+        PostCustomField.create!(post_id: post.id, name: Post::MISSING_UPLOADS, value: missing.to_json)
+        count += missing.count
+        putc "x"
+      else
+        putc "."
+      end
+    end
   end
 
   puts "", "#{count} post uploads are missing.", ""

GitHub sha: 5de483a1

Will this add an additional DB query? The posts object should have already loaded the ActiveRecord objects into memory.

No, it won’t add an additional query because posts variable is an array. Not an activerecord relationship.

2 Likes