List and restore missing post uploads from S3 inventory.

follow-up
#1

List and restore missing post uploads from S3 inventory.

diff --git a/app/models/post.rb b/app/models/post.rb
index 5ab7320..903400c 100644
--- a/app/models/post.rb
+++ b/app/models/post.rb
@@ -890,12 +890,12 @@ class Post < ActiveRecord::Base
 
   def link_post_uploads(fragments: nil)
     upload_ids = []
-    fragments ||= Nokogiri::HTML::fragment(self.cooked)
 
-    fragments.css("a/@href", "img/@src").each do |media|
-      if upload = Upload.get_from_url(media.value)
-        upload_ids << upload.id
-      end
+    each_upload_url(fragments: fragments) do |src, _, sha1|
+      upload = nil
+      upload = Upload.find_by(sha1: sha1) if sha1.present?
+      upload ||= Upload.get_from_url(src)
+      upload_ids << upload.id if upload.present?
     end
 
     upload_ids |= Upload.where(id: downloaded_images.values).pluck(:id)
@@ -916,6 +916,84 @@ class Post < ActiveRecord::Base
     {}
   end
 
+  def each_upload_url(fragments: nil, include_local_upload: true)
+    upload_patterns = [
+      /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//,
+      /\/original\//,
+      /\/optimized\//
+    ]
+    fragments ||= Nokogiri::HTML::fragment(self.cooked)
+    links = fragments.css("a/@href", "img/@src").map { |media| media.value }.uniq
+
+    links.each do |src|
+      next if src.blank? || upload_patterns.none? { |pattern| src =~ pattern }
+
+      src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
+      next unless Discourse.store.has_been_uploaded?(src) || (include_local_upload && src =~ /\A\/[^\/]/i)
+
+      path = begin
+        URI(URI.unescape(src))&.path
+      rescue URI::Error
+      end
+
+      next if path.blank?
+
+      sha1 =
+        if path.include? "optimized"
+          OptimizedImage.extract_sha1(path)
+        else
+          Upload.extract_sha1(path)
+        end
+
+      yield(src, path, sha1)
+    end
+  end
+
+  def self.find_missing_uploads(include_local_upload: true)
+    PostCustomField.where(name: Post::MISSING_UPLOADS).delete_all
+    missing_uploads = []
+    missing_post_uploads = {}
+
+    Post.have_uploads.select(:id, :cooked).find_in_batches do |posts|
+      ids = posts.pluck(:id)
+      sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1)
+
+      posts.each do |post|
+        post.each_upload_url do |src, path, sha1|
+          next if sha1.present? && sha1s.include?(sha1)
+
+          missing_post_uploads[post.id] ||= []
+
+          if missing_uploads.include?(src)
+            missing_post_uploads[post.id] << src
+            next
+          end
+
+          upload_id = nil
+          upload_id = Upload.where(sha1: sha1).pluck(:id).first if sha1.present?
+          upload_id ||= yield(post, src, path, sha1)
+
+          if upload_id.present?
+            attributes = { post_id: post.id, upload_id: upload_id }
+            PostUpload.create!(attributes) unless PostUpload.exists?(attributes)
+          else
+            missing_uploads << src
+            missing_post_uploads[post.id] << src
+          end
+        end
+      end
+    end
+
+    count = 0
+    missing_post_uploads = missing_post_uploads.reject { |_, uploads| uploads.empty? }
+    missing_post_uploads.reject do |post_id, uploads|
+      PostCustomField.create!(post_id: post_id, name: Post::MISSING_UPLOADS, value: uploads.to_json)
+      count += uploads.count
+    end
+
+    return { uploads: missing_uploads, post_uploads: missing_post_uploads, count: count }
+  end
+
   private
 
   def parse_quote_into_arguments(quote)
diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb
index 486d492..1603136 100644
--- a/lib/s3_inventory.rb
+++ b/lib/s3_inventory.rb
@@ -36,8 +36,6 @@ class S3Inventory
 
       ActiveRecord::Base.transaction do
         begin
-          table_name = "#{type}_inventory"
-          connection = ActiveRecord::Base.connection.raw_connection
           connection.exec("CREATE TEMP TABLE #{table_name}(key text UNIQUE, etag text, PRIMARY KEY(etag, key))")
           connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
             files.each do |file|
@@ -54,6 +52,8 @@ class S3Inventory
             WHERE #{model.table_name}.etag IS NULL
               AND url ILIKE '%' || #{table_name}.key")
 
+          list_missing_post_uploads if type == "original"
+
           uploads = (model == Upload) ? model.by_users.where("created_at < ?", inventory_date) : model
           missing_uploads = uploads.joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag").where("#{table_name}.etag is NULL")
 
@@ -73,6 +73,35 @@ class S3Inventory
     end
   end
 
+  def list_missing_post_uploads
+    log "Listing missing post uploads..."
+
+    missing = Post.find_missing_uploads(include_local_upload: false) do |_, _, _, sha1|
+      next if sha1.blank?
+
+      upload_id = nil
+      result = connection.exec("SELECT * FROM #{table_name} WHERE key LIKE '%original/%/#{sha1}%'")
+
+      if result.count >= 0
+        key = result[0]["key"]
+        data = s3_helper.object(key).data
+        upload_id = Upload.create!(
+          user_id: Discourse.system_user.id,
+          original_filename: "",
+          filesize: data.content_length,
+          url: File.join(Discourse.store.absolute_base_url, key),
+          sha1: sha1,
+          etag: result[0]["etag"]
+        ).id
+      end
+
+      upload_id
+    end
+
+    Discourse.stats.set("missing_post_uploads", missing[:count])
+    log "#{missing[:count]} post uploads are missing."
+  end
+
   def download_inventory_files_to_tmp_directory
     files.each do |file|
       log "Downloading inventory file '#{file[:key]}' to tmp directory..."
@@ -128,6 +157,14 @@ class S3Inventory
 
   private
 
+  def connection
+    @connection ||= ActiveRecord::Base.connection.raw_connection
+  end
+
+  def table_name
+    "#{type}_inventory"
+  end
+
   def files
     @files ||= begin
       symlink_file = unsorted_files.sort_by { |file| -file.last_modified.to_i }.first
diff --git a/lib/tasks/posts.rake b/lib/tasks/posts.rake
index 3d1509c..241dbbf 100644
--- a/lib/tasks/posts.rake
+++ b/lib/tasks/posts.rake
@@ -390,123 +390,60 @@ task 'posts:reorder_posts', [:topic_id] => [:environment] do |_, args|
   puts "", "Done.", ""
 end
 
-def get_missing_uploads
-  PostCustomField.where(name: Post::MISSING_UPLOADS)
-end
-
 desc 'Finds missing post upload records from cooked HTML content'
 task 'posts:missing_uploads' => :environment do
-  get_missing_uploads.delete_all
-
-  upload_patterns = [
-    /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//,
-    /\/original\//,
-    /\/optimized\//
-  ]
-  missing_uploads = []
   old_scheme_upload_count = 0
-  count = 0
 
-  Post.have_uploads.select(:id, :cooked).find_in_batches do |posts|
-    ids = posts.pluck(:id)
-    sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1)
+  missing = Post.find_missing_uploads do |post, src, path, sha1|
+    next if sha1.present?
 
-    posts.each do |post|
-      missing_post_uploads = []
-      links = Nokogiri::HTML::fragment(post.cooked).css("a/@href", "img/@src").map { |media| media.value }.uniq
+    upload_id = nil
 
-      links.each do |src|
-        next if src.blank? || upload_patterns.none? { |pattern| src =~ pattern }
+    # recovering old scheme upload.
+    local_store = FileStore::LocalStore.new
+    public_path = "#{local_store.public_dir}#{path}"
+    file_path = nil
 
-        src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
-        next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i
+    if File.exists?(public_path)
+      file_path = public_path
+    else
+      tombstone_path = public_path.sub("/uploads/", "/uploads/tombstone/")
+      file_path = tombstone_path if File.exists?(tombstone_path)
+    end
 
-        path = begin
-          URI(URI.unescape(src))&.path
-        rescue URI::Error
-        end

[... diff too long, it was truncated ...]

GitHub sha: e8fafbc1

DEV: wrap find_missing_uploads method in distributed mutex
REFACTOR: remove duplicate reject loop and implicit return
DEV: Fix undefined variable.
#5

These patterns here are too general. Is is possible to resuse the patterns that we’ve defined in HasUrl?

1 Like
#6

Why do we need this when Upload.get_from_url already does this check?

1 Like
#7

There is a lack of tests here so I’m not quite sure why we need this method instead of just passing all the links through Upload.get_from_url like before.

#8

return is implicit in Ruby for the last line.

1 Like
#9

This method needs to be wrapped in a distributed mutex. Otherwise, the results of what this method returns is not guaranteed if two different processes run this method at the same time.

1 Like
#10

We probably don’t need to do two loops of reject here.

1 Like
Follow Up #11
Followed Up #13

DEV: Fix undefined variable.

#14

Since find_by sha1 is faster (with dedicated index) I’m trying to use it when possible.

Followed Up #15

REFACTOR: remove duplicate reject loop and implicit return

#16

Upload.get_from_url already does that. It finds by sha1 first and then falls back to url.

2 Likes