List and restore missing post uploads from S3 inventory.

List and restore missing post uploads from S3 inventory.

diff --git a/app/models/post.rb b/app/models/post.rb
index 5ab7320..903400c 100644
--- a/app/models/post.rb
+++ b/app/models/post.rb
@@ -890,12 +890,12 @@ class Post < ActiveRecord::Base
 
   def link_post_uploads(fragments: nil)
     upload_ids = []
-    fragments ||= Nokogiri::HTML::fragment(self.cooked)
 
-    fragments.css("a/@href", "img/@src").each do |media|
-      if upload = Upload.get_from_url(media.value)
-        upload_ids << upload.id
-      end
+    each_upload_url(fragments: fragments) do |src, _, sha1|
+      upload = nil
+      upload = Upload.find_by(sha1: sha1) if sha1.present?
+      upload ||= Upload.get_from_url(src)
+      upload_ids << upload.id if upload.present?
     end
 
     upload_ids |= Upload.where(id: downloaded_images.values).pluck(:id)
@@ -916,6 +916,84 @@ class Post < ActiveRecord::Base
     {}
   end
 
+  def each_upload_url(fragments: nil, include_local_upload: true)
+    upload_patterns = [
+      /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//,
+      /\/original\//,
+      /\/optimized\//
+    ]
+    fragments ||= Nokogiri::HTML::fragment(self.cooked)
+    links = fragments.css("a/@href", "img/@src").map { |media| media.value }.uniq
+
+    links.each do |src|
+      next if src.blank? || upload_patterns.none? { |pattern| src =~ pattern }
+
+      src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
+      next unless Discourse.store.has_been_uploaded?(src) || (include_local_upload && src =~ /\A\/[^\/]/i)
+
+      path = begin
+        URI(URI.unescape(src))&.path
+      rescue URI::Error
+      end
+
+      next if path.blank?
+
+      sha1 =
+        if path.include? "optimized"
+          OptimizedImage.extract_sha1(path)
+        else
+          Upload.extract_sha1(path)
+        end
+
+      yield(src, path, sha1)
+    end
+  end
+
+  def self.find_missing_uploads(include_local_upload: true)
+    PostCustomField.where(name: Post::MISSING_UPLOADS).delete_all
+    missing_uploads = []
+    missing_post_uploads = {}
+
+    Post.have_uploads.select(:id, :cooked).find_in_batches do |posts|
+      ids = posts.pluck(:id)
+      sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1)
+
+      posts.each do |post|
+        post.each_upload_url do |src, path, sha1|
+          next if sha1.present? && sha1s.include?(sha1)
+
+          missing_post_uploads[post.id] ||= []
+
+          if missing_uploads.include?(src)
+            missing_post_uploads[post.id] << src
+            next
+          end
+
+          upload_id = nil
+          upload_id = Upload.where(sha1: sha1).pluck(:id).first if sha1.present?
+          upload_id ||= yield(post, src, path, sha1)
+
+          if upload_id.present?
+            attributes = { post_id: post.id, upload_id: upload_id }
+            PostUpload.create!(attributes) unless PostUpload.exists?(attributes)
+          else
+            missing_uploads << src
+            missing_post_uploads[post.id] << src
+          end
+        end
+      end
+    end
+
+    count = 0
+    missing_post_uploads = missing_post_uploads.reject { |_, uploads| uploads.empty? }
+    missing_post_uploads.reject do |post_id, uploads|
+      PostCustomField.create!(post_id: post_id, name: Post::MISSING_UPLOADS, value: uploads.to_json)
+      count += uploads.count
+    end
+
+    return { uploads: missing_uploads, post_uploads: missing_post_uploads, count: count }
+  end
+
   private
 
   def parse_quote_into_arguments(quote)
diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb
index 486d492..1603136 100644
--- a/lib/s3_inventory.rb
+++ b/lib/s3_inventory.rb
@@ -36,8 +36,6 @@ class S3Inventory
 
       ActiveRecord::Base.transaction do
         begin
-          table_name = "#{type}_inventory"
-          connection = ActiveRecord::Base.connection.raw_connection
           connection.exec("CREATE TEMP TABLE #{table_name}(key text UNIQUE, etag text, PRIMARY KEY(etag, key))")
           connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
             files.each do |file|
@@ -54,6 +52,8 @@ class S3Inventory
             WHERE #{model.table_name}.etag IS NULL
               AND url ILIKE '%' || #{table_name}.key")
 
+          list_missing_post_uploads if type == "original"
+
           uploads = (model == Upload) ? model.by_users.where("created_at < ?", inventory_date) : model
           missing_uploads = uploads.joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag").where("#{table_name}.etag is NULL")
 
@@ -73,6 +73,35 @@ class S3Inventory
     end
   end
 
+  def list_missing_post_uploads
+    log "Listing missing post uploads..."
+
+    missing = Post.find_missing_uploads(include_local_upload: false) do |_, _, _, sha1|
+      next if sha1.blank?
+
+      upload_id = nil
+      result = connection.exec("SELECT * FROM #{table_name} WHERE key LIKE '%original/%/#{sha1}%'")
+
+      if result.count >= 0
+        key = result[0]["key"]
+        data = s3_helper.object(key).data
+        upload_id = Upload.create!(
+          user_id: Discourse.system_user.id,
+          original_filename: "",
+          filesize: data.content_length,
+          url: File.join(Discourse.store.absolute_base_url, key),
+          sha1: sha1,
+          etag: result[0]["etag"]
+        ).id
+      end
+
+      upload_id
+    end
+
+    Discourse.stats.set("missing_post_uploads", missing[:count])
+    log "#{missing[:count]} post uploads are missing."
+  end
+
   def download_inventory_files_to_tmp_directory
     files.each do |file|
       log "Downloading inventory file '#{file[:key]}' to tmp directory..."
@@ -128,6 +157,14 @@ class S3Inventory
 
   private
 
+  def connection
+    @connection ||= ActiveRecord::Base.connection.raw_connection
+  end
+
+  def table_name
+    "#{type}_inventory"
+  end
+
   def files
     @files ||= begin
       symlink_file = unsorted_files.sort_by { |file| -file.last_modified.to_i }.first
diff --git a/lib/tasks/posts.rake b/lib/tasks/posts.rake
index 3d1509c..241dbbf 100644
--- a/lib/tasks/posts.rake
+++ b/lib/tasks/posts.rake
@@ -390,123 +390,60 @@ task 'posts:reorder_posts', [:topic_id] => [:environment] do |_, args|
   puts "", "Done.", ""
 end
 
-def get_missing_uploads
-  PostCustomField.where(name: Post::MISSING_UPLOADS)
-end
-
 desc 'Finds missing post upload records from cooked HTML content'
 task 'posts:missing_uploads' => :environment do
-  get_missing_uploads.delete_all
-
-  upload_patterns = [
-    /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//,
-    /\/original\//,
-    /\/optimized\//
-  ]
-  missing_uploads = []
   old_scheme_upload_count = 0
-  count = 0
 
-  Post.have_uploads.select(:id, :cooked).find_in_batches do |posts|
-    ids = posts.pluck(:id)
-    sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1)
+  missing = Post.find_missing_uploads do |post, src, path, sha1|
+    next if sha1.present?
 
-    posts.each do |post|
-      missing_post_uploads = []
-      links = Nokogiri::HTML::fragment(post.cooked).css("a/@href", "img/@src").map { |media| media.value }.uniq
+    upload_id = nil
 
-      links.each do |src|
-        next if src.blank? || upload_patterns.none? { |pattern| src =~ pattern }
+    # recovering old scheme upload.
+    local_store = FileStore::LocalStore.new
+    public_path = "#{local_store.public_dir}#{path}"
+    file_path = nil
 
-        src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
-        next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i
+    if File.exists?(public_path)
+      file_path = public_path
+    else
+      tombstone_path = public_path.sub("/uploads/", "/uploads/tombstone/")
+      file_path = tombstone_path if File.exists?(tombstone_path)
+    end
 
-        path = begin
-          URI(URI.unescape(src))&.path
-        rescue URI::Error
-        end

[... diff too long, it was truncated ...]

GitHub sha: e8fafbc1

These patterns here are too general. Is is possible to resuse the patterns that we’ve defined in HasUrl?

1 Like

Why do we need this when Upload.get_from_url already does this check?

1 Like

There is a lack of tests here so I’m not quite sure why we need this method instead of just passing all the links through Upload.get_from_url like before.

return is implicit in Ruby for the last line.

1 Like

This method needs to be wrapped in a distributed mutex. Otherwise, the results of what this method returns is not guaranteed if two different processes run this method at the same time.

1 Like

We probably don’t need to do two loops of reject here.

1 Like

DEV: Fix undefined variable.

Since find_by sha1 is faster (with dedicated index) I’m trying to use it when possible.

REFACTOR: remove duplicate reject loop and implicit return

Upload.get_from_url already does that. It finds by sha1 first and then falls back to url.

3 Likes

FIX: skip external urls which has upload url in query string.

If you pass an optimized image URL Upload.get_from_url method won’t find the corresponding upload. But each_upload_url method will extract “upload sha1” from both upload and optimized image URLs.

Tests are added in FIX: skip external urls which has upload url in query string. · discourse/discourse@788f995 · GitHub. If you pass http://example.com/path to Upload.get_from_url it will cost one DB query. But this method will ignore it.

we don’t have patterns in HasUrl.

We don’t need optimized images here do we? Optimized images are not required to be linked to the post records.

Hmm that doesn’t sound right based on what I’m seeing in List and restore missing post uploads from S3 inventory. · discourse/discourse@e8fafbc · GitHub. We end up doing a lookup with Upload.find_by(url: url)

I meant Upload::URL_REGEX

No. That line itself won’t be executed in case of http://example.com/path. We already filtering it in List and restore missing post uploads from S3 inventory. · discourse/discourse@e8fafbc · GitHub.