REFACTOR: Refactor pull_hotlinked_images job

REFACTOR: Refactor pull_hotlinked_images job

This commit should cause no functional change

  • Split into functions to avoid deep nesting
  • Register custom field type, and remove manual json parse/serialize
  • Recover from deleted upload records

Also adds a test to ensure pull_hotlinked_images redownloads secure images only once

diff --git a/app/jobs/regular/pull_hotlinked_images.rb b/app/jobs/regular/pull_hotlinked_images.rb
index 5e3266d..5a6ab64 100644
--- a/app/jobs/regular/pull_hotlinked_images.rb
+++ b/app/jobs/regular/pull_hotlinked_images.rb
@@ -9,6 +9,89 @@ module Jobs
       @max_size = SiteSetting.max_image_size_kb.kilobytes
     end
 
+    def execute(args)
+      @post_id = args[:post_id]
+      raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
+
+      post = Post.find_by(id: @post_id)
+      return if post.blank?
+
+      raw = post.raw.dup
+      start_raw = raw.dup
+
+      large_image_urls = post.custom_fields[Post::LARGE_IMAGES] || []
+      broken_image_urls = post.custom_fields[Post::BROKEN_IMAGES] || []
+      downloaded_image_ids = post.custom_fields[Post::DOWNLOADED_IMAGES] || {}
+
+      upload_records = Upload.where(id: downloaded_image_ids.values)
+      upload_records = Hash[upload_records.map { |u| [u.id, u] }]
+
+      downloaded_images = {}
+      downloaded_image_ids.each { |url, id| downloaded_images[url] = upload_records[id] }
+
+      extract_images_from(post.cooked).each do |node|
+        download_src = original_src = node['src'] || node['href']
+        download_src = "#{SiteSetting.force_https ? "https" : "http"}:#{original_src}" if original_src.start_with?("//")
+        normalized_src = normalize_src(download_src)
+
+        next if !should_download_image?(download_src, post)
+
+        begin
+          already_attempted_download = downloaded_images.include?(normalized_src) || large_image_urls.include?(normalized_src) || broken_image_urls.include?(normalized_src)
+          if !already_attempted_download
+            downloaded_images[normalized_src] = attempt_download(download_src, post.user_id)
+          end
+        rescue ImageTooLargeError
+          large_image_urls << normalized_src
+        rescue ImageBrokenError
+          broken_image_urls << normalized_src
+        end
+
+        # have we successfully downloaded that file?
+        if upload = downloaded_images[normalized_src]
+          raw = replace_in_raw(original_src: original_src, upload: upload, raw: raw)
+        end
+      rescue => e
+        raise e if Rails.env.test?
+        log(:error, "Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
+      end
+
+      large_image_urls.uniq!
+      broken_image_urls.uniq!
+      downloaded_images.compact!
+
+      post.custom_fields[Post::LARGE_IMAGES] = large_image_urls
+      post.custom_fields[Post::BROKEN_IMAGES] = broken_image_urls
+
+      downloaded_image_ids = {}
+      downloaded_images.each { |url, upload| downloaded_image_ids[url] = upload.id }
+      post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_image_ids
+
+      [Post::LARGE_IMAGES, Post::BROKEN_IMAGES, Post::DOWNLOADED_IMAGES].each do |key|
+        post.custom_fields.delete(key) if !post.custom_fields[key].present?
+      end
+
+      custom_fields_updated = !post.custom_fields_clean?
+
+      # only save custom fields if they changed
+      post.save_custom_fields if custom_fields_updated
+
+      # If post changed while we were downloading images, never apply edits
+      post.reload
+      post_changed_elsewhere = (start_raw != post.raw)
+      raw_changed_here = (raw != post.raw)
+
+      if !post_changed_elsewhere && raw_changed_here
+        changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
+        post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true)
+      elsif custom_fields_updated
+        post.trigger_post_process(
+          bypass_bump: true,
+          skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling
+        )
+      end
+    end
+
     def download(src)
       downloaded = nil
 
@@ -32,131 +115,67 @@ module Jobs
       downloaded
     end
 
-    def execute(args)
-      post_id = args[:post_id]
-      raise Discourse::InvalidParameters.new(:post_id) unless post_id.present?
+    class ImageTooLargeError < StandardError; end
+    class ImageBrokenError < StandardError; end
 
-      post = Post.find_by(id: post_id)
-      return unless post.present?
+    def attempt_download(src, user_id)
+      # secure-media-uploads endpoint prevents anonymous downloads, so we
+      # need the presigned S3 URL here
+      src = Upload.signed_url_from_secure_media_url(src) if Upload.secure_media_url?(src)
 
-      raw = post.raw.dup
-      start_raw = raw.dup
+      hotlinked = download(src)
+      raise ImageBrokenError if !hotlinked
+      raise ImageTooLargeError if File.size(hotlinked.path) > @max_size
 
-      downloaded_urls = {}
+      filename = File.basename(URI.parse(src).path)
+      filename << File.extname(hotlinked.path) unless filename["."]
+      upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(user_id)
 
-      large_images = JSON.parse(post.custom_fields[Post::LARGE_IMAGES].presence || "[]")
-      broken_images = JSON.parse(post.custom_fields[Post::BROKEN_IMAGES].presence || "[]")
-      downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES].presence || "{}")
-
-      has_new_large_image  = false
-      has_new_broken_image = false
-      has_downloaded_image = false
+      if upload.persisted?
+        upload
+      else
+        log(:info, "Failed to persist downloaded hotlinked image for post: #{@post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")
+        nil
+      end
+    end
 
-      extract_images_from(post.cooked).each do |node|
-        src = original_src = node['src'] || node['href']
-        src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
-
-        if should_download_image?(src, post)
-          begin
-            # have we already downloaded that file?
-            schemeless_src = normalize_src(original_src)
-
-            unless downloaded_images.include?(schemeless_src) || large_images.include?(schemeless_src) || broken_images.include?(schemeless_src)
-
-              # secure-media-uploads endpoint prevents anonymous downloads, so we
-              # need the presigned S3 URL here
-              src = Upload.signed_url_from_secure_media_url(src) if Upload.secure_media_url?(src)
-
-              if hotlinked = download(src)
-                if File.size(hotlinked.path) <= @max_size
-                  filename = File.basename(URI.parse(src).path)
-                  filename << File.extname(hotlinked.path) unless filename["."]
-                  upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)
-
-                  if upload.persisted?
-                    downloaded_urls[src] = upload.url
-                    downloaded_images[normalize_src(src)] = upload.id
-                    has_downloaded_image = true
-                  else
-                    log(:info, "Failed to pull hotlinked image for post: #{post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")
-                  end
-                else
-                  large_images << normalize_src(original_src)
-                  has_new_large_image = true
-                end
-              else
-                broken_images << normalize_src(original_src)
-                has_new_broken_image = true
-              end
+    def replace_in_raw(original_src:, raw:, upload:)
+      raw = raw.dup
+      escaped_src = Regexp.escape(original_src)
+
+      replace_raw = ->(match, match_src, replacement, _index) {
+        if normalize_src(original_src) == normalize_src(match_src)
+          replacement =
+            if replacement.include?(InlineUploads::PLACEHOLDER)
+              replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
+            elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER)
+              replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path)
             end
 
-            # have we successfully downloaded that file?

[... diff too long, it was truncated ...]

GitHub sha: cb12a721

1 Like

This commit appears in #10342 which was approved by eviltrout. It was merged by davidtaylorhq.

This commit has been mentioned on Discourse Meta. There might be relevant details there:

https://meta.discourse.org/t/pullhotlinkedimages-does-not-replace-images-again-after-post-is-edited/154184/8