FIX: Improve image downsizing script (#9549)

FIX: Improve image downsizing script (#9549)

Correctly handles more upload formats in posts, updates post custom fields, fixes more edge cases, adds debugging capabilities. (VERBOSE=1 and INTERACTIVE=1 flags)

Includes these commits and some more:

  • DEV: Show the fixed image dimensions
  • FIX: Support more upload url formats
  • DEV: Remove the old upload after updating posts
  • FIX: Use the process_post_#{id} mutex
  • FIX: Avoid rebaking twice
  • DEV: Print out the link to the post
  • DEV: Process posts chronologically
  • DEV: Do a dry-run before saving, pause on any issue
  • FIX: Also process deleted posts
  • DEV: Make matchers case-insensitive
  • DEV: Pause on “detached” uploads, add more debug info
  • DEV: Print out time when finished
  • DEV: Add support for WORKER_ID/WORKER_COUNT
  • DEV: Fix the onebox in cooked text heuristic
  • DEV: Don’t report already processed posts
  • DEV: Beep when done!
  • DEV: Ignore issues with deleted posts
  • DEV: Ignore issues with deleted topics
  • DEV: Multiline SQL
  • DEV: Use the bulk attribute assignment
  • DEV: Add ENV[“INTERACTIVE”] mode
  • DEV: Handle post custom fields
  • DEV: Bail on non-S3 sites
  • DEV: Allow sizes smaller than 1 mpix
diff --git a/script/downsize_uploads.rb b/script/downsize_uploads.rb
index 95d65c2..dc2e7b2 100644
--- a/script/downsize_uploads.rb
+++ b/script/downsize_uploads.rb
@@ -2,80 +2,181 @@
 
 require File.expand_path("../../config/environment", __FILE__)
 
-# no less than 1 megapixel
-max_image_pixels = [ARGV[0].to_i, 1_000_000].max
+# Supported ENV arguments:
+#
+# VERBOSE=1
+# Shows debug information.
+#
+# INTERACTIVE=1
+# Shows debug information and pauses for input on issues.
+#
+# WORKER_ID/WORKER_COUNT
+# When running the script on a single forum in multiple terminals.
+# For example, if you want 4 concurrent scripts use WORKER_COUNT=4
+# and WORKER_ID from 0 to 3
+
+MIN_IMAGE_PIXELS = 500_000 # 0.5 megapixels
+DEFAULT_IMAGE_PIXELS = 1_000_000 # 1 megapixel
+
+MAX_IMAGE_PIXELS = [
+  ARGV[0]&.to_i || DEFAULT_IMAGE_PIXELS,
+  MIN_IMAGE_PIXELS
+].max
+
+ENV["VERBOSE"] = "1" if ENV["INTERACTIVE"]
+
+def log(*args)
+  puts(*args) if ENV["VERBOSE"]
+end
 
-puts "", "Downsizing images to no more than #{max_image_pixels} pixels"
+def transform_post(post, upload_before, upload_after)
+  post.raw.gsub!(/upload:\/\/#{upload_before.base62_sha1}(\.#{upload_before.extension})?/i, upload_after.short_url)
+  post.raw.gsub!(Discourse.store.cdn_url(upload_before.url), Discourse.store.cdn_url(upload_after.url))
+  post.raw.gsub!(Discourse.store.url_for(upload_before), Discourse.store.url_for(upload_after))
+  post.raw.gsub!("#{Discourse.base_url}#{upload_before.short_path}", "#{Discourse.base_url}#{upload_after.short_path}")
 
-dimensions_count = 0
-downsized_count = 0
+  path = SiteSetting.Upload.s3_upload_bucket.split("/", 2)[1]
+  post.raw.gsub!(/<img src=\"https:\/\/.+?\/#{path}\/uploads\/default\/optimized\/.+?\/#{upload_before.sha1}_\d_(?<width>\d+)x(?<height>\d+).*?\" alt=\"(?<alt>.*?)\"\/?>/i) do
+    "![#{$~[:alt]}|#{$~[:width]}x#{$~[:height]}](#{upload_after.short_url})"
+  end
 
-def downsize_upload(upload, path, max_image_pixels)
+  post.raw.gsub!(/!\[(.*?)\]\(\/uploads\/.+?\/#{upload_before.sha1}(\.#{upload_before.extension})?\)/i, "![\\1](#{upload_after.short_url})")
+end
+
+def downsize_upload(upload, path)
   # Make sure the filesize is up to date
   upload.filesize = File.size(path)
 
-  OptimizedImage.downsize(path, path, "#{max_image_pixels}@", filename: upload.original_filename)
+  OptimizedImage.downsize(path, path, "#{MAX_IMAGE_PIXELS}@", filename: upload.original_filename)
   sha1 = Upload.generate_digest(path)
 
   if sha1 == upload.sha1
-    puts "no sha1 change" if ENV["VERBOSE"]
+    log "No sha1 change"
     return
   end
 
-  w, h = FastImage.size(path, timeout: 10, raise_on_failure: true)
+  w, h = FastImage.size(path, timeout: 15, raise_on_failure: true)
 
   if !w || !h
-    puts "invalid image dimensions after resizing" if ENV["VERBOSE"]
+    log "Invalid image dimensions after resizing"
     return
   end
 
   # Neither #dup or #clone provide a complete copy
   original_upload = Upload.find(upload.id)
   ww, hh = ImageSizer.resize(w, h)
-  new_file = true
 
-  if existing_upload = Upload.find_by(sha1: sha1)
-    upload = existing_upload
-    new_file = false
-  end
-
-  before = upload.filesize
-  upload.filesize = File.size(path)
-
-  if upload.filesize > before
-    puts "no filesize reduction" if ENV["VERBOSE"]
+  # A different upload record that matches the sha1 of the downsized image
+  existing_upload = Upload.find_by(sha1: sha1)
+  upload = existing_upload if existing_upload
+
+  upload.attributes = {
+    sha1: sha1,
+    width: w,
+    height: h,
+    thumbnail_width: ww,
+    thumbnail_height: hh,
+    filesize: File.size(path)
+  }
+
+  if upload.filesize > upload.filesize_was
+    log "No filesize reduction"
     return
   end
 
-  upload.sha1 = sha1
-  upload.width = w
-  upload.height = h
-  upload.thumbnail_width = ww
-  upload.thumbnail_height = hh
-
-  if new_file
+  unless existing_upload
     url = Discourse.store.store_upload(File.new(path), upload)
 
     unless url
-      puts "couldn't store the upload" if ENV["VERBOSE"]
+      log "Couldn't store the upload"
       return
     end
 
     upload.url = url
   end
 
-  if ENV["VERBOSE"]
-    puts "base62: #{original_upload.base62_sha1} -> #{Upload.base62_sha1(sha1)}"
-    puts "sha1: #{original_upload.sha1} -> #{sha1}"
-    puts "is a new file: #{new_file}"
+  log "base62: #{original_upload.base62_sha1} -> #{Upload.base62_sha1(sha1)}"
+  log "sha: #{original_upload.sha1} -> #{sha1}"
+  log "(an exisiting upload)" if existing_upload
+
+  success = true
+  posts = Post.unscoped.joins(:post_uploads).where(post_uploads: { upload_id: original_upload.id }).uniq.sort_by(&:created_at)
+
+  posts.each do |post|
+    transform_post(post, original_upload, upload)
+
+    if post.custom_fields[Post::DOWNLOADED_IMAGES].present?
+      downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES])
+    end
+
+    if post.raw_changed?
+      log "Updating post"
+    elsif downloaded_images&.has_value?(original_upload.id)
+      log "A hotlinked, unreferenced image"
+    elsif post.raw.include?(upload.short_url)
+      log "Already processed"
+    elsif post.trashed?
+      log "A deleted post"
+    elsif !post.topic || post.topic.trashed?
+      log "A deleted topic"
+    elsif post.cooked.include?(original_upload.sha1)
+      if post.raw.include?("#{Discourse.base_url.sub(/^https?:\/\//i, "")}/t/")
+        log "Updating a topic onebox"
+      else
+        log "Updating an external onebox"
+      end
+    else
+      log "Could not find the upload URL"
+      success = false
+    end
+
+    log "#{Discourse.base_url}/p/#{post.id}"
+  end
+
+  if posts.empty?
+    log "Upload not used in any posts"
+
+    if User.where(uploaded_avatar_id: original_upload.id).count
+      log "Used as a User avatar"
+    elsif UserAvatar.where(gravatar_upload_id: original_upload.id).count
+      log "Used as a UserAvatar gravatar"
+    elsif UserAvatar.where(custom_upload_id: original_upload.id).count
+      log "Used as a UserAvatar custom upload"
+    elsif UserProfile.where(profile_background_upload_id: original_upload.id).count
+      log "Used as a UserProfile profile background"
+    elsif UserProfile.where(card_background_upload_id: original_upload.id).count
+      log "Used as a UserProfile card background"
+    elsif Category.where(uploaded_logo_id: original_upload.id).count
+      log "Used as a Category logo"
+    elsif Category.where(uploaded_background_id: original_upload.id).count
+      log "Used as a Category background"
+    elsif CustomEmoji.where(upload_id: original_upload.id).count
+      log "Used as a CustomEmoji"
+    elsif ThemeField.where(upload_id: original_upload.id).count
+      log "Used as a ThemeField"
+    else
+      success = false
+    end
+  end
+
+  unless success
+    if ENV["INTERACTIVE"]
+      print "Press any key to continue with the upload"
+      STDIN.beep
+      STDIN.getch
+      puts " k"
+    elsif !existing_upload && !Upload.where(url: upload.url).exist?
+      # We're bailing, so clean up the just uploaded file
+      Discourse.store.remove_upload(upload)
+
+      log "⏩ Skipping"
+      return
+    end
   end
 
   upload.save!
 
-  if new_file
-    upload.optimized_images.each(&:destroy!)
-    Discourse.store.remove_upload(original_upload)
-  else
+  if existing_upload
     begin
       PostUpload.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
     rescue ActiveRecord::RecordNotUnique, PG::UniqueViolation
@@ -90,85 +191,162 @@ def downsize_upload(upload, path, max_image_pixels)
     Category.where(uploaded_background_id: original_upload.id).update_all(uploaded_background_id: upload.id)
     CustomEmoji.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
     ThemeField.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
+  else
+    upload.optimized_images.each(&:destroy!)
   end
 
-  original_upload.posts.each do |post|

[... diff too long, it was truncated ...]

GitHub sha: eb462bfb

1 Like

This commit appears in #9549 which was approved by eviltrout. It was merged by CvX.