FEATURE: ensure consistency of post uploads in cooked content

approved
#1

FEATURE: ensure consistency of post uploads in cooked content

diff --git a/app/jobs/scheduled/ensure_post_uploads_existence.rb b/app/jobs/scheduled/ensure_post_uploads_existence.rb
new file mode 100644
index 0000000..0e2a8be
--- /dev/null
+++ b/app/jobs/scheduled/ensure_post_uploads_existence.rb
@@ -0,0 +1,48 @@
+# frozen_string_literal: true
+
+module Jobs
+
+  class EnsurePostUploadsExistence < Jobs::Scheduled
+    every 1.hour
+
+    MISSING_UPLOADS ||= "missing_uploads"
+
+    def execute(args)
+      PostCustomField
+        .where(name: MISSING_UPLOADS)
+        .where("created_at < ?", 1.month.ago)
+        .destroy_all
+
+      Post
+        .joins("LEFT JOIN post_custom_fields cf ON posts.id = cf.post_id AND cf.name = 'missing_uploads'")
+        .where("(posts.cooked LIKE '%<a %' OR posts.cooked LIKE '%<img %') AND cf.id IS NULL")
+        .find_in_batches(batch_size: 100) do |posts|
+
+          Post.preload_custom_fields(posts, [MISSING_UPLOADS])
+
+          posts.each do |post|
+            fragments ||= Nokogiri::HTML::fragment(post.cooked)
+            missing = []
+
+            fragments.css("a/@href", "img/@src").each do |media|
+              src = media.value
+              next if src.blank? || (src =~ /\/uploads\//).blank?
+
+              src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
+              next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i
+
+              missing << src unless Upload.get_from_url(src) || OptimizedImage.get_from_url(src)
+            end
+
+            if missing.present?
+              post.preloaded_custom_fields = nil
+              post.custom_fields[MISSING_UPLOADS] = missing
+              post.save_custom_fields
+            elsif post.custom_fields[MISSING_UPLOADS].present?
+              PostCustomField.find_by(post_id: post.id, name: MISSING_UPLOADS).destroy!
+            end
+          end
+      end
+    end
+  end
+end
diff --git a/app/jobs/scheduled/migrate_upload_scheme.rb b/app/jobs/scheduled/migrate_upload_scheme.rb
index d72108a..639c662 100644
--- a/app/jobs/scheduled/migrate_upload_scheme.rb
+++ b/app/jobs/scheduled/migrate_upload_scheme.rb
@@ -36,6 +36,30 @@ module Jobs
         optimized_image.destroy!
         upload.rebake_posts_on_old_scheme
       end
+
+      Post.where("cooked LIKE '%<img %'").find_each do |post|
+        missing = post.find_missing_uploads
+        next if missing.blank?
+  
+        missing.each do |src|
+          src.sub!("https://discourse-cdn-sjc1.com/mcneel", "")
+          next unless src.split("/").length == 5
+  
+          source = "#{Discourse.store.public_dir}#{src}"
+          if File.exists?(source)
+            PostCustomField.create!(post_id: post.id, value: src, key: "pu_found")
+            next
+          end
+  
+          source = "#{Discourse.store.tombstone_dir}#{src}"
+          if File.exists?(source)
+            PostCustomField.create!(post_id: post.id, value: src, key: "pu_tombstone")
+            next
+          end
+  
+          PostCustomField.create!(post_id: post.id, value: src, key: "pu_missing")
+        end
+      end
     end
 
   end
diff --git a/app/models/optimized_image.rb b/app/models/optimized_image.rb
index 3860d34..ab0f9db 100644
--- a/app/models/optimized_image.rb
+++ b/app/models/optimized_image.rb
@@ -350,6 +350,25 @@ class OptimizedImage < ActiveRecord::Base
       false
     end
   end
+
+  def self.extract_optimized_url(url)
+    url.match(/(\/optimized\/\dX[\/\.\w]*\/([a-zA-Z0-9]+)[\.\w]*)/)
+  end
+
+  def self.get_from_url(url)
+    return if url.blank?
+
+    uri = begin
+      URI(URI.unescape(url))
+    rescue URI::Error
+    end
+
+    return if uri&.path.blank?
+    data = extract_optimized_url(uri.path)
+    return if data.blank?
+
+    OptimizedImage.find_by("url LIKE ?", "%#{data[1]}")
+  end
 end
 
 # == Schema Information
diff --git a/spec/jobs/ensure_post_uploads_existence_spec.rb b/spec/jobs/ensure_post_uploads_existence_spec.rb
new file mode 100644
index 0000000..e057c0b
--- /dev/null
+++ b/spec/jobs/ensure_post_uploads_existence_spec.rb
@@ -0,0 +1,28 @@
+require 'rails_helper'
+
+describe Jobs::EnsurePostUploadsExistence do
+
+  context '.execute' do
+    let(:upload) { Fabricate(:upload) }
+    let(:optimized) { Fabricate(:optimized_image, url: '/uploads/default/optimized/1X/d1c2d40ab994e8410c_100x200.png') }
+
+    it 'should create post custom field for missing upload' do
+      post = Fabricate(:post, cooked: "A sample post <img src='#{upload.url}'>")
+      upload.destroy!
+      described_class.new.execute({})
+      field = PostCustomField.last
+      expect(field.name).to eq(Jobs::EnsurePostUploadsExistence::MISSING_UPLOADS)
+      expect(field.value).to eq(upload.url)
+    end
+
+    it 'should not create post custom fields' do
+      post = Fabricate(:post, cooked: "A sample post <a href='#{upload.url}'> <img src='#{optimized.url}'>")
+
+      expect {
+        described_class.new.execute({})
+      }.not_to change {
+        PostCustomField.count
+      }
+    end
+  end
+end

GitHub sha: b3fb0a70

#2

removed in remove unrelated code · discourse/discourse@3b581de · GitHub.

Follow Up #4
#8

The following two methods have very similar logic with Upload#extract_upload_url and Upload.get_from_url. Is there a way to reduce the duplication here?

2 Likes
#9

Created a “concern” here to reduce the duplicate.

Approved #11