DEV: convert scheduled job EnsurePostUploadsExistence into a rake task

DEV: convert scheduled job EnsurePostUploadsExistence into a rake task

diff --git a/app/jobs/scheduled/ensure_post_uploads_existence.rb b/app/jobs/scheduled/ensure_post_uploads_existence.rb
deleted file mode 100644
index ea1103d..0000000
--- a/app/jobs/scheduled/ensure_post_uploads_existence.rb
+++ /dev/null
@@ -1,47 +0,0 @@
-# frozen_string_literal: true
-
-module Jobs
-
-  class EnsurePostUploadsExistence < Jobs::Scheduled
-    every 1.hour
-
-    MISSING_UPLOADS ||= "missing_uploads"
-
-    def execute(args)
-      return unless SiteSetting.enable_missing_post_uploads_check?
-
-      PostCustomField
-        .where(name: MISSING_UPLOADS)
-        .where("created_at < ?", 1.month.ago)
-        .destroy_all
-
-      Post
-        .joins("LEFT JOIN post_custom_fields cf ON posts.id = cf.post_id AND cf.name = '#{MISSING_UPLOADS}'")
-        .where("(posts.cooked LIKE '%<a %' OR posts.cooked LIKE '%<img %') AND cf.id IS NULL")
-        .find_in_batches(batch_size: 100) do |posts|
-
-        Post.preload_custom_fields(posts, [MISSING_UPLOADS])
-        posts.each do |post|
-          fragments ||= Nokogiri::HTML::fragment(post.cooked)
-          missing = []
-
-          fragments.css("a/@href", "img/@src").each do |media|
-            src = media.value
-            next if src.blank? || (src =~ /\/uploads\//).blank?
-
-            src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
-            next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i
-
-            missing << src unless Upload.get_from_url(src) || OptimizedImage.get_from_url(src)
-          end
-
-          if missing.present?
-            missing.each { |src| PostCustomField.create!(post_id: post.id, name: MISSING_UPLOADS, value: src) }
-          else
-            PostCustomField.create!(post_id: post.id, name: MISSING_UPLOADS, value: nil)
-          end
-        end
-      end
-    end
-  end
-end
diff --git a/app/models/concerns/has_url.rb b/app/models/concerns/has_url.rb
new file mode 100644
index 0000000..eca6cf9
--- /dev/null
+++ b/app/models/concerns/has_url.rb
@@ -0,0 +1,24 @@
+module HasUrl
+  extend ActiveSupport::Concern
+
+  class_methods do
+    def extract_url(url)
+      url.match(self::URL_REGEX)
+    end
+
+    def get_from_url(url)
+      return if url.blank?
+  
+      uri = begin
+        URI(URI.unescape(url))
+      rescue URI::Error
+      end
+  
+      return if uri&.path.blank?
+      data = extract_url(uri.path)
+      return if data.blank?
+  
+      self.find_by("url LIKE ?", "%#{data[1]}")
+    end
+  end
+end
diff --git a/app/models/optimized_image.rb b/app/models/optimized_image.rb
index ab0f9db..9dc037e 100644
--- a/app/models/optimized_image.rb
+++ b/app/models/optimized_image.rb
@@ -4,10 +4,12 @@ require_dependency "db_helper"
 require_dependency "file_store/local_store"
 
 class OptimizedImage < ActiveRecord::Base
+  include HasUrl
   belongs_to :upload
 
   # BUMP UP if optimized image algorithm changes
   VERSION = 2
+  URL_REGEX ||= /(\/optimized\/\dX[\/\.\w]*\/([a-zA-Z0-9]+)[\.\w]*)/
 
   def self.lock(upload_id, width, height)
     @hostname ||= `hostname`.strip rescue "unknown"
@@ -350,25 +352,6 @@ class OptimizedImage < ActiveRecord::Base
       false
     end
   end
-
-  def self.extract_optimized_url(url)
-    url.match(/(\/optimized\/\dX[\/\.\w]*\/([a-zA-Z0-9]+)[\.\w]*)/)
-  end
-
-  def self.get_from_url(url)
-    return if url.blank?
-
-    uri = begin
-      URI(URI.unescape(url))
-    rescue URI::Error
-    end
-
-    return if uri&.path.blank?
-    data = extract_optimized_url(uri.path)
-    return if data.blank?
-
-    OptimizedImage.find_by("url LIKE ?", "%#{data[1]}")
-  end
 end
 
 # == Schema Information
diff --git a/app/models/upload.rb b/app/models/upload.rb
index 4149582..8b3088c 100644
--- a/app/models/upload.rb
+++ b/app/models/upload.rb
@@ -8,9 +8,11 @@ require_dependency "base62"
 
 class Upload < ActiveRecord::Base
   include ActionView::Helpers::NumberHelper
+  include HasUrl
 
   SHA1_LENGTH = 40
   SEEDED_ID_THRESHOLD = 0
+  URL_REGEX ||= /(\/original\/\dX[\/\.\w]*\/([a-zA-Z0-9]+)[\.\w]*)/
 
   belongs_to :user
 
@@ -192,27 +194,6 @@ class Upload < ActiveRecord::Base
     Digest::SHA1.file(path).hexdigest
   end
 
-  def self.extract_upload_url(url)
-    url.match(/(\/original\/\dX[\/\.\w]*\/([a-zA-Z0-9]+)[\.\w]*)/)
-  end
-
-  def self.get_from_url(url)
-    return if url.blank?
-
-    uri = begin
-      URI(URI.unescape(url))
-    rescue URI::Error
-    end
-
-    return if uri&.path.blank?
-    data = extract_upload_url(uri.path)
-    return if data.blank?
-    sha1 = data[2]
-    upload = nil
-    upload = Upload.find_by(sha1: sha1) if sha1&.length == SHA1_LENGTH
-    upload || Upload.find_by("url LIKE ?", "%#{data[1]}")
-  end
-
   def human_filesize
     number_to_human_size(self.filesize)
   end
diff --git a/config/site_settings.yml b/config/site_settings.yml
index 55a1ff1..7e31adf 100644
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@@ -1481,9 +1481,6 @@ developer:
   enable_safe_mode:
     default: true
     client: true
-  enable_missing_post_uploads_check:
-    default: false
-    hidden: true
 
 embedding:
   feed_polling_enabled:
diff --git a/lib/tasks/posts.rake b/lib/tasks/posts.rake
index 0c78987..09f2242 100644
--- a/lib/tasks/posts.rake
+++ b/lib/tasks/posts.rake
@@ -387,3 +387,28 @@ task 'posts:reorder_posts', [:topic_id] => [:environment] do |_, args|
 
   puts "", "Done.", ""
 end
+
+desc 'Finds missing post upload records from cooked HTML content'
+task 'posts:missing_uploads' => :environment do
+  name = "missing_uploads"
+  PostCustomField.where(name: name).destroy_all
+  posts = Post.where("posts.cooked LIKE '%<a %' OR posts.cooked LIKE '%<img %'").select(:id, :cooked)
+  missing = []
+
+  posts.find_each do |post|
+    Nokogiri::HTML::fragment(post.cooked).css("a/@href", "img/@src").each do |media|
+      src = media.value
+      next if src.blank? || (src =~ /\/uploads\//).blank?
+
+      src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
+      next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i
+
+      missing << src unless Upload.get_from_url(src) || OptimizedImage.get_from_url(src)
+    end
+
+    missing.each { |src| PostCustomField.create!(post_id: post.id, name: name, value: src) }
+    putc "."
+  end
+
+  puts "", "#{missing.count} post uploads are missing.", ""
+end
diff --git a/spec/jobs/ensure_post_uploads_existence_spec.rb b/spec/jobs/ensure_post_uploads_existence_spec.rb
deleted file mode 100644
index 99d187a..0000000
--- a/spec/jobs/ensure_post_uploads_existence_spec.rb
+++ /dev/null
@@ -1,47 +0,0 @@
-require 'rails_helper'
-
-describe Jobs::EnsurePostUploadsExistence do
-
-  context '.execute' do
-    let(:upload) { Fabricate(:upload) }
-    let(:optimized) { Fabricate(:optimized_image, url: '/uploads/default/optimized/1X/d1c2d40ab994e8410c_100x200.png') }
-
-    context "when enabled" do
-      before do
-        SiteSetting.enable_missing_post_uploads_check = true
-      end
-
-      it 'should create post custom field for missing upload' do
-        Fabricate(:post, cooked: "A sample post <img src='#{upload.url}'>")
-        upload.destroy!
-        described_class.new.execute({})
-        field = PostCustomField.find_by(name: Jobs::EnsurePostUploadsExistence::MISSING_UPLOADS)
-        expect(field).to be_present
-        expect(field.value).to eq(upload.url)
-      end
-
-      it 'should create post custom field with nil value' do
-        Fabricate(:post, cooked: "A sample post <a href='#{upload.url}'> <img src='#{optimized.url}'>")
-        described_class.new.execute({})
-        field = PostCustomField.find_by(name: Jobs::EnsurePostUploadsExistence::MISSING_UPLOADS)
-        expect(field).to be_present
-        expect(field.value).to eq(nil)
-      end
-    end
-
-    context "when disabled" do
-      before do
-        SiteSetting.enable_missing_post_uploads_check = false
-      end
-
-      it "does not execute" do

[... diff too long, it was truncated ...]

GitHub sha: 914ada1c

@vinothkannans Ah this shouldn’t have been removed. Searching by sha1 is much faster than trying to match on URL because of the index we have on sha1.

1 Like

Fix regression due to 914ada1c749425c4eb53b2d8338afa6b4ab3f3d4.