FIX: Don't attempt to reindex posts that have an empty raw.

FIX: Don’t attempt to reindex posts that have an empty raw.

If the post ids keep loading, we might end up in a situations where we’re always loading the same post ids over and over again without indexing anything new.

Follow up to daeda80ada1d914d52dcc53a2cda087e5f62330c.

diff --git a/app/jobs/scheduled/reindex_search.rb b/app/jobs/scheduled/reindex_search.rb
index 6e919eb..ef341ab 100644
--- a/app/jobs/scheduled/reindex_search.rb
+++ b/app/jobs/scheduled/reindex_search.rb
@@ -12,7 +12,7 @@ module Jobs
       clean_post_search_data
     end
 
-    def rebuild_problem_categories(limit = 500)
+    def rebuild_problem_categories(limit: 500)
       category_ids = load_problem_category_ids(limit)
 
       category_ids.each do |id|
@@ -21,7 +21,7 @@ module Jobs
       end
     end
 
-    def rebuild_problem_users(limit = 10000)
+    def rebuild_problem_users(limit: 10000)
       user_ids = load_problem_user_ids(limit)
 
       user_ids.each do |id|
@@ -30,7 +30,7 @@ module Jobs
       end
     end
 
-    def rebuild_problem_topics(limit = 10000)
+    def rebuild_problem_topics(limit: 10000)
       topic_ids = load_problem_topic_ids(limit)
 
       topic_ids.each do |id|
@@ -39,18 +39,18 @@ module Jobs
       end
     end
 
-    def rebuild_problem_posts(limit = 20000)
+    def rebuild_problem_posts(limit: 20000, indexer: SearchIndexer)
       post_ids = load_problem_post_ids(limit)
 
       post_ids.each do |id|
         # could be deleted while iterating through batch
         if post = Post.find_by(id: id)
-          SearchIndexer.index(post, force: true)
+          indexer.index(post, force: true)
         end
       end
     end
 
-    def rebuild_problem_tags(limit = 10000)
+    def rebuild_problem_tags(limit: 10000)
       tag_ids = load_problem_tag_ids(limit)
 
       tag_ids.each do |id|
@@ -75,6 +75,7 @@ module Jobs
                 LEFT JOIN post_search_data pd ON pd.locale = ? AND pd.version = ? AND p2.id = pd.post_id
                 WHERE pd.post_id IS NULL
                 )', SiteSetting.default_locale, Search::INDEX_VERSION)
+        .where("posts.raw != ''")
         .limit(limit)
         .order('posts.id DESC')
         .pluck(:id)
diff --git a/spec/jobs/reindex_search_spec.rb b/spec/jobs/reindex_search_spec.rb
index 01879bd..8eef192 100644
--- a/spec/jobs/reindex_search_spec.rb
+++ b/spec/jobs/reindex_search_spec.rb
@@ -29,14 +29,53 @@ describe Jobs::ReindexSearch do
     end
   end
 
-  it "should clean up post_search_data of posts with empty raw" do
-    post = Fabricate(:post)
-    post2 = Fabricate(:post, post_type: Post.types[:small_action])
-    post2.raw = ""
-    post2.save!(validate: false)
-
-    expect { subject.execute({}) }.to change { PostSearchData.count }.by(-1)
-    expect(Post.all).to contain_exactly(post, post2)
-    expect(PostSearchData.all).to contain_exactly(post.post_search_data)
+  describe 'rebuild_problem_posts' do
+    class FakeIndexer
+      def self.index(post, force:)
+        @posts ||= []
+        @posts.push(post)
+      end
+
+      def self.posts
+        @posts
+      end
+
+      def self.reset
+        @posts.clear
+      end
+    end
+
+    after do
+      FakeIndexer.reset
+    end
+
+    it 'should not reindex posts with empty raw' do
+      post = Fabricate(:post)
+      post.post_search_data.destroy!
+
+      post2 = Fabricate.build(:post,
+        raw: "",
+        post_type: Post.types[:small_action]
+      )
+
+      post2.save!(validate: false)
+
+      subject.rebuild_problem_posts(indexer: FakeIndexer)
+
+      expect(FakeIndexer.posts).to contain_exactly(post)
+    end
+  end
+
+  describe '#execute' do
+    it "should clean up post_search_data of posts with empty raw" do
+      post = Fabricate(:post)
+      post2 = Fabricate(:post, post_type: Post.types[:small_action])
+      post2.raw = ""
+      post2.save!(validate: false)
+
+      expect { subject.execute({}) }.to change { PostSearchData.count }.by(-1)
+      expect(Post.all).to contain_exactly(post, post2)
+      expect(PostSearchData.all).to contain_exactly(post.post_search_data)
+    end
   end
 end

GitHub sha: 3fc5dbb0