FIX: Don't index posts with empty `Post#raw` for search. (#7263)

FIX: Don’t index posts with empty Post#raw for search. (#7263)

  • DEV: Remove unnecessary join in Jobs::ReindexSearch.

  • FIX: Don’t index posts with empty Post#raw for search.

diff --git a/app/jobs/scheduled/reindex_search.rb b/app/jobs/scheduled/reindex_search.rb
index 0e9768d..6e919eb 100644
--- a/app/jobs/scheduled/reindex_search.rb
+++ b/app/jobs/scheduled/reindex_search.rb
@@ -9,6 +9,7 @@ module Jobs
       rebuild_problem_categories
       rebuild_problem_users
       rebuild_problem_tags
+      clean_post_search_data
     end
 
     def rebuild_problem_categories(limit = 500)
@@ -60,8 +61,15 @@ module Jobs
 
     private
 
+    def clean_post_search_data
+      PostSearchData
+        .joins("LEFT JOIN posts p ON p.id = post_search_data.post_id")
+        .where("p.raw = ''")
+        .delete_all
+    end
+
     def load_problem_post_ids(limit)
-      Post.joins(:topic)
+      Post
         .where('posts.id IN (
                 SELECT p2.id FROM posts p2
                 LEFT JOIN post_search_data pd ON pd.locale = ? AND pd.version = ? AND p2.id = pd.post_id
diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb
index 2f496e1..c7b07c5 100644
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -134,7 +134,7 @@ class SearchIndexer
     category_name = topic.category&.name if topic
     tag_names = topic.tags.pluck(:name).join(' ') if topic
 
-    if Post === obj &&
+    if Post === obj && obj.raw.present? &&
        (
          obj.saved_change_to_cooked? ||
          obj.saved_change_to_topic_id? ||
diff --git a/spec/jobs/reindex_search_spec.rb b/spec/jobs/reindex_search_spec.rb
index 8b8c870..01879bd 100644
--- a/spec/jobs/reindex_search_spec.rb
+++ b/spec/jobs/reindex_search_spec.rb
@@ -28,4 +28,15 @@ describe Jobs::ReindexSearch do
       expect(model.send("#{m}_search_data").version).to eq Search::INDEX_VERSION
     end
   end
+
+  it "should clean up post_search_data of posts with empty raw" do
+    post = Fabricate(:post)
+    post2 = Fabricate(:post, post_type: Post.types[:small_action])
+    post2.raw = ""
+    post2.save!(validate: false)
+
+    expect { subject.execute({}) }.to change { PostSearchData.count }.by(-1)
+    expect(Post.all).to contain_exactly(post, post2)
+    expect(PostSearchData.all).to contain_exactly(post.post_search_data)
+  end
 end
diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb
index 1922237..d748ec6 100644
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@@ -103,5 +103,12 @@ describe SearchIndexer do
       expect { post.update!(topic_id: Fabricate(:topic).id) }
         .to change { post.reload.post_search_data.raw_data }
     end
+
+    it 'should not index posts with empty raw' do
+      expect do
+        post = Fabricate.build(:post, raw: "", post_type: Post.types[:small_action])
+        post.save!(validate: false)
+      end.to_not change { PostSearchData.count }
+    end
   end
 end

GitHub sha: daeda80a

There is slight mis-parity here cause ' '.present? is false and ' ' != ''

Probably does not matter though.

FIX: Don't attempt to reindex posts that have an empty raw.

This commit has been mentioned on Discourse Meta. There might be relevant details there:

1 Like