FIX: Relevance search will now consider document length in ranking.

FIX: Relevance search will now consider document length in ranking.

The default ranking options ranks by the number of matches which is highly problematic when posts are stuffed with a keyword. The ranking will now be divided by the document length which is a much fairer way to rank.

diff --git a/lib/search.rb b/lib/search.rb
index d89e231..d0e296a 100644
--- a/lib/search.rb
+++ b/lib/search.rb
@@ -838,13 +838,14 @@ class Search
         posts = posts.order("posts.like_count DESC")
       end
     else
-      # 0|32 default normalization scaled into the range zero to one
+      # 2|32 divides the rank by the document length and scales the range from
+      # zero to one
       data_ranking = <<~SQL
       (
         TS_RANK_CD(
           post_search_data.search_data,
           #{ts_query(weight_filter: weights)},
-          0|32
+          2|32
         ) *
         (
           CASE categories.search_priority
diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb
index 18dd5d9..b759b81 100644
--- a/spec/components/search_spec.rb
+++ b/spec/components/search_spec.rb
@@ -334,6 +334,27 @@ describe Search do
         expect(result.posts).to contain_exactly(reply)
         expect(result.blurb(reply)).to eq(expected_blurb)
       end
+
+      it 'does not allow a post with repeated words to dominate the ranking' do
+        category = Fabricate(:category, name: "winter is coming")
+
+        post = Fabricate(:post,
+          raw: "I think winter will end soon",
+          topic: Fabricate(:topic,
+            title: "dragon john snow winter",
+            category: category
+          )
+        )
+
+        post2 = Fabricate(:post,
+          raw: "I think winter winter winter winter winter will end soon",
+          topic: Fabricate(:topic, title: "dragon john snow summer", category: category)
+        )
+
+        result = Search.execute('winter')
+
+        expect(result.posts).to eq([post, post2, category.topic.first_post])
+      end
     end
 
     context 'searching for quoted title' do
@@ -940,22 +961,45 @@ describe Search do
       today        = Date.today
       yesterday    = 1.day.ago
       two_days_ago = 2.days.ago
+      category = Fabricate(:category)
+
+      old_topic = Fabricate(:topic,
+        title: 'First Topic, testing the created_at sort',
+        created_at: two_days_ago,
+        category: category
+      )
 
-      old_topic    = Fabricate(:topic,
-          title: 'First Topic, testing the created_at sort',
-          created_at: two_days_ago)
       latest_topic = Fabricate(:topic,
-          title: 'Second Topic, testing the created_at sort',
-          created_at: yesterday)
+        title: 'Second Topic, testing the created_at sort',
+        created_at: yesterday,
+        category: category
+      )
+
+      old_relevant_topic_post = Fabricate(:post,
+        topic: old_topic,
+        created_at: yesterday,
+        raw: 'Relevant Relevant Topic'
+      )
 
-      old_relevant_topic_post     = Fabricate(:post, topic: old_topic, created_at: yesterday, raw: 'Relevant Topic')
-      latest_irelevant_topic_post = Fabricate(:post, topic: latest_topic, created_at: today, raw: 'Not Relevant')
+      latest_irelevant_topic_post = Fabricate(:post,
+        topic: latest_topic,
+        created_at: today,
+        raw: 'Not Relevant'
+      )
 
       # Expecting the default results
-      expect(Search.execute('Topic').posts.map(&:id)).to eq([old_relevant_topic_post.id, latest_irelevant_topic_post.id])
+      expect(Search.execute('Topic').posts).to contain_exactly(
+        old_relevant_topic_post,
+        latest_irelevant_topic_post,
+        category.topic.first_post
+      )
 
       # Expecting the ordered by topic creation results
-      expect(Search.execute('Topic order:latest_topic').posts.map(&:id)).to eq([latest_irelevant_topic_post.id, old_relevant_topic_post.id])
+      expect(Search.execute('Topic order:latest_topic').posts).to contain_exactly(
+        latest_irelevant_topic_post,
+        old_relevant_topic_post,
+        category.topic.first_post
+      )
     end
 
     it 'can tokenize dots' do

GitHub sha: e87ca594

2 Likes

FIX: Avoid penalizing long documents too much in search.

This commit has been mentioned on Discourse Meta. There might be relevant details there:

1 Like