FIX: Ensure that aggregating search shows the post with the higest rank.

FIX: Ensure that aggregating search shows the post with the higest rank.

Previously, we would only take either the MIN or MAX for post_number during aggregation meaning that the ranking is not considered.

require 'benchmark/ips'

Benchmark.ips do |x|
  x.config(time: 10, warmup: 2)

  x.report("current aggregate search query") do
    DB.exec <<~SQL
    SELECT "posts"."id", "posts"."user_id", "posts"."topic_id", "posts"."post_number", "posts"."raw", "posts"."cooked", "posts"."created_at", "posts"."updated_at", "posts"."reply_to_post_number", "posts"."reply_count", "posts"."quote_count", "posts"."deleted_at", "posts"."off_topic_count", "posts"."like_count", "posts"."incoming_link_count", "posts"."bookmark_count", "posts"."score", "posts"."reads", "posts"."post_type", "posts"."sort_order", "posts"."last_editor_id", "posts"."hidden", "posts"."hidden_reason_id", "posts"."notify_moderators_count", "posts"."spam_count", "posts"."illegal_count", "posts"."inappropriate_count", "posts"."last_version_at", "posts"."user_deleted", "posts"."reply_to_user_id", "posts"."percent_rank", "posts"."notify_user_count", "posts"."like_score", "posts"."deleted_by_id", "posts"."edit_reason", "posts"."word_count", "posts"."version", "posts"."cook_method", "posts"."wiki", "posts"."baked_at", "posts"."baked_version", "posts"."hidden_at", "posts"."self_edits", "posts"."reply_quoted", "posts"."via_email", "posts"."raw_email", "posts"."public_version", "posts"."action_code", "posts"."locked_by_id", "posts"."image_upload_id" FROM "posts" JOIN (SELECT *, row_number() over() row_number FROM (SELECT topics.id, min(posts.post_number) post_number FROM "posts" INNER JOIN "post_search_data" ON "post_search_data"."post_id" = "posts"."id" INNER JOIN "topics" ON "topics"."id" = "posts"."topic_id" AND ("topics"."deleted_at" IS NULL) LEFT JOIN categories ON categories.id = topics.category_id WHERE ("posts"."deleted_at" IS NULL) AND "posts"."post_type" IN (1, 2, 3, 4) AND (topics.visible) AND (topics.archetype <> 'private_message') AND (post_search_data.search_data @@ TO_TSQUERY('english', '''postgres'':*ABCD')) AND (categories.id NOT IN (
      SELECT categories.id WHERE categories.search_priority = 1
    )
    ) AND ((categories.id IS NULL) OR (NOT categories.read_restricted)) GROUP BY topics.id ORDER BY MAX((
      TS_RANK_CD(
        post_search_data.search_data,
        TO_TSQUERY('english', '''postgres'':*ABCD'),
        1|32
      ) *
      (
        CASE categories.search_priority
        WHEN 2
        THEN 0.6
        WHEN 3
        THEN 0.8
        WHEN 4
        THEN 1.2
        WHEN 5
        THEN 1.4
        ELSE
          CASE WHEN topics.closed
          THEN 0.9
          ELSE 1
          END
        END
      )
    )
    ) DESC, topics.bumped_at DESC LIMIT 51 OFFSET 0) xxx) x ON x.id = posts.topic_id AND x.post_number = posts.post_number WHERE ("posts"."deleted_at" IS NULL) ORDER BY row_number;
    SQL
  end

  x.report("current aggregate search query with proper ranking") do
    DB.exec <<~SQL
    SELECT "posts"."id", "posts"."user_id", "posts"."topic_id", "posts"."post_number", "posts"."raw", "posts"."cooked", "posts"."created_at", "posts"."updated_at", "posts"."reply_to_post_number", "posts"."reply_count", "posts"."quote_count", "posts"."deleted_at", "posts"."off_topic_count", "posts"."like_count", "posts"."incoming_link_count", "posts"."bookmark_count", "posts"."score", "posts"."reads", "posts"."post_type", "posts"."sort_order", "posts"."last_editor_id", "posts"."hidden", "posts"."hidden_reason_id", "posts"."notify_moderators_count", "posts"."spam_count", "posts"."illegal_count", "posts"."inappropriate_count", "posts"."last_version_at", "posts"."user_deleted", "posts"."reply_to_user_id", "posts"."percent_rank", "posts"."notify_user_count", "posts"."like_score", "posts"."deleted_by_id", "posts"."edit_reason", "posts"."word_count", "posts"."version", "posts"."cook_method", "posts"."wiki", "posts"."baked_at", "posts"."baked_version", "posts"."hidden_at", "posts"."self_edits", "posts"."reply_quoted", "posts"."via_email", "posts"."raw_email", "posts"."public_version", "posts"."action_code", "posts"."locked_by_id", "posts"."image_upload_id" FROM "posts" JOIN (SELECT *, row_number() over() row_number FROM (SELECT subquery.topic_id id, (ARRAY_AGG(subquery.post_number))[1] post_number, MAX(subquery.rank) rank, MAX(subquery.bumped_at) bumped_at FROM (SELECT "posts"."id", "posts"."user_id", "posts"."topic_id", "posts"."post_number", "posts"."raw", "posts"."cooked", "posts"."created_at", "posts"."updated_at", "posts"."reply_to_post_number", "posts"."reply_count", "posts"."quote_count", "posts"."deleted_at", "posts"."off_topic_count", "posts"."like_count", "posts"."incoming_link_count", "posts"."bookmark_count", "posts"."score", "posts"."reads", "posts"."post_type", "posts"."sort_order", "posts"."last_editor_id", "posts"."hidden", "posts"."hidden_reason_id", "posts"."notify_moderators_count", "posts"."spam_count", "posts"."illegal_count", "posts"."inappropriate_count", "posts"."last_version_at", "posts"."user_deleted", "posts"."reply_to_user_id", "posts"."percent_rank", "posts"."notify_user_count", "posts"."like_score", "posts"."deleted_by_id", "posts"."edit_reason", "posts"."word_count", "posts"."version", "posts"."cook_method", "posts"."wiki", "posts"."baked_at", "posts"."baked_version", "posts"."hidden_at", "posts"."self_edits", "posts"."reply_quoted", "posts"."via_email", "posts"."raw_email", "posts"."public_version", "posts"."action_code", "posts"."locked_by_id", "posts"."image_upload_id", (
      TS_RANK_CD(
        post_search_data.search_data,
        TO_TSQUERY('english', '''postgres'':*ABCD'),
        1|32
      ) *
      (
        CASE categories.search_priority
        WHEN 2
        THEN 0.6
        WHEN 3
        THEN 0.8
        WHEN 4
        THEN 1.2
        WHEN 5
        THEN 1.4
        ELSE
          CASE WHEN topics.closed
          THEN 0.9
          ELSE 1
          END
        END
      )
    )
     rank, topics.bumped_at bumped_at FROM "posts" INNER JOIN "post_search_data" ON "post_search_data"."post_id" = "posts"."id" INNER JOIN "topics" ON "topics"."id" = "posts"."topic_id" AND ("topics"."deleted_at" IS NULL) LEFT JOIN categories ON categories.id = topics.category_id WHERE ("posts"."deleted_at" IS NULL) AND "posts"."post_type" IN (1, 2, 3, 4) AND (topics.visible) AND (topics.archetype <> 'private_message') AND (post_search_data.search_data @@ TO_TSQUERY('english', '''postgres'':*ABCD')) AND (categories.id NOT IN (
      SELECT categories.id WHERE categories.search_priority = 1
    )
    ) AND ((categories.id IS NULL) OR (NOT categories.read_restricted))) subquery GROUP BY subquery.topic_id ORDER BY rank DESC, bumped_at DESC LIMIT 51 OFFSET 0) xxx) x ON x.id = posts.topic_id AND x.post_number = posts.post_number WHERE ("posts"."deleted_at" IS NULL) ORDER BY row_number;
    SQL
  end

  x.compare!
end
Warming up --------------------------------------
current aggregate search query
                         1.000  i/100ms
current aggregate search query with proper ranking
                         1.000  i/100ms
Calculating -------------------------------------
current aggregate search query
                         17.726  (± 0.0%) i/s -    178.000  in  10.045107s
current aggregate search query with proper ranking
                         17.802  (± 0.0%) i/s -    178.000  in  10.002230s

Comparison:
current aggregate search query with proper ranking:       17.8 i/s
current aggregate search query:       17.7 i/s - 1.00x  (± 0.00) slower
diff --git a/lib/search.rb b/lib/search.rb
index 564c134..d17b35a 100644
--- a/lib/search.rb
+++ b/lib/search.rb
@@ -794,9 +794,7 @@ class Search
 
   PHRASE_MATCH_REGEXP_PATTERN = '"([^"]+)"'
 
-  def posts_query(limit, opts = nil)
-    opts ||= {}
-
+  def posts_query(limit, type_filter: nil, aggregate_search: false)
     posts = Post.where(post_type: Topic.visible_post_types(@guardian.user))
       .joins(:post_search_data, :topic)
       .joins("LEFT JOIN categories ON categories.id = topics.category_id")
@@ -805,13 +803,13 @@ class Search
 
     posts = posts.where("topics.visible") unless is_topic_search
 
-    if opts[:type_filter] === "private_messages" || (is_topic_search && @search_context.private_message?)
+    if type_filter === "private_messages" || (is_topic_search && @search_context.private_message?)
       posts = posts.where("topics.archetype =  ?", Archetype.private_message)
 
        unless @guardian.is_admin?
          posts = posts.private_posts_for_user(@guardian.user)
        end
-    elsif opts[:type_filter] === "all_topics"
+    elsif type_filter === "all_topics"
       private_posts = posts.where("topics.archetype = ?", Archetype.private_message)
       private_posts = private_posts.private_posts_for_user(@guardian.user)
 
@@ -861,7 +859,7 @@ class Search
     posts =
       if @search_context.present?
         if @search_context.is_a?(User)
-          if opts[:type_filter] === "private_messages"
+          if type_filter === "private_messages"
             @direct_pms_only ? posts : posts.private_posts_for_user(@search_context)
           else
             posts.where("posts.user_id = #{@search_context.id}")
@@ -886,14 +884,58 @@ class Search
         posts = categories_ignored(posts) unless @category_filter_matched
         posts
       end
+
+    if aggregate_search
+      aggregate_relation = Post.unscoped
+        .select(
+          "subquery.topic_id id",
+          "(ARRAY_AGG(subquery.post_number))[1] post_number",
+        )
+        .group("subquery.topic_id")
+
+      posts = posts.select(posts.arel.projections)
+    end
+
     if @order == :latest
-      if opts[:aggregate_search]
-        posts = posts.order("MAX(posts.created_at) DESC")
-      else
-        posts = posts.reorder("posts.created_at DESC")
+      posts = posts.reorder("posts.created_at DESC")
+
+      if aggregate_search
+        aggregate_relation = aggregate_relation
+          .select("MAX(subquery.created_at) created_at")
+          .order("created_at DESC")
       end
-    elsif @term.blank? && !@order
-      data_ranking = <<~SQL
+    elsif @order == :latest_topic
+      posts = posts.order("topic_created_at DESC")
+
+      if aggregate_search
+        posts = posts.select("topics.created_at topic_created_at")
+
+        aggregate_relation = aggregate_relation
+          .select("MAX(subquery.topic_created_at) topic_created_at")
+          .order("topic_created_at DESC")
+      end
+    elsif @order == :views
+      posts = posts.order("topic_views DESC")
+
+      if aggregate_search
+        posts = posts.select("topics.views topic_views")
+
+        aggregate_relation = aggregate_relation
+          .select("MAX(subquery.topic_views) topic_views")
+          .order("topic_views DESC")
+      end
+    elsif @order == :likes
+      posts = posts.order("posts.like_count DESC")
+    else
+      rank = <<~SQL
+      TS_RANK_CD(
+        post_search_data.search_data,
+        #{@term.blank? ? '' : ts_query(weight_filter: weights)},
+        #{SiteSetting.search_ranking_normalization}|32
+      )
+      SQL
+
+      category_priority_weights = <<~SQL
       (
         CASE categories.search_priority
         WHEN #{Searchable::PRIORITIES[:very_low]}
@@ -912,64 +954,24 @@ class Search
         END
       )
       SQL
-      if opts[:aggregate_search]
-        posts = posts.order("MAX(#{data_ranking}) DESC").order("MAX(posts.created_at) DESC")
-      else
-        posts = posts.order("#{data_ranking} DESC").order("posts.created_at DESC")
-      end
-    elsif @order == :latest_topic
-      if opts[:aggregate_search]
-        posts = posts.order("MAX(topics.created_at) DESC")
-      else
-        posts = posts.order("topics.created_at DESC")
-      end
-    elsif @order == :views
-      if opts[:aggregate_search]
-        posts = posts.order("MAX(topics.views) DESC")
-      else
-        posts = posts.order("topics.views DESC")
-      end
-    elsif @order == :likes
-      if opts[:aggregate_search]
-        posts = posts.order("MAX(posts.like_count) DESC")
-      else
-        posts = posts.order("posts.like_count DESC")
-      end
-    else
-      data_ranking = <<~SQL
-      (
-        TS_RANK_CD(
-          post_search_data.search_data,
-          #{ts_query(weight_filter: weights)},
-          #{SiteSetting.search_ranking_normalization}|32
-        ) *
-        (
-          CASE categories.search_priority
-          WHEN #{Searchable::PRIORITIES[:very_low]}
-          THEN #{SiteSetting.category_search_priority_very_low_weight}
-          WHEN #{Searchable::PRIORITIES[:low]}
-          THEN #{SiteSetting.category_search_priority_low_weight}
-          WHEN #{Searchable::PRIORITIES[:high]}
-          THEN #{SiteSetting.category_search_priority_high_weight}
-          WHEN #{Searchable::PRIORITIES[:very_high]}
-          THEN #{SiteSetting.category_search_priority_very_high_weight}
-          ELSE
-            CASE WHEN topics.closed
-            THEN 0.9
-            ELSE 1
-            END
-          END
-        )
-      )
-      SQL
 
-      if opts[:aggregate_search]
-        posts = posts.order("MAX(#{data_ranking}) DESC")
+      data_ranking =
+        if @term.blank?
+          "(#{category_priority_weights})"
+        else
+          "(#{rank} * #{category_priority_weights})"
+        end
+
+      if aggregate_search
+        posts = posts.select("#{data_ranking} rank", "topics.bumped_at topic_bumped_at")
+          .order("rank DESC", "topic_bumped_at DESC")
+
+        aggregate_relation = aggregate_relation
+          .select("MAX(subquery.rank) rank", "MAX(subquery.topic_bumped_at) topic_bumped_at")
+          .order("rank DESC", "topic_bumped_at DESC")
       else
-        posts = posts.order("#{data_ranking} DESC")
+        posts = posts.order("#{data_ranking} DESC", "topics.bumped_at DESC")
       end
-
-      posts = posts.order("topics.bumped_at DESC")
     end
 
     if secure_category_ids.present?
@@ -978,6 +980,11 @@ class Search
       posts = posts.where("(categories.id IS NULL) OR (NOT categories.read_restricted)").references(:categories)
     end
 
+    if aggregate_search
+      posts = yield(posts) if block_given?
+      posts = aggregate_relation.from(posts)
+    end
+
     posts = posts.offset(offset)
     posts.limit(limit)
   end
@@ -1032,29 +1039,42 @@ class Search
   end
 
   def aggregate_post_sql(opts)
-    min_or_max = @order == :latest ? "max" : "min"
+    default_opts = {
+      type_filter: opts[:type_filter]
+    }
+
+    min_id = Search.min_post_id
 
-    query =
-      if @order == :likes
-        # likes are a pain to aggregate so skip
-        posts_query(limit, type_filter: opts[:type_filter])
-          .select('topics.id', "posts.post_number")
+    if @order == :likes
+      # likes are a pain to aggregate so skip
+      query = posts_query(limit, **default_opts).select('topics.id', 'posts.post_number')
+
+      if min_id > 0
+        low_set = query.dup.where("post_search_data.post_id < #{min_id}")
+        high_set = query.where("post_search_data.post_id >= #{min_id}")
+
+        { default: wrap_rows(high_set), remaining: wrap_rows(low_set) }
       else
-        posts_query(limit, aggregate_search: true, type_filter: opts[:type_filter])
-          .select('topics.id', "#{min_or_max}(posts.post_number) post_number")
-          .group('topics.id')
+        { default: wrap_rows(query) }
+      end
+    else
+      query = posts_query(limit, **default_opts, aggregate_search: true) do |posts|

[... diff too long, it was truncated ...]

GitHub sha: d8c796bc