PERF: Avoid parsing `Post#cooked` with Nokogiri for every search.

PERF: Avoid parsing Post#cooked with Nokogiri for every search.

diff --git a/app/controllers/similar_topics_controller.rb b/app/controllers/similar_topics_controller.rb
index a426778..e2f3314 100644
--- a/app/controllers/similar_topics_controller.rb
+++ b/app/controllers/similar_topics_controller.rb
@@ -10,7 +10,7 @@ class SimilarTopicsController < ApplicationController
     attr_reader :topic
 
     def blurb
-      Search::GroupedSearchResults.blurb_for(@topic.try(:blurb))
+      Search::GroupedSearchResults.blurb_for(cooked: @topic.try(:blurb))
     end
   end
 
diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb
index 27a18f7..a73585e 100644
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 
 class SearchIndexer
-  POST_INDEX_VERSION = 3
+  POST_INDEX_VERSION = 4
   MIN_POST_REINDEX_VERSION = 3
   TOPIC_INDEX_VERSION = 3
   CATEGORY_INDEX_VERSION = 3
@@ -39,8 +39,6 @@ class SearchIndexer
       setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
     SQL
 
-    indexed_data = search_data.select { |d| d.length > 0 }.join(' ')
-
     ranked_params = {
       a: search_data[0],
       b: search_data[1],
@@ -48,6 +46,13 @@ class SearchIndexer
       d: search_data[3],
     }
 
+    indexed_data =
+      if table.to_s == "post"
+        ranked_params[:d]
+      else
+        search_data.select { |d| d.length > 0 }.join(' ')
+      end
+
     tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
     additional_lexemes = []
 
@@ -105,7 +110,7 @@ class SearchIndexer
     scrubbed_cooked = scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
 
     # a bit inconsitent that we use title as A and body as B when in
-    # the post index body is C
+    # the post index body is D
     update_index(table: 'topic', id: topic_id, raw_data: [title, scrubbed_cooked])
   end
 
@@ -165,9 +170,11 @@ class SearchIndexer
     end
 
     category_name = topic.category&.name if topic
+
     if topic
-      tags = topic.tags.select(:id, :name)
-      unless tags.empty?
+      tags = topic.tags.select(:id, :name).to_a
+
+      if tags.present?
         tag_names = (tags.map(&:name) + Tag.where(target_tag_id: tags.map(&:id)).pluck(:name)).join(' ')
       end
     end
diff --git a/lib/search.rb b/lib/search.rb
index 0caf6e3..02462cd 100644
--- a/lib/search.rb
+++ b/lib/search.rb
@@ -1128,7 +1128,7 @@ class Search
   end
 
   def posts_eager_loads(query)
-    query = query.includes(:user)
+    query = query.includes(:user, :post_search_data)
     topic_eager_loads = [:category]
 
     if SiteSetting.tagging_enabled
diff --git a/lib/search/grouped_search_results.rb b/lib/search/grouped_search_results.rb
index 51413dc..b6059f0 100644
--- a/lib/search/grouped_search_results.rb
+++ b/lib/search/grouped_search_results.rb
@@ -58,7 +58,19 @@ class Search
     end
 
     def blurb(post)
-      GroupedSearchResults.blurb_for(post.cooked, @blurb_term, @blurb_length)
+      opts = {
+        term: @blurb_term,
+        blurb_length: @blurb_length
+      }
+
+      if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION
+        opts[:cooked] = post.post_search_data.raw_data
+        opts[:scrub] = false
+      else
+        opts[:cooked] = post.cooked
+      end
+
+      GroupedSearchResults.blurb_for(**opts)
     end
 
     def add(object)
@@ -73,9 +85,9 @@ class Search
       end
     end
 
-    def self.blurb_for(cooked, term = nil, blurb_length = BLURB_LENGTH)
+    def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
       blurb = nil
-      cooked = SearchIndexer.scrub_html_for_search(cooked)
+      cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub
 
       urls = Set.new
       cooked.scan(URI.regexp(%w{http https})) { urls << $& }
diff --git a/spec/lib/search_spec.rb b/spec/lib/search_spec.rb
index f8c6fae..e37367b 100644
--- a/spec/lib/search_spec.rb
+++ b/spec/lib/search_spec.rb
@@ -38,7 +38,7 @@ describe Search do
 
         link to a video file: https://somesite.com/content/somethingelse.MOV
       RAW
-      result = Search::GroupedSearchResults.blurb_for(cooked)
+      result = Search::GroupedSearchResults.blurb_for(cooked: cooked)
       expect(result).to eq("link to an external page: https://google.com/?u=bar link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")}")
     end
 
@@ -51,7 +51,7 @@ describe Search do
         http://localhost/uploads/default/original/1X/90adc0092b30c04b761541bc0322d0dce3d896e7.m4a
       RAW
 
-      result = Search::GroupedSearchResults.blurb_for(cooked)
+      result = Search::GroupedSearchResults.blurb_for(cooked: cooked)
       expect(result).to eq("Here goes a test cooked with enough characters to hit the blurb limit. Something is very interesting about this audio file. #{I18n.t("search.audio")}")
     end
 
@@ -59,7 +59,7 @@ describe Search do
       cooked = <<~RAW
         invalid URL: http:error] should not trip up blurb generation.
       RAW
-      result = Search::GroupedSearchResults.blurb_for(cooked)
+      result = Search::GroupedSearchResults.blurb_for(cooked: cooked)
       expect(result).to eq("invalid URL: http:error] should not trip up blurb generation.")
     end
   end
diff --git a/spec/requests/search_controller_spec.rb b/spec/requests/search_controller_spec.rb
index c947528..8259268 100644
--- a/spec/requests/search_controller_spec.rb
+++ b/spec/requests/search_controller_spec.rb
@@ -3,10 +3,22 @@
 require 'rails_helper'
 
 describe SearchController do
+  fab!(:awesome_topic) do
+    topic = Fabricate(:topic)
+    tag = Fabricate(:tag)
+    topic.tags << tag
+    Fabricate(:tag, target_tag_id: tag.id)
+    topic
+  end
 
   fab!(:awesome_post) do
     SearchIndexer.enable
-    Fabricate(:post, raw: 'this is my really awesome post')
+    Fabricate(:post, topic: awesome_topic, raw: 'this is my really awesome post')
+  end
+
+  fab!(:awesome_post_2) do
+    SearchIndexer.enable
+    Fabricate(:post, raw: 'this is my really awesome post 2')
   end
 
   fab!(:user) do
@@ -95,10 +107,14 @@ describe SearchController do
 
       data = response.parsed_body
 
-      expect(data['posts'].length).to eq(1)
-      expect(data['posts'][0]['id']).to eq(awesome_post.id)
-      expect(data['posts'][0]['blurb']).to eq(awesome_post.raw)
-      expect(data['topics'][0]['id']).to eq(awesome_post.topic_id)
+      expect(data['posts'].length).to eq(2)
+      expect(data['posts'][0]['id']).to eq(awesome_post_2.id)
+      expect(data['posts'][0]['blurb']).to eq(awesome_post_2.raw)
+      expect(data['topics'][0]['id']).to eq(awesome_post_2.topic_id)
+
+      expect(data['posts'][1]['id']).to eq(awesome_post.id)
+      expect(data['posts'][1]['blurb']).to eq(awesome_post.raw)
+      expect(data['topics'][1]['id']).to eq(awesome_post.topic_id)
     end
 
     it "can search correctly with advanced search filters" do
diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb
index d23c171..b814949 100644
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@@ -20,12 +20,13 @@ describe SearchIndexer do
   it 'correctly indexes chinese' do
     SiteSetting.default_locale = 'zh_CN'
     data = "你好世界"
-    expect(data.split(" ").length).to eq(1)
 
-    SearchIndexer.update_posts_index(post_id, "你好世界", "", "", nil)
+    SearchIndexer.update_posts_index(post_id, "", "", "", data)
 
-    raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]
-    expect(raw_data.split(' ').length).to eq(2)
+    post_search_data = PostSearchData.find_by(post_id: post_id)
+
+    expect(post_search_data.raw_data).to eq("你好 世界")
+    expect(post_search_data.search_data).to eq("'世界':2 '你好':1")
   end
 
   it 'extract youtube title' do
@@ -104,11 +105,6 @@ describe SearchIndexer do
     expect(raw_data).to eq("This is a test")
     expect(locale).to eq(SiteSetting.default_locale)

[... diff too long, it was truncated ...]

GitHub sha: 181c4eb7

1 Like

This commit appears in #10249 which was merged by tgxworld.