PERF: Limit characters used to generate headline for search blurb.

PERF: Limit characters used to generate headline for search blurb.

We determined using the following benchmark script that limiting to 2500 chars would mean a maximum of 25ms spent generating headlines.

require 'benchmark/ips'

string = <<~STRING
Far far away, behind the word mountains...
STRING

def sql_excerpt(string, l = 1000000)
  DB.query_single(<<~SQL)
  SELECT TS_HEADLINE('english', left('#{string}', #{l}), PLAINTO_TSQUERY('mountains'))
  SQL
end

def ruby_excerpt(string)
  output = DB.query_single("SELECT '#{string}'")[0]
  Search::GroupedSearchResults::TextHelper.excerpt(output, 'mountains', radius: 100)
end

puts "Ruby Excerpt: #{ruby_excerpt(string)}"
puts "SQL Excerpt: #{sql_excerpt(string)}"
puts

Benchmark.ips do |x|
  x.time = 10

  [1000, 2500, 5000, 10000, 20000, 50000].each do |l|
    short_string = string[0..l]

    x.report("ts_headline excerpt #{l}") do
      sql_excerpt(short_string, l)
    end

    x.report("actionview excerpt #{l}") do
      ruby_excerpt(short_string)
    end
  end

  x.compare!
end
actionview excerpt 1000:    20570.7 i/s
actionview excerpt 2500:    17863.1 i/s - 1.15x  (± 0.00) slower
actionview excerpt 5000:    14228.9 i/s - 1.45x  (± 0.00) slower
actionview excerpt 10000:    10906.2 i/s - 1.89x  (± 0.00) slower
actionview excerpt 20000:     6255.0 i/s - 3.29x  (± 0.00) slower
ts_headline excerpt 1000:     4337.5 i/s - 4.74x  (± 0.00) slower
actionview excerpt 50000:     3222.7 i/s - 6.38x  (± 0.00) slower
ts_headline excerpt 2500:     2240.4 i/s - 9.18x  (± 0.00) slower
ts_headline excerpt 5000:     1258.7 i/s - 16.34x  (± 0.00) slower
ts_headline excerpt 10000:      667.2 i/s - 30.83x  (± 0.00) slower
ts_headline excerpt 20000:      348.7 i/s - 58.98x  (± 0.00) slower
ts_headline excerpt 50000:      131.9 i/s - 155.91x  (± 0.00) slower
diff --git a/lib/search.rb b/lib/search.rb
index fe19f17..16b813d 100644
--- a/lib/search.rb
+++ b/lib/search.rb
@@ -1164,6 +1164,10 @@ class Search
     query.includes(topic: topic_eager_loads)
   end
 
+  # Limited for performance reasons since `TS_HEADLINE` is slow when the text
+  # document is too long.
+  MAX_LENGTH_FOR_HEADLINE = 2500
+
   def posts_scope(default_scope = Post.all)
     if SiteSetting.use_pg_headlines_for_excerpt
       search_term = @term.present? ? PG::Connection.escape_string(@term) : nil
@@ -1174,7 +1178,7 @@ class Search
         .joins("INNER JOIN topics t1 ON t1.id = posts.topic_id")
         .select(
           "TS_HEADLINE(#{ts_config}, t1.fancy_title, PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'), 'StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS topic_title_headline",
-          "TS_HEADLINE(#{ts_config}, pd.raw_data, PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'), 'ShortWord=0, MaxFragments=1, MinWords=50, MaxWords=51, StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS headline",
+          "TS_HEADLINE(#{ts_config}, LEFT(pd.raw_data, #{MAX_LENGTH_FOR_HEADLINE}), PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'), 'ShortWord=0, MaxFragments=1, MinWords=50, MaxWords=51, StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS headline",
           default_scope.arel.projections
         )
     else
diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb
index 85c0183..2588bc2 100644
--- a/spec/components/search_spec.rb
+++ b/spec/components/search_spec.rb
@@ -429,6 +429,20 @@ describe Search do
       expect(post.topic_title_headline).to eq(topic.fancy_title)
     end
 
+    it "it limits the headline to #{Search::MAX_LENGTH_FOR_HEADLINE} characters" do
+      SiteSetting.use_pg_headlines_for_excerpt = true
+
+      reply.update!(raw: "#{'a' * Search::MAX_LENGTH_FOR_HEADLINE} #{reply.raw}")
+
+      result = Search.execute('elephant')
+
+      expect(result.posts.map(&:id)).to contain_exactly(reply.id)
+
+      post = result.posts.first
+
+      expect(post.headline.include?('elephant')).to eq(false)
+    end
+
     it 'returns the right post and blurb for searches with phrase' do
       SiteSetting.use_pg_headlines_for_excerpt = true
 

GitHub sha: 053cbe31