PERF: Replace video and audio links in search blurb while indexing.

PERF: Replace video and audio links in search blurb while indexing.

In the near future, we will be swtiching to PG headlines to generate the search blurb. As such, we need to replace audio and video links in the raw data used for headline generation. This also means that we avoid replacing links each time we need to generate the blurb.

diff --git a/app/mailers/group_smtp_mailer.rb b/app/mailers/group_smtp_mailer.rb
index 01ef9b5..15c3d0c 100644
--- a/app/mailers/group_smtp_mailer.rb
+++ b/app/mailers/group_smtp_mailer.rb
@@ -88,7 +88,7 @@ class GroupSmtpMailer < ActionMailer::Base
 
   def strip_secure_urls(raw)
     urls = Set.new
-    raw.scan(URI.regexp(%w{http https})) { urls << $& }
+    raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
 
     urls.each do |url|
       if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))
diff --git a/app/mailers/user_notifications.rb b/app/mailers/user_notifications.rb
index 62525ee..aa700f5 100644
--- a/app/mailers/user_notifications.rb
+++ b/app/mailers/user_notifications.rb
@@ -365,7 +365,7 @@ class UserNotifications < ActionMailer::Base
 
   def strip_secure_urls(raw)
     urls = Set.new
-    raw.scan(URI.regexp(%w{http https})) { urls << $& }
+    raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
 
     urls.each do |url|
       if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))
diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb
index cb60237..bc9c7ec 100644
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -46,13 +46,6 @@ class SearchIndexer
       d: search_data[3],
     }
 
-    indexed_data =
-      if table.to_s == "post"
-        ranked_params[:d]
-      else
-        search_data.select { |d| d.length > 0 }.join(' ')
-      end
-
     tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
     additional_lexemes = []
 
@@ -75,6 +68,13 @@ class SearchIndexer
 
     tsvector = "#{tsvector} #{additional_lexemes.join(' ')}"
 
+    indexed_data =
+      if table.to_s == "post"
+        clean_post_raw_data!(ranked_params[:d])
+      else
+        search_data.select { |d| d.length > 0 }.join(' ')
+      end
+
     params = {
       raw_data: indexed_data,
       id: id,
@@ -216,6 +216,26 @@ class SearchIndexer
     end
   end
 
+  def self.clean_post_raw_data!(raw_data)
+    urls = Set.new
+    raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
+
+    urls.each do |url|
+      begin
+        case File.extname(URI(url).path || "")
+        when Oneboxer::VIDEO_REGEX
+          raw_data.gsub!(url, I18n.t("search.video"))
+        when Oneboxer::AUDIO_REGEX
+          raw_data.gsub!(url, I18n.t("search.audio"))
+        end
+      rescue URI::InvalidURIError
+      end
+    end
+
+    raw_data
+  end
+  private_class_method :clean_post_raw_data!
+
   class HtmlScrubber < Nokogiri::XML::SAX::Document
 
     attr_reader :scrubbed
diff --git a/lib/discourse.rb b/lib/discourse.rb
index 5d32882..2c2d8a8 100644
--- a/lib/discourse.rb
+++ b/lib/discourse.rb
@@ -24,6 +24,8 @@ module Discourse
   end
 
   class Utils
+    URI_REGEXP = URI.regexp(%w{http https})
+
     # Usage:
     #   Discourse::Utils.execute_command("pwd", chdir: 'mydirectory')
     # or with a block
diff --git a/lib/search/grouped_search_results.rb b/lib/search/grouped_search_results.rb
index 1626066..ba112b5 100644
--- a/lib/search/grouped_search_results.rb
+++ b/lib/search/grouped_search_results.rb
@@ -106,23 +106,24 @@ class Search
       end
     end
 
-    URI_REGEXP = URI.regexp(%w{http https})
-
     def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
       blurb = nil
-      cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub
-
-      urls = Set.new
-      cooked.scan(URI_REGEXP) { urls << $& }
-      urls.each do |url|
-        begin
-          case File.extname(URI(url).path || "")
-          when Oneboxer::VIDEO_REGEX
-            cooked.gsub!(url, I18n.t("search.video"))
-          when Oneboxer::AUDIO_REGEX
-            cooked.gsub!(url, I18n.t("search.audio"))
+
+      if scrub
+        cooked = SearchIndexer.scrub_html_for_search(cooked)
+
+        urls = Set.new
+        cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
+        urls.each do |url|
+          begin
+            case File.extname(URI(url).path || "")
+            when Oneboxer::VIDEO_REGEX
+              cooked.gsub!(url, I18n.t("search.video"))
+            when Oneboxer::AUDIO_REGEX
+              cooked.gsub!(url, I18n.t("search.audio"))
+            end
+          rescue URI::InvalidURIError
           end
-        rescue URI::InvalidURIError
         end
       end
 
diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb
index f55b183..c359c32 100644
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@@ -209,6 +209,29 @@ describe SearchIndexer do
         "Let me see how I can fix this image white walkers GOT"
       )
     end
+
+    it 'should strips audio and videos URLs from raw data' do
+      SiteSetting.authorized_extensions = 'mp4'
+      upload = Fabricate(:video_upload)
+
+      post.update!(raw: <<~RAW)
+      link to an external page: https://google.com/?u=bar
+
+      link to an audio file: https://somesite.com/audio.m4a
+
+      link to a video file: https://somesite.com/content/somethingelse.MOV
+
+      link to an invalid URL: http:error]
+      RAW
+
+      expect(post.post_search_data.raw_data).to eq(
+        "link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]"
+      )
+
+      expect(post.post_search_data.search_data).to eq(
+        "'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
+      )
+    end
   end
 
   describe '.queue_post_reindex' do

GitHub sha: 255b0e9f