FIX: search indexer had various cases where it could fail

FIX: search indexer had various cases where it could fail

Previous to this fix is a post had the test www.test.com/abc it would fail to index.

This also simplifies the rules to avoid full url parsing which can be expensive

diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb
index 08df177..4ea9b7b 100644
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -21,16 +21,13 @@ class SearchIndexer
     # insert some extra words for I.am.a.word so "word" is tokenized
     # I.am.a.word becomes I.am.a.word am a word
     raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot|
-      if with_dot.match?(PlainTextToMarkdown::URL_REGEX)
-        "#{with_dot} #{URI.parse(with_dot).hostname.gsub('.', ' ')}"
-      else
-        split = with_dot.split(".")
 
-        if split.length > 1
-          with_dot + ((+" ") << split[1..-1].join(" "))
-        else
-          with_dot
-        end
+      split = with_dot.split(/https?:\/\/|[?:;,.\/]/)
+
+      if split.length > 1
+        with_dot + ((+" ") << split[1..-1].reject { |x| x.blank? }.join(" "))
+      else
+        with_dot
       end
     end
   end
diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb
index b92f342..f8823a9 100644
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@@ -17,6 +17,16 @@ describe SearchIndexer do
     SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
   end
 
+  it 'can correctly inject if http or https links exist' do
+
+    val = "a https://cnn.com?bob=1, http://stuff.com.au?bill=1 b abc.net/xyz=1"
+    result = SearchIndexer.inject_extra_terms(val)
+
+    expected = "a https://cnn.com?bob=1, cnn com bob=1 http://stuff.com.au?bill=1 stuff com au bill=1 b abc.net/xyz=1 net xyz=1"
+
+    expect(result).to eq(expected)
+  end
+
   it 'correctly indexes chinese' do
     SiteSetting.default_locale = 'zh_CN'
     data = "你好世界"
@@ -141,7 +151,7 @@ describe SearchIndexer do
       topic = post.topic
 
       expect(post.post_search_data.raw_data).to eq(
-        "#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png meta discourse org"
+        "#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png meta discourse org some png"
       )
     end

GitHub sha: 6428aa5b

1 Like