FIX: Inject extra lexemes for host lexeme.

FIX: Inject extra lexemes for host lexeme.

discourse_development=# SELECT alias, lexemes FROM TS_DEBUG('www.discourse.org');
 alias |       lexemes
-------+---------------------
 host  | {www.discourse.org}

discourse_development=# SELECT TO_TSVECTOR('www.discourse.org');
      to_tsvector
-----------------------
 'www.discourse.org':1

Given the above lexeme, we will inject additional lexeme by splitting the host on .. The actual tsvector stored will look something like

               tsvector
---------------------------------------
 'discourse':1 'discourse.org':1 'org':1 'www':1 'www.discourse.org':1
diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb
index 09193ff..aae80d4 100644
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -16,26 +16,9 @@ class SearchIndexer
     HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
   end
 
-  def self.inject_extra_terms(raw)
-    return raw if !SiteSetting.search_inject_extra_terms
-
-    # insert some extra words for I.am.a.word so "word" is tokenized
-    # I.am.a.word becomes I.am.a.word am a word
-    raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot|
-
-      split = with_dot.split(/https?:\/\/|[?:;,.\/]/)
-
-      if split.length > 1
-        with_dot + ((+" ") << split[1..-1].reject { |x| x.blank? }.join(" "))
-      else
-        with_dot
-      end
-    end
-  end
-
   def self.update_index(table: , id: , raw_data:)
     search_data = raw_data.map do |data|
-      inject_extra_terms(Search.prepare_data(data || "", :index))
+      Search.prepare_data(data || "", :index)
     end
 
     table_name = "#{table}_search_data"
@@ -53,15 +36,39 @@ class SearchIndexer
 
     indexed_data = search_data.select { |d| d.length > 0 }.join(' ')
 
-    params = {
+    ranked_params = {
       a: search_data[0],
       b: search_data[1],
       c: search_data[2],
       d: search_data[3],
+    }
+
+    tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
+    additional_lexemes = []
+
+    tsvector.scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/).reduce(additional_lexemes) do |array, (lexeme, _, positions)|
+      count = 0
+
+      loop do
+        count += 1
+        break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
+        term, _, remaining = lexeme.partition(".")
+        break if remaining.blank?
+        array << "'#{term}':#{positions} '#{remaining}':#{positions}"
+        lexeme = remaining
+      end
+
+      array
+    end
+
+    tsvector = "#{tsvector} #{additional_lexemes.join(' ')}"
+
+    params = {
       raw_data: indexed_data,
       id: id,
       locale: SiteSetting.default_locale,
-      version: INDEX_VERSION
+      version: INDEX_VERSION,
+      tsvector: tsvector,
     }
 
     # Would be nice to use AR here but not sure how to execut Postgres functions
@@ -71,7 +78,7 @@ class SearchIndexer
        SET
           raw_data = :raw_data,
           locale = :locale,
-          search_data = #{ranked_index},
+          search_data = (:tsvector)::tsvector,
           version = :version
        WHERE #{foreign_key} = :id
     SQL
@@ -80,7 +87,7 @@ class SearchIndexer
       DB.exec(<<~SQL, params)
         INSERT INTO #{table_name}
         (#{foreign_key}, search_data, locale, raw_data, version)
-        VALUES (:id, #{ranked_index}, :locale, :raw_data, :version)
+        VALUES (:id, (:tsvector)::tsvector, :locale, :raw_data, :version)
       SQL
     end
   rescue
diff --git a/config/site_settings.yml b/config/site_settings.yml
index 5b6938e..ccf27a4 100644
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@@ -379,7 +379,7 @@ login:
   discord_trusted_guilds:
     default: ""
     type: list
-  external_auth_skip_create_confirm: 
+  external_auth_skip_create_confirm:
     default: false
     client: true
   enable_sso:
@@ -1750,9 +1750,6 @@ search:
   search_ranking_normalization:
     default: '1'
     hidden: true
-  search_inject_extra_terms:
-    default: true
-    hidden: true
   min_search_term_length:
     client: true
     default: 3
diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb
index 06e5638..075043d 100644
--- a/spec/components/search_spec.rb
+++ b/spec/components/search_spec.rb
@@ -1255,20 +1255,26 @@ describe Search do
       ])
     end
 
-    it 'can tokenize dots' do
+    it 'can search for terms with dots' do
       post = Fabricate(:post, raw: 'Will.2000 Will.Bob.Bill...')
       expect(Search.execute('bill').posts.map(&:id)).to eq([post.id])
+      expect(Search.execute('bob').posts.map(&:id)).to eq([post.id])
+      expect(Search.execute('2000').posts.map(&:id)).to eq([post.id])
     end
 
     it 'can search URLS correctly' do
       post = Fabricate(:post, raw: 'i like http://wb.camra.org.uk/latest#test so yay')
+
       expect(Search.execute('http://wb.camra.org.uk/latest#test').posts.map(&:id)).to eq([post.id])
       expect(Search.execute('camra').posts.map(&:id)).to eq([post.id])
-
-      complex_url = "https://test.some.site.com/path?some.range_input=74235a"
-      post2 = Fabricate(:post, raw: "this is a complex url #{complex_url} so complex")
-
-      expect(Search.execute(complex_url).posts.map(&:id)).to eq([post2.id])
+      expect(Search.execute('http://wb').posts.map(&:id)).to eq([post.id])
+      expect(Search.execute('wb.camra').posts.map(&:id)).to eq([post.id])
+      expect(Search.execute('wb.camra.org').posts.map(&:id)).to eq([post.id])
+      expect(Search.execute('org.uk').posts.map(&:id)).to eq([post.id])
+      expect(Search.execute('camra.org.uk').posts.map(&:id)).to eq([post.id])
+      expect(Search.execute('wb.camra.org.uk').posts.map(&:id)).to eq([post.id])
+      expect(Search.execute('wb.camra.org.uk/latest').posts.map(&:id)).to eq([post.id])
+      expect(Search.execute('/latest#test').posts.map(&:id)).to eq([post.id])
     end
 
     it 'supports category slug and tags' do
diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb
index f8823a9..027b660 100644
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@@ -17,16 +17,6 @@ describe SearchIndexer do
     SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
   end
 
-  it 'can correctly inject if http or https links exist' do
-
-    val = "a https://cnn.com?bob=1, http://stuff.com.au?bill=1 b abc.net/xyz=1"
-    result = SearchIndexer.inject_extra_terms(val)
-
-    expected = "a https://cnn.com?bob=1, cnn com bob=1 http://stuff.com.au?bill=1 stuff com au bill=1 b abc.net/xyz=1 net xyz=1"
-
-    expect(result).to eq(expected)
-  end
-
   it 'correctly indexes chinese' do
     SiteSetting.default_locale = 'zh_CN'
     data = "你好世界"
@@ -151,7 +141,28 @@ describe SearchIndexer do
       topic = post.topic
 
       expect(post.post_search_data.raw_data).to eq(
-        "#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png meta discourse org some png"
+        "#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png"
+      )
+    end
+
+    it 'should tokenize host of a URL' do
+      category = Fabricate(:category, name: 'awesome category')
+      topic = Fabricate(:topic, category: category, title: 'this is a test topic')
+
+      post = Fabricate(:post, topic: topic, raw: <<~RAW)
+      a https://cnn.com?bob=1, http://stuff.com.au?bill=1 b abc.net/xyz=1
+      RAW
+
+      post.rebake!
+      post.reload
+      topic = post.topic
+
+      expect(post.post_search_data.raw_data).to eq(
+        "#{topic.title} #{category.name} a https://cnn.com?bob=1 , http://stuff.com.au?bill=1 b http://abc.net/xyz=1 abc.net/xyz=1"
+      )
+
+      expect(post.post_search_data.search_data).to eq(
+        "'/xyz=1':18,21 '1':11,14 'abc':17,20 'abc.net':17,20 'abc.net/xyz=1':16,19 'au':12 'awesom':6B 'b':15 'bill':13 'bob':10 'categori':7B 'cnn':9 'cnn.com':9 'com':9,12 'com.au':12 'net':17,20 'stuff':12 'stuff.com.au':12 'test':4A 'topic':5A"
       )
     end
 

GitHub sha: 5c230266

1 Like

This commit appears in #10198 which was merged by tgxworld.