FIX: remove superfluous spaces from CJK blurbs (#12629)

FIX: remove superfluous spaces from CJK blurbs (#12629)

Previously we used the raw data indexed to generate blurbs even for cases when Chinese/Korean/Japanese text was used.

This caused superfluous spaces to show up in excerpts.

diff --git a/lib/search.rb b/lib/search.rb
index 2370877..aa7b1ca 100644
--- a/lib/search.rb
+++ b/lib/search.rb
@@ -64,6 +64,11 @@ class Search
     end
   end
 
+  def self.segment_cjk?
+    ['zh_TW', 'zh_CN', 'ja'].include?(SiteSetting.default_locale) ||
+      SiteSetting.search_tokenize_chinese_japanese_korean
+  end
+
   def self.prepare_data(search_data, purpose = :query)
     purpose ||= :query
 
@@ -73,7 +78,7 @@ class Search
       # TODO cppjieba_rb is designed for chinese, we need something else for Japanese
       # Korean appears to be safe cause words are already space seperated
       # For Japanese we should investigate using kakasi
-      if ['zh_TW', 'zh_CN', 'ja'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese_japanese_korean
+      if segment_cjk?
         require 'cppjieba_rb' unless defined? CppjiebaRb
         mode = (purpose == :query ? :query : :mix)
         data = CppjiebaRb.segment(search_data, mode: mode)
diff --git a/lib/search/grouped_search_results.rb b/lib/search/grouped_search_results.rb
index 470a732..8182fe9 100644
--- a/lib/search/grouped_search_results.rb
+++ b/lib/search/grouped_search_results.rb
@@ -87,7 +87,7 @@ class Search
         blurb_length: @blurb_length
       }
 
-      if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION
+      if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION && !Search.segment_cjk?
         if SiteSetting.use_pg_headlines_for_excerpt
           scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1')
           prefix_omission = scrubbed_headline.start_with?(post.leading_raw_data) ? '' : OMISSION
diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb
index be6e051..c6a52dc 100644
--- a/spec/components/search_spec.rb
+++ b/spec/components/search_spec.rb
@@ -1791,6 +1791,27 @@ describe Search do
     end
   end
 
+  context 'CJK segmentation' do
+    before do
+      SiteSetting.search_tokenize_chinese_japanese_korean = true
+      SiteSetting.min_search_term_length = 1
+    end
+
+    let!(:post1) do
+      Fabricate(:post, raw: '場サアマネ織企ういかせ竹域ヱイマ穂基ホ神3予読ずねいぱ松査ス禁多サウ提懸イふ引小43改こょドめ。深とつぐ主思料農ぞかル者杯検める活分えほづぼ白犠')
+    end
+
+    it('does not include superflous spaces in blurbs') do
+
+      results = Search.execute('ういかせ竹域', type_filter: 'topic')
+      expect(results.posts.length).to eq(1)
+
+      expect(results.blurb(results.posts.first)).to include('ういかせ竹域')
+
+    end
+
+  end
+
   context 'include_diacritics' do
     before { SiteSetting.search_ignore_accents = false }
     let!(:post1) { Fabricate(:post, raw: 'สวัสดี Régis hello') }

GitHub sha: 5b342ae5

This commit appears in #12629 which was approved by CvX. It was merged by SamSaffron.