FIX: Make HTML scrubber work with deep HTML (#12619)

FIX: Make HTML scrubber work with deep HTML (#12619)

SearchIndexer and ReindexSearch used to explode for posts with very deep or invalid HTML content.

diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb
index 12bdcbc..45d4239 100644
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -284,7 +284,11 @@ class SearchIndexer
     def self.scrub(html, strip_diacritics: false)
       return +"" if html.blank?
 
-      document = Nokogiri::HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
+      begin
+        document = Nokogiri::HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
+      rescue ArgumentError
+        return +""
+      end
 
       nodes = document.css(
         "div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}"
diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb
index 936dca0..659ef8c 100644
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@@ -144,6 +144,19 @@ describe SearchIndexer do
         .to change { post.reload.post_search_data.search_data }
     end
 
+    it 'should work with invalid HTML' do
+      post.update!(cooked: "<FD>" * Nokogumbo::DEFAULT_MAX_TREE_DEPTH)
+
+      SearchIndexer.update_posts_index(
+        post_id: post.id,
+        topic_title: post.topic.title,
+        category_name: post.topic.category&.name,
+        topic_tags: post.topic.tags.map(&:name).join(' '),
+        cooked: post.cooked,
+        private_message: post.topic.private_message?
+      )
+    end
+
     it 'should not index posts with empty raw' do
       expect do
         post = Fabricate.build(:post, raw: "", post_type: Post.types[:small_action])

GitHub sha: c10df4b5

1 Like

This commit appears in #12619 which was approved by pmusaraj. It was merged by SamSaffron.