FEATURE: Nokogumbo (#9577)

FEATURE: Nokogumbo (#9577)

  • FEATURE: Nokogumbo

Use Nokogumbo HTML parser.

diff --git a/app/helpers/user_notifications_helper.rb b/app/helpers/user_notifications_helper.rb
index eb02931..ffbf735 100644
--- a/app/helpers/user_notifications_helper.rb
+++ b/app/helpers/user_notifications_helper.rb
@@ -13,7 +13,7 @@ module UserNotificationsHelper
   end
 
   def correct_top_margin(html, desired)
-    fragment = Nokogiri::HTML.fragment(html)
+    fragment = Nokogiri::HTML5.fragment(html)
     if para = fragment.css("p:first").first
       para["style"] = "margin-top: #{desired};"
     end
@@ -32,7 +32,7 @@ module UserNotificationsHelper
   end
 
   def first_paragraphs_from(html)
-    doc = Nokogiri::HTML(html)
+    doc = Nokogiri::HTML5(html)
 
     result = +""
     length = 0
diff --git a/app/jobs/onceoff/grant_emoji.rb b/app/jobs/onceoff/grant_emoji.rb
index 5f85b43..5abdb34 100644
--- a/app/jobs/onceoff/grant_emoji.rb
+++ b/app/jobs/onceoff/grant_emoji.rb
@@ -14,7 +14,7 @@ module Jobs
         .where("cooked LIKE '%emoji%'")
         .find_in_batches do |group|
         group.each do |p|
-          doc = Nokogiri::HTML::fragment(p.cooked)
+          doc = Nokogiri::HTML5::fragment(p.cooked)
           if (doc.css("img.emoji") - doc.css(".quote img")).size > 0
             to_award[p.user_id] ||= { post_id: p.id, created_at: p.created_at }
           end
diff --git a/app/jobs/onceoff/grant_onebox.rb b/app/jobs/onceoff/grant_onebox.rb
index 59cf443..66d2cf2 100644
--- a/app/jobs/onceoff/grant_onebox.rb
+++ b/app/jobs/onceoff/grant_onebox.rb
@@ -19,7 +19,7 @@ module Jobs
           begin
             # Note we can't use `p.cooked` here because oneboxes have been cooked out
             cooked = PrettyText.cook(p.raw)
-            doc = Nokogiri::HTML::fragment(cooked)
+            doc = Nokogiri::HTML5::fragment(cooked)
             if doc.search('a.onebox').size > 0
               to_award[p.user_id] ||= { post_id: p.id, created_at: p.created_at }
             end
diff --git a/app/jobs/regular/pull_hotlinked_images.rb b/app/jobs/regular/pull_hotlinked_images.rb
index 2570365..e7644e8 100644
--- a/app/jobs/regular/pull_hotlinked_images.rb
+++ b/app/jobs/regular/pull_hotlinked_images.rb
@@ -157,7 +157,7 @@ module Jobs
     end
 
     def extract_images_from(html)
-      doc = Nokogiri::HTML::fragment(html)
+      doc = Nokogiri::HTML5::fragment(html)
 
       doc.css("img[src], a.lightbox[href]") -
         doc.css("img.avatar") -
diff --git a/app/jobs/regular/update_username.rb b/app/jobs/regular/update_username.rb
index d43c119..7c9fced 100644
--- a/app/jobs/regular/update_username.rb
+++ b/app/jobs/regular/update_username.rb
@@ -154,11 +154,11 @@ module Jobs
     # and there is no reason to invalidate oneboxes, run the post analyzer etc.
     # when only the username changes.
     def update_cooked(cooked)
-      doc = Nokogiri::HTML.fragment(cooked)
+      doc = Nokogiri::HTML5.fragment(cooked)
 
       doc.css("a.mention").each do |a|
         a.content = a.content.gsub(@cooked_mention_username_regex, "@#{@new_username}")
-        a["href"] = a["href"].gsub(@cooked_mention_user_path_regex, "/u/#{@new_username}") if a["href"]
+        a["href"] = a["href"].gsub(@cooked_mention_user_path_regex, "/u/#{URI.escape(@new_username)}") if a["href"]
       end
 
       doc.css("aside.quote").each do |aside|
diff --git a/app/models/category.rb b/app/models/category.rb
index bbf1e9b..ddab403 100644
--- a/app/models/category.rb
+++ b/app/models/category.rb
@@ -306,7 +306,7 @@ class Category < ActiveRecord::Base
 
     @@cache_text ||= LruRedux::ThreadSafeCache.new(1000)
     @@cache_text.getset(self.description) do
-      text = Nokogiri::HTML.fragment(self.description).text.strip
+      text = Nokogiri::HTML5.fragment(self.description).text.strip
       Rack::Utils.escape_html(text).html_safe
     end
   end
diff --git a/app/models/post.rb b/app/models/post.rb
index 5da5d16..f2ee105 100644
--- a/app/models/post.rb
+++ b/app/models/post.rb
@@ -953,7 +953,7 @@ class Post < ActiveRecord::Base
       /\/uploads\/short-url\/[a-zA-Z0-9]+(\.[a-z0-9]+)?/
     ]
 
-    fragments ||= Nokogiri::HTML::fragment(self.cooked)
+    fragments ||= Nokogiri::HTML5::fragment(self.cooked)
     selectors = fragments.css("a/@href", "img/@src", "source/@src", "track/@src", "video/@poster")
 
     links = selectors.map do |media|
diff --git a/app/models/post_analyzer.rb b/app/models/post_analyzer.rb
index 63fe972..bae36c3 100644
--- a/app/models/post_analyzer.rb
+++ b/app/models/post_analyzer.rb
@@ -131,7 +131,7 @@ class PostAnalyzer
 
   def cooked_stripped
     @cooked_stripped ||= begin
-      doc = Nokogiri::HTML.fragment(cook(@raw, topic_id: @topic_id))
+      doc = Nokogiri::HTML5.fragment(cook(@raw, topic_id: @topic_id))
       doc.css("pre .mention, aside.quote > .title, aside.quote .mention, aside.quote .mention-group, .onebox, .elided").remove
       doc
     end
diff --git a/app/models/quoted_post.rb b/app/models/quoted_post.rb
index 03b981e..9a6a96e 100644
--- a/app/models/quoted_post.rb
+++ b/app/models/quoted_post.rb
@@ -9,7 +9,7 @@ class QuotedPost < ActiveRecord::Base
   #  we are double parsing this fragment, this may be worth optimising later
   def self.extract_from(post)
 
-    doc = Nokogiri::HTML.fragment(post.cooked)
+    doc = Nokogiri::HTML5.fragment(post.cooked)
 
     uniq = {}
 
diff --git a/app/models/theme_field.rb b/app/models/theme_field.rb
index 351c7b9..a9f98ce 100644
--- a/app/models/theme_field.rb
+++ b/app/models/theme_field.rb
@@ -78,7 +78,7 @@ class ThemeField < ActiveRecord::Base
 
     js_compiler = ThemeJavascriptCompiler.new(theme_id, self.theme.name)
 
-    doc = Nokogiri::HTML.fragment(html)
+    doc = Nokogiri::HTML5.fragment(html)
 
     doc.css('script[type="text/x-handlebars"]').each do |node|
       name = node["name"] || node["data-template-name"] || "broken"
diff --git a/app/models/topic_embed.rb b/app/models/topic_embed.rb
index 43de234..a18e5b8 100644
--- a/app/models/topic_embed.rb
+++ b/app/models/topic_embed.rb
@@ -126,7 +126,7 @@ class TopicEmbed < ActiveRecord::Base
       return
     end
 
-    raw_doc = Nokogiri::HTML(html)
+    raw_doc = Nokogiri::HTML5(html)
     auth_element = raw_doc.at('meta[@name="author"]')
     if auth_element.present?
       response.author = User.where(username_lower: auth_element[:content].strip).first
@@ -142,7 +142,7 @@ class TopicEmbed < ActiveRecord::Base
       title.strip!
     end
     response.title = title
-    doc = Nokogiri::HTML(read_doc.content)
+    doc = Nokogiri::HTML5(read_doc.content)
 
     tags = { 'img' => 'src', 'script' => 'src', 'a' => 'href' }
     doc.search(tags.keys.join(',')).each do |node|
@@ -198,7 +198,7 @@ class TopicEmbed < ActiveRecord::Base
     prefix = "#{uri.scheme}://#{uri.host}"
     prefix += ":#{uri.port}" if uri.port != 80 && uri.port != 443
 
-    fragment = Nokogiri::HTML.fragment("<div>#{contents}</div>")
+    fragment = Nokogiri::HTML5.fragment("<div>#{contents}</div>")
     fragment.css('a').each do |a|
       href = a['href']
       if href.present? && href.start_with?('/')
@@ -220,7 +220,7 @@ class TopicEmbed < ActiveRecord::Base
   end
 
   def self.first_paragraph_from(html)
-    doc = Nokogiri::HTML(html)
+    doc = Nokogiri::HTML5(html)
 
     result = +""
     doc.css('p').each do |p|
diff --git a/app/services/inline_uploads.rb b/app/services/inline_uploads.rb
index e75348b..0facc32 100644
--- a/app/services/inline_uploads.rb
+++ b/app/services/inline_uploads.rb
@@ -16,7 +16,7 @@ class InlineUploads
       end
     end
 
-    cooked_fragment = Nokogiri::HTML::fragment(PrettyText.cook(markdown, disable_emojis: true))
+    cooked_fragment = Nokogiri::HTML5::fragment(PrettyText.cook(markdown, disable_emojis: true))
     link_occurences = []
 
     cooked_fragment.traverse do |node|
@@ -183,7 +183,7 @@ class InlineUploads
 
   def self.match_anchor(markdown, external_href: false)
     markdown.scan(/((<a[^<]+>)([^<\a>]*?)<\/a>)/i) do |match|
-      node = Nokogiri::HTML::fragment(match[0]).children[0]

[... diff too long, it was truncated ...]

GitHub sha: 9bff0882

This commit appears in #9577 which was merged by lis2.

@lis2, do you remember why this changed?

1 Like