FIX: Follow the canonical URL when importing a remote topic. (#14489)

FIX: Follow the canonical URL when importing a remote topic. (#14489)

FinalDestination now supports the follow_canonical option, which will perform an initial GET request, parse the canonical link if present, and perform a HEAD request to it.

We use this mode during embeds to avoid treating URLs with different query parameters as different topics.

diff --git a/app/models/topic_embed.rb b/app/models/topic_embed.rb
index 61554f6..1e9f06a 100644
--- a/app/models/topic_embed.rb
+++ b/app/models/topic_embed.rb
@@ -113,7 +113,8 @@ class TopicEmbed < ActiveRecord::Base
     fd = FinalDestination.new(
       url,
       validate_uri: true,
-      max_redirects: 5
+      max_redirects: 5,
+      follow_canonical: true,
     )
 
     url = fd.resolve
diff --git a/lib/final_destination.rb b/lib/final_destination.rb
index 3831abf..e380547 100644
--- a/lib/final_destination.rb
+++ b/lib/final_destination.rb
@@ -63,7 +63,8 @@ class FinalDestination
     end
 
     @status = :ready
-    @http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head
+    @follow_canonical = @opts[:follow_canonical]
+    @http_verb = http_verb(@force_get_hosts, @follow_canonical)
     @cookie = nil
     @limited_ips = []
     @verbose = @opts[:verbose] || false
@@ -77,6 +78,14 @@ class FinalDestination
     20
   end
 
+  def http_verb(force_get_hosts, follow_canonical)
+    if follow_canonical || force_get_hosts.any? { |host| hostname_matches?(host) }
+      :get
+    else
+      :head
+    end
+  end
+
   def timeout
     @timeout || FinalDestination.connection_timeout
   end
@@ -215,6 +224,18 @@ class FinalDestination
         end
       end
 
+      if @follow_canonical
+        next_url = uri(fetch_canonical_url(response.body))
+
+        if next_url.to_s.present? && next_url != @uri
+          @follow_canonical = false
+          @uri = next_url
+          @http_verb = http_verb(@force_get_hosts, @follow_canonical)
+
+          return resolve
+        end
+      end
+
       @content_type = response.headers['Content-Type'] if response.headers.has_key?('Content-Type')
       @status = :resolved
       return @uri
@@ -458,4 +479,12 @@ class FinalDestination
     end
   end
 
+  def fetch_canonical_url(body)
+    return if body.blank?
+    canonical_link = Nokogiri::HTML5(body).at("link[rel='canonical']")
+
+    return if canonical_link.nil?
+
+    canonical_link['href']
+  end
 end
diff --git a/spec/components/final_destination_spec.rb b/spec/components/final_destination_spec.rb
index 8c626eb..8b78132 100644
--- a/spec/components/final_destination_spec.rb
+++ b/spec/components/final_destination_spec.rb
@@ -49,6 +49,13 @@ describe FinalDestination do
     }
   end
 
+  def canonical_follow(from, dest)
+    stub_request(:get, from).to_return(
+      status: 200,
+      body: "<head><link rel=\"canonical\" href=\"#{dest}\"></head>"
+    )
+  end
+
   def redirect_response(from, dest)
     stub_request(:head, from).to_return(
       status: 302,
@@ -175,6 +182,39 @@ describe FinalDestination do
       end
     end
 
+    context 'follows canonical links' do
+      it 'resolves the canonical link as the final destination' do
+        canonical_follow("https://eviltrout.com", "https://codinghorror.com/blog")
+        stub_request(:head, "https://codinghorror.com/blog").to_return(doc_response)
+
+        final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true))
+
+        expect(final.resolve.to_s).to eq("https://codinghorror.com/blog")
+        expect(final.redirected?).to eq(false)
+        expect(final.status).to eq(:resolved)
+      end
+
+      it "does not follow the canonical link if it's the same as the current URL" do
+        canonical_follow("https://eviltrout.com", "https://eviltrout.com")
+
+        final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true))
+
+        expect(final.resolve.to_s).to eq("https://eviltrout.com")
+        expect(final.redirected?).to eq(false)
+        expect(final.status).to eq(:resolved)
+      end
+
+      it "does not follow the canonical link if it's invalid" do
+        canonical_follow("https://eviltrout.com", "")
+
+        final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true))
+
+        expect(final.resolve.to_s).to eq("https://eviltrout.com")
+        expect(final.redirected?).to eq(false)
+        expect(final.status).to eq(:resolved)
+      end
+    end
+
     context "GET can be forced" do
       before do
         stub_request(:head, 'https://force.get.com/posts?page=4')
diff --git a/spec/models/topic_embed_spec.rb b/spec/models/topic_embed_spec.rb
index 694ad88..8659eb7 100644
--- a/spec/models/topic_embed_spec.rb
+++ b/spec/models/topic_embed_spec.rb
@@ -190,18 +190,20 @@ describe TopicEmbed do
   end
 
   describe '.find_remote' do
+    fab!(:embeddable_host) { Fabricate(:embeddable_host) }
+    let(:file) { StringIO.new }
 
-    context ".title_scrub" do
+    before do
+      TopicEmbed.stubs(:open).returns file
+    end
 
+    context ".title_scrub" do
       let(:url) { 'http://eviltrout.com/123' }
       let(:contents) { "<title>Through the Looking Glass - Classic Books</title><body>some content here</body>" }
-      fab!(:embeddable_host) { Fabricate(:embeddable_host) }
-      let!(:file) { StringIO.new }
 
       before do
         file.stubs(:read).returns contents
-        TopicEmbed.stubs(:open).returns file
-        stub_request(:head, url)
+        stub_request(:get, url)
       end
 
       it "doesn't scrub the title by default" do
@@ -214,44 +216,38 @@ describe TopicEmbed do
         response = TopicEmbed.find_remote(url)
         expect(response.title).to eq("Through the Looking Glass")
       end
-
     end
 
     context 'post with allowed classes "foo" and "emoji"' do
       fab!(:user) { Fabricate(:user) }
       let(:url) { 'http://eviltrout.com/123' }
       let(:contents) { "my normal size emoji <p class='foo'>Hi</p> <img class='emoji other foo' src='/images/smiley.jpg'>" }
-      fab!(:embeddable_host) { Fabricate(:embeddable_host) }
-      let!(:file) { StringIO.new }
-
-      response = nil
 
       before do
         SiteSetting.allowed_embed_classnames = 'emoji, foo'
         file.stubs(:read).returns contents
-        TopicEmbed.stubs(:open).returns file
-        stub_request(:head, url)
-        response = TopicEmbed.find_remote(url)
+        stub_request(:get, url)
+        @response = TopicEmbed.find_remote(url)
       end
 
       it "has no author tag" do
-        expect(response.author).to be_blank
+        expect(@response.author).to be_blank
       end
 
       it 'img node has emoji class' do
-        expect(response.body).to have_tag('img', with: { class: 'emoji' })
+        expect(@response.body).to have_tag('img', with: { class: 'emoji' })
       end
 
       it 'img node has foo class' do
-        expect(response.body).to have_tag('img', with: { class: 'foo' })
+        expect(@response.body).to have_tag('img', with: { class: 'foo' })
       end
 
       it 'p node has foo class' do
-        expect(response.body).to have_tag('p', with: { class: 'foo' })
+        expect(@response.body).to have_tag('p', with: { class: 'foo' })
       end
 
       it 'nodes removes classes other than emoji' do
-        expect(response.body).to have_tag('img', without: { class: 'other' })
+        expect(@response.body).to have_tag('img', without: { class: 'other' })
       end
     end
 
@@ -259,67 +255,56 @@ describe TopicEmbed do
       fab!(:user) { Fabricate(:user, username: 'eviltrout') }
       let(:url) { 'http://eviltrout.com/321' }
       let(:contents) { '<html><head><meta name="author" content="eviltrout"></head><body>rich and morty</body></html>' }
-      fab!(:embeddable_host) { Fabricate(:embeddable_host) }
-      let!(:file) { StringIO.new }
-
-      response = nil
 
       before(:each) do
         file.stubs(:read).returns contents
         TopicEmbed.stubs(:open).returns file
-        stub_request(:head, url)
-        response = TopicEmbed.find_remote(url)
+        stub_request(:get, url)
       end
 
       it "has no author tag" do
+        response = TopicEmbed.find_remote(url)
+
         expect(response.author).to eq(user)
       end
     end
 
     context 'post with no allowed classes' do
-

[... diff too long, it was truncated ...]

GitHub sha: 4c2d5158c58f04c2043ca90f1761a70717792ffc

This commit appears in #14489 which was approved by eviltrout. It was merged by romanrizzi.