FEATURE: Send a 'noindex' header in non-canonical responses (#15026)

FEATURE: Send a ‘noindex’ header in non-canonical responses (#15026)

  • FEATURE: Optionally send a ‘noindex’ header in non-canonical responses

This will be used in a SEO experiment.

Co-authored-by: David Taylor david@taylorhq.com

diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb
index 0561be4..8d14941 100644
--- a/app/controllers/application_controller.rb
+++ b/app/controllers/application_controller.rb
@@ -41,13 +41,14 @@ class ApplicationController < ActionController::Base
   before_action :redirect_to_login_if_required
   before_action :block_if_requires_login
   before_action :preload_json
-  before_action :add_noindex_header, if: -> { is_feed_request? || !SiteSetting.allow_index_in_robots_txt }
   before_action :check_xhr
   after_action  :add_readonly_header
   after_action  :perform_refresh_session
   after_action  :dont_cache_page
   after_action  :conditionally_allow_site_embedding
   after_action  :ensure_vary_header
+  after_action  :add_noindex_header, if: -> { is_feed_request? || !SiteSetting.allow_index_in_robots_txt }
+  after_action  :add_noindex_header_to_non_canonical, if: -> { request.get? && !(request.format && request.format.json?) && !request.xhr? }
 
   HONEYPOT_KEY ||= 'HONEYPOT_KEY'
   CHALLENGE_KEY ||= 'CHALLENGE_KEY'
@@ -912,6 +913,13 @@ class ApplicationController < ActionController::Base
     end
   end
 
+  def add_noindex_header_to_non_canonical
+    canonical = (@canonical_url || @default_canonical)
+    if canonical.present? && canonical != request.url && !SiteSetting.allow_indexing_non_canonical_urls
+      response.headers['X-Robots-Tag'] ||= 'noindex'
+    end
+  end
+
   protected
 
   def honeypot_value
diff --git a/config/site_settings.yml b/config/site_settings.yml
index 5089815..d82c831 100644
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@@ -1602,6 +1602,9 @@ security:
     regex: "^(Lax|Strict|Disabled|None)$"
   enable_escaped_fragments: true
   allow_index_in_robots_txt: true
+  allow_indexing_non_canonical_urls:
+    default: true
+    hidden: true
   moderators_manage_categories_and_groups: false
   moderators_change_post_ownership:
     client: true
diff --git a/lib/canonical_url.rb b/lib/canonical_url.rb
index d318bd5..df66c37 100644
--- a/lib/canonical_url.rb
+++ b/lib/canonical_url.rb
@@ -2,6 +2,8 @@
 
 module CanonicalURL
   module ControllerExtensions
+    ALLOWED_CANONICAL_PARAMS = %w(page)
+
     def canonical_url(url_for_options = {})
       case url_for_options
       when Hash
@@ -10,22 +12,27 @@ module CanonicalURL
         @canonical_url = url_for_options
       end
     end
+
+    def default_canonical
+      @default_canonical ||= begin
+        canonical = +"#{Discourse.base_url_no_prefix}#{request.path}"
+        allowed_params = params.select { |key| ALLOWED_CANONICAL_PARAMS.include?(key) }
+        if allowed_params.present?
+          canonical << "?#{allowed_params.keys.zip(allowed_params.values).map { |key, value| "#{key}=#{value}" }.join("&")}"
+        end
+        canonical
+      end
+    end
+
+    def self.included(base)
+      base.helper_method :default_canonical
+    end
   end
 
   module Helpers
-    ALLOWED_CANONICAL_PARAMS = %w(page)
     def canonical_link_tag(url = nil)
       tag('link', rel: 'canonical', href: url || @canonical_url || default_canonical)
     end
-
-    def default_canonical
-      canonical = +"#{Discourse.base_url_no_prefix}#{request.path}"
-      allowed_params = params.select { |key| ALLOWED_CANONICAL_PARAMS.include?(key) }
-      if allowed_params.present?
-        canonical << "?#{allowed_params.keys.zip(allowed_params.values).map { |key, value| "#{key}=#{value}" }.join("&")}"
-      end
-      canonical
-    end
   end
 end
 
diff --git a/spec/requests/application_controller_spec.rb b/spec/requests/application_controller_spec.rb
index 603d2f3..34d9ecb 100644
--- a/spec/requests/application_controller_spec.rb
+++ b/spec/requests/application_controller_spec.rb
@@ -668,6 +668,33 @@ RSpec.describe ApplicationController do
     expect(response.body).to have_tag("link", with: { rel: "canonical", href: "http://test.localhost/t/#{topic.slug}/#{topic.id}" })
   end
 
+  it "adds a noindex header if non-canonical indexing is disabled" do
+    SiteSetting.allow_indexing_non_canonical_urls = false
+    get '/'
+    expect(response.headers['X-Robots-Tag']).to be_nil
+
+    get '/latest'
+    expect(response.headers['X-Robots-Tag']).to be_nil
+
+    get '/categories'
+    expect(response.headers['X-Robots-Tag']).to be_nil
+
+    topic = create_post.topic
+    get "/t/#{topic.slug}/#{topic.id}"
+    expect(response.headers['X-Robots-Tag']).to be_nil
+    post = create_post(topic_id: topic.id)
+    get "/t/#{topic.slug}/#{topic.id}/2"
+    expect(response.headers['X-Robots-Tag']).to eq('noindex')
+
+    20.times do
+      create_post(topic_id: topic.id)
+    end
+    get "/t/#{topic.slug}/#{topic.id}/21"
+    expect(response.headers['X-Robots-Tag']).to eq('noindex')
+    get "/t/#{topic.slug}/#{topic.id}?page=2"
+    expect(response.headers['X-Robots-Tag']).to be_nil
+  end
+
   context "default locale" do
     before do
       SiteSetting.default_locale = :fr
diff --git a/spec/requests/groups_controller_spec.rb b/spec/requests/groups_controller_spec.rb
index d918db8..ff7b00a 100644
--- a/spec/requests/groups_controller_spec.rb
+++ b/spec/requests/groups_controller_spec.rb
@@ -176,12 +176,6 @@ describe GroupsController do
       )
     end
 
-    it 'should return correct X-Robots-Tag header when allow_index_in_robots_txt is set to false' do
-      SiteSetting.allow_index_in_robots_txt = false
-      get "/groups"
-      expect(response.headers['X-Robots-Tag']).to eq('noindex, nofollow')
-    end
-
     context 'viewing groups of another user' do
       describe 'when an invalid username is given' do
         it 'should return the right response' do

GitHub sha: 5647819de40bd7900bc4e9556ef3a673075dab4d

This commit appears in #15026 which was approved by pmusaraj and davidtaylorhq. It was merged by Falco.