FIX: use crawler layout when saving url in Wayback Machine (#7667)

FIX: use crawler layout when saving url in Wayback Machine (#7667)

diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb
index 1075233..7e30169 100644
--- a/app/controllers/application_controller.rb
+++ b/app/controllers/application_controller.rb
@@ -79,7 +79,9 @@ class ApplicationController < ActionController::Base
       request.user_agent &&
       (request.content_type.blank? || request.content_type.include?('html')) &&
       !['json', 'rss'].include?(params[:format]) &&
-      (has_escaped_fragment? || CrawlerDetection.crawler?(request.user_agent) || params.key?("print"))
+      (has_escaped_fragment? || params.key?("print") ||
+      CrawlerDetection.crawler?(request.user_agent, request.headers["HTTP_VIA"])
+      )
   end
 
   def perform_refresh_session
diff --git a/lib/crawler_detection.rb b/lib/crawler_detection.rb
index 040c339..6cd1eba 100644
--- a/lib/crawler_detection.rb
+++ b/lib/crawler_detection.rb
@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 
 module CrawlerDetection
+  WAYBACK_MACHINE_URL = "web.archive.org"
 
   def self.to_matcher(string, type: nil)
     escaped = string.split('|').map { |agent| Regexp.escape(agent) }.join('|')
@@ -13,8 +14,8 @@ module CrawlerDetection
     Regexp.new(escaped, Regexp::IGNORECASE)
   end
 
-  def self.crawler?(user_agent)
-    return true if user_agent.nil?
+  def self.crawler?(user_agent, via_header = nil)
+    return true if user_agent.nil? || via_header&.include?(WAYBACK_MACHINE_URL)
 
     # this is done to avoid regenerating regexes
     @non_crawler_matchers ||= {}
diff --git a/lib/middleware/anonymous_cache.rb b/lib/middleware/anonymous_cache.rb
index 48a7648..51d0e19 100644
--- a/lib/middleware/anonymous_cache.rb
+++ b/lib/middleware/anonymous_cache.rb
@@ -62,7 +62,7 @@ module Middleware
         @is_crawler ||=
           begin
             user_agent = @env[USER_AGENT]
-            if CrawlerDetection.crawler?(user_agent)
+            if CrawlerDetection.crawler?(user_agent, @env["HTTP_VIA"])
               :true
             else
               user_agent.downcase.include?("discourse") ? :true : :false
diff --git a/spec/components/crawler_detection_spec.rb b/spec/components/crawler_detection_spec.rb
index 907b968..7d53933 100644
--- a/spec/components/crawler_detection_spec.rb
+++ b/spec/components/crawler_detection_spec.rb
@@ -5,9 +5,9 @@ require_dependency 'crawler_detection'
 
 describe CrawlerDetection do
 
-  def crawler!(s)
-    if (!CrawlerDetection.crawler?(s))
-      raise "#{s} should be a crawler!"
+  def crawler!(user_agent, via = nil)
+    if (!CrawlerDetection.crawler?(user_agent, via))
+      raise "#{user_agent} should be a crawler!"
     end
   end
 
@@ -50,6 +50,10 @@ describe CrawlerDetection do
       crawler! "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
     end
 
+    it "returns true when VIA header contains 'web.archive.org'" do
+      crawler!("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "HTTP/1.0 web.archive.org (Wayback Save Page)")
+    end
+
     it "returns false for non-crawler user agents" do
       not_crawler! "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36"
       not_crawler! "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko"
diff --git a/spec/requests/topics_controller_spec.rb b/spec/requests/topics_controller_spec.rb
index 2964a91..7825047 100644
--- a/spec/requests/topics_controller_spec.rb
+++ b/spec/requests/topics_controller_spec.rb
@@ -2690,15 +2690,14 @@ RSpec.describe TopicsController do
     end
 
     context "when a crawler" do
-      it "renders with the crawler layout, and handles proper pagination" do
-
-        page1_time = 3.months.ago
-        page2_time = 2.months.ago
-        page3_time = 1.month.ago
+      let(:topic) { Fabricate(:topic) }
+      let(:page1_time) { 3.months.ago }
+      let(:page2_time) { 2.months.ago }
+      let(:page3_time) { 1.month.ago }
 
+      before do
         freeze_time page1_time
 
-        topic = Fabricate(:topic)
         Fabricate(:post, topic: topic)
         Fabricate(:post, topic: topic)
 
@@ -2712,33 +2711,38 @@ RSpec.describe TopicsController do
         # ugly, but no inteface to set this and we don't want to create
         # 100 posts to test this thing
         TopicView.stubs(:chunk_size).returns(2)
+      end
 
-        user_agent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
-
-        get topic.url, env: { "HTTP_USER_AGENT" => user_agent }
+      shared_examples "crawler layout" do |user_agent, via|
+        it "renders with the crawler layout, and handles proper pagination" do
+          get topic.url, env: { "HTTP_USER_AGENT" => user_agent, "HTTP_VIA" => via }
 
-        body = response.body
+          body = response.body
 
-        expect(body).to have_tag(:body, with: { class: 'crawler' })
-        expect(body).to_not have_tag(:meta, with: { name: 'fragment' })
-        expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=2")
+          expect(body).to have_tag(:body, with: { class: 'crawler' })
+          expect(body).to_not have_tag(:meta, with: { name: 'fragment' })
+          expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=2")
 
-        expect(response.headers['Last-Modified']).to eq(page1_time.httpdate)
+          expect(response.headers['Last-Modified']).to eq(page1_time.httpdate)
 
-        get topic.url + "?page=2", env: { "HTTP_USER_AGENT" => user_agent }
-        body = response.body
+          get topic.url + "?page=2", env: { "HTTP_USER_AGENT" => user_agent, "HTTP_VIA" => via }
+          body = response.body
 
-        expect(response.headers['Last-Modified']).to eq(page2_time.httpdate)
+          expect(response.headers['Last-Modified']).to eq(page2_time.httpdate)
 
-        expect(body).to include('<link rel="prev" href="' + topic.relative_url)
-        expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=3")
+          expect(body).to include('<link rel="prev" href="' + topic.relative_url)
+          expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=3")
 
-        get topic.url + "?page=3", env: { "HTTP_USER_AGENT" => user_agent }
-        body = response.body
+          get topic.url + "?page=3", env: { "HTTP_USER_AGENT" => user_agent, "HTTP_VIA" => via }
+          body = response.body
 
-        expect(response.headers['Last-Modified']).to eq(page3_time.httpdate)
-        expect(body).to include('<link rel="prev" href="' + topic.relative_url + "?page=2")
+          expect(response.headers['Last-Modified']).to eq(page3_time.httpdate)
+          expect(body).to include('<link rel="prev" href="' + topic.relative_url + "?page=2")
+        end
       end
+
+      include_examples "crawler layout", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", nil
+      include_examples "crawler layout", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "HTTP/1.0 web.archive.org (Wayback Save Page)"
     end
 
   end

GitHub sha: 42809f4d

2 Likes

Minor followup here, I am not sure we need to repeat the entirety of the integration test.

I would just do one get route for wayback machine as an integration test to confirm the detection is all good.

2 Likes

DEV: simpler spec for wayback machine crawler layout (#7696)