DEV: Oneboxer wildcard subdomains (#13015)

DEV: Oneboxer wildcard subdomains (#13015)

  • DEV: Allow wildcards in Oneboxer optional domain Site Settings

Allows a wildcard to be used as a subdomain on Oneboxer-related SiteSettings, e.g.:

  • force_get_hosts
  • cache_onebox_response_body_domains
  • force_custom_user_agent_hosts
  • DEV: fix typos

  • FIX: Try doing a GET after receiving a 500 error from a HEAD

By default we try to do a HEAD requests. If this results in a 500 error response, we should try to do a GET

  • DEV: force_get_hosts should be a hidden setting

  • DEV: Oneboxer Strategies

Have an alternative oneboxing ‘strategy’ (i.e., set of options) to use when an attempt to generate a Onebox fails. Keep track of any non-default strategies that were used on a particular host, and use that strategy for that host in the future.

Initially, the alternate strategy (force_get_and_ua) forces the FinalDestination step of Oneboxing to do a GET rather than HEAD, and forces a custom user agent.

  • DEV: change stubbed return code

The stubbed status code needs to be a value not recognized by FinalDestination

diff --git a/config/site_settings.yml b/config/site_settings.yml
index 80658f6..8da736a 100644
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@@ -1637,6 +1637,10 @@ onebox:
   force_custom_user_agent_hosts:
     default: "http://codepen.io"
     type: list
+  force_get_hosts:
+    default: "us.battle.net|news.yahoo.com|*.medium.com"
+    type: list
+    hidden: true
   facebook_app_access_token:
     default: ""
     secret: true
diff --git a/lib/final_destination.rb b/lib/final_destination.rb
index 73f7ce3..348a44f 100644
--- a/lib/final_destination.rb
+++ b/lib/final_destination.rb
@@ -215,7 +215,7 @@ class FinalDestination
 
       @status = :resolved
       return @uri
-    when 400, 405, 406, 409, 501
+    when 400, 405, 406, 409, 500, 501
       response_status, small_headers = small_get(request_headers)
 
       if response_status == 200
@@ -300,7 +300,17 @@ class FinalDestination
 
   def hostname_matches?(url)
     url = uri(url)
-    @uri && url.present? && @uri.hostname == url&.hostname
+
+    if @uri&.hostname.present? && url&.hostname.present?
+      hostname_parts = url.hostname.split('.')
+      has_wildcard = hostname_parts.first == '*'
+
+      if has_wildcard
+        @uri.hostname.end_with?(hostname_parts[1..-1].join('.'))
+      else
+        @uri.hostname == url.hostname
+      end
+    end
   end
 
   def is_dest_valid?
diff --git a/lib/oneboxer.rb b/lib/oneboxer.rb
index 4e0b96e..954e810 100644
--- a/lib/oneboxer.rb
+++ b/lib/oneboxer.rb
@@ -32,7 +32,8 @@ module Oneboxer
   end
 
   def self.force_get_hosts
-    hosts = ['http://us.battle.net', 'https://news.yahoo.com']
+    hosts = []
+    hosts += SiteSetting.force_get_hosts.split('|').collect { |domain| "https://#{domain}" }
     hosts += SiteSetting.cache_onebox_response_body_domains.split('|').collect { |domain| "https://www.#{domain}" }
     hosts += amazon_domains
 
@@ -394,8 +395,13 @@ module Oneboxer
     allowed += SiteSetting.allowed_iframes.split("|")
   end
 
-  def self.external_onebox(url)
+  def self.external_onebox(url, available_strategies = nil)
     Discourse.cache.fetch(onebox_cache_key(url), expires_in: 1.day) do
+
+      uri = URI(url)
+      available_strategies ||= Oneboxer.ordered_strategies(uri.hostname)
+      strategy = available_strategies.shift
+
       fd_options = {
         ignore_redirects: ignore_redirects,
         ignore_hostnames: blocked_domains,
@@ -404,6 +410,13 @@ module Oneboxer
         preserve_fragment_url_hosts: preserve_fragment_url_hosts
       }
 
+      if strategy && Oneboxer.strategies[strategy][:force_get_host]
+        fd_options[:force_get_hosts] = ["https://#{uri.hostname}"]
+      end
+      if strategy && Oneboxer.strategies[strategy][:force_custom_user_agent_host]
+        fd_options[:force_custom_user_agent_hosts] = ["https://#{uri.hostname}"]
+      end
+
       user_agent_override = SiteSetting.cache_onebox_user_agent if Oneboxer.cache_response_body?(url) && SiteSetting.cache_onebox_user_agent.present?
       fd_options[:default_user_agent] = user_agent_override if user_agent_override
 
@@ -415,6 +428,11 @@ module Oneboxer
         if fd.status == :invalid_address
           args[:error_message] = I18n.t("errors.onebox.invalid_address", hostname: fd.hostname)
         elsif fd.status_code
+          # Try a different oneboxing strategy, if we have any options left:
+          if available_strategies.present?
+            return external_onebox(url, available_strategies)
+          end
+
           args[:error_message] = I18n.t("errors.onebox.error_response", status_code: fd.status_code)
         end
 
@@ -466,6 +484,8 @@ module Oneboxer
         end
       end
 
+      Oneboxer.cache_preferred_strategy(uri.hostname, strategy)
+
       result
     end
   end
@@ -490,4 +510,44 @@ module Oneboxer
     end
   end
 
+  def self.ordered_strategies(hostname)
+    all = strategies.keys
+    preferred = Oneboxer.preferred_strategy(hostname)
+
+    all.insert(0, all.delete(preferred)) if all.include?(preferred)
+
+    all
+  end
+
+  def self.strategies
+    {
+      default: {}, # don't override anything by default
+      force_get_and_ua: {
+        force_get_host: true,
+        force_custom_user_agent_host: true,
+      },
+    }
+  end
+
+  def self.cache_preferred_strategy(hostname, strategy)
+    return if strategy == :default
+
+    key = redis_oneboxer_strategy_key(hostname)
+    Discourse.redis.without_namespace.setex(key, 2.weeks.to_i, strategy.to_s)
+  end
+
+  def self.clear_preferred_strategy!(hostname)
+    key = redis_oneboxer_strategy_key(hostname)
+    Discourse.redis.without_namespace.del(key)
+  end
+
+  def self.preferred_strategy(hostname)
+    key = redis_oneboxer_strategy_key(hostname)
+    Discourse.redis.without_namespace.get(key)&.to_sym
+  end
+
+  def self.redis_oneboxer_strategy_key(hostname)
+    "ONEBOXER_STRATEGY_#{hostname}"
+  end
+
 end
diff --git a/spec/components/final_destination_spec.rb b/spec/components/final_destination_spec.rb
index 585ca61..678423e 100644
--- a/spec/components/final_destination_spec.rb
+++ b/spec/components/final_destination_spec.rb
@@ -9,7 +9,7 @@ describe FinalDestination do
     {
       ignore_redirects: ['https://ignore-me.com'],
 
-      force_get_hosts: ['https://force.get.com'],
+      force_get_hosts: ['https://force.get.com', 'https://*.ihaveawildcard.com/'],
 
       preserve_fragment_url_hosts: ['https://eviltrout.com'],
 
@@ -17,6 +17,7 @@ describe FinalDestination do
       lookup_ip: lambda do |host|
         case host
         when 'eviltrout.com' then '52.84.143.152'
+        when 'particularly.eviltrout.com' then '52.84.143.152'
         when 'codinghorror.com' then '91.146.108.148'
         when 'discourse.org' then '104.25.152.10'
         when 'some_thing.example.com' then '104.25.152.10'
@@ -24,6 +25,7 @@ describe FinalDestination do
         when 'internal-ipv6.com' then '2001:abc:de:01:3:3d0:6a65:c2bf'
         when 'ignore-me.com' then '53.84.143.152'
         when 'force.get.com' then '22.102.29.40'
+        when 'any-subdomain.ihaveawildcard.com' then '104.25.152.11'
         when 'wikipedia.com' then '1.2.3.4'
         else
           as_ip = IPAddr.new(host)
@@ -170,8 +172,11 @@ describe FinalDestination do
       before do
         stub_request(:head, 'https://force.get.com/posts?page=4')
         stub_request(:get, 'https://force.get.com/posts?page=4')
+        stub_request(:get, 'https://any-subdomain.ihaveawildcard.com/some/other/content')
         stub_request(:head, 'https://eviltrout.com/posts?page=2')
         stub_request(:get, 'https://eviltrout.com/posts?page=2')
+        stub_request(:head, 'https://particularly.eviltrout.com/has/a/secret/plan')
+        stub_request(:get, 'https://particularly.eviltrout.com/has/a/secret/plan')
       end
 
       it "will do a GET when forced" do
@@ -189,6 +194,23 @@ describe FinalDestination do
         expect(WebMock).to_not have_requested(:get, 'https://eviltrout.com/posts?page=2')
         expect(WebMock).to have_requested(:head, 'https://eviltrout.com/posts?page=2')
       end
+
+      it "will do a GET when forced on a wildcard subdomain" do
+        final = FinalDestination.new('https://any-subdomain.ihaveawildcard.com/some/other/content', opts)
+        expect(final.resolve.to_s).to eq('https://any-subdomain.ihaveawildcard.com/some/other/content')
+        expect(final.status).to eq(:resolved)
+        expect(WebMock).to have_requested(:get, 'https://any-subdomain.ihaveawildcard.com/some/other/content')
+        expect(WebMock).to_not have_requested(:head, 'https://any-subdomain.ihaveawildcard.com/some/other/content')
+      end
+
+      it "will do a HEAD if on a subdomain of a forced get domain without a wildcard" do
+        final = FinalDestination.new('https://particularly.eviltrout.com/has/a/secret/plan', opts)
+        expect(final.resolve.to_s).to eq('https://particularly.eviltrout.com/has/a/secret/plan')
+        expect(final.status).to eq(:resolved)

[... diff too long, it was truncated ...]

GitHub sha: 19182b13

This commit appears in #13015 which was approved by Falco. It was merged by jbrw.