FEATURE: Oneboxer cache response body (#12562)
- FEATURE: Cache successful HTTP GET requests during Oneboxing
Some oneboxes may fail if when making excessive and/or odd requests against the target domains. This change provides a simple mechanism to cache the results of succesful GET requests as part of the oneboxing process, with the goal of reducing repeated requests and ultimately improving the rate of successful oneboxing.
To enable:
Set SiteSetting.cache_onebox_response_body
to true
Add the domains you’re interesting in caching to SiteSetting. cache_onebox_response_body_domains
e.g. example.com|example.org|example.net
Optionally set SiteSetting.cache_onebox_user_agent
to a user agent string of your choice to use when making requests against domains in the above list.
- FIX: Swap order of duration and value in redis call
The correct order for setex
arguments is key
, duration
, and value
.
Duration and value had been flipped, however the code would not have thrown an error because we were caching the value of 1.day.to_i
for a period of 1 seconds… The intention appears to be to set a value of 1 (purely as a flag) for a period of 1 day.
diff --git a/config/site_settings.yml b/config/site_settings.yml
index 33bd132..90cd192 100644
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@@ -1638,6 +1638,16 @@ onebox:
facebook_app_access_token:
default: ""
secret: true
+ cache_onebox_response_body:
+ default: false
+ hidden: true
+ cache_onebox_response_body_domains:
+ default: ""
+ type: list
+ hidden: true
+ cache_onebox_user_agent:
+ default: ""
+ hidden: true
spam:
add_rel_nofollow_to_user_content: true
hide_post_sensitivity:
diff --git a/lib/final_destination.rb b/lib/final_destination.rb
index 8bf06ee..0c0db2e 100644
--- a/lib/final_destination.rb
+++ b/lib/final_destination.rb
@@ -16,7 +16,7 @@ class FinalDestination
def self.cache_https_domain(domain)
key = redis_https_key(domain)
- Discourse.redis.without_namespace.setex(key, "1", 1.day.to_i).present?
+ Discourse.redis.without_namespace.setex(key, 1.day.to_i, "1")
end
def self.is_https_domain?(domain)
@@ -28,6 +28,8 @@ class FinalDestination
"HTTPS_DOMAIN_#{domain}"
end
+ DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"
+
attr_reader :status, :cookie, :status_code, :ignored
def initialize(url, opts = nil)
@@ -38,6 +40,7 @@ class FinalDestination
@force_get_hosts = @opts[:force_get_hosts] || []
@preserve_fragment_url_hosts = @opts[:preserve_fragment_url_hosts] || []
@force_custom_user_agent_hosts = @opts[:force_custom_user_agent_hosts] || []
+ @default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT
@opts[:max_redirects] ||= 5
@opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) }
@@ -67,7 +70,7 @@ class FinalDestination
@timeout = @opts[:timeout] || nil
@preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) }
@validate_uri = @opts.fetch(:validate_uri) { true }
- @user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"
+ @user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : @default_user_agent
end
def self.connection_timeout
@@ -182,20 +185,35 @@ class FinalDestination
end
end
+ if Oneboxer.cached_response_body_exists?(@uri.to_s)
+ @status = :resolved
+ return @uri
+ end
+
headers = request_headers
+ middlewares = Excon.defaults[:middlewares]
+ middlewares << Excon::Middleware::Decompress if @http_verb == :get
+
response = Excon.public_send(@http_verb,
@uri.to_s,
read_timeout: timeout,
- headers: headers
+ headers: headers,
+ middlewares: middlewares
)
location = nil
response_headers = nil
-
response_status = response.status.to_i
case response.status
when 200
+ # Cache body of successful `get` requests
+ if @http_verb == :get
+ if Oneboxer.cache_response_body?(@uri)
+ Oneboxer.cache_response_body(@uri.to_s, response.body)
+ end
+ end
+
@status = :resolved
return @uri
when 400, 405, 406, 409, 501
diff --git a/lib/oneboxer.rb b/lib/oneboxer.rb
index 79f5070..c30c4e9 100644
--- a/lib/oneboxer.rb
+++ b/lib/oneboxer.rb
@@ -26,13 +26,17 @@ module Oneboxer
@ignore_redirects ||= ['http://www.dropbox.com', 'http://store.steampowered.com', 'http://vimeo.com', Discourse.base_url]
end
+ def self.amazon_domains
+ amazon_suffixes = %w(com com.br ca cn fr de in it co.jp com.mx nl pl sa sg es se com.tr ae co.uk)
+ amazon_suffixes.collect { |suffix| "https://www.amazon.#{suffix}" }
+ end
+
def self.force_get_hosts
- @force_get_hosts ||= begin
- hosts = ['http://us.battle.net', 'https://news.yahoo.com']
- amazon_suffixes = %w(com com.br ca cn fr de in it co.jp com.mx nl pl sa sg es se com.tr ae co.uk)
+ hosts = ['http://us.battle.net', 'https://news.yahoo.com']
+ hosts += SiteSetting.cache_onebox_response_body_domains.split('|').collect { |domain| "https://www.#{domain}" }
+ hosts += amazon_domains
- hosts + amazon_suffixes.collect { |suffix| "https://www.amazon.#{suffix}" }
- end
+ hosts.uniq
end
def self.force_custom_user_agent_hosts
@@ -80,6 +84,33 @@ module Oneboxer
Discourse.cache.delete(onebox_failed_cache_key(url))
end
+ def self.cache_response_body?(uri)
+ uri = URI.parse(uri) if uri.is_a?(String)
+
+ if SiteSetting.cache_onebox_response_body?
+ SiteSetting.cache_onebox_response_body_domains.split("|").any? { |domain| uri.hostname.ends_with?(domain) }
+ end
+ end
+
+ def self.cache_response_body(uri, response)
+ key = redis_cached_response_body_key(uri)
+ Discourse.redis.without_namespace.setex(key, 1.minutes.to_i, response)
+ end
+
+ def self.cached_response_body_exists?(uri)
+ key = redis_cached_response_body_key(uri)
+ Discourse.redis.without_namespace.exists(key).to_i > 0
+ end
+
+ def self.fetch_cached_response_body(uri)
+ key = redis_cached_response_body_key(uri)
+ Discourse.redis.without_namespace.get(key)
+ end
+
+ def self.redis_cached_response_body_key(uri)
+ "CACHED_RESPONSE_#{uri}"
+ end
+
# Parse URLs out of HTML, returning the document when finished.
def self.each_onebox_link(string_or_doc, extra_paths: [])
doc = string_or_doc
@@ -368,12 +399,18 @@ module Oneboxer
def self.external_onebox(url)
Discourse.cache.fetch(onebox_cache_key(url), expires_in: 1.day) do
- fd = FinalDestination.new(url,
- ignore_redirects: ignore_redirects,
- ignore_hostnames: blocked_domains,
- force_get_hosts: force_get_hosts,
- force_custom_user_agent_hosts: force_custom_user_agent_hosts,
- preserve_fragment_url_hosts: preserve_fragment_url_hosts)
+ fd_options = {
+ ignore_redirects: ignore_redirects,
+ ignore_hostnames: blocked_domains,
+ force_get_hosts: force_get_hosts,
+ force_custom_user_agent_hosts: force_custom_user_agent_hosts,
+ preserve_fragment_url_hosts: preserve_fragment_url_hosts
+ }
+
+ user_agent_override = SiteSetting.cache_onebox_user_agent if Oneboxer.cache_response_body?(url) && SiteSetting.cache_onebox_user_agent.present?
+ fd_options[:default_user_agent] = user_agent_override if user_agent_override
+
+ fd = FinalDestination.new(url, fd_options)
uri = fd.resolve
if fd.status != :resolved
@@ -389,20 +426,22 @@ module Oneboxer
return error_box
end
- return blank_onebox if uri.blank? || blocked_domains.map { |hostname| uri.hostname.match?(hostname) }.any?
+ return blank_onebox if uri.blank? || blocked_domains.any? { |hostname| uri.hostname.match?(hostname) }
- options = {
+ onebox_options = {
max_width: 695,
sanitize_config: Onebox::DiscourseOneboxSanitizeConfig::Config::DISCOURSE_ONEBOX,
allowed_iframe_origins: allowed_iframe_origins,
hostname: GlobalSetting.hostname,
facebook_app_access_token: SiteSetting.facebook_app_access_token,
- disable_media_download_controls: SiteSetting.disable_onebox_media_download_controls
+ disable_media_download_controls: SiteSetting.disable_onebox_media_download_controls,
+ body_cacher: self
}
- options[:cookie] = fd.cookie if fd.cookie
+ onebox_options[:cookie] = fd.cookie if fd.cookie
+ onebox_options[:user_agent] = user_agent_override if user_agent_override
- r = Onebox.preview(uri.to_s, options)
+ r = Onebox.preview(uri.to_s, onebox_options)
[... diff too long, it was truncated ...]
GitHub sha: 68d0916e