DEV: updates to Amazon Oneboxing (#461)

DEV: updates to Amazon Oneboxing (#461)

  • Add additional Amazon domains to support additional countries

  • If available, grab the canonical URL for a given Amazon URL from a cache, rather than making a separate HTTP GET request.

  • If the canonical URL can not be determined, we use a new shorter /dp/ link, which seems to be preferred by Amazon at the moment.

  • Allow the passing in of a custom user-agent to use on Amazon requests.

diff --git a/lib/onebox/engine/amazon_onebox.rb b/lib/onebox/engine/amazon_onebox.rb
index 22a2eea..9b23c93 100644
--- a/lib/onebox/engine/amazon_onebox.rb
+++ b/lib/onebox/engine/amazon_onebox.rb
@@ -11,11 +11,22 @@ module Onebox
       include HTML
 
       always_https
-      matches_regexp(/^https?:\/\/(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?<tld>com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx)\//)
+      matches_regexp(/^https?:\/\/(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?<tld>com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx|nl|pl|sa|sg|se|com\.tr|ae)\//)
 
       def url
+        # Have we cached the HTML body of the requested URL?
+        # If so, try to grab the canonical URL from that document,
+        # rather than guess at the best URL structure to use
+        if @body_cacher && @body_cacher.respond_to?('cache_response_body?')
+          if @body_cacher.cached_response_body_exists?(uri.to_s)
+            @raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, @body_cacher)
+            canonical_link = @raw.at('//link[@rel="canonical"]/@href')
+            return canonical_link.to_s if canonical_link
+          end
+        end
+
         if match && match[:id]
-          return "https://www.amazon.#{tld}/gp/aw/d/#{Onebox::Helpers.uri_encode(match[:id])}"
+          return "https://www.amazon.#{tld}/dp/#{Onebox::Helpers.uri_encode(match[:id])}"
         end
 
         @url
@@ -26,10 +37,9 @@ module Onebox
       end
 
       def http_params
-        {
-          'User-Agent' =>
-          'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'
-        }
+        if @options && @options[:user_agent]
+          { 'User-Agent' => @options[:user_agent] }
+        end
       end
 
       private
diff --git a/spec/lib/onebox/engine/amazon_onebox_spec.rb b/spec/lib/onebox/engine/amazon_onebox_spec.rb
index 809acdf..85a75d4 100644
--- a/spec/lib/onebox/engine/amazon_onebox_spec.rb
+++ b/spec/lib/onebox/engine/amazon_onebox_spec.rb
@@ -7,7 +7,7 @@ describe Onebox::Engine::AmazonOnebox do
   context "regular amazon page" do
     before do
       @link = "https://www.amazon.com/Knit-Noro-Accessories-Colorful-Little/dp/193609620X"
-      @uri = "https://www.amazon.com/gp/aw/d/193609620X"
+      @uri = "https://www.amazon.com/dp/193609620X"
     end
     include_context "engines"
     it_behaves_like "an engine"
@@ -68,10 +68,10 @@ describe Onebox::Engine::AmazonOnebox do
     describe "#url" do
       it "maintains the same http/https scheme as the requested URL" do
         expect(described_class.new("https://www.amazon.fr/gp/product/B01BYD0TZM").url)
-          .to eq("https://www.amazon.fr/gp/aw/d/B01BYD0TZM")
+          .to eq("https://www.amazon.fr/dp/B01BYD0TZM")
 
         expect(described_class.new("http://www.amazon.fr/gp/product/B01BYD0TZM").url)
-          .to eq("https://www.amazon.fr/gp/aw/d/B01BYD0TZM")
+          .to eq("https://www.amazon.fr/dp/B01BYD0TZM")
       end
     end
 
@@ -99,7 +99,7 @@ describe Onebox::Engine::AmazonOnebox do
     let(:html) { described_class.new(link).to_html }
 
     before do
-      fake("https://www.amazon.com/gp/aw/d/B01MFXN4Y2", response("amazon-og"))
+      fake("https://www.amazon.com/dp/B01MFXN4Y2", response("amazon-og"))
     end
 
     describe "#to_html" do
@@ -122,7 +122,7 @@ describe Onebox::Engine::AmazonOnebox do
     let(:html) { described_class.new(link).to_html }
 
     before do
-      fake("https://www.amazon.com/gp/aw/d/B00AYQNR46", response("amazon"))
+      fake("https://www.amazon.com/dp/B00AYQNR46", response("amazon"))
     end
 
     describe "#to_html" do
@@ -146,7 +146,7 @@ describe Onebox::Engine::AmazonOnebox do
     let(:html) { described_class.new(link).to_html }
 
     before do
-      fake("https://www.amazon.com/gp/aw/d/193435659X", response("amazon-ebook"))
+      fake("https://www.amazon.com/dp/193435659X", response("amazon-ebook"))
     end
 
     describe "#to_html" do

GitHub sha: 2447db2b

This commit appears in #461 which was approved by ZogStriP. It was merged by jbrw.