DEV: Small Amazon fixes (#463)

DEV: Small Amazon fixes (#463)

  • DEV: Small Amazon fixes

  • Use the canonical URL as the link within the generated onebox

  • Use the data-old-hires value for the image URL, if available

  • Version bump to 2.2.10

  • Truncate description, regardless of where it is sourced from

Co-authored-by: Jarek Radosz jradosz@gmail.com

diff --git a/lib/onebox/engine/amazon_onebox.rb b/lib/onebox/engine/amazon_onebox.rb
index 9b23c93..ae15da0 100644
--- a/lib/onebox/engine/amazon_onebox.rb
+++ b/lib/onebox/engine/amazon_onebox.rb
@@ -14,17 +14,20 @@ module Onebox
       matches_regexp(/^https?:\/\/(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?<tld>com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx|nl|pl|sa|sg|se|com\.tr|ae)\//)
 
       def url
-        # Have we cached the HTML body of the requested URL?
-        # If so, try to grab the canonical URL from that document,
+        # If possible, fetch the cached HTML body immediately so we can
+        # try to grab the canonical URL from that document,
         # rather than guess at the best URL structure to use
-        if @body_cacher && @body_cacher.respond_to?('cache_response_body?')
-          if @body_cacher.cached_response_body_exists?(uri.to_s)
-            @raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, @body_cacher)
-            canonical_link = @raw.at('//link[@rel="canonical"]/@href')
-            return canonical_link.to_s if canonical_link
+        if body_cacher&.respond_to?('cache_response_body?')
+          if body_cacher.cache_response_body?(uri.to_s) && body_cacher.cached_response_body_exists?(uri.to_s)
+            @raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher)
           end
         end
 
+        if @raw
+          canonical_link = @raw.at('//link[@rel="canonical"]/@href')
+          return canonical_link.to_s if canonical_link
+        end
+
         if match && match[:id]
           return "https://www.amazon.#{tld}/dp/#{Onebox::Helpers.uri_encode(match[:id])}"
         end
@@ -45,7 +48,7 @@ module Onebox
       private
 
       def match
-        @match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[^\/]+)(?:\/|$)/mi)
+        @match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[A-Z0-9]+)(?:\/|\?|$)/mi)
       end
 
       def image
@@ -60,6 +63,10 @@ module Onebox
         end
 
         if (landing_image = raw.css("#landingImage")) && landing_image.any?
+          attributes = landing_image.first.attributes
+
+          return attributes["data-old-hires"].to_s if attributes["data-old-hires"]
+
           landing_image.first["src"].to_s
         end
 
@@ -110,7 +117,7 @@ module Onebox
           end
 
           result = {
-            link: link,
+            link: url,
             title: title,
             by_info: authors,
             image: og.image || image,
@@ -141,7 +148,7 @@ module Onebox
           end
 
           result = {
-            link: link,
+            link: url,
             title: title,
             by_info: authors,
             image: og.image || image,
@@ -157,7 +164,7 @@ module Onebox
         else
           title = og.title || CGI.unescapeHTML(raw.css("title").inner_text)
           result = {
-            link: link,
+            link: url,
             title: title,
             image: og.image || image,
             price: price
@@ -167,7 +174,10 @@ module Onebox
           result[:by_info] = Onebox::Helpers.clean(result[:by_info].inner_html) if result[:by_info]
 
           summary = raw.at("#productDescription")
-          result[:description] = og.description || (summary && summary.inner_text) || CGI.unescapeHTML(Onebox::Helpers.truncate(raw.css("meta[name=description]").first["content"], 250))
+
+          description = og.description || summary&.inner_text
+          description ||= raw.css("meta[name=description]").first&.[]("content")
+          result[:description] = CGI.unescapeHTML(Onebox::Helpers.truncate(description, 250)) if description
         end
 
         result[:price] = nil if result[:price].start_with?("$0") || result[:price] == 0
diff --git a/lib/onebox/engine/html.rb b/lib/onebox/engine/html.rb
index f11c178..b0dfba2 100644
--- a/lib/onebox/engine/html.rb
+++ b/lib/onebox/engine/html.rb
@@ -11,10 +11,13 @@ module Onebox
       end
 
       def raw
-        body_cacher = self.options[:body_cacher] if self.options
         @raw ||= Onebox::Helpers.fetch_html_doc(url, http_params, body_cacher)
       end
 
+      def body_cacher
+        self.options&.[](:body_cacher)
+      end
+
       def html?
         raw.respond_to(:css)
       end
diff --git a/lib/onebox/version.rb b/lib/onebox/version.rb
index cf43cd2..9370cec 100644
--- a/lib/onebox/version.rb
+++ b/lib/onebox/version.rb
@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 
 module Onebox
-  VERSION = "2.2.9"
+  VERSION = "2.2.10"
 end
diff --git a/spec/lib/onebox/engine/amazon_onebox_spec.rb b/spec/lib/onebox/engine/amazon_onebox_spec.rb
index 85a75d4..a405335 100644
--- a/spec/lib/onebox/engine/amazon_onebox_spec.rb
+++ b/spec/lib/onebox/engine/amazon_onebox_spec.rb
@@ -73,6 +73,13 @@ describe Onebox::Engine::AmazonOnebox do
         expect(described_class.new("http://www.amazon.fr/gp/product/B01BYD0TZM").url)
           .to eq("https://www.amazon.fr/dp/B01BYD0TZM")
       end
+
+      let(:long_url) { "https://www.amazon.ca/gp/product/B087Z3N428?pf_rd_r=SXABADD0ZZ3NF9Q5F8TW&pf_rd_p=05378fd5-c43e-4948-99b1-a65b129fdd73&pd_rd_r=0237fb28-7f47-49f4-986a-be0c78e52863&pd_rd_w=FfIoI&pd_rd_wg=Hw4qq&ref_=pd_gw_unk" }
+
+      it "removes parameters from the URL" do
+        expect(described_class.new(long_url).url)
+          .not_to include("?pf_rd_r")
+      end
     end
 
     describe "#to_html" do

GitHub sha: d84e820c

1 Like

This commit appears in #463 which was approved by CvX. It was merged by jbrw.