DEV: General refactoring and cleanup (#474)

DEV: General refactoring and cleanup (#474)

Included:

  • DEV: Remove some defunct sites
  • DEV: Refactor GoogleDocsOnebox
  • DEV: Refactor PastebinOnebox
  • DEV: Refactor GfycatOnebox
  • DEV: Refactor PdfOnebox
  • DEV: Refactor PubmedOnebox
  • DEV: Refactor AmazonOnebox
  • DEV: Minor InstagramOnebox clean up
  • DEV: Small tweaks to TrelloOnebox

Other stuff:

  • DEV: Don’t use link and url interchangeably
  • DEV: Roll Layout#link into Layout#uri
  • DEV: Properly memoize Layout#uri
  • DEV: Re-use Layout#uri
  • DEV: Rename get_secure_image to secure_image_url
  • DEV: Normalize matches_regexp calls
  • DEV: Use path-specific encoding rules
  • DEV: Layout#checksum is no longer used
  • DEV: Remove double protected
  • DEV: Refactor Matcher
  • DEV: Use the existing Engine#uri method
  • DEV: The first Engine#initialize arg is url
  • DEV: Use match?
  • DEV: Simplify allowlistedgeneric template
  • DEV: Initialize @options in json spec
  • DEV: Simplify conditionals
  • DEV: Merge attr_readers
  • DEV: Whitespace, after() before tests
diff --git a/lib/onebox/engine.rb b/lib/onebox/engine.rb
index 4986b32..9f64180 100644
--- a/lib/onebox/engine.rb
+++ b/lib/onebox/engine.rb
@@ -28,23 +28,19 @@ module Onebox
       end
     end
 
-    attr_reader :url, :uri
-    attr_reader :timeout
+    attr_reader :url, :uri, :options, :timeout
     attr :errors
 
     DEFAULT = {}
-    def options
-      @options
-    end
 
     def options=(opt)
-      return @options if opt.nil? #make sure options provided
-      opt = opt.to_h  if opt.instance_of?(OpenStruct)
+      return @options if opt.nil? # make sure options provided
+      opt = opt.to_h if opt.instance_of?(OpenStruct)
       @options.merge!(opt)
       @options
     end
 
-    def initialize(link, timeout = nil)
+    def initialize(url, timeout = nil)
       @errors = {}
       @options = DEFAULT
       class_name = self.class.name.split("::").last.to_s
@@ -52,8 +48,8 @@ module Onebox
       # Set the engine options extracted from global options.
       self.options = Onebox.options[class_name] || {}
 
-      @url = link
-      @uri = URI(link)
+      @url = url
+      @uri = URI(url)
       if always_https?
         @uri.scheme = 'https'
         @url = @uri.to_s
diff --git a/lib/onebox/engine/allowlisted_generic_onebox.rb b/lib/onebox/engine/allowlisted_generic_onebox.rb
index 0397296..f04e064 100644
--- a/lib/onebox/engine/allowlisted_generic_onebox.rb
+++ b/lib/onebox/engine/allowlisted_generic_onebox.rb
@@ -27,7 +27,6 @@ module Onebox
           500px.com
           8tracks.com
           abc.net.au
-          about.com
           answers.com
           arstechnica.com
           ask.com
@@ -36,11 +35,9 @@ module Onebox
           bbs.boingboing.net
           bestbuy.ca
           bestbuy.com
-          blip.tv
           bloomberg.com
           businessinsider.com
           change.org
-          clikthrough.com
           cnet.com
           cnn.com
           codepen.io
@@ -90,7 +87,6 @@ module Onebox
           meetup.com
           mixcloud.com
           mlb.com
-          myshopify.com
           myspace.com
           nba.com
           npr.org
@@ -98,16 +94,13 @@ module Onebox
           photobucket.com
           pinterest.com
           reference.com
-          revision3.com
           rottentomatoes.com
           samsung.com
-          screenr.com
           scribd.com
           slideshare.net
           sourceforge.net
           speakerdeck.com
           spotify.com
-          squidoo.com
           streamable.com
           techcrunch.com
           ted.com
@@ -124,7 +117,6 @@ module Onebox
           twitpic.com
           usatoday.com
           viddler.com
-          videojug.com
           vine.co
           walmart.com
           washingtonpost.com
@@ -275,7 +267,6 @@ module Onebox
 
       def rewrite_https(html)
         return unless html
-        uri = URI(@url)
         if AllowlistedGenericOnebox.host_matches(uri, AllowlistedGenericOnebox.rewrites)
           html = html.gsub("http://", "https://")
         end
diff --git a/lib/onebox/engine/amazon_onebox.rb b/lib/onebox/engine/amazon_onebox.rb
index eb5463c..4ea26aa 100644
--- a/lib/onebox/engine/amazon_onebox.rb
+++ b/lib/onebox/engine/amazon_onebox.rb
@@ -19,10 +19,8 @@ module Onebox
         # If possible, fetch the cached HTML body immediately so we can
         # try to grab the canonical URL from that document,
         # rather than guess at the best URL structure to use
-        if body_cacher&.respond_to?('cache_response_body?')
-          if body_cacher.cache_response_body?(uri.to_s) && body_cacher.cached_response_body_exists?(uri.to_s)
-            @raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher)
-          end
+        if !@raw && has_cached_body
+          @raw = Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher)
         end
 
         if @raw
@@ -31,7 +29,8 @@ module Onebox
         end
 
         if match && match[:id]
-          return "https://www.amazon.#{tld}/dp/#{Onebox::Helpers.uri_encode(match[:id])}"
+          id = Addressable::URI.encode_component(match[:id], Addressable::URI::CharacterClasses::PATH)
+          return "https://www.amazon.#{tld}/dp/#{id}"
         end
 
         @url
@@ -49,6 +48,12 @@ module Onebox
 
       private
 
+      def has_cached_body
+        body_cacher&.respond_to?('cache_response_body?') &&
+          body_cacher.cache_response_body?(uri.to_s) &&
+          body_cacher.cached_response_body_exists?(uri.to_s)
+      end
+
       def match
         @match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[A-Z0-9]+)(?:\/|\?|$)/mi)
       end
@@ -57,9 +62,9 @@ module Onebox
         if (main_image = raw.css("#main-image")) && main_image.any?
           attributes = main_image.first.attributes
 
-          return attributes["data-a-hires"].to_s if attributes["data-a-hires"]
-
-          if attributes["data-a-dynamic-image"]
+          if attributes["data-a-hires"]
+            return attributes["data-a-hires"].to_s
+          elsif attributes["data-a-dynamic-image"]
             return ::JSON.parse(attributes["data-a-dynamic-image"].value).keys.first
           end
         end
@@ -67,9 +72,11 @@ module Onebox
         if (landing_image = raw.css("#landingImage")) && landing_image.any?
           attributes = landing_image.first.attributes
 
-          return attributes["data-old-hires"].to_s if attributes["data-old-hires"]
-
-          landing_image.first["src"].to_s
+          if attributes["data-old-hires"]
+            return attributes["data-old-hires"].to_s
+          else
+            return landing_image.first["src"].to_s
+          end
         end
 
         if (ebook_image = raw.css("#ebooksImgBlkFront")) && ebook_image.any?
@@ -91,16 +98,16 @@ module Onebox
       end
 
       def multiple_authors(authors_xpath)
-        author_list = raw.xpath(authors_xpath)
-        authors = []
-        author_list.each { |a| authors << a.inner_text.strip }
-        authors.join(", ")
+        raw
+          .xpath(authors_xpath)
+          .map { |a| a.inner_text.strip }
+          .join(", ")
       end
 
       def data
         og = ::Onebox::OpenGraph.new(raw)
 
-        if raw.at_css('#dp.book_mobile') #printed books
+        if raw.at_css('#dp.book_mobile') # printed books
           title = raw.at("h1#title")&.inner_text
           authors = raw.at_css('#byline_secondary_view_div') ? multiple_authors("//div[@id='byline_secondary_view_div']//span[@class='a-text-bold']") : raw.at("#byline")&.inner_text
           rating = raw.at("#averageCustomerReviews_feature_div .a-icon")&.inner_text || raw.at("#cmrsArcLink .a-icon")&.inner_text
diff --git a/lib/onebox/engine/flickr_onebox.rb b/lib/onebox/engine/flickr_onebox.rb
index 8819d6b..3ed2668 100644
--- a/lib/onebox/engine/flickr_onebox.rb
+++ b/lib/onebox/engine/flickr_onebox.rb
@@ -32,7 +32,7 @@ module Onebox
                   <span class='album-title'>#{album_title}</span>
                 </span>
               </span>
-              <img src='#{og.get_secure_image}' #{og.title_attr} height='#{og.image_height}' width='#{og.image_width}'>
+              <img src='#{og.secure_image_url}' #{og.title_attr} height='#{og.image_height}' width='#{og.image_width}'>
             </a>
           </div>
         HTML
@@ -43,7 +43,7 @@ module Onebox
 
         <<-HTML
           <a href='#{escaped_url}' target='_blank' rel='noopener' class="onebox">
-            <img src='#{og.get_secure_image}' #{og.title_attr} alt='Imgur' height='#{og.image_height}' width='#{og.image_width}'>
+            <img src='#{og.secure_image_url}' #{og.title_attr} alt='Imgur' height='#{og.image_height}' width='#{og.image_width}'>
           </a>
         HTML
       end
diff --git a/lib/onebox/engine/gfycat_onebox.rb b/lib/onebox/engine/gfycat_onebox.rb
index 10973c3..27fd4bf 100644
--- a/lib/onebox/engine/gfycat_onebox.rb
+++ b/lib/onebox/engine/gfycat_onebox.rb
@@ -9,8 +9,8 @@ module Onebox

[... diff too long, it was truncated ...]

GitHub sha: b816ada0

This commit appears in #474 which was approved by eviltrout. It was merged by CvX.