FEATURE: Add more control of iframe embeds (#434)

FEATURE: Add more control of iframe embeds (#434)

All onebox engines now define their desired iframe domains using requires_iframe_origins

Consumers can choose to pass an array of allowed_iframe_origins in options. Engines which require iframe origins not included in the list will be skipped. The sanitizer will also strip all iframes which are not in the allowed list. If allowed_iframe_origins is unspecified, all required_iframe_origins will be allowed.

diff --git a/lib/onebox/engine.rb b/lib/onebox/engine.rb
index 33f7807..6863a9d 100644
--- a/lib/onebox/engine.rb
+++ b/lib/onebox/engine.rb
@@ -12,6 +12,22 @@ module Onebox
       end.map(&method(:const_get))
     end
 
+    def self.all_iframe_origins
+      engines.flat_map { |e| e.iframe_origins }.uniq.compact
+    end
+
+    def self.origins_to_regexes(origins)
+      return /.*/ if origins.include?("*")
+      origins.map do |origin|
+        escaped_origin = Regexp.escape(origin)
+        if origin.start_with?("*.", "https://*.", "http://*.")
+          escaped_origin = escaped_origin.sub("\\*", '\S*')
+        end
+
+        Regexp.new("\\A#{escaped_origin}", 'i')
+      end
+    end
+
     attr_reader :url, :uri
     attr_reader :timeout
 
@@ -100,6 +116,14 @@ module Onebox
         class_variable_set :@@matcher, r
       end
 
+      def requires_iframe_origins(*origins)
+        class_variable_set :@@iframe_origins, origins
+      end
+
+      def iframe_origins
+        class_variable_defined?(:@@iframe_origins) ? class_variable_get(:@@iframe_origins) : []
+      end
+
       # calculates a name for onebox using the class name of engine
       def onebox_name
         name.split("::").last.downcase.gsub(/onebox/, "")
diff --git a/lib/onebox/engine/allowlisted_generic_onebox.rb b/lib/onebox/engine/allowlisted_generic_onebox.rb
index da71583..53f6dd0 100644
--- a/lib/onebox/engine/allowlisted_generic_onebox.rb
+++ b/lib/onebox/engine/allowlisted_generic_onebox.rb
@@ -281,7 +281,9 @@ module Onebox
       end
 
       def is_card?
-        data[:card] == 'player' && data[:player] =~ URI::regexp
+        data[:card] == 'player' &&
+          data[:player] =~ URI::regexp &&
+          options[:allowed_iframe_regexes]&.any? { |r| data[:player] =~ r }
       end
 
       def is_article?
@@ -305,16 +307,19 @@ module Onebox
       end
 
       def is_video?
-        data[:type] =~ /^video[\/\.]/ && !Onebox::Helpers.blank?(data[:video])
+        data[:type] =~ /^video[\/\.]/ &&
+          data[:video_type] == "video/mp4" && # Many sites include 'videos' with text/html types (i.e. iframes)
+          !Onebox::Helpers.blank?(data[:video])
       end
 
       def is_embedded?
-        data[:html] &&
-        data[:height] &&
-        (
-          data[:html]["iframe"] ||
-          AllowlistedGenericOnebox.html_providers.include?(data[:provider_name])
-        )
+        return false unless data[:html] && data[:height]
+        return true if AllowlistedGenericOnebox.html_providers.include?(data[:provider_name])
+        return false unless data[:html]["iframe"]
+
+        fragment = Nokogiri::HTML::fragment(data[:html])
+        src = fragment.at_css('iframe')&.[]("src")
+        options[:allowed_iframe_regexes]&.any? { |r| src =~ r }
       end
 
       def card_html
diff --git a/lib/onebox/engine/bandcamp_onebox.rb b/lib/onebox/engine/bandcamp_onebox.rb
index 7118b28..83ff790 100644
--- a/lib/onebox/engine/bandcamp_onebox.rb
+++ b/lib/onebox/engine/bandcamp_onebox.rb
@@ -8,6 +8,7 @@ module Onebox
 
       matches_regexp(/^https?:\/\/.*\.bandcamp\.com\/(album|track)\//)
       always_https
+      requires_iframe_origins "https://bandcamp.com"
 
       def placeholder_html
         og = get_opengraph
diff --git a/lib/onebox/engine/facebook_media_onebox.rb b/lib/onebox/engine/facebook_media_onebox.rb
index d6afa22..d676be3 100644
--- a/lib/onebox/engine/facebook_media_onebox.rb
+++ b/lib/onebox/engine/facebook_media_onebox.rb
@@ -8,6 +8,7 @@ module Onebox
 
       matches_regexp(/^https?:\/\/.*\.facebook\.com\/(\w+)\/(videos|\?).*/)
       always_https
+      requires_iframe_origins "https://www.facebook.com"
 
       def to_html
         metadata = get_twitter
diff --git a/lib/onebox/engine/google_calendar_onebox.rb b/lib/onebox/engine/google_calendar_onebox.rb
index 2500c92..8407164 100644
--- a/lib/onebox/engine/google_calendar_onebox.rb
+++ b/lib/onebox/engine/google_calendar_onebox.rb
@@ -7,6 +7,7 @@ module Onebox
 
       matches_regexp /^(https?:)?\/\/((www|calendar)\.google\.[\w.]{2,}|goo\.gl)\/calendar\/.+$/
       always_https
+      requires_iframe_origins "https://calendar.google.com"
 
       def to_html
         url = @url.split('&').first
diff --git a/lib/onebox/engine/google_maps_onebox.rb b/lib/onebox/engine/google_maps_onebox.rb
index c4a1d08..64adc49 100644
--- a/lib/onebox/engine/google_maps_onebox.rb
+++ b/lib/onebox/engine/google_maps_onebox.rb
@@ -23,6 +23,8 @@ module Onebox
 
       always_https
 
+      requires_iframe_origins("https://maps.google.com", "https://google.com")
+
       # Matches shortened Google Maps URLs
       matches_regexp :short,      %r"^(https?:)?//goo\.gl/maps/"
 
diff --git a/lib/onebox/engine/kaltura_onebox.rb b/lib/onebox/engine/kaltura_onebox.rb
index db20f90..99f5375 100644
--- a/lib/onebox/engine/kaltura_onebox.rb
+++ b/lib/onebox/engine/kaltura_onebox.rb
@@ -8,6 +8,7 @@ module Onebox
 
       always_https
       matches_regexp(/^https?:\/\/[a-z0-9]+\.kaltura\.com\/id\/[a-zA-Z0-9]+/)
+      requires_iframe_origins "https://*.kaltura.com"
 
       def preview_html
         og = get_opengraph
diff --git a/lib/onebox/engine/sketchfab_onebox.rb b/lib/onebox/engine/sketchfab_onebox.rb
index 29ff4cf..348d98b 100644
--- a/lib/onebox/engine/sketchfab_onebox.rb
+++ b/lib/onebox/engine/sketchfab_onebox.rb
@@ -8,6 +8,7 @@ module Onebox
 
       matches_regexp(/^https?:\/\/sketchfab\.com\/(?:models\/|3d-models\/(?:[^\/\s]+-)?)([a-z0-9]{32})/)
       always_https
+      requires_iframe_origins("https://sketchfab.com")
 
       def to_html
         og = get_opengraph
diff --git a/lib/onebox/engine/slides_onebox.rb b/lib/onebox/engine/slides_onebox.rb
index f3c00d8..0979a94 100644
--- a/lib/onebox/engine/slides_onebox.rb
+++ b/lib/onebox/engine/slides_onebox.rb
@@ -7,10 +7,11 @@ module Onebox
       include StandardEmbed
 
       matches_regexp(/^https?:\/\/slides\.com\/[\p{Alnum}_\-]+\/[\p{Alnum}_\-]+$/)
+      requires_iframe_origins "https://slides.com"
 
       def to_html
         <<-HTML
-          <iframe src="//slides.com#{uri.path}/embed?style=light"
+          <iframe src="https://slides.com#{uri.path}/embed?style=light"
                   width="576"
                   height="420"
                   scrolling="no"
diff --git a/lib/onebox/engine/steam_store_onebox.rb b/lib/onebox/engine/steam_store_onebox.rb
index c913659..20c5dd1 100644
--- a/lib/onebox/engine/steam_store_onebox.rb
+++ b/lib/onebox/engine/steam_store_onebox.rb
@@ -8,6 +8,7 @@ module Onebox
 
       always_https
       matches_regexp(/^https?:\/\/store\.steampowered\.com\/app\/\d+/)
+      requires_iframe_origins "https://store.steampowered.com"
 
       def placeholder_html
         og = get_opengraph
diff --git a/lib/onebox/engine/trello_onebox.rb b/lib/onebox/engine/trello_onebox.rb
index 0fc16ff..f9d4397 100644
--- a/lib/onebox/engine/trello_onebox.rb
+++ b/lib/onebox/engine/trello_onebox.rb
@@ -7,6 +7,7 @@ module Onebox
       include StandardEmbed
 
       matches_regexp(/^https:\/\/trello\.com\/[bc]\/\W*/)
+      requires_iframe_origins "https://trello.com"
       always_https
 
       def to_html
diff --git a/lib/onebox/engine/twitch_clips_onebox.rb b/lib/onebox/engine/twitch_clips_onebox.rb
index 343c0fd..62b90e6 100644
--- a/lib/onebox/engine/twitch_clips_onebox.rb
+++ b/lib/onebox/engine/twitch_clips_onebox.rb
@@ -9,6 +9,8 @@ class Onebox::Engine::TwitchClipsOnebox
   end
   include Onebox::Mixins::TwitchOnebox
 
+  requires_iframe_origins "https://clips.twitch.tv"
+
   def query_params
     "clip=#{twitch_id}"
   end
diff --git a/lib/onebox/engine/typeform_onebox.rb b/lib/onebox/engine/typeform_onebox.rb
index da53a84..fe2f4b6 100644
--- a/lib/onebox/engine/typeform_onebox.rb
+++ b/lib/onebox/engine/typeform_onebox.rb
@@ -6,6 +6,7 @@ module Onebox
       include Engine
 
       matches_regexp(/^https?:\/\/[a-z0-9\-_]+\.typeform\.com\/to\/[a-zA-Z0-9]+/)
+      requires_iframe_origins "https://*.typeform.com"

[... diff too long, it was truncated ...]

GitHub sha: d0eed779

This commit appears in #434 which was approved by ZogStriP. It was merged by davidtaylorhq.