DEV: Attempt to fetch/parse YouTube ‘embed’ responses (#464)

DEV: Attempt to fetch/parse YouTube ‘embed’ responses (#464)

  • DEV: Attempt to fetch/parse YouTube ‘embed’ responses

YouTube embed responses appear to give us enough data to construct a Onebox, without requiring EU-based requests to consent to various T&Cs before we can get the result.

Note: This requires a change in the lazy-yt plugin in Discourse core for the solution to fully work within the Discourse app.

  • Apply suggestions from code review

Co-authored-by: Jarek Radosz jradosz@gmail.com

diff --git a/lib/onebox/engine/youtube_onebox.rb b/lib/onebox/engine/youtube_onebox.rb
index a02c688..2de3f00 100644
--- a/lib/onebox/engine/youtube_onebox.rb
+++ b/lib/onebox/engine/youtube_onebox.rb
@@ -13,11 +13,36 @@ module Onebox
       WIDTH  ||= 480
       HEIGHT ||= 360
 
-      def placeholder_html
-        og = get_opengraph.data
+      def parse_embed_response
+        return unless video_id
+        return @parse_embed_response if defined?(@parse_embed_response)
+
+        embed_url = "https://www.youtube.com/embed/#{video_id}"
+        @embed_doc ||= Onebox::Helpers.fetch_html_doc(embed_url)
+
+        begin
+          script_tag = @embed_doc.xpath('//script').find { |tag| tag.to_s.include?('ytcfg.set') }.to_s
+          match = script_tag.to_s.match(/ytcfg\.set\((?<json>.*)\)/)
+
+          yt_json = ::JSON.parse(match[:json])
+          renderer = ::JSON.parse(yt_json['PLAYER_VARS']['embedded_player_response'])['embedPreview']['thumbnailPreviewRenderer']
+
+          title = renderer['title']['runs'].first['text']
+
+          image = "https://img.youtube.com/vi/#{video_id}/hqdefault.jpg"
+        rescue
+          return
+        end
+
+        @parse_embed_response = { image: image, title: title }
+      end
 
+      def placeholder_html
         if video_id || list_id
-          "<img src='#{og[:image]}' width='#{WIDTH}' height='#{HEIGHT}' title='#{og[:title]}'>"
+          result = parse_embed_response
+          result ||= get_opengraph.data
+
+          "<img src='#{result[:image]}' width='#{WIDTH}' height='#{HEIGHT}' title='#{result[:title]}'>"
         else
           to_html
         end
@@ -52,7 +77,10 @@ module Onebox
       end
 
       def video_title
-        @video_title ||= get_opengraph.data[:title]
+        @video_title ||= begin
+          result = parse_embed_response || get_opengraph.data
+          result[:title]
+        end
       end
 
       private
diff --git a/lib/onebox/helpers.rb b/lib/onebox/helpers.rb
index 10db24c..1746106 100644
--- a/lib/onebox/helpers.rb
+++ b/lib/onebox/helpers.rb
@@ -7,7 +7,7 @@ module Onebox
 
     class DownloadTooLarge < StandardError; end
 
-    IGNORE_CANONICAL_DOMAINS ||= ['www.instagram.com']
+    IGNORE_CANONICAL_DOMAINS ||= ['www.instagram.com', 'youtube.com']
 
     def self.symbolize_keys(hash)
       return {} if hash.nil?
diff --git a/spec/fixtures/youtube-embed.response b/spec/fixtures/youtube-embed.response
new file mode 100644
index 0000000..7395472
--- /dev/null
+++ b/spec/fixtures/youtube-embed.response
@@ -0,0 +1,7 @@
+<!DOCTYPE html><html lang="en-GB" dir="ltr" data-cast-api-enabled="true"><head><meta name="viewport" content="width=device-width, initial-scale=1"><style name="www-roboto" nonce="WzCdUe7QzlbKDHiDk4xX0w">@font-face{font-family:'Roboto';font-style:normal;font-weight:400;src:url(//fonts.gstatic.com/s/roboto/v18/KFOmCnqEu92Fr1Mu4mxP.ttf)format('truetype');}</style><script name="www-roboto" nonce="WzCdUe7QzlbKDHiDk4xX0w">if (document.fonts && document.fonts.load) {document.fonts.load("400 10pt Roboto", "E"); document.fonts.load("500 10pt Roboto", "E");}</script><link rel="stylesheet" href="/s/player/1c20fac3/www-player.css" name="www-player" nonce="WzCdUe7QzlbKDHiDk4xX0w"><style nonce="WzCdUe7QzlbKDHiDk4xX0w">html {overflow: hidden;}body {font: 12px Roboto, Arial, sans-serif; background-color: #000; color: #fff; height: 100%; width: 100%; overflow: hidden; position: absolute; margin: 0; padding: 0;}#player {width: 100%; height: 100%;}h1 {text-align: center; color: #fff;}h3 {margin-top: 6px; margin-bottom: 3px;}.player-unavailable {position: absolute; top: 0; left: 0; right: 0; bottom: 0; padding: 25px; font-size: 13px; background: url(/img/meh7.png) 50% 65% no-repeat;}.player-unavailable .message {text-align: left; margin: 0 -5px 15px; padding: 0 5px 14px; border-bottom: 1px solid #888; font-size: 19px; font-weight: normal;}.player-unavailable a {color: #167ac6; text-decoration: none;}</style><script nonce="WzCdUe7QzlbKDHiDk4xX0w">var ytcsi={gt:function(n){n=(n||"")+"data_";return ytcsi[n]||(ytcsi[n]={tick:{},info:{}})},now:window.performance&&window.performance.timing&&window.performance.now&&window.performance.timing.navigationStart?function(){return window.performance.timing.navigationStart+window.performance.now()}:function(){return(new Date).getTime()},tick:function(l,t,n){var ticks=ytcsi.gt(n).tick;var v=t||ytcsi.now();if(ticks[l]){ticks["_"+l]=ticks["_"+l]||[ticks[l]];ticks["_"+l].push(v)}ticks[l]=v},info:function(k,
+v,n){ytcsi.gt(n).info[k]=v},setStart:function(s,t,n){ytcsi.info("yt_sts",s,n);ytcsi.tick("_start",t,n)}};
+(function(w,d){ytcsi.setStart("dhs",w.performance?w.performance.timing.responseStart:null);var isPrerender=(d.visibilityState||d.webkitVisibilityState)=="prerender";var vName=!d.visibilityState&&d.webkitVisibilityState?"webkitvisibilitychange":"visibilitychange";if(isPrerender){ytcsi.info("prerender",1);var startTick=function(){ytcsi.setStart("dhs");d.removeEventListener(vName,startTick)};d.addEventListener(vName,startTick,false)}if(d.addEventListener)d.addEventListener(vName,function(){ytcsi.tick("vc")},
+false);function isGecko(){if(!w.navigator)return false;try{if(w.navigator.userAgentData&&w.navigator.userAgentData.brands&&w.navigator.userAgentData.brands.length){var brands=w.navigator.userAgentData.brands;for(var i=0;i<brands.length;i++)if(brands[i]&&brands[i].brand==="Firefox")return true;return false}}catch(e){setTimeout(function(){throw e;})}if(!w.navigator.userAgent)return false;var ua=w.navigator.userAgent;return ua.indexOf("Gecko")>0&&ua.toLowerCase().indexOf("webkit")<0&&ua.indexOf("Edge")<
+0&&ua.indexOf("Trident")<0&&ua.indexOf("MSIE")<0}if(isGecko()){var isHidden=(d.visibilityState||d.webkitVisibilityState)=="hidden";if(isHidden)ytcsi.tick("vc")}var slt=function(el,t){setTimeout(function(){var n=ytcsi.now();el.loadTime=n;if(el.slt)el.slt()},t)};w.__ytRIL=function(el){if(!el.getAttribute("data-thumb"))if(w.requestAnimationFrame)w.requestAnimationFrame(function(){slt(el,0)});else slt(el,16)}})(window,document);
+</script><script nonce="WzCdUe7QzlbKDHiDk4xX0w">var ytcfg={d:function(){return window.yt&&yt.config_||ytcfg.data_||(ytcfg.data_={})},get:function(k,o){return k in ytcfg.d()?ytcfg.d()[k]:o},set:function(){var a=arguments;if(a.length>1)ytcfg.d()[a[0]]=a[1];else for(var k in a[0])ytcfg.d()[k]=a[0][k]}};

[... diff too long, it was truncated ...]

GitHub sha: 4d669d2b

1 Like

This commit appears in #464 which was approved by CvX. It was merged by jbrw.