PERF: avoid race conditions when creating topic links

PERF: avoid race conditions when creating topic links

Previously the code was very race condition prone leading to odd failures in production

It was re-written in raw SQL to avoid conditions where rows conflict on inserts

There is no clean way in ActiveRecord to do:

Insert, on conflict do nothing and return existing id.

This also increases test coverage, we were previously not testing the code responsible for crawling external sites directly

diff --git a/app/models/topic_link.rb b/app/models/topic_link.rb
index afc6fae..3f992d9 100644
--- a/app/models/topic_link.rb
+++ b/app/models/topic_link.rb
@@ -140,9 +140,12 @@ class TopicLink < ActiveRecord::Base
 
   end
 
-  # Crawl a link's title after it's saved
+  def self.crawl_link_title(topic_link_id)
+    Jobs.enqueue(:crawl_topic_link, topic_link_id: topic_link_id)
+  end
+
   def crawl_link_title
-    Jobs.enqueue(:crawl_topic_link, topic_link_id: id)
+    TopicLink.crawl_link_title(id)
   end
 
   def self.duplicate_lookup(topic)
@@ -167,6 +170,97 @@ class TopicLink < ActiveRecord::Base
 
   private
 
+  # This pattern is used to create topic links very efficiently with minimal
+  # errors under heavy concurrent use
+  #
+  # It avoids a SELECT to find out if the record is there and minimizes all
+  # the work it needs to do in case a record is missing
+  #
+  # It handles calling the required callback and has parity with Rails implementation
+  #
+  # Usually we would rely on ActiveRecord but in this case we have had lots of churn
+  # around creation of topic links leading to hard to debug log messages in production
+  #
+  def self.safe_create_topic_link(
+    post_id:,
+    user_id:,
+    topic_id:,
+    url:,
+    domain: nil,
+    internal: false,
+    link_topic_id: nil,
+    link_post_id: nil,
+    quote: false,
+    extension: nil,
+    reflection: false
+  )
+
+    domain ||= Discourse.current_hostname
+
+    sql = <<~SQL
+      WITH new_row AS(
+        INSERT INTO topic_links(
+          post_id,
+          user_id,
+          topic_id,
+          url,
+          domain,
+          internal,
+          link_topic_id,
+          link_post_id,
+          quote,
+          extension,
+          reflection,
+          created_at,
+          updated_at
+        ) VALUES (
+          :post_id,
+          :user_id,
+          :topic_id,
+          :url,
+          :domain,
+          :internal,
+          :link_topic_id,
+          :link_post_id,
+          :quote,
+          :extension,
+          :reflection,
+          :now,
+          :now
+        )
+        ON CONFLICT DO NOTHING
+        RETURNING id
+      )
+      SELECT COALESCE(
+        (SELECT id FROM new_row),
+        (SELECT id FROM topic_links WHERE post_id = :post_id AND topic_id = :topic_id AND url = :url)
+      ), (SELECT id FROM new_row) IS NOT NULL
+    SQL
+
+    topic_link_id, new_record = DB.query_single(sql,
+      post_id: post_id,
+      user_id: user_id,
+      topic_id: topic_id,
+      url: url,
+      domain: domain,
+      internal: internal,
+      link_topic_id: link_topic_id,
+      link_post_id: link_post_id,
+      quote: quote,
+      extension: extension,
+      reflection: reflection,
+      now: Time.now
+    )
+
+    if new_record
+      DB.after_commit do
+        crawl_link_title(topic_link_id)
+      end
+    end
+
+    topic_link_id
+  end
+
   def self.ensure_entry_for(post, link, parsed)
     url = link.url
     internal = false
@@ -210,25 +304,20 @@ class TopicLink < ActiveRecord::Base
     url = url[0...TopicLink.max_url_length]
     return nil if parsed && parsed.host && parsed.host.length > TopicLink.max_domain_length
 
-    unless TopicLink.exists?(topic_id: post.topic_id, post_id: post.id, url: url)
-      file_extension = File.extname(parsed.path)[1..10].downcase unless parsed.path.nil? || File.extname(parsed.path).empty?
-      begin
-        TopicLink.create(
-          post_id: post.id,
-          user_id: post.user_id,
-          topic_id: post.topic_id,
-          url: url,
-          domain: parsed.host || Discourse.current_hostname,
-          internal: internal,
-          link_topic_id: topic&.id,
-          link_post_id: reflected_post.try(:id),
-          quote: link.is_quote,
-          extension: file_extension
-        )
-      rescue ActiveRecord::RecordNotUnique
-        # it's fine
-      end
-    end
+    file_extension = File.extname(parsed.path)[1..10].downcase unless parsed.path.nil? || File.extname(parsed.path).empty?
+
+    safe_create_topic_link(
+      post_id: post.id,
+      user_id: post.user_id,
+      topic_id: post.topic_id,
+      url: url,
+      domain: parsed.host,
+      internal: internal,
+      link_topic_id: topic&.id,
+      link_post_id: reflected_post.try(:id),
+      quote: link.is_quote,
+      extension: file_extension,
+    )
 
     reflected_id = nil
 
@@ -236,24 +325,19 @@ class TopicLink < ActiveRecord::Base
     if topic && post.topic && topic.archetype != 'private_message' && post.topic.archetype != 'private_message' && post.topic.visible?
       prefix = Discourse.base_url_no_prefix
       reflected_url = "#{prefix}#{post.topic.relative_url(post.post_number)}"
-      tl = TopicLink.find_by(topic_id: topic&.id,
-                             post_id: reflected_post&.id,
-                             url: reflected_url)
-
-      unless tl
-        tl = TopicLink.create(user_id: post.user_id,
-                              topic_id: topic&.id,
-                              post_id: reflected_post&.id,
-                              url: reflected_url,
-                              domain: Discourse.current_hostname,
-                              reflection: true,
-                              internal: true,
-                              link_topic_id: post.topic_id,
-                              link_post_id: post.id)
 
-      end
+      reflected_id = safe_create_topic_link(
+        user_id: post.user_id,
+        topic_id: topic&.id,
+        post_id: reflected_post&.id,
+        url: reflected_url,
+        domain: Discourse.current_hostname,
+        reflection: true,
+        internal: true,
+        link_topic_id: post.topic_id,
+        link_post_id: post.id
+      )
 
-      reflected_id = tl.id if tl.persisted?
     end
 
     [url, reflected_id]
diff --git a/spec/models/topic_link_spec.rb b/spec/models/topic_link_spec.rb
index 2f8a0be..c1a1171 100644
--- a/spec/models/topic_link_spec.rb
+++ b/spec/models/topic_link_spec.rb
@@ -28,31 +28,49 @@ describe TopicLink do
   end
 
   describe 'external links' do
-    fab!(:post2) do
-      Fabricate(:post, raw: <<~RAW, user: user, topic: topic)
+    it 'correctly handles links' do
+
+      non_png = "https://b.com/#{SecureRandom.hex}"
+
+      # prepare a title for one of the links
+      stub_request(:get, non_png).
+        with(headers: {
+          'Accept' => '*/*',
+          'Accept-Encoding' => 'gzip',
+          'Host' => 'b.com',
+        }).
+        to_return(status: 200, body: "<html><head><title>amazing</title></head></html>", headers: {})
+
+      # so we run crawl_topic_links
+      Jobs.run_immediately!
+
+      png_title = "#{SecureRandom.hex}.png"
+      png = "https://awesome.com/#{png_title}"
+
+      post = Fabricate(:post, raw: <<~RAW, user: user, topic: topic)
         http://a.com/
-        https://b.com/b
+        #{non_png}
         http://#{'a' * 200}.com/invalid
         //b.com/#{'a' * 500}
+        #{png}
       RAW
-    end
 
-    before do
-      TopicLink.extract_from(post2)
-    end
+      TopicLink.extract_from(post)
+
+      # we have a special rule for images title where we pull them out of the filename
+      expect(topic.topic_links.where(url: png).pluck(:title).first).to eq(png_title)
+      expect(topic.topic_links.where(url: non_png).pluck(:title).first).to eq("amazing")
 
-    it 'works' do
       expect(topic.topic_links.pluck(:url)).to contain_exactly(
+        png,
+        non_png,
         "http://a.com/",
-        "https://b.com/b",
         "//b.com/#{'a' * 500}"[0...TopicLink.max_url_length]
       )
-    end
 
-    it "doesn't reset them when rebaking" do
       old_ids = topic.topic_links.pluck(:id)
 
-      TopicLink.extract_from(post2)
+      TopicLink.extract_from(post)
 
       new_ids = topic.topic_links.pluck(:id)
 
@@ -107,15 +125,17 @@ describe TopicLink do
         # this is subtle, but we had a bug were second time

[... diff too long, it was truncated ...]

GitHub sha: 7f841dc2