FIX: improve Vanilla importing (#10478)

FIX: improve Vanilla importing (#10478)

  • ensure emails don’t have spaces
  • import banned users as suspended for 1k yrs
  • upgrade users to TL2 if they have comments
  • topic: import views, closed and pinned info
  • import messages
  • encode vanilla usernames for permalinks. Vanilla usernames can contain spaces and special characters.
  • parse Vanilla’s new rich body format
diff --git a/script/import_scripts/vanilla_body_parser.rb b/script/import_scripts/vanilla_body_parser.rb
new file mode 100644
index 0000000..4a4e7f1
--- /dev/null
+++ b/script/import_scripts/vanilla_body_parser.rb
@@ -0,0 +1,211 @@
+# frozen_string_literal: true
+
+class VanillaBodyParser
+  def self.configure(lookup:, uploader:, host:, uploads_path:)
+    @@lookup = lookup
+    @@uploader = uploader
+    @@host = host
+    @@uploads_path = uploads_path
+  end
+
+  def initialize(row, user_id)
+    @row = row
+    @user_id = user_id
+  end
+
+  def parse
+    return clean_up(@row['Body']) unless rich?
+
+    full_text = json.each_with_index.map(&method(:parse_fragment)).join('')
+
+    normalize full_text
+  end
+
+  private
+
+  def clean_up(text)
+    text.gsub(/<\/?font[^>]*>/, '').gsub(/<\/?span[^>]*>/, '').gsub(/<\/?div[^>]*>/, '').gsub(/^ +/, '').gsub(/ +/, ' ')
+  end
+
+  def rich?
+    @row['Format'] == 'Rich'
+  end
+
+  def json
+    return nil unless rich?
+    @json ||= JSON.parse(@row['Body']).map(&:deep_symbolize_keys)
+  end
+
+  def parse_fragment(fragment, index)
+    text = fragment.keys.one? && fragment[:insert].is_a?(String) ? fragment[:insert] : rich_parse(fragment)
+
+    text = parse_code(text, fragment, index)
+    text = parse_list(text, fragment, index)
+
+    text
+  end
+
+  def rich_parse(fragment)
+    insert = fragment[:insert]
+
+    return parse_mention(insert[:mention]) if insert.respond_to?(:dig) && insert.dig(:mention, :userID)
+
+    return parse_formatting(fragment) if fragment[:attributes]
+
+    embed_type = insert.dig(:'embed-external', :data, :embedType)
+
+    quoting = embed_type == 'quote'
+    return parse_quote(insert) if quoting
+
+    embed = embed_type.in? ['image', 'link', 'file']
+    parse_embed(insert) if embed
+  end
+
+  def parse_mention(mention)
+    user = user_from_imported_id(mention[:userID])
+    username = user&.username || mention[:name]
+    "@#{username}"
+  end
+
+  def user_from_imported_id(imported_id)
+    user_id = @@lookup.user_id_from_imported_user_id(imported_id)
+    User.find(user_id) if user_id
+  end
+
+  def parse_formatting(fragment)
+    insert = fragment[:insert]
+    attributes = fragment[:attributes]
+
+    text = fragment[:insert]
+
+    text = "<a href=\"#{attributes[:link]}\">#{text}</a>" if attributes[:link]
+    text = "<i>#{text}</i>" if attributes[:italic]
+    text = "<b>#{text}</b>" if attributes[:bold]
+
+    text
+  end
+
+  # In the Quill format used by Vanilla Forums, a line is rendered as `code`
+  # when it's followed by a fragment with attributes: {'code-block': true}.
+  # So we open our `‍`` block when the next fragment has a 'code-block'
+  # attribute and the previous one didn't and we close the `‍`` block when
+  # the second next fragment does not contain the 'code-block' attribute
+  def parse_code(text, fragment, index)
+    next_fragment = next_fragment(index)
+
+    next_code = next_fragment.dig(:attributes, :'code-block')
+    if next_code
+      previous_fragment = previous_fragment(index)
+      previous_code = previous_fragment.dig(:attributes, :'code-block')
+
+      # if next is code and previous is not, prepend `‍``
+      text = "\n`‍``#{text}" unless previous_code
+    end
+
+    current_code = fragment.dig(:attributes, :'code-block')
+
+    if current_code
+      second_next_fragment = second_next_fragment(index)
+      second_next_code = second_next_fragment.dig(:attributes, :'code-block')
+
+      # if current is code and 2 after is not, prepend `‍``
+      text = "\n`‍``#{text}" unless second_next_code
+    end
+
+    text
+  end
+
+  def parse_list(text, fragment, index)
+    next_fragment = next_fragment(index)
+
+    next_list = next_fragment.dig(:attributes, :list, :type)
+    if next_list
+      # if next is list, prepend <li>
+      text = '<li>' + text
+
+      previous_fragment = previous_fragment(index)
+      previous_list = previous_fragment.dig(:attributes, :list, :type)
+
+      # if next is list and previous is not, prepend <ol> or <ul>
+      list_tag = next_list == 'ordered' ? '<ol>' : '<ul>'
+      text = "\n#{list_tag}\n#{text}" unless previous_list
+    end
+
+    current_list = fragment.dig(:attributes, :list, :type)
+
+    if current_list
+      # if current is list prepend </li>
+      tag_closings = '</li>'
+
+      second_next_fragment = second_next_fragment(index)
+      second_next_list = second_next_fragment.dig(:attributes, :list, :type)
+
+      # if current is list and 2 after is not, prepend </ol>
+      list_tag = current_list == 'ordered' ? '</ol>' : '</ul>'
+      tag_closings = "#{tag_closings}\n#{list_tag}" unless second_next_list
+
+      text = tag_closings + text
+    end
+
+    text
+  end
+
+  def next_fragment(index)
+    json[index + 1] || {}
+  end
+
+  def previous_fragment(index)
+    json[index - 1] || {}
+  end
+
+  def second_next_fragment(index)
+    json[index + 2] || {}
+  end
+
+  def parse_quote(insert)
+    embed = insert.dig(:'embed-external', :data)
+
+    import_post_id = "#{embed[:recordType]}##{embed[:recordID]}"
+    topic = @@lookup.topic_lookup_from_imported_post_id(import_post_id)
+    user = user_from_imported_id(embed.dig(:insertUser, :userID))
+
+    quote_info = topic && user ? "=\"#{user.username}, post: #{topic[:post_number]}, topic: #{topic[:topic_id]}\"" : ''
+
+    "[quote#{quote_info}]\n#{embed[:body]}\n[/quote]\n\n"""
+  end
+
+  def parse_embed(insert)
+    embed = insert.dig(:'embed-external', :data)
+
+    url = embed[:url]
+
+    if /https?\:\/\/#{@@host}\/uploads\/.*/.match?(url)
+      remote_path = url.scan(/uploads\/(.*)/)
+      path = File.join(@@uploads_path, remote_path)
+
+      upload = @@uploader.create_upload(@user_id, path, embed[:name])
+
+      if upload&.persisted?
+        return "\n" + @@uploader.html_for_upload(upload, embed[:name]) + "\n"
+      else
+        puts "Failed to upload #{path}"
+        puts upload.errors.full_messages.join(', ') if upload
+      end
+    end
+
+    "\n[#{embed[:name]}](#{url})\n"
+  end
+
+  def normalize(full_text)
+    code_matcher = /`‍``(.*\n)+`‍``/
+    code_block = full_text[code_matcher]
+    full_text[code_matcher] = '{{{CODE_BLOCK}}}' if code_block
+    full_text = double_new_lines(full_text)
+    full_text['{{{CODE_BLOCK}}}'] = code_block if code_block
+    full_text
+  end
+
+  def double_new_lines(text)
+    text.split("\n").map(&:strip).map(&:presence).compact.join("\n\n")
+  end
+end
diff --git a/script/import_scripts/vanilla_mysql.rb b/script/import_scripts/vanilla_mysql.rb
index 10c5bdc..f2b4e6e 100644
--- a/script/import_scripts/vanilla_mysql.rb
+++ b/script/import_scripts/vanilla_mysql.rb
@@ -3,6 +3,7 @@
 require "mysql2"
 require File.expand_path(File.dirname(__FILE__) + "/base.rb")
 require 'htmlentities'
+require_relative 'vanilla_body_parser'
 
 class ImportScripts::VanillaSQL < ImportScripts::Base
 
@@ -22,6 +23,13 @@ class ImportScripts::VanillaSQL < ImportScripts::Base
       database: VANILLA_DB
     )
 
+    VanillaBodyParser.configure(
+      lookup: @lookup,
+      uploader: @uploader,
+      host: 'vanilla.yourforum.com', # your Vanilla forum domain
+      uploads_path: 'uploads' # relative path to your vanilla uploads folder
+    )
+
     @import_tags = false
     begin
       r = @client.query("select count(*) count from #{TABLE_PREFIX}Tag where countdiscussions > 0")
@@ -42,6 +50,7 @@ class ImportScripts::VanillaSQL < ImportScripts::Base
     import_categories
     import_topics
     import_posts
+    import_messages
 
     update_tl0
 
@@ -59,8 +68,8 @@ class ImportScripts::VanillaSQL < ImportScripts::Base
 
     batches(BATCH_SIZE) do |offset|
       results = mysql_query(
-        "SELECT UserID, Name, Title, Location, About, Email,
-                DateInserted, DateLastActive, InsertIPAddress, Admin
+        "SELECT UserID, Name, Title, Location, About, Email, Admin, Banned, CountComments,
+                DateInserted, DateLastActive, InsertIPAddress

[... diff too long, it was truncated ...]

GitHub sha: 812e0d6b

This commit appears in #10478 which was approved by nlalonde. It was merged by nlalonde.