Improve Vanilla import script. (#11701)

Improve Vanilla import script. (#11701)

  • import groups and group users
  • import uploads/attachments
  • improved code tag parsing
  • improved text formatting
  • mark topics as solved
diff --git a/script/import_scripts/vanilla_body_parser.rb b/script/import_scripts/vanilla_body_parser.rb
index af9d2c4..ba4608e 100644
--- a/script/import_scripts/vanilla_body_parser.rb
+++ b/script/import_scripts/vanilla_body_parser.rb
@@ -24,6 +24,17 @@ class VanillaBodyParser
   private
 
   def clean_up(text)
+    # <pre class="CodeBlock">...</pre>
+    text = text.gsub(/\<pre class="CodeBlock"\>(.*?)\<\/pre\>/im) { "\n`‍``\n#{$1}\n`‍``\n" }
+    # <pre>...</pre>
+    text = text.gsub(/\<pre\>(.*?)\<\/pre\>/im) { "\n`‍``\n#{$1}\n`‍``\n" }
+    # <code></code>
+    text = text.gsub("\<code\>\</code\>", "").gsub(/\<code\>(.*?)\<\/code\>/im) { "#{$1}" }
+    # <div class="Quote">...</div>
+    text = text.gsub(/\<div class="Quote"\>(.*?)\<\/div\>/im) { "\n[quote]\n#{$1}\n[/quote]\n" }
+    # [code], [quote]
+    text = text.gsub(/\[\/?code\]/i, "\n`‍``\n").gsub(/\[quote.*?\]/i, "\n" + '\0' + "\n").gsub(/\[\/quote\]/i, "\n" + '\0' + "\n")
+
     text.gsub(/<\/?font[^>]*>/, '').gsub(/<\/?span[^>]*>/, '').gsub(/<\/?div[^>]*>/, '').gsub(/^ +/, '').gsub(/ +/, ' ')
   end
 
@@ -58,7 +69,7 @@ class VanillaBodyParser
     return parse_quote(insert) if quoting
 
     embed = embed_type.in? ['image', 'link', 'file']
-    parse_embed(insert) if embed
+    parse_embed(insert, embed_type) if embed
   end
 
   def parse_mention(mention)
@@ -87,9 +98,6 @@ class VanillaBodyParser
 
   # In the Quill format used by Vanilla Forums, a line is rendered as `code`
   # when it's followed by a fragment with attributes: {'code-block': true}.
-  # So we open our `‍`` block when the next fragment has a 'code-block'
-  # attribute and the previous one didn't and we close the `‍`` block when
-  # the second next fragment does not contain the 'code-block' attribute
   def parse_code(text, fragment, index)
     next_fragment = next_fragment(index)
 
@@ -98,18 +106,27 @@ class VanillaBodyParser
       previous_fragment = previous_fragment(index)
       previous_code = previous_fragment.dig(:attributes, :'code-block')
 
-      # if next is code and previous is not, prepend `‍``
-      text = "\n`‍``#{text}" unless previous_code
+      if previous_code
+        text = text.gsub(/\\n(.*?)\\n/) { "\n`‍``\n#{$1}\n`‍``\n" }
+      else
+        last_pos = text.rindex(/\n/)
+
+        if last_pos
+          array = [text[0..last_pos].strip, text[last_pos + 1 .. text.length].strip]
+          text = array.join("\n`‍``\n")
+        else
+          text = "\n`‍``\n#{text}"
+        end
+      end
     end
 
     current_code = fragment.dig(:attributes, :'code-block')
-
     if current_code
       second_next_fragment = second_next_fragment(index)
       second_next_code = second_next_fragment.dig(:attributes, :'code-block')
 
       # if current is code and 2 after is not, prepend `‍``
-      text = "\n`‍``#{text}" unless second_next_code
+      text = "\n`‍``\n#{text}" unless second_next_code
     end
 
     text
@@ -174,7 +191,7 @@ class VanillaBodyParser
     "[quote#{quote_info}]\n#{embed[:body]}\n[/quote]\n\n"""
   end
 
-  def parse_embed(insert)
+  def parse_embed(insert, embed_type)
     embed = insert.dig(:'embed-external', :data)
 
     url = embed[:url]
@@ -193,7 +210,13 @@ class VanillaBodyParser
       end
     end
 
-    "\n[#{embed[:name]}](#{url})\n"
+    if embed_type == "link"
+      "\n[#{embed[:name]}](#{url})\n"
+    elsif embed_type == "image"
+      "\n<img src=\"#{url}\" alt=\"#{embed[:name]}\">\n"
+    else
+      "\n<a href=\"#{url}\">#{embed[:name]}</a>\n"
+    end
   end
 
   def normalize(full_text)
diff --git a/script/import_scripts/vanilla_mysql.rb b/script/import_scripts/vanilla_mysql.rb
index f2b4e6e..1a290ec 100644
--- a/script/import_scripts/vanilla_mysql.rb
+++ b/script/import_scripts/vanilla_mysql.rb
@@ -45,16 +45,37 @@ class ImportScripts::VanillaSQL < ImportScripts::Base
       SiteSetting.max_tags_per_topic = 10
     end
 
+    import_groups
     import_users
     import_avatars
+    import_group_users
     import_categories
     import_topics
     import_posts
     import_messages
 
     update_tl0
+    mark_topics_as_solved
 
     create_permalinks
+    import_attachments
+  end
+
+  def import_groups
+    puts "", "importing groups..."
+
+    groups = mysql_query <<-SQL
+        SELECT RoleID, Name
+          FROM #{TABLE_PREFIX}Role
+      ORDER BY RoleID
+    SQL
+
+    create_groups(groups) do |group|
+      {
+        id: group["RoleID"],
+        name: @htmlentities.decode(group["Name"]).strip
+      }
+    end
   end
 
   def import_users
@@ -147,7 +168,7 @@ class ImportScripts::VanillaSQL < ImportScripts::Base
 
         photo_real_filename = nil
         parts = photo.squeeze("/").split("/")
-        if parts[0] == "cf:"
+        if parts[0] =~ /^[a-z0-9]{2}:/
           photo_path = "#{ATTACHMENTS_BASE_DIR}/#{parts[2..-2].join('/')}".squeeze("/")
         elsif parts[0] == "~cf"
           photo_path = "#{ATTACHMENTS_BASE_DIR}/#{parts[1..-2].join('/')}".squeeze("/")
@@ -200,6 +221,24 @@ class ImportScripts::VanillaSQL < ImportScripts::Base
     nil
   end
 
+  def import_group_users
+    puts "", "importing group users..."
+
+    group_users = mysql_query("
+      SELECT RoleID, UserID
+        FROM #{TABLE_PREFIX}UserRole
+    ").to_a
+
+    group_users.each do |row|
+      user_id = user_id_from_imported_user_id(row["UserID"])
+      group_id = group_id_from_imported_group_id(row["RoleID"])
+
+      if user_id && group_id
+        GroupUser.find_or_create_by(user_id: user_id, group_id: group_id)
+      end
+    end
+  end
+
   def import_categories
     puts "", "importing categories..."
 
@@ -272,7 +311,7 @@ class ImportScripts::VanillaSQL < ImportScripts::Base
     batches(BATCH_SIZE) do |offset|
       comments = mysql_query(
         "SELECT CommentID, DiscussionID, Body, Format,
-                DateInserted, InsertUserID
+                DateInserted, InsertUserID, QnA
          FROM #{TABLE_PREFIX}Comment
          WHERE CommentID > #{@last_post_id}
          ORDER BY CommentID ASC
@@ -286,13 +325,20 @@ class ImportScripts::VanillaSQL < ImportScripts::Base
         next unless t = topic_lookup_from_imported_post_id("discussion#" + comment['DiscussionID'].to_s)
         next if comment['Body'].blank?
         user_id = user_id_from_imported_user_id(comment['InsertUserID']) || Discourse::SYSTEM_USER_ID
-        {
+
+        mapped = {
           id: "comment#" + comment['CommentID'].to_s,
           user_id: user_id,
           topic_id: t[:topic_id],
           raw: VanillaBodyParser.new(comment, user_id).parse,
           created_at: Time.zone.at(comment['DateInserted'])
         }
+
+        if comment['QnA'] == "Accepted"
+          mapped[:custom_fields] = { is_accepted_answer: "true" }
+        end
+
+        mapped
       end
     end
   end
@@ -395,6 +441,104 @@ class ImportScripts::VanillaSQL < ImportScripts::Base
     end
   end
 
+  def import_attachments
+    if ATTACHMENTS_BASE_DIR && File.exists?(ATTACHMENTS_BASE_DIR)
+      puts "", "importing attachments"
+
+      start = Time.now
+      count = 0
+
+      # https://us.v-cdn.net/1234567/uploads/editor/xyz/image.jpg
+      cdn_regex = /https:\/\/us.v-cdn.net\/1234567\/uploads\/(\S+\/(\w|-)+.\w+)/i
+      # [attachment=10109:Screen Shot 2012-04-01 at 3.47.35 AM.png]
+      attachment_regex = /\[attachment=(\d+):(.*?)\]/i
+
+      Post.where("raw LIKE '%/us.v-cdn.net/%' OR raw LIKE '%[attachment%'").find_each do |post|
+        count += 1
+        print "\r%7d - %6d/sec" % [count, count.to_f / (Time.now - start)]
+        new_raw = post.raw.dup
+
+        new_raw.gsub!(attachment_regex) do |s|
+          matches = attachment_regex.match(s)
+          attachment_id = matches[1]
+          file_name = matches[2]
+          next unless attachment_id
+
+          r = mysql_query("SELECT Path, Name FROM #{TABLE_PREFIX}Media WHERE MediaID = #{attachment_id};").first
+          next if r.nil?
+          path = r["Path"]
+          name = r["Name"]
+          next unless path.present?
+

[... diff too long, it was truncated ...]

GitHub sha: bd7cbcd8

This commit appears in #11701 which was approved by eviltrout. It was merged by techAPJ.