FIX: improvements for vanilla bulk import (#10212)

FIX: improvements for vanilla bulk import (#10212)

Adjustments to the base:

  1. PG connection doesn’t require host - it was broken on import droplet
  2. Drop topic_reply_count - it was removed here - https://github.com/discourse/discourse/blob/master/db/post_migrate/20200513185052_drop_topic_reply_count.rb
  3. Error with backtrace.join("\n") -> e.backtrace.join("\n")
  4. Correctly link the user and avatar to quote block

Adjustments to vanilla:

  1. Top-level Vanilla categories are valid categories
  2. Posts have format column which should be used to decide if the format is HTML or Markdown
  3. Remove no UTF8 characters
  4. Remove not supported HTML elements like font span sub u
diff --git a/script/bulk_import/base.rb b/script/bulk_import/base.rb
index 1253c2d..0d7ec65 100644
--- a/script/bulk_import/base.rb
+++ b/script/bulk_import/base.rb
@@ -76,7 +76,7 @@ class BulkImport::Base
     charset = ENV["DB_CHARSET"] || "utf8"
     db = ActiveRecord::Base.connection_config
     @encoder = PG::TextEncoder::CopyRow.new
-    @raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port])
+    @raw_connection = PG.connect(dbname: db[:database], port: db[:port])
     @uploader = ImportScripts::Uploader.new
     @html_entities = HTMLEntities.new
     @encoding = CHARSET_MAP[charset]
@@ -283,7 +283,7 @@ class BulkImport::Base
 
   USER_STAT_COLUMNS ||= %i{
     user_id topics_entered time_read days_visited posts_read_count
-    likes_given likes_received topic_reply_count new_since read_faq
+    likes_given likes_received new_since read_faq
     first_post_created_at post_count topic_count bounce_score
     reset_bounce_score_after
   }
@@ -441,14 +441,12 @@ class BulkImport::Base
 
   def process_user_stat(user_stat)
     user_stat[:user_id] = @users[user_stat[:imported_user_id].to_i]
-    user_stat[:topic_reply_count] = user_stat[:post_count] - user_stat[:topic_count]
     user_stat[:topics_entered] ||= 0
     user_stat[:time_read] ||= 0
     user_stat[:days_visited] ||= 0
     user_stat[:posts_read_count] ||= 0
     user_stat[:likes_given] ||= 0
     user_stat[:likes_received] ||= 0
-    user_stat[:topic_reply_count] ||= 0
     user_stat[:new_since] ||= NOW
     user_stat[:post_count] ||= 0
     user_stat[:topic_count] ||= 0
@@ -546,7 +544,8 @@ class BulkImport::Base
     topic_tag
   end
 
-  def process_raw(raw)
+  def process_raw(original_raw)
+    raw = original_raw.dup
     # fix whitespaces
     raw.gsub!(/(\\r)?\\n/, "\n")
     raw.gsub!("\\t", "\t")
@@ -699,7 +698,7 @@ class BulkImport::Base
         rescue => e
           puts "\n"
           puts "ERROR: #{e.message}"
-          puts backtrace.join("\n")
+          puts e.backtrace.join("\n")
         end
       end
     end
@@ -782,17 +781,25 @@ class BulkImport::Base
       quote.gsub!(/^(<br>\n?)+/, "")
       quote.gsub!(/(<br>\n?)+$/, "")
 
+      user = User.find_by(username: username)
+
       if post_id.present? && topic_id.present?
         <<-HTML
           <aside class="quote" data-post="#{post_id}" data-topic="#{topic_id}">
-            <div class="title">#{username}:</div>
+            <div class="title">
+              <div class="quote-controls"></div>
+              #{user ? user_avatar(user) : username}:
+            </div>
             <blockquote>#{quote}</blockquote>
           </aside>
         HTML
       else
         <<-HTML
-          <aside class="quote">
-            <div class="title">#{username}:</div>
+          <aside class="quote no-group" data-username="#{username}">
+            <div class="title">
+              <div class="quote-controls"></div>
+              #{user ? user_avatar(user) : username}:
+            </div>
             <blockquote>#{quote}</blockquote>
           </aside>
         HTML
@@ -802,6 +809,11 @@ class BulkImport::Base
     cooked.scrub.strip
   end
 
+  def user_avatar(user)
+    url = user.avatar_template.gsub("{size}", "45")
+    "<img alt=\"\" width=\"20\" height=\"20\" src=\"#{url}\" class=\"avatar\"> #{user.username}"
+  end
+
   def pre_fancy(title)
     Redcarpet::Render::SmartyPants.render(ERB::Util.html_escape(title)).scrub.strip
   end
diff --git a/script/bulk_import/vanilla.rb b/script/bulk_import/vanilla.rb
index 71307db..92ddf3f 100644
--- a/script/bulk_import/vanilla.rb
+++ b/script/bulk_import/vanilla.rb
@@ -188,7 +188,7 @@ class BulkImport::Vanilla < BulkImport::Base
     now = Time.zone.now
 
     create_user_stats(users) do |row|
-      next unless @users[row['UserID'].to_s] # shouldn't need this but it can be NULL :<
+      next unless @users[row['UserID'].to_i] # shouldn't need this but it can be NULL :<
 
       {
         imported_id: row['UserID'],
@@ -371,9 +371,8 @@ class BulkImport::Vanilla < BulkImport::Base
 
     # Throw the -1 level categories away since they contain no topics.
     # Use the next level as root categories.
-    root_category_ids = Set.new(categories.select { |c| c["ParentCategoryID"] == -1 }.map { |c| c["CategoryID"] })
 
-    top_level_categories = categories.select { |c| root_category_ids.include?(c["ParentCategoryID"]) }
+    top_level_categories = categories.select { |c| c["ParentCategoryID"].blank? || c['ParentCategoryID'] == -1 }
 
     # Depth = 2
     create_categories(top_level_categories) do |category|
@@ -432,13 +431,13 @@ class BulkImport::Vanilla < BulkImport::Base
   def import_topics
     puts "", "Importing topics..."
 
-    topics_sql = "SELECT DiscussionID, CategoryID, Name, Body, DateInserted, InsertUserID, Announce
+    topics_sql = "SELECT DiscussionID, CategoryID, Name, Body, DateInserted, InsertUserID, Announce, Format
       FROM #{TABLE_PREFIX}Discussion
       WHERE DiscussionID > #{@last_imported_topic_id}
       ORDER BY DiscussionID ASC"
 
     create_topics(mysql_stream(topics_sql)) do |row|
-      {
+      data = {
         imported_id: row["DiscussionID"],
         title: normalize_text(row["Name"]),
         category_id: category_id_from_imported_id(row["CategoryID"]) ||
@@ -447,18 +446,20 @@ class BulkImport::Vanilla < BulkImport::Base
         created_at: Time.zone.at(row['DateInserted']),
         pinned_at: row['Announce'] == 0 ? nil : Time.zone.at(row['DateInserted'])
       }
+      (data[:user_id].present? && data[:title].present?) ? data : false
     end
 
     puts "", "importing first posts..."
 
     create_posts(mysql_stream(topics_sql)) do |row|
-      {
+      data = {
         imported_id: "d-" + row['DiscussionID'].to_s,
-        topic_id: topic_id_from_imported_id(row["DiscussionID"]),
+        topic_id: topic_id_from_imported_id(row['DiscussionID']),
         user_id: user_id_from_imported_id(row["InsertUserID"]),
         created_at: Time.zone.at(row['DateInserted']),
-        raw: clean_up(row["Body"])
+        raw: clean_up(row['Body'], row['Format'])
       }
+      data[:topic_id].present? ? data : false
     end
 
     puts '', 'converting deep categories to tags...'
@@ -477,7 +478,7 @@ class BulkImport::Vanilla < BulkImport::Base
     puts "", "Importing posts..."
 
     posts = mysql_stream(
-      "SELECT CommentID, DiscussionID, Body, DateInserted, InsertUserID
+      "SELECT CommentID, DiscussionID, Body, DateInserted, InsertUserID, Format
          FROM #{TABLE_PREFIX}Comment
          WHERE CommentID > #{@last_imported_post_id}
          ORDER BY CommentID ASC")
@@ -489,9 +490,9 @@ class BulkImport::Vanilla < BulkImport::Base
       {
         imported_id: row['CommentID'],
         topic_id: topic_id,
-        user_id: user_id_from_imported_id(row["InsertUserID"]),
+        user_id: user_id_from_imported_id(row['InsertUserID']),
         created_at: Time.zone.at(row['DateInserted']),
-        raw: clean_up(row["Body"])
+        raw: clean_up(row['Body'], row['Format'])
       }
     end
   end
@@ -572,7 +573,7 @@ class BulkImport::Vanilla < BulkImport::Base
     puts "", "importing private replies..."
 
     private_posts_sql = "
-      SELECT ConversationID, MessageID, Body, InsertUserID, DateInserted
+      SELECT ConversationID, MessageID, Body, InsertUserID, DateInserted, Format
         FROM GDN_ConversationMessage
        WHERE ConversationID > #{@last_imported_private_topic_id - PRIVATE_OFFSET}
        ORDER BY ConversationID ASC, MessageID ASC"
@@ -585,7 +586,7 @@ class BulkImport::Vanilla < BulkImport::Base
         topic_id: topic_id,
         user_id: user_id_from_imported_id(row['InsertUserID']),
         created_at: Time.zone.at(row['DateInserted']),
-        raw: clean_up(row['Body'])
+        raw: clean_up(row['Body'], row['Format'])
       }
     end
   end
@@ -650,13 +651,48 @@ class BulkImport::Vanilla < BulkImport::Base
     end
   end
 

[... diff too long, it was truncated ...]

GitHub sha: 93ff54e1

1 Like

This commit appears in #10212 which was approved by eviltrout. It was merged by SamSaffron.