FEATURE: Import script for AnswerBase

FEATURE: Import script for AnswerBase

Improves the generic database used by some import scripts:

  • Adds additional columns for users
  • Adds support for attachments
  • Allows setting the data type for keys (numeric or string) to ensure correct sorting
diff --git a/Gemfile b/Gemfile
index 4030e93..54797b5 100644
--- a/Gemfile
+++ b/Gemfile
@@ -202,10 +202,11 @@ gem 'rchardet', require: false
 if ENV["IMPORT"] == "1"
   gem 'mysql2'
   gem 'redcarpet'
-  gem 'sqlite3', '~> 1.3.13'
+  gem 'sqlite3', '~> 1.3', '>= 1.3.13'
   gem 'ruby-bbcode-to-md', git: 'https://github.com/nlalonde/ruby-bbcode-to-md'
   gem 'reverse_markdown'
   gem 'tiny_tds'
+  gem 'csv', '~> 3.0'
 end
 
 gem 'webpush', require: false
diff --git a/script/import_scripts/answerbase.rb b/script/import_scripts/answerbase.rb
new file mode 100644
index 0000000..88fa191
--- /dev/null
+++ b/script/import_scripts/answerbase.rb
@@ -0,0 +1,341 @@
+require 'csv'
+require 'reverse_markdown'
+require_relative 'base'
+require_relative 'base/generic_database'
+
+# Call it like this:
+#   RAILS_ENV=production bundle exec ruby script/import_scripts/answerbase.rb DIRNAME
+class ImportScripts::Answerbase < ImportScripts::Base
+  OLD_DOMAIN = "http://answerbase.example.com" # without trailing slash
+  NEW_DOMAIN = "https://discourse.example.com"
+  AVATAR_DIRECTORY = "User Images"
+  ANSWER_ATTACHMENT_DIRECTORY = "Answer Attachments"
+  ANSWER_IMAGE_DIRECTORY = "Answer Images"
+  QUESTION_ATTACHMENT_DIRECTORY = "Question Attachments"
+  QUESTION_IMAGE_DIRECTORY = "Question Images"
+  EMBEDDED_IMAGE_REGEX = /<a[^>]*href="[^"]*relativeUrl=(?<path>[^"\&]*)[^"]*"[^>]*>\s*<img[^>]*>\s*<\/a>/i
+  QUESTION_LINK_REGEX = /<a[^>]*?href="#{Regexp.escape(OLD_DOMAIN)}\/[^"]*?(?:q|questionid=)(?<id>\d+)[^"]*?"[^>]*>(?<text>.*?)<\/a>/i
+  TOPIC_LINK_NORMALIZATION = '/.*?-(q\d+).*/\1'
+  BATCH_SIZE = 1000
+
+  def initialize(path)
+    super()
+
+    @path = path
+    @db = ImportScripts::GenericDatabase.new(
+      @path,
+      batch_size: BATCH_SIZE,
+      recreate: true,
+      numeric_keys: true
+    )
+  end
+
+  def execute
+    read_csv_files
+
+    add_permalink_normalizations
+    import_categories
+    import_users
+    import_topics
+    import_posts
+  end
+
+  def read_csv_files
+    puts "", "reading CSV files..."
+
+    category_position = 0
+    csv_parse("categories") do |row|
+      @db.insert_category(
+        id: row[:id],
+        name: row[:name],
+        position: category_position += 1
+      )
+    end
+
+    csv_parse("users") do |row|
+      @db.insert_user(
+        id: row[:id],
+        email: row[:email],
+        username: row[:username],
+        bio: row[:description],
+        avatar_path: row[:profile_image],
+        created_at: parse_date(row[:createtime]),
+        active: true
+      )
+    end
+
+    last_topic_id = nil
+    csv_parse("questions-answers-comments") do |row|
+      next if row[:published] == "No"
+      user_id = @db.get_user_id(row[:username])
+      created_at = parse_datetime(row[:createtime])
+
+      begin
+        if row[:type] == "Question"
+          attachments = parse_filenames(row[:attachments], QUESTION_ATTACHMENT_DIRECTORY) +
+            parse_filenames(row[:images], QUESTION_IMAGE_DIRECTORY)
+
+          @db.insert_topic(
+            id: row[:id],
+            title: row[:title],
+            raw: row[:text],
+            category_id: row[:categorylist],
+            user_id: user_id,
+            created_at: created_at,
+            attachments: attachments
+          )
+          last_topic_id = row[:id]
+        else
+          attachments = parse_filenames(row[:attachments], ANSWER_ATTACHMENT_DIRECTORY) +
+            parse_filenames(row[:images], ANSWER_IMAGE_DIRECTORY)
+
+          @db.insert_post(
+            id: row[:id],
+            raw: row[:text],
+            topic_id: last_topic_id,
+            user_id: user_id,
+            created_at: created_at,
+            attachments: attachments
+          )
+        end
+      rescue
+        p row
+        raise
+      end
+    end
+  end
+
+  def parse_filenames(text, directory)
+    return [] if text.blank?
+
+    text
+      .split(';')
+      .map { |filename| File.join(@path, directory, filename.strip) }
+  end
+
+  def parse_date(text)
+    return nil if text.blank?
+    DateTime.strptime(text, "%m/%d/%y")
+  end
+
+  def parse_datetime(text)
+    return nil if text.blank?
+    # DateTime.strptime(text, "%m/%d/%Y %H:%M")
+    DateTime.parse(text).utc.to_datetime
+  end
+
+  def import_categories
+    puts "", "creating categories"
+    rows = @db.fetch_categories
+
+    create_categories(rows) do |row|
+      {
+        id: row['id'],
+        name: row['name'],
+        description: row['description'],
+        position: row['position']
+      }
+    end
+  end
+
+  def batches
+    super(BATCH_SIZE)
+  end
+
+  def import_users
+    puts "", "creating users"
+    total_count = @db.count_users
+    last_id = 0
+
+    batches do |offset|
+      rows, last_id = @db.fetch_users(last_id)
+      break if rows.empty?
+
+      next if all_records_exist?(:users, rows.map { |row| row['id'] })
+
+      create_users(rows, total: total_count, offset: offset) do |row|
+        {
+          id: row['id'],
+          email: row['email'],
+          username: row['username'],
+          bio_raw: row['bio'],
+          created_at: row['created_at'],
+          active: row['active'] == 1,
+          post_create_action: proc do |user|
+            create_avatar(user, row['avatar_path'])
+          end
+        }
+      end
+    end
+  end
+
+  def create_avatar(user, avatar_path)
+    return if avatar_path.blank?
+    avatar_path = File.join(@path, AVATAR_DIRECTORY, avatar_path)
+
+    if File.exist?(avatar_path)
+      @uploader.create_avatar(user, avatar_path)
+    else
+      STDERR.puts "Could not find avatar: #{avatar_path}"
+    end
+  end
+
+  def import_topics
+    puts "", "creating topics"
+    total_count = @db.count_topics
+    last_id = 0
+
+    batches do |offset|
+      rows, last_id = @db.fetch_topics(last_id)
+      break if rows.empty?
+
+      next if all_records_exist?(:posts, rows.map { |row| row['id'] })
+
+      create_posts(rows, total: total_count, offset: offset) do |row|
+        attachments = @db.fetch_topic_attachments(row['id']) if row['upload_count'] > 0
+        user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id
+
+        {
+          id: row['id'],
+          title: row['title'],
+          raw: raw_with_attachments(row['raw'].presence || row['title'], attachments, user_id),
+          category: category_id_from_imported_category_id(row['category_id']),
+          user_id: user_id,
+          created_at: row['created_at'],
+          closed: row['closed'] == 1,
+          post_create_action: proc do |post|
+            url = "q#{row['id']}"
+            Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
+          end
+        }
+      end
+    end
+  end
+
+  def import_posts
+    puts "", "creating posts"
+    total_count = @db.count_posts
+    last_row_id = 0
+
+    batches do |offset|
+      rows, last_row_id = @db.fetch_posts(last_row_id)
+      break if rows.empty?
+
+      next if all_records_exist?(:posts, rows.map { |row| row['id'] })
+
+      create_posts(rows, total: total_count, offset: offset) do |row|
+        topic = topic_lookup_from_imported_post_id(row['topic_id'])
+        attachments = @db.fetch_post_attachments(row['id']) if row['upload_count'] > 0
+        user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id
+
+        {
+          id: row['id'],
+          raw: raw_with_attachments(row['raw'], attachments, user_id),
+          user_id: user_id,
+          topic_id: topic[:topic_id],
+          created_at: row['created_at']
+        }
+      end
+    end
+  end
+
+  def raw_with_attachments(raw, attachments, user_id)
+    raw, embedded_paths, upload_ids = replace_embedded_attachments(raw, user_id)
+    raw = replace_question_links(raw)
+    raw = ReverseMarkdown.convert(raw) || ""
+
+    attachments&.each do |attachment|
+      path = attachment['path']
+      next if embedded_paths.include?(path)
+
+      if File.exist?(path)

[... diff too long, it was truncated ...]

GitHub sha: c36c9c2e

3 Likes