DEV: Improvements to Drupal script (#10016)

DEV: Improvements to Drupal script (#10016)

Refactors script to follow conventions of other importers and adds some features including like import, processing of post raw text, and, if needed, SSO import.

diff --git a/script/import_scripts/drupal.rb b/script/import_scripts/drupal.rb
index c97c2e6..a10615f 100644
--- a/script/import_scripts/drupal.rb
+++ b/script/import_scripts/drupal.rb
@@ -1,16 +1,21 @@
 # frozen_string_literal: true
 
 require "mysql2"
+require "htmlentities"
 require File.expand_path(File.dirname(__FILE__) + "/base.rb")
 
 class ImportScripts::Drupal < ImportScripts::Base
 
-  DRUPAL_DB = ENV['DRUPAL_DB'] || "newsite3"
+  DRUPAL_DB = ENV['DRUPAL_DB'] || "drupal"
   VID = ENV['DRUPAL_VID'] || 1
+  BATCH_SIZE = 1000
+  ATTACHMENT_DIR = "/root/files/upload"
 
   def initialize
     super
 
+    @htmlentities = HTMLEntities.new
+
     @client = Mysql2::Client.new(
       host: "localhost",
       username: "root",
@@ -19,142 +24,210 @@ class ImportScripts::Drupal < ImportScripts::Base
     )
   end
 
-  def categories_query
-    @client.query("SELECT tid, name, description FROM taxonomy_term_data WHERE vid = #{VID}")
-  end
-
   def execute
-    create_users(@client.query("SELECT uid id, name, mail email, created FROM users;")) do |row|
-      { id: row['id'], username: row['name'], email: row['email'], created_at: Time.zone.at(row['created']) }
-    end
 
-    # You'll need to edit the following query for your Drupal install:
-    #
-    #   * Drupal allows duplicate category names, so you may need to exclude some categories or rename them here.
-    #   * Table name may be term_data.
-    #   * May need to select a vid other than 1.
-    create_categories(categories_query) do |c|
-      { id: c['tid'], name: c['name'], description: c['description'] }
-    end
+    import_users
+    import_categories
 
     # "Nodes" in Drupal are divided into types. Here we import two types,
     # and will later import all the comments/replies for each node.
     # You will need to figure out what the type names are on your install and edit the queries to match.
     if ENV['DRUPAL_IMPORT_BLOG']
-      create_blog_topics
+      import_blog_topics
     end
 
-    create_forum_topics
+    import_forum_topics
+
+    import_replies
+    import_likes
+    mark_topics_as_solved
+    import_sso_records
+    import_attachments
+    postprocess_posts
+    create_permalinks
+    import_gravatars
+  end
+
+  def import_users
+    puts "", "importing users"
+
+    user_count = mysql_query("SELECT count(uid) count FROM users").first["count"]
+
+    last_user_id = -1
+
+    batches(BATCH_SIZE) do |offset|
+      users = mysql_query(<<-SQL
+          SELECT uid,
+                 name username,
+                 mail email,
+                 created
+            FROM users
+           WHERE uid > #{last_user_id}
+        ORDER BY uid
+           LIMIT #{BATCH_SIZE}
+      SQL
+      ).to_a
 
-    create_replies
+      break if users.empty?
 
-    begin
-      create_admin(email: 'neil.lalonde@discourse.org', username: UserNameSuggester.suggest('neil'))
-    rescue => e
-      puts '', "Failed to create admin user"
-      puts e.message
+      last_user_id = users[-1]["uid"]
+
+      users.reject! { |u| @lookup.user_already_imported?(u["uid"]) }
+
+      create_users(users, total: user_count, offset: offset) do |user|
+        email = user["email"].presence || fake_email
+        email = fake_email unless email[EmailValidator.email_regex]
+
+        username = @htmlentities.decode(user["username"]).strip
+
+        {
+          id: user["uid"],
+          name: username,
+          email: email,
+          created_at: Time.zone.at(user["created"])
+        }
+      end
     end
   end
 
-  def create_blog_topics
-    puts '', "creating blog topics"
+  def import_categories
+    # You'll need to edit the following query for your Drupal install:
+    #
+    #   * Drupal allows duplicate category names, so you may need to exclude some categories or rename them here.
+    #   * Table name may be term_data.
+    #   * May need to select a vid other than 1
 
-    create_category({
-      name: 'Blog',
-      user_id: -1,
-      description: "Articles from the blog"
-    }, nil) unless Category.find_by_name('Blog')
+    puts "", "importing categories"
 
-    results = @client.query("
+    categories = mysql_query(<<-SQL
+        SELECT tid,
+               name,
+               description
+          FROM taxonomy_term_data
+         WHERE vid = #{VID}
+    SQL
+    ).to_a
+
+    create_categories(categories) do |category|
+      {
+        id: category['tid'],
+        name: @htmlentities.decode(category['name']).strip,
+        description: @htmlentities.decode(category['description']).strip
+      }
+    end
+  end
+
+  def import_blog_topics
+    puts '', "importing blog topics"
+
+    create_category(
+      {
+        name: 'Blog',
+        description: "Articles from the blog"
+      },
+    nil) unless Category.find_by_name('Blog')
+
+    blogs = mysql_query(<<-SQL
       SELECT n.nid nid, n.title title, n.uid uid, n.created created, n.sticky sticky,
              f.body_value body
         FROM node n,
              field_data_body f
-       WHERE n.type = 'blog'
+       WHERE n.type = 'article'
          AND n.nid = f.entity_id
          AND n.status = 1
-    ", cache_rows: false)
+    SQL
+    ).to_a
+
+    category_id = Category.find_by_name('Blog').id
 
-    create_posts(results) do |row|
+    create_posts(blogs) do |topic|
       {
-        id: "nid:#{row['nid']}",
-        user_id: user_id_from_imported_user_id(row['uid']) || -1,
-        category: 'Blog',
-        raw: row['body'],
-        created_at: Time.zone.at(row['created']),
-        pinned_at: row['sticky'].to_i == 1 ? Time.zone.at(row['created']) : nil,
-        title: row['title'].try(:strip),
-        custom_fields: { import_id: "nid:#{row['nid']}" }
+        id: "nid:#{topic['nid']}",
+        user_id: user_id_from_imported_user_id(topic['uid']) || -1,
+        category: category_id,
+        raw: topic['body'],
+        created_at: Time.zone.at(topic['created']),
+        pinned_at: topic['sticky'].to_i == 1 ? Time.zone.at(topic['created']) : nil,
+        title: topic['title'].try(:strip),
+        custom_fields: { import_id: "nid:#{topic['nid']}" }
       }
     end
   end
 
-  def create_forum_topics
-    puts '', "creating forum topics"
+  def import_forum_topics
+    puts '', "importing forum topics"
 
-    total_count = @client.query("
+    total_count = mysql_query(<<-SQL
         SELECT COUNT(*) count
           FROM forum_index fi, node n
          WHERE n.type = 'forum'
            AND fi.nid = n.nid
-           AND n.status = 1;").first['count']
-
-    batch_size = 1000
+           AND n.status = 1
+    SQL
+    ).first['count']
 
-    batches(batch_size) do |offset|
-      results = @client.query("
+    batches(BATCH_SIZE) do |offset|
+      results = mysql_query(<<-SQL
         SELECT fi.nid nid,
                fi.title title,
                fi.tid tid,
                n.uid uid,
                fi.created created,
                fi.sticky sticky,
-               f.body_value body
-          FROM forum_index fi,
-               node n,
-               field_data_body f
+               f.body_value body,
+	       nc.totalcount views,
+	       fl.timestamp solved
+          FROM forum_index fi
+	 LEFT JOIN node n ON fi.nid = n.nid
+	 LEFT JOIN field_data_body f ON f.entity_id = n.nid
+	 LEFT JOIN flagging fl ON fl.entity_id = n.nid
+	     AND fl.fid = 7
+	 LEFT JOIN node_counter nc ON nc.nid = n.nid
          WHERE n.type = 'forum'
-           AND fi.nid = n.nid
-           AND n.nid = f.entity_id
            AND n.status = 1
-         LIMIT #{batch_size}
+         LIMIT #{BATCH_SIZE}
         OFFSET #{offset};
-      ", cache_rows: false)
+      SQL
+      ).to_a
 
       break if results.size < 1
 
       next if all_records_exist? :posts, results.map { |p| "nid:#{p['nid']}" }
 
       create_posts(results, total: total_count, offset: offset) do |row|
-        {
+        raw = preprocess_raw(row['body'])
+        topic = {
           id: "nid:#{row['nid']}",

[... diff too long, it was truncated ...]

GitHub sha: be28fc73

1 Like

This commit appears in #10016 which was merged by justindirose.