DEV: Bulk imports should find existing users by email (#14468)

DEV: Bulk imports should find existing users by email (#14468)

Without this change, bulk imports unconditionally create new user records even when a user with the same email address exists.

diff --git a/script/bulk_import/base.rb b/script/bulk_import/base.rb
index a04c7de..3b0efbc 100644
--- a/script/bulk_import/base.rb
+++ b/script/bulk_import/base.rb
@@ -153,6 +153,7 @@ class BulkImport::Base
     puts "Loading imported user ids..."
     @users, imported_user_ids = imported_ids("user")
     @last_imported_user_id = imported_user_ids.max || -1
+    @pre_existing_user_ids = Set.new
 
     puts "Loading imported category ids..."
     @categories, imported_category_ids = imported_ids("category")
@@ -197,7 +198,7 @@ class BulkImport::Base
     puts "Loading users indexes..."
     @last_user_id = last_id(User)
     @last_user_email_id = last_id(UserEmail)
-    @emails = User.unscoped.joins(:user_emails).pluck(:"user_emails.email").to_set
+    @emails = User.unscoped.joins(:user_emails).pluck(:"user_emails.email", :"user_emails.user_id").to_h
     @usernames_lower = User.unscoped.pluck(:username_lower).to_set
     @mapped_usernames = UserCustomField.joins(:user).where(name: "import_username").pluck("user_custom_fields.value", "users.username").to_h
 
@@ -393,6 +394,17 @@ class BulkImport::Base
   end
 
   def process_user(user)
+    if user[:email].present?
+      user[:email].downcase!
+
+      if existing_user_id = @emails[user[:email]]
+        @pre_existing_user_ids << existing_user_id
+        @users[user[:imported_id].to_i] = existing_user_id
+        user[:skip] = true
+        return user
+      end
+    end
+
     @users[user[:imported_id].to_i] = user[:id] = @last_user_id += 1
 
     imported_username = user[:username].dup
@@ -412,11 +424,6 @@ class BulkImport::Base
     end
 
     user[:username_lower] = user[:username].downcase
-    user[:email] ||= random_email
-    user[:email].downcase!
-
-    # unique email
-    user[:email] = random_email until user[:email] =~ EmailValidator.email_regex && @emails.add?(user[:email])
     user[:trust_level] ||= TrustLevel[1]
     user[:active] = true unless user.has_key?(:active)
     user[:admin] ||= false
@@ -428,18 +435,28 @@ class BulkImport::Base
   end
 
   def process_user_email(user_email)
+    user_id = @users[user_email[:imported_user_id].to_i]
+    return { skip: true } if @pre_existing_user_ids.include?(user_id)
+
     user_email[:id] = @last_user_email_id += 1
-    user_email[:user_id] = @users[user_email[:imported_user_id].to_i]
+    user_email[:user_id] = user_id
     user_email[:primary] = true
     user_email[:created_at] ||= NOW
     user_email[:updated_at] ||= user_email[:created_at]
+
     user_email[:email] ||= random_email
     user_email[:email].downcase!
+    # unique email
+    user_email[:email] = random_email until user_email[:email] =~ EmailValidator.email_regex && !@emails.has_key?(user_email[:email])
+
     user_email
   end
 
   def process_user_stat(user_stat)
-    user_stat[:user_id] = @users[user_stat[:imported_user_id].to_i]
+    user_id = @users[user_stat[:imported_user_id].to_i]
+    return { skip: true } if @pre_existing_user_ids.include?(user_id)
+
+    user_stat[:user_id] = user_id
     user_stat[:topics_entered] ||= 0
     user_stat[:time_read] ||= 0
     user_stat[:days_visited] ||= 0
@@ -455,6 +472,8 @@ class BulkImport::Base
   end
 
   def process_user_profile(user_profile)
+    return { skip: true } if @pre_existing_user_ids.include?(user_profile[:user_id])
+
     user_profile[:bio_raw] = (user_profile[:bio_raw].presence || "").scrub.strip.presence
     user_profile[:bio_cooked] = pre_cook(user_profile[:bio_raw]) if user_profile[:bio_raw].present?
     user_profile[:views] ||= 0
@@ -697,7 +716,7 @@ class BulkImport::Base
           processed = send(process_method_name, mapped)
           imported_ids << mapped[:imported_id] unless mapped[:imported_id].nil?
           imported_ids |= mapped[:imported_ids] unless mapped[:imported_ids].nil?
-          @raw_connection.put_copy_data columns.map { |c| processed[c] }
+          @raw_connection.put_copy_data columns.map { |c| processed[c] } unless processed[:skip]
           print "\r%7d - %6d/sec" % [imported_ids.size, imported_ids.size.to_f / (Time.now - start)] if imported_ids.size % 5000 == 0
         rescue => e
           puts "\n"
diff --git a/script/bulk_import/phpbb_postgresql.rb b/script/bulk_import/phpbb_postgresql.rb
index d7d7205..cd5fa62 100644
--- a/script/bulk_import/phpbb_postgresql.rb
+++ b/script/bulk_import/phpbb_postgresql.rb
@@ -83,6 +83,7 @@ class BulkImport::PhpBB < BulkImport::Base
       u = {
         imported_id: row["user_id"],
         username: normalize_text(row["username"]),
+        email: row["user_email"],
         created_at: Time.zone.at(row["user_regdate"].to_i),
         last_seen_at: row["user_lastvisit"] == 0 ? Time.zone.at(row["user_regdate"].to_i) : Time.zone.at(row["user_lastvisit"].to_i),
         trust_level: row["user_posts"] == 0 ? TrustLevel[0] : TrustLevel[1],
diff --git a/script/bulk_import/vbulletin.rb b/script/bulk_import/vbulletin.rb
index c251ef0..8a57522 100644
--- a/script/bulk_import/vbulletin.rb
+++ b/script/bulk_import/vbulletin.rb
@@ -118,6 +118,7 @@ class BulkImport::VBulletin < BulkImport::Base
         imported_id: row[0],
         username: normalize_text(row[1]),
         name: normalize_text(row[1]),
+        email: row[2],
         created_at: Time.zone.at(row[3]),
         date_of_birth: parse_birthday(row[4]),
         primary_group_id: group_id_from_imported_id(row[6]),

GitHub sha: a4d0d866aaf64052ee0a1dc3f1f83e0aa5a393fc

This commit appears in #14468 which was approved by ZogStriP. It was merged by gschlager.