FIX: Google Groups crawler failed to login

FIX: Google Groups crawler failed to login

Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output.

diff --git a/script/import_scripts/google_groups.rb b/script/import_scripts/google_groups.rb
index 64c16a4..15f152a 100755
--- a/script/import_scripts/google_groups.rb
+++ b/script/import_scripts/google_groups.rb
@@ -6,22 +6,22 @@ require "bundler/inline"
 gemfile(true) do
   source "https://rubygems.org"
 
-  gem "nokogiri"
   gem "webdrivers"
+  gem "colored2"
 end
 
 require "fileutils"
-require "nokogiri"
 require "optparse"
-require "webdrivers"
 require "set"
 require "yaml"
 
 DEFAULT_OUTPUT_PATH = "/shared/import/data"
+DEFAULT_COOKIES_TXT = "/shared/import/cookies.txt"
 
 def driver
   @driver ||= begin
-    chrome_args = ["headless", "disable-gpu"]
+    chrome_args = ["disable-gpu"]
+    chrome_args << "headless" unless ENV["NOT_HEADLESS"] == '1'
     chrome_args << "no-sandbox" if inside_container?
     options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)
     Selenium::WebDriver.for(:chrome, options: options)
@@ -63,7 +63,7 @@ def find(css, parent_element = driver)
   begin
     retries ||= 0
     parent_element.find_element(css: css)
-  rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotVisibleError
+  rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotInteractableError
     sleep retries
     retry if (retries += 1) < MAX_FIND_RETRIES
   end
@@ -83,7 +83,7 @@ end
 
 def crawl_topic(url)
   if @scraped_topic_urls.include?(url)
-    puts "Skipping #{url}"
+    puts "Skipping".green << " #{url}"
     return
   end
 
@@ -99,7 +99,7 @@ def crawl_topic(url)
 
   @scraped_topic_urls << url
 rescue
-  puts "Failed to scrape topic at #{url}"
+  puts "Failed to scrape topic at #{url}".red
   raise
 end
 
@@ -113,7 +113,7 @@ def crawl_message(url, might_be_deleted)
     @first_message_checked = true
 
     if content.match?(/From:.*\.\.\.@.*/i) && !@force_import
-      exit_with_error(<<~MSG)
+      exit_with_error(<<~MSG.red.bold)
         It looks like you do not have permissions to see email addresses. Aborting.
         Use the --force option to import anyway.
       MSG
@@ -125,67 +125,44 @@ rescue Selenium::WebDriver::Error::NoSuchElementError
   raise unless might_be_deleted
   puts "Message might be deleted. Skipping #{url}"
 rescue
-  puts "Failed to scrape message at #{url}"
+  puts "Failed to scrape message at #{url}".red
   raise
 end
 
 def login
   puts "Logging in..."
-  get("https://www.google.com/accounts/Login")
-
-  sleep(1)
-  email_element = wait_for_element("input[type='email']")
-  exit_with_error("Failed to detect 'email' input on login page") if !email_element
-
-  driver.action.move_to(email_element)
-  email_element.send_keys(@email)
-  email_element.send_keys("\n")
-
-  sleep(1)
-  password_element = wait_for_element("input[type='password']")
-  exit_with_error("Failed to detect 'password' input on login page") if !password_element
-
-  driver.action.move_to(password_element)
-  password_element.send_keys(@password)
-  password_element.send_keys("\n")
-
-  sleep(1)
-
-  if driver.current_url.include?("challenge")
-    puts "", "2-Step Verification is required."
-    puts "Unlock on your phone and press Enter"
-    puts "or enter the code from your authenticator app"
-    puts "or enter the code you received via SMS (without the G- prefix)"
-
-    print "Enter code: "
-
-    code = gets.chomp
-
-    if code.empty?
-      # Verification via phone?
-      begin
-        wait_for_url { |url| !url.include?("challenge") }
-      rescue Selenium::WebDriver::Error::TimeOutError
-        exit_with_error("Failed to login. Did you tap 'Yes' on your phone to allow the login?")
-      end
-    else
-      code_element = wait_for_element("input[type='tel']")
-      exit_with_error("Failed to detect 'code' input on login page") if !code_element
-
-      code_element.send_keys(code)
-      code_element.send_keys("\n")
-
-      begin
-        wait_for_url { |url| !url.include?("challenge") }
-      rescue Selenium::WebDriver::Error::TimeOutError
-        exit_with_error("Failed to login. Wrong code?")
-      end
-    end
+  get("https://google.com/404")
+
+  add_cookies(
+    "accounts.google.com",
+    "myaccount.google.com",
+    "google.com"
+  )
+
+  get("https://accounts.google.com/servicelogin")
+
+  begin
+    wait_for_url { |url| url.start_with?("https://myaccount.google.com") }
+  rescue Selenium::WebDriver::Error::TimeoutError
+    exit_with_error("Failed to login. Please check the content of your cookies.txt".red.bold)
   end
+end
 
-  sleep(1)
-  user_element = wait_for_element("a[aria-label*='#{@email}']")
-  exit_with_error("Failed to login") if !user_element
+def add_cookies(*domains)
+  File.readlines(@cookies).each do |line|
+    parts = line.chomp.split("\t")
+    next if parts.size != 7 || !domains.any? { |domain| parts[0] =~ /^\.?#{Regexp.escape(domain)}$/ }
+
+    driver.manage.add_cookie(
+      domain: parts[0],
+      httpOnly: "true".casecmp?(parts[1]),
+      path: parts[2],
+      secure: "true".casecmp?(parts[3]),
+      expires: parts[4] == "0" ? nil : DateTime.strptime(parts[4], "%s"),
+      name: parts[5],
+      value: parts[6]
+    )
+  end
 end
 
 def wait_for_url
@@ -193,14 +170,6 @@ def wait_for_url
   wait.until { yield(driver.current_url) }
 end
 
-def wait_for_element(css)
-  wait = Selenium::WebDriver::Wait.new(timeout: 5)
-  wait.until { driver.find_element(css: css).displayed? }
-  find(css)
-rescue Selenium::WebDriver::Error::TimeOutError
-  nil
-end
-
 def exit_with_error(*messages)
   STDERR.puts messages
   exit 1
@@ -231,9 +200,8 @@ def parse_arguments
   parser = OptionParser.new do |opts|
     opts.banner = "Usage: google_groups.rb [options]"
 
-    opts.on("-e", "--email EMAIL", "email address of group admin or manager") { |v| @email = v }
-    opts.on("-p", "--password PASSWORD", "password of group admin or manager") { |v| @password = v }
     opts.on("-g", "--groupname GROUPNAME") { |v| @groupname = v }
+    opts.on("-c", "--cookies PATH", "path to cookies.txt") { |v| @cookies = v }
     opts.on("--path PATH", "output path for emails") { |v| @path = v }
     opts.on("-f", "--force", "force import when user isn't allowed to see email addresses") { @force_import = true }
     opts.on("-h", "--help") do
@@ -248,14 +216,15 @@ def parse_arguments
     exit_with_error(e.message, "", parser)
   end
 
-  mandatory = [:email, :password, :groupname]
+  @cookies = DEFAULT_COOKIES_TXT if @cookies.nil? && File.exist?(DEFAULT_COOKIES_TXT)
+  @path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil?
+
+  mandatory = [:groupname, :cookies]
   missing = mandatory.select { |name| instance_variable_get("@#{name}").nil? }
 
-  if missing.any?
-    exit_with_error("Missing arguments: #{missing.join(', ')}", "", parser)
-  end
+  exit_with_error("Missing arguments: #{missing.join(', ')}".red.bold, "", parser, "") if missing.any?
+  exit_with_error("cookies.txt not found at #{@cookies}".red.bold, "") if !File.exist?(@cookies)
 
-  @path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil?
   FileUtils.mkpath(@path)
 end

GitHub sha: ab96239f

1 Like

This commit has been mentioned on Discourse Meta. There might be relevant details there: